In [None]:
# default_exp core

# 00_Core

> This module contains helper functions for preparing the data

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
from fastai.tabular.all import *

We'll use the `ADULT_SAMPLE` dataset for all of our examples. This dataset's aim is to identify if an individual makes above or below $50,000

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')

Next let's name our variables

In [None]:
dep_var = 'salary'
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]

And decide how we want to split and generate our `DataLoader`!

In [None]:
splits = IndexSplitter(list(range(800,1000)))(range_of(df))
to = TabularPandas(df, procs, cat_names, cont_names, y_names="salary", splits=splits)
dls = to.dataloaders()

Now let's train an initial model to use

In [None]:
learn = tabular_learner(dls, layers=[200,100], metrics=accuracy)
learn.fit(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.360043,0.461972,0.8,00:07


# _prepare_data
> Prepares some test data for how `SHAP` expects their input. You can pass in a `DataFrame` or `TabDataLoader`. If nothing is passed in, it will default to your validation data

In [None]:
#export
def _prepare_data(learn:Learner, test_data=None, n_samples:int=128):
  "Prepares train and test data for `SHAP`, pass in a learner with optional data"
  no_user_provided_test_data = test_data is None
  if isinstance(test_data, pd.DataFrame):
    dl = learn.dls.test_dl(test_data)
  elif isinstance(test_data, TabDataLoader):
    dl = test_data
  elif test_data is None:
    try:
      dl = learn.dls[1]
    except IndexError:
      print('No validation dataloader, using `train`')
      dl = learn.dls[0]
  else:
    raise ValueError('Input is not supported. Please use either a `DataFrame` or `TabularDataLoader`')
  test_data = pd.merge(dl.cats, dl.conts, left_index=True, right_index=True)
  return test_data.sample(n=n_samples) if ((len(test_data) > n_samples) and no_user_provided_test_data) else test_data

First, an example with a `DataFrame`

In [None]:
X_test = _prepare_data(learn, df.iloc[:100])
X_test.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,age_na,fnlwgt_na,education-num_na,age,fnlwgt,education-num
0,5,8,3,0,6,5,1,1,1,0.763259,-0.83812,0.751095
1,5,13,1,5,2,5,1,1,1,0.396758,0.445856,1.533399
2,5,12,1,0,5,3,1,1,2,-0.043043,-0.886805,-0.031209
3,6,15,3,11,1,2,1,1,1,-0.043043,-0.728833,1.924551
4,7,6,3,9,6,3,1,1,2,0.250158,-1.018478,-0.031209


In [None]:
#hide
test_eq(len(X_test), 100)

We can see that we now are working with the transformed data! Now let's do the same with a `DataLoader`

In [None]:
dl = learn.dls.test_dl(df.iloc[:100])
X_test = _prepare_data(learn, dl)
X_test.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,age_na,fnlwgt_na,education-num_na,age,fnlwgt,education-num
0,5,8,3,0,6,5,1,1,1,0.763259,-0.83812,0.751095
1,5,13,1,5,2,5,1,1,1,0.396758,0.445856,1.533399
2,5,12,1,0,5,3,1,1,2,-0.043043,-0.886805,-0.031209
3,6,15,3,11,1,2,1,1,1,-0.043043,-0.728833,1.924551
4,7,6,3,9,6,3,1,1,2,0.250158,-1.018478,-0.031209


In [None]:
#hide
test_eq(len(X_test), 100)

And now finally for absolutely nothing

In [None]:
X_test = _prepare_data(learn)
X_test.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,age_na,fnlwgt_na,education-num_na,age,fnlwgt,education-num
859,8,16,5,2,5,3,1,1,1,-1.215847,0.407894,-0.031209
866,5,16,3,4,1,5,1,1,2,-1.069247,0.096357,-0.031209
997,7,1,4,0,2,5,1,1,2,2.229264,-0.922008,-0.031209
911,5,7,5,0,2,5,1,1,1,-0.556145,-0.551339,-1.986969
871,7,10,3,0,1,5,1,1,1,0.103557,0.860053,1.142247


In [None]:
#hide
test_eq(len(X_test), 128)

In [None]:
#hide
# X_train, X_test = _prepare_data(learn, 'test')
# This should fail, just a test case

# _predict

Now we need to grab predictions based on what shap throws back. This is a basic function you can use to get your predictions. We can't include it in the library as we need access to your current `Learn`

In [None]:
#export
def _predict(learn:TabularLearner, data:np.array):
  "Predict function for some data on a fastai model"
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
  model = learn.model.to(device)
  dl = learn.dls[0]
  nb_cat_cols = len(dl.dataset.cat_names)
  nb_cont_cols = len(dl.dataset.cont_names)
  x_cat = torch.from_numpy(data[:, :nb_cat_cols]).to(device, torch.int64)
  x_cont = torch.from_numpy(data[:, -nb_cont_cols:]).to(device, torch.float32)
  with torch.no_grad():
    pred_probs = learn.model(x_cat, x_cont).cpu().numpy()
  return pred_probs

`SHAP` will expect a numpy array for our data, so let's work with that and get some predictions!

In [None]:
data = X_test.iloc[:5].to_numpy()
pred_probs = _predict(learn, data)

In [None]:
#hide
test_eq(pred_probs.shape, (5,2))

Let's take a look at those predictions

In [None]:
pred_probs.shape

(5, 2)

In [None]:
pred_probs

array([[-0.2988554,  0.5680708],
       [ 1.0022082, -2.168028 ],
       [ 1.0071597, -3.1777158],
       [ 0.694771 , -1.3450477],
       [ 0.6915872, -0.1694549]], dtype=float32)

And now we can do whatever we need to!