# Create a Learner for inference

In [1]:
from fastai.gen_doc.nbdoc import *

In this tutorial, we'll see how the same API allows you to create an empty [`DataBunch`](/basic_data.html#DataBunch) for a [`Learner`](/basic_train.html#Learner) at inference time (once you have trained your model) and how to call the `predict` method to get the predictions on a single item.

In [2]:
jekyll_note("""As usual, this page is generated from a notebook that you can find in the <code>docs_src</code> folder of the
<a href="https://github.com/fastai/fastai">fastai repo</a>. We use the saved models from <a href="/tutorial.data.html">this tutorial</a> to
have this notebook run quickly.""")

<div markdown="span" class="alert alert-info" role="alert"><i class="fa fa-info-circle"></i> <b>Note: </b>As usual, this page is generated from a notebook that you can find in the <code>docs_src</code> folder of the
<a href="https://github.com/fastai/fastai">fastai repo</a>. We use the saved models from <a href="/tutorial.data.html">this tutorial</a> to
have this notebook run quickly.</div>

## Tabular

Last application brings us to tabular data. First let's import everything we'll need.

In [3]:
from fastai.tabular import *

We'll use a sample of the [adult dataset](https://archive.ics.uci.edu/ml/datasets/adult) here. Once we read the csv file, we'll need to specify the dependant variable, the categorical variables, the continuous variables and the processors we want to use.

In [4]:
adult = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(adult/'adult.csv')


In [5]:
def train_test_split(df, train_percent=.8, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    train = df.loc[perm[:train_end]]
    test = df.loc[perm[train_end:]]
    return train, test

In [45]:
df = df.reset_index(drop=True)

In [46]:
df.head(n=2)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,>=50k
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,1
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,1


### Create a train and test dataframe

In [47]:
np.random.seed(42)
train_df, test_df = train_test_split(df)

In [49]:
train_df.head(n=2)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,>=50k
22286,37,Self-emp-not-inc,184655,HS-grad,9.0,Never-married,Craft-repair,Not-in-family,White,Male,0,0,35,United-States,0
21040,36,Private,156400,Some-college,10.0,Divorced,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0


In [123]:
idxs=list(train_df.index.values)
sorted(idxs)[:10]


[0, 1, 2, 4, 5, 6, 7, 8, 10, 11]

In [124]:
#20% for validation set
valid_idx = random.sample(idxs, int(len(train_df)*0.2))

In [128]:
sorted(valid_idx)[:10]

[18, 25, 27, 28, 56, 70, 88, 98, 99, 101]

In [125]:
assert set(valid_idx).issubset(idxs)

In [126]:
#get the items not in valid_idx
train_set=list(set(valid_idx).symmetric_difference(idxs))

In [127]:
sorted(train_set)[:10]

[0, 1, 2, 4, 5, 6, 7, 8, 10, 11]

#### caution needed with using train dataframe without consecutively numbered indexes 

- will get an error when create TabularDataBunch.from_df unless reset index for training dataframe

In [129]:
#fastai methodology as per fastai.data_block.ItemList.split_by_idx()
train_idx= list(np.setdiff1d(arange_of(idxs), valid_idx))

In [131]:
len(train_idx),len(train_set),len(valid_idx),len(idxs), len(train_idx)-len(train_set)

(21859, 20839, 5209, 26048, 1020)

In [111]:
#items in fastai train set - but not in dataframe index:
difference=list(set(train_idx).symmetric_difference(set(train_set)))
len(difference)

9463

These indexes are not actually contained in the original dataframe

In [132]:
difference[0] in idxs

False

### Reset train_df index for fastai

In [138]:
train_df = train_df.reset_index(drop=True)
idxs=list(train_df.index.values)

In [139]:
valid_idx = random.sample(idxs, int(len(train_df)*0.2))

In [140]:
train_idx= list(np.setdiff1d(arange_of(idxs), valid_idx))

In [144]:
assert (len(train_idx)+len(valid_idx)) == len(idxs)

In [146]:
#### now we have the correct number of indexes in our training data

In [147]:
dep_var = '>=50k'


In [148]:
def unique_deps(x:Series)->List:
    od = OrderedDict.fromkeys(x)
    res = list(OrderedDict.fromkeys(x).keys())
    res.sort()
    return res, od

In [149]:
classes, od =unique_deps(df[dep_var].values)

In [150]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
cont_names = ['education-num', 'hours-per-week', 'age', 'capital-loss', 'fnlwgt', 'capital-gain']
procs = [FillMissing, Categorify, Normalize]

Then we can use the data block API to grab everything together before using `data.show_batch()`

In [153]:
data = TabularDataBunch.from_df( path=adult, df=train_df, dep_var=dep_var, valid_idx=valid_idx, procs=procs, cat_names=cat_names, cont_names=cont_names, test_df=test_df)


--label_from_df() cols: >=50k
--label_from_df() cols: >=50k


We define a [`Learner`](/basic_train.html#Learner) object that we fit and then save the model.

In [154]:
learn = tabular_learner(data, layers=[200,100], metrics=accuracy)
learn.fit(1, 1e-2)
learn.save('mini_train')

epoch,train_loss,valid_loss,accuracy
1,0.342515,0.323958,0.848147


In [155]:
preds, y = learn.get_preds(DatasetType.Test)

In [156]:
probs = np.exp(preds)

#### Is the code below OK?

- ie are the predictions we get in cell above in originial order of the test_df?

In [157]:
indexes=list(test_df.index.values)

In [158]:
#get classes
d = {}
p = {}
for indx, prob in zip(indexes, probs):
    max_idx = np.argmax(prob)
    max_val = prob[max_idx].item()
    p[indx] = max_val
    prob_c = classes[max_idx]
    d[indx] = prob_c

In [160]:
df_preds=pd.DataFrame([d, p])
df_preds=df_preds.T

In [163]:
df_preds.rename(columns={0: dep_var, 1: 'Probability'}, inplace=True)

In [164]:
df_preds.head(n=2)

Unnamed: 0,>=50k,Probability
3,1.0,2.313252
9,1.0,2.694515
