In [1]:
from fastai2.tabular.all import *

In [2]:
# we will use the Adult Dataset where we have to predict if a person is earning more or less than $50k per year using some general data
path = untar_data(URLs.ADULT_SAMPLE)

In [3]:
path.ls()

(#3) [Path('C:/Users/Archel/.fastai/data/adult_sample/adult.csv'),Path('C:/Users/Archel/.fastai/data/adult_sample/export.pkl'),Path('C:/Users/Archel/.fastai/data/adult_sample/models')]

In [5]:
df = pd.read_csv(path/'adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k


In [6]:
dls = TabularDataLoaders.from_csv(path/'adult.csv', path=path, y_names='salary',
                                 cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race'], # categorical variables
                                 cont_names = ['age', 'fnlwgt', 'education-num'], # continuous variables
                                 procs = [Categorify, FillMissing, Normalize]) # Categorify: take every categorical variable and make a map from integer to unique categories, then replace the values by the corresponding index.
                                                                               # FillMissing: fill the missing values in the continuous variables by the median of existing values (you can choose a specific value if you prefer)
                                                                               # Normalize: normalize the continuous variables (substract the mean and divide by the std)

In [7]:
# define data splits
splits = RandomSplitter(valid_pct=.2)(range_of(df))

In [8]:
# we can rewrite it with TabularPandas class
to = TabularPandas(df, procs=[Categorify, FillMissing, Normalize],
                   cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race'],
                   cont_names = ['age', 'fnlwgt', 'education-num'],
                   y_names = 'salary',
                  splits=splits)

In [9]:
to.xs.iloc[:2]

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num
14979,8,16,3,9,1,5,1,-0.115734,-0.024419,-0.028953
2244,1,2,5,1,4,5,1,-1.581914,-1.037998,-1.205548


In [10]:
# now we can build our dataloaders again
dls = to.dataloaders(bs=64)

In [11]:
dls.show_batch()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,salary
0,Private,Assoc-acdm,Divorced,Craft-repair,Unmarried,White,False,29.0,236937.999495,12.0,<50k
1,Self-emp-inc,HS-grad,Divorced,Adm-clerical,Unmarried,White,False,30.0,224497.999283,9.0,<50k
2,Self-emp-inc,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,False,34.0,202450.000084,9.0,<50k
3,Local-gov,Bachelors,Divorced,Other-service,Not-in-family,White,False,61.0,34631.99648,13.0,<50k
4,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,False,41.0,182108.000111,14.0,>=50k
5,Private,HS-grad,Never-married,Adm-clerical,Not-in-family,Black,False,27.0,401723.000557,9.0,<50k
6,Private,HS-grad,Separated,Tech-support,Not-in-family,White,False,33.0,243674.00059,9.0,<50k
7,Private,HS-grad,Divorced,Transport-moving,Not-in-family,White,False,47.0,192053.000078,9.0,<50k
8,Private,Bachelors,Married-civ-spouse,#na#,Husband,White,False,39.0,269721.997942,13.0,<50k
9,Private,Bachelors,Never-married,Adm-clerical,Own-child,White,False,24.0,96178.000196,13.0,<50k


In [12]:
learn = tabular_learner(dls, metrics=accuracy)

In [13]:
# learn.fine_tune won't work since we don't have pretrained models
learn.fit_one_cycle(1)

epoch,train_loss,valid_loss,accuracy,time
0,0.359015,0.356381,0.835381,00:05


In [14]:
learn.show_results()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,salary,salary_pred
0,7.0,10.0,4.0,5.0,2.0,5.0,1.0,-0.555588,-0.263346,1.147642,0.0,0.0
1,5.0,12.0,3.0,11.0,1.0,5.0,1.0,2.303463,-0.945895,-0.421151,0.0,0.0
2,6.0,12.0,3.0,13.0,1.0,5.0,1.0,1.277137,0.183074,-0.421151,1.0,1.0
3,5.0,12.0,6.0,15.0,2.0,2.0,1.0,-0.335661,-0.832811,-0.421151,0.0,0.0
4,5.0,16.0,3.0,4.0,1.0,3.0,1.0,-0.262352,0.721141,-0.028953,1.0,0.0
5,5.0,12.0,1.0,2.0,3.0,3.0,1.0,1.350446,-0.163313,-0.421151,0.0,0.0
6,5.0,7.0,6.0,8.0,5.0,5.0,1.0,-0.482279,-0.842742,-1.989945,0.0,0.0
7,5.0,12.0,3.0,11.0,1.0,5.0,1.0,0.837283,-0.321881,-0.421151,1.0,0.0
8,7.0,12.0,3.0,13.0,1.0,5.0,1.0,1.570373,-0.79099,-0.421151,1.0,0.0


In [15]:
row, cl, probs = learn.predict(df.iloc[0])

In [16]:
cl, probs

(tensor(0), tensor([0.5486, 0.4514]))

In [17]:
# to get new predictions on test dataset, use test_dl method from DataLoaders
# that dataframe does not need to have the dependent variable in its column
test_df = df.copy()
test_df.drop(['salary'], axis=1, inplace=True)
dl = learn.dls.test_dl(test_df)

In [18]:
learn.get_preds(dl=dl)

(tensor([[0.5486, 0.4514],
         [0.3945, 0.6055],
         [0.9765, 0.0235],
         ...,
         [0.6647, 0.3353],
         [0.7085, 0.2915],
         [0.6510, 0.3490]]),
 None)

In [19]:
# to train other classifiers with other libraries
X_train, y_train = to.train.xs, to.train.ys.values.ravel()
X_test, y_test = to.valid.xs, to.valid.ys.values.ravel()

In [33]:
to.train.ys.values.ravel().shape == to.train.ys.values[:,0].shape

True