# Tabular models

In [1]:
from fastai.tabular import *
from pathlib import Path
from sklearn.metrics import cohen_kappa_score

In [2]:
def sklearn_qwk(y_true, y_pred) -> np.float64:
    """
    Function for measuring Quadratic Weighted Kappa with scikit-learn
    
    :param y_true: The ground truth labels
    :param y_pred: Our predicted labels
    
    :return The Quadratic Weighted Kappa Score (QWK)
    """
    return cohen_kappa_score(y_true, y_pred, weights="quadratic")

Tabular data should be in a Pandas `DataFrame`.

In [3]:
path = Path('/home/jupyter/tutorials/data/data-science-bowl-2019')

specs = pd.read_csv(path/'specs.csv')
sample_submission = pd.read_csv(path/'sample_submission.csv')
test = pd.read_csv(path/'test.csv')
train = pd.read_csv(path/'train.csv')
train_labels = pd.read_csv(path/'train_labels.csv')

The outcomes in this competition are grouped into 4 groups (labeled accuracy_group in the data):

    3: the assessment was solved on the first attempt
    2: the assessment was solved on the second attempt
    1: the assessment was solved after 3 or more attempts
    0: the assessment was never solved

In [4]:
df_summary = train_labels.groupby('accuracy_group').count()
df_summary.game_session.nlargest(10)

accuracy_group
3    8845
0    4229
1    2411
2    2205
Name: game_session, dtype: int64

In [5]:
sample_submission

Unnamed: 0,installation_id,accuracy_group
0,00abaee7,3
1,01242218,3
2,017c5718,3
3,01a44906,3
4,01bc6cb6,3
...,...,...
995,fee254cf,3
996,ff57e602,3
997,ffc73fb2,3
998,ffe00ca8,3


The groups are imbalanced as approximately 1/2 of the activities were solved on the first attempt. Approximately 1/4 were never solved whilst the remained are evenly balanced between those taking 1 attempt and those taking 2.

In [6]:
train_labels.title.unique()

array(['Mushroom Sorter (Assessment)', 'Bird Measurer (Assessment)', 'Cauldron Filler (Assessment)',
       'Chest Sorter (Assessment)', 'Cart Balancer (Assessment)'], dtype=object)

In [7]:
train_labels = train_labels[['game_session','accuracy_group']]
train_labels 

Unnamed: 0,game_session,accuracy_group
0,6bdf9623adc94d89,3
1,77b8ee947eb84b4e,0
2,901acc108f55a5a1,3
3,9501794defd84e4d,2
4,a9ef3ecb3d1acc6a,3
...,...,...
17685,c996482b11d149dd,3
17686,b05a02b52d5c1f4c,3
17687,5448d652309a6324,1
17688,a6885ab824fbc32c,0


In [8]:
test[(test.installation_id == 'ffe774cc') & (test.type == 'Assessment')]

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
1156371,3bfd1a65,46ff9d3ad2be09f2,2019-09-28T21:20:08.980Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",ffe774cc,1,2000,0,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
1156372,db02c830,46ff9d3ad2be09f2,2019-09-28T21:20:09.273Z,"{""event_count"":2,""game_time"":248,""event_code"":...",ffe774cc,2,2025,248,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
1156373,a1e4395d,46ff9d3ad2be09f2,2019-09-28T21:20:09.325Z,"{""description"":""Pull three mushrooms out of th...",ffe774cc,3,3010,248,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
1156374,a52b92d5,46ff9d3ad2be09f2,2019-09-28T21:20:13.338Z,"{""description"":""Pull three mushrooms out of th...",ffe774cc,4,3110,4445,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
1156375,a1e4395d,46ff9d3ad2be09f2,2019-09-28T21:20:13.348Z,"{""description"":""To pick a mushroom, pull it ou...",ffe774cc,5,3010,4445,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
1156376,28ed704e,46ff9d3ad2be09f2,2019-09-28T21:20:14.631Z,"{""height"":3,""coordinates"":{""x"":832,""y"":464,""st...",ffe774cc,6,4025,5739,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
1156377,a52b92d5,46ff9d3ad2be09f2,2019-09-28T21:20:14.634Z,"{""description"":""To pick a mushroom, pull it ou...",ffe774cc,7,3110,5739,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
1156378,9d29771f,46ff9d3ad2be09f2,2019-09-28T21:20:14.645Z,"{""description"":""That's one!"",""identifier"":""Dot...",ffe774cc,8,3021,5739,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
1156379,c74f40cd,46ff9d3ad2be09f2,2019-09-28T21:20:15.328Z,"{""description"":""That's one!"",""identifier"":""Dot...",ffe774cc,9,3121,6433,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
1156380,7da34a02,46ff9d3ad2be09f2,2019-09-28T21:20:16.519Z,"{""coordinates"":{""x"":951,""y"":453,""stage_width"":...",ffe774cc,10,4070,7608,Mushroom Sorter (Assessment),Assessment,TREETOPCITY


In [9]:
train = train.merge(train_labels,how='left',on='game_session')

In [22]:
installations = list(train.installation_id[~train.accuracy_group.isna()].unique())
len(installations)

3614

In [24]:
train = train[train.installation_id.isin(installations)]
train

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world,accuracy_group
1538,27253bdc,34ba1a28d02ba8ba,2019-08-06T04:57:18.904Z,"{""event_code"": 2000, ""event_count"": 1}",0006a69f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE,
1539,27253bdc,4b57c9a59474a1b9,2019-08-06T04:57:45.301Z,"{""event_code"": 2000, ""event_count"": 1}",0006a69f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK,
1540,77261ab5,2b9d5af79bcdb79f,2019-08-06T04:58:14.538Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK,
1541,b2dba42b,2b9d5af79bcdb79f,2019-08-06T04:58:14.615Z,"{""description"":""Let's build a sandcastle! Firs...",0006a69f,2,3010,29,Sandcastle Builder (Activity),Activity,MAGMAPEAK,
1542,1325467d,2b9d5af79bcdb79f,2019-08-06T04:58:16.680Z,"{""coordinates"":{""x"":273,""y"":650,""stage_width"":...",0006a69f,3,4070,2137,Sandcastle Builder (Activity),Activity,MAGMAPEAK,
...,...,...,...,...,...,...,...,...,...,...,...,...
11337821,28520915,5448d652309a6324,2019-09-22T02:07:27.562Z,"{""misses"":1,""prompt"":""holds least"",""mode"":""sel...",ffeb0b1b,58,2030,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,1.0
11337822,91561152,5448d652309a6324,2019-09-22T02:07:27.562Z,"{""bucket"":1,""buckets_placed"":[3,1,2],""target_b...",ffeb0b1b,57,4025,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,1.0
11337823,d3268efa,5448d652309a6324,2019-09-22T02:07:27.566Z,"{""description"":""Awesome."",""identifier"":""Dot_Aw...",ffeb0b1b,59,3021,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,1.0
11337824,b5053438,5448d652309a6324,2019-09-22T02:07:28.311Z,"{""description"":""Awesome."",""identifier"":""Dot_Aw...",ffeb0b1b,60,3121,67847,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,1.0


Verifying each game session only has 1 game at a time on there.

In [35]:
g = train.groupby('game_session').agg({'title': lambda x: len(x.unique())})
g.title.nlargest(4)

Unnamed: 0_level_0,title
game_session,Unnamed: 1_level_1
000050630c4b081b,Dino Drink
00005be8058d8e35,Balancing Act
0000d473b2f78883,Sandcastle Builder (Activity)
0000e00444c302d9,Magma Peak - Level 1
00011d7f4b48ed4b,Crystal Caves - Level 3
...,...
fffda980b07029a3,Flower Waterer (Activity)
fffe19a8d3973f9d,Tree Top City - Level 1
fffe897c0216edaf,Costume Box
fffe9cd0cc5b076b,Leaf Leader


In [42]:
final = train.groupby('game_session').agg({'timestamp':'min', 
                                           'title':'min', 
                                           'accuracy_group':'min',
                                           'installation_id':'min'})

In [45]:
final.dropna(inplace=True)
final.sort_values(by=['installation_id', 'timestamp'],inplace=True)
train.sort_values(by=['installation_id', 'timestamp'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [46]:
final

Unnamed: 0_level_0,timestamp,title,accuracy_group,installation_id
game_session,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
901acc108f55a5a1,2019-08-06T05:22:01.344Z,Mushroom Sorter (Assessment),3.0,0006a69f
77b8ee947eb84b4e,2019-08-06T05:35:19.167Z,Bird Measurer (Assessment),0.0,0006a69f
6bdf9623adc94d89,2019-08-06T05:37:50.020Z,Mushroom Sorter (Assessment),3.0,0006a69f
9501794defd84e4d,2019-08-06T20:34:53.812Z,Mushroom Sorter (Assessment),2.0,0006a69f
a9ef3ecb3d1acc6a,2019-08-06T20:49:59.095Z,Bird Measurer (Assessment),3.0,0006a69f
...,...,...,...,...
460e8bdc2822b340,2019-07-30T12:12:49.516Z,Chest Sorter (Assessment),3.0,ffc90c32
b05a02b52d5c1f4c,2019-08-07T01:06:38.407Z,Cauldron Filler (Assessment),3.0,ffd2871d
dadd1a4d8ac68ab0,2019-09-09T15:34:41.704Z,Cauldron Filler (Assessment),1.0,ffeb0b1b
a6885ab824fbc32c,2019-09-22T01:58:20.095Z,Mushroom Sorter (Assessment),0.0,ffeb0b1b


In [3]:
dep_var = 'salary'
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [FillMissing, Categorify, Normalize]

In [4]:
test = TabularList.from_df(df.iloc[800:1000].copy(), path=path, cat_names=cat_names, cont_names=cont_names)

In [5]:
data = (TabularList.from_df(df, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)
                           .split_by_idx(list(range(800,1000)))
                           .label_from_df(cols=dep_var)
                           .add_test(test)
                           .databunch())

In [6]:
data.show_batch(rows=10)

workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,target
Local-gov,HS-grad,Married-civ-spouse,Transport-moving,Husband,White,False,1.203,-0.291,-0.4224,<50k
Private,Some-college,Never-married,Adm-clerical,Own-child,White,False,-1.4357,0.7079,-0.0312,<50k
Self-emp-not-inc,Some-college,Married-civ-spouse,Sales,Husband,White,False,1.7161,0.1362,-0.0312,<50k
Private,Some-college,Widowed,Other-service,Unmarried,White,False,1.936,0.5105,-0.0312,<50k
Private,Some-college,Separated,Adm-clerical,Unmarried,Black,False,-0.6294,-0.1218,-0.0312,<50k
Federal-gov,Bachelors,Never-married,Exec-managerial,Not-in-family,White,False,0.4701,1.8052,1.1422,<50k
Private,Assoc-voc,Married-civ-spouse,Adm-clerical,Husband,White,False,0.5434,0.4552,0.3599,<50k
Private,10th,Married-civ-spouse,Sales,Husband,White,False,0.4701,-0.7334,-1.5958,<50k
Private,HS-grad,Never-married,Machine-op-inspct,Not-in-family,White,False,-0.5561,1.3525,-0.4224,<50k
Private,9th,Married-civ-spouse,Transport-moving,Husband,White,False,1.7894,-0.8497,-1.9869,<50k


In [9]:
learn = tabular_learner(data, layers=[200,100], metrics=accuracy)

In [10]:
learn.fit(5, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.372202,0.385182,0.845,00:03
1,0.351467,0.383389,0.82,00:03
2,0.353323,0.363336,0.825,00:03
3,0.368288,0.365168,0.83,00:03
4,0.357279,0.356136,0.835,00:03


## Inference

In [11]:
row = df.iloc[0]

In [12]:
learn.predict(row)

(Category <50k, tensor(0), tensor([0.5045, 0.4955]))