In [1]:
from fastai import *
from fastai.tabular import *
from utils.models import FastAICV

In [2]:
import pandas as pd
import numpy as np

path = "../data/titanic/"
train = pd.read_csv(path+"train.csv")
test = pd.read_csv(path+"test.csv")

In [3]:
from utils.eda import reduce_mem_usage, missing_data, get_cats_nums

In [4]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Memory usage of properties dataframe is : 0.081695556640625  MB
******************************
Column:  PassengerId
dtype before:  int64
min for this col:  1
max for this col:  891
dtype after:  uint16
******************************
******************************
Column:  Survived
dtype before:  int64
min for this col:  0
max for this col:  1
dtype after:  uint8
******************************
******************************
Column:  Pclass
dtype before:  int64
min for this col:  1
max for this col:  3
dtype after:  uint8
******************************
******************************
Column:  Age
dtype before:  float64
min for this col:  0.42
max for this col:  80.0
dtype after:  float32
******************************
******************************
Column:  SibSp
dtype before:  int64
min for this col:  0
max for this col:  8
dtype after:  uint8
******************************
******************************
Column:  Parch
dtype before:  int64
min for this col:  0
max for this col:  6
dtype 

In [5]:
missing_data(train)

Unnamed: 0,Total,Percent
Cabin,687,77.104377
Age,177,19.86532
Embarked,2,0.224467
Fare,0,0.0
Ticket,0,0.0
Parch,0,0.0
SibSp,0,0.0
Sex,0,0.0
Name,0,0.0
Pclass,0,0.0


In [6]:
missing_data(test)

Unnamed: 0,Total,Percent
Cabin,327,78.229665
Age,86,20.574163
Fare,1,0.239234
Embarked,0,0.0
Ticket,0,0.0
Parch,0,0.0
SibSp,0,0.0
Sex,0,0.0
Name,0,0.0
Pclass,0,0.0


In [7]:
train.columns = [f.lower() for f in train.columns]
test.columns = [f.lower() for f in test.columns]

In [8]:
drop_cols = ['name', 'ticket', 'cabin']

In [9]:
train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)

In [10]:
cats, nums = get_cats_nums(train)
nums = [f for f in nums if f not in ['passengerid', 'survived']]

In [11]:
cats

['sex', 'embarked']

In [12]:
nums

['pclass', 'age', 'sibsp', 'parch', 'fare']

In [13]:
missing_data(train[cats])

Unnamed: 0,Total,Percent
embarked,2,0.224467
sex,0,0.0


In [14]:
missing_data(train[nums])

Unnamed: 0,Total,Percent
age,177,19.86532
fare,0,0.0
parch,0,0.0
sibsp,0,0.0
pclass,0,0.0


In [15]:
for num in nums:
    train[num] = train[num].fillna(-1)
    test[num] = test[num].fillna(-1)

In [16]:
for cat in cats:
    train[cat] = train[cat].fillna(train[cat].mode()[0])
    test[cat] = test[cat].fillna(train[cat].mode()[0])

In [17]:
missing_data(test[cats])

Unnamed: 0,Total,Percent
embarked,0,0.0
sex,0,0.0


In [18]:
train[cats] = train[cats].astype('category')
test[cats] = test[cats].astype('category')

In [19]:
train.dtypes

passengerid      uint16
survived          uint8
pclass            uint8
sex            category
age             float32
sibsp             uint8
parch             uint8
fare            float32
embarked       category
dtype: object

In [20]:
test.dtypes

passengerid      uint16
pclass            uint8
sex            category
age             float32
sibsp             uint8
parch             uint8
fare            float32
embarked       category
dtype: object

In [21]:
from sklearn.model_selection import KFold, StratifiedKFold
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=13)

In [22]:
skf_splitted = skf.split(train[cats+nums], train['survived'])

In [23]:
skf_folds = []
for tr, te in skf_splitted:
    skf_folds.append([tr, te])

In [24]:
skf_folds

[[array([  1,   2,   3,   4, ..., 884, 886, 888, 889]),
  array([  0,   5,  14,  21, ..., 880, 885, 887, 890])],
 [array([  0,   2,   3,   4, ..., 887, 888, 889, 890]),
  array([  1,   7,   8,   9, ..., 881, 882, 884, 886])],
 [array([  0,   1,   5,   7, ..., 885, 886, 887, 890]),
  array([  2,   3,   4,   6, ..., 871, 883, 888, 889])]]

In [25]:
procs=[Categorify, Normalize, FillMissing] #don't use the idea first of filling

In [26]:
from fastai.metrics import accuracy

In [27]:
fastai_model = FastAICV(
    folds=skf_folds, cat_names=cats, cont_names=nums, procs=procs, metric=accuracy,
    bs=64, metric_mode='max'
)

In [28]:
cats

['sex', 'embarked']

In [29]:
train.dtypes

passengerid      uint16
survived          uint8
pclass            uint8
sex            category
age             float32
sibsp             uint8
parch             uint8
fare            float32
embarked       category
dtype: object

In [30]:
##define architecture

layers = [100]
ps=[0.01]
emb_drop=0.01

In [31]:
cats

['sex', 'embarked']

In [32]:
fastai_model.fit_predict(train, test, epochs=10, lr=1e-2, wd=None, y='survived',\
               layers=layers, ps=ps, emb_drop=emb_drop)

<utils.models.FastAICV at 0x7f2b2cda4c90>

In [33]:
cv_score = np.mean(fastai_model.model_scores_); cv_score

0.8237935

In [34]:
fastai_model.model_scores_

[tensor(0.8148), tensor(0.8316), tensor(0.8249)]

In [35]:
pd.read_csv(path+"gender_submission.csv")

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [36]:
fastai_model.predict()

array([0.134284, 0.302446, 0.075522, 0.133599, ..., 0.965989, 0.101174, 0.082911, 0.549341])

In [37]:
test['Survived'] = fastai_model.predict()

In [38]:
test.rename(columns={'passengerid': 'PassengerId'}, inplace=True)

In [39]:
sub_path = "submission/titanic_submission/"

In [40]:
test[['PassengerId', 'Survived']].to_csv(sub_path+\
                 "fastai_titanic_{0:.3f}.csv".format(cv_score), index=False)