**Important: This notebook will only work with fastai-0.7.x. Do not try to run any fastai-1.x code from this path in the repository because it will load fastai-0.7.x**

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display

from sklearn import metrics

In [3]:
PATH = "data/titanic/"

In [4]:
!ls {PATH}

test.csv  train.csv


In [5]:
df_raw = pd.read_csv(f'{PATH}train.csv', low_memory=False)

In [6]:
df_raw.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [7]:
df_raw['Surname'] = df_raw['Name'].str.split(',').str[0]

In [8]:
df_raw.head()
df_raw = df_raw.drop('Name', axis=1)

The categorical variables are currently stored as strings, which is inefficient, and doesn't provide the numeric coding required for a random forest. Therefore we call train_cats to convert strings to pandas categories.

In [9]:
train_cats(df_raw)

In [10]:
df_raw.isnull().sum().sort_index()/len(df_raw)

Age            0.198653
Cabin          0.771044
Embarked       0.002245
Fare           0.000000
Parch          0.000000
PassengerId    0.000000
Pclass         0.000000
Sex            0.000000
SibSp          0.000000
Surname        0.000000
Survived       0.000000
Ticket         0.000000
dtype: float64

But let's save this file for now, since it's already in format can we be stored and accessed efficiently.

In [11]:
os.makedirs('tmp', exist_ok=True)
df_raw.reset_index().to_feather('tmp/titanic-raw')

In the future we can simply read it from this fast format

In [12]:
df_raw = pd.read_feather('tmp/titanic-raw')

We'll replace categories with their numeric codes, handle missing continuous values, and split the dependent variable into a separate variable.

In [13]:
df, y, nas = proc_df(df_raw, 'Survived')
df['Fare_na']=False

In [14]:
set_rf_samples(20000)

In [15]:
m = RandomForestClassifier(n_estimators=150, min_samples_leaf=5, n_jobs=-1, oob_score=True)
m.fit(df, y)
m.score(df,y)

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


0.941638608305275

In [16]:
df_test = pd.read_csv(f'{PATH}test.csv', low_memory=False)
df_test['Surname'] = df_test['Name'].str.split(',').str[0]
df_test = df_test.drop('Name', axis=1)
train_cats(df_test)

In [17]:
os.makedirs('tmp', exist_ok=True)
df_test.reset_index().to_feather('tmp/titanic-test')

In [18]:
df_test = pd.read_feather('tmp/titanic-test')
df_test
df, _, nas = proc_df(df_test)
df

Unnamed: 0,index,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Surname,Age_na,Fare_na
0,0,892,3,2,34.5,0,0,153,7.8292,0,2,173,False,False
1,1,893,3,1,47.0,1,0,222,7.0000,0,3,340,False,False
2,2,894,2,2,62.0,0,0,74,9.6875,0,2,232,False,False
3,3,895,3,2,27.0,0,0,148,8.6625,0,3,345,False,False
4,4,896,3,1,22.0,1,1,139,12.2875,0,3,151,False,False
5,5,897,3,2,14.0,0,0,262,9.2250,0,3,313,False,False
6,6,898,3,1,30.0,0,0,159,7.6292,0,2,74,False,False
7,7,899,2,2,26.0,1,1,85,29.0000,0,3,49,False,False
8,8,900,3,1,18.0,0,0,101,7.2292,0,1,4,False,False
9,9,901,3,2,21.0,2,0,270,24.1500,0,3,91,False,False


In [19]:
predicted_labels = m.predict(df)

In [20]:
predicted_labels

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 0, 

In [21]:
file = open("titanic-result.csv","w")

file.write("PassengerId,Survived\n")
i=892
for label in predicted_labels:
    file.write(f'{i},{label}\n')
    i=i+1
file.close()