In [1]:
import pandas as pd
import numpy as np

path = "../data/titanic/"
train = pd.read_csv(path+"train.csv")
test = pd.read_csv(path+"test.csv")

In [2]:
from utils.models import CatBoostCV, LGBMCV
from utils.models_aj import RandomForestCV
from utils.eda import reduce_mem_usage, missing_data, get_cats_nums

In [3]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Memory usage of properties dataframe is : 0.081695556640625  MB
******************************
Column:  PassengerId
dtype before:  int64
min for this col:  1
max for this col:  891
dtype after:  uint16
******************************
******************************
Column:  Survived
dtype before:  int64
min for this col:  0
max for this col:  1
dtype after:  uint8
******************************
******************************
Column:  Pclass
dtype before:  int64
min for this col:  1
max for this col:  3
dtype after:  uint8
******************************
******************************
Column:  Age
dtype before:  float64
min for this col:  0.42
max for this col:  80.0
dtype after:  float32
******************************
******************************
Column:  SibSp
dtype before:  int64
min for this col:  0
max for this col:  8
dtype after:  uint8
******************************
******************************
Column:  Parch
dtype before:  int64
min for this col:  0
max for this col:  6
dtype 

In [4]:
train.columns = [f.lower() for f in train.columns]
test.columns = [f.lower() for f in test.columns]

In [5]:
train.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283302,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.099998,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
train.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [7]:
drop_cols = ['name', 'ticket', 'cabin']
train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)

In [8]:
cats, nums = get_cats_nums(train)
feats = cats + [f for f in nums if f not in ['passengerid', 'survived']]

print(cats)
print(nums)
print(feats)

['sex', 'embarked']
['passengerid', 'survived', 'pclass', 'age', 'sibsp', 'parch', 'fare']
['sex', 'embarked', 'pclass', 'age', 'sibsp', 'parch', 'fare']


# Missing values

In [9]:
missing_data(train)

Unnamed: 0,Total,Percent
age,177,19.86532
embarked,2,0.224467
fare,0,0.0
parch,0,0.0
sibsp,0,0.0
sex,0,0.0
pclass,0,0.0
survived,0,0.0
passengerid,0,0.0


In [10]:
missing_data(test)

Unnamed: 0,Total,Percent
age,86,20.574163
fare,1,0.239234
embarked,0,0.0
parch,0,0.0
sibsp,0,0.0
sex,0,0.0
pclass,0,0.0
passengerid,0,0.0


In [11]:
train['age'] = train['age'].fillna(train['age'].mean())
test['age'] = test['age'].fillna(train['age'].mean())

train['fare'] = train['fare'].fillna(train['fare'].mean())
test['fare'] = test['fare'].fillna(train['fare'].mean())

train['embarked'] = train['embarked'].fillna(train['embarked'].mode()[0])
test['embarked'] = test['embarked'].fillna(train['embarked'].mode()[0])

# Preprocessing

In [12]:
from sklearn.preprocessing import OrdinalEncoder

In [13]:
enc = OrdinalEncoder()
train[cats] = enc.fit_transform(train[cats])
test[cats] = enc.transform(test[cats])

In [27]:
train.groupby('embarked').size()

embarked
0.0    168
1.0     77
2.0    646
dtype: int64

In [14]:
# Frequency encoding
for col in cats:
    freq = train.groupby('Embarked').size()

# Model

In [15]:
from sklearn.model_selection import KFold, StratifiedKFold

In [16]:
params = {
    'random_state': 13,
    'n_estimators': 250,
    'n_jobs': -1, # all cores
}

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=13)

skf_splitted = skf.split(train[feats], train['survived'])
train[cats] = train[cats].astype('category')

rf_model = RandomForestCV(cv=skf_splitted, obj='binary', **params)

rf_model.fit(train[feats], train.survived)

<utils.models_aj.RandomForestCV at 0x7fbf12cf1128>

In [17]:
cv_score = np.mean(rf_model.model_scores_); cv_score

0.78942235128709

In [18]:
rf_model.model_scores_

[0.7716517857142857, 0.8133525562825774, 0.7832627118644069]

In [19]:
rf_model.feature_importances_

Unnamed: 0,feature_names,importance_0,importance_1,importance_2
0,sex,0.286612,0.250872,0.22351
1,embarked,0.037905,0.03747,0.037196
2,pclass,0.075554,0.083909,0.099763
3,age,0.244539,0.271475,0.265466
4,sibsp,0.051744,0.050885,0.051668
5,parch,0.038646,0.041907,0.044733
6,fare,0.265,0.263481,0.277663


# Test set

In [21]:
test[cats] = test[cats].astype('category')

In [22]:
test['Survived'] = rf_model.predict(test[feats])

In [23]:
test.rename(columns={'passengerid': 'PassengerId'}, inplace=True)

In [24]:
sub_path = "submission/titanic_submission/"

In [25]:
test[['PassengerId', 'Survived']].to_csv(sub_path+\
         "rf_titanic_{0:.3f}.csv".format(cv_score), index=False)