In [1]:
import pandas as pd
import numpy as np

path = "../data/titanic/"
train = pd.read_csv(path+"train.csv")
test = pd.read_csv(path+"test.csv")

In [2]:
from utils.models import CatBoostCV, LGBMCV
from utils.models import RandomForestCV
from utils.eda import reduce_mem_usage, missing_data, get_cats_nums

In [3]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Memory usage of properties dataframe is : 0.081695556640625  MB
******************************
Column:  PassengerId
dtype before:  int64
min for this col:  1
max for this col:  891
dtype after:  uint16
******************************
******************************
Column:  Survived
dtype before:  int64
min for this col:  0
max for this col:  1
dtype after:  uint8
******************************
******************************
Column:  Pclass
dtype before:  int64
min for this col:  1
max for this col:  3
dtype after:  uint8
******************************
******************************
Column:  Age
dtype before:  float64
min for this col:  0.42
max for this col:  80.0
dtype after:  float32
******************************
******************************
Column:  SibSp
dtype before:  int64
min for this col:  0
max for this col:  8
dtype after:  uint8
******************************
******************************
Column:  Parch
dtype before:  int64
min for this col:  0
max for this col:  6
dtype 

In [4]:
train.columns = [f.lower() for f in train.columns]
test.columns = [f.lower() for f in test.columns]

In [5]:
train.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283302,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.099998,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
train.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [7]:
drop_cols = ['name', 'ticket', 'cabin']
train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)

In [8]:
cats, nums = get_cats_nums(train)

print(cats)
print(nums)

['sex', 'embarked']
['passengerid', 'survived', 'pclass', 'age', 'sibsp', 'parch', 'fare']


# Missing values

In [9]:
missing_data(train)

Unnamed: 0,Total,Percent
age,177,19.86532
embarked,2,0.224467
fare,0,0.0
parch,0,0.0
sibsp,0,0.0
sex,0,0.0
pclass,0,0.0
survived,0,0.0
passengerid,0,0.0


In [10]:
missing_data(test)

Unnamed: 0,Total,Percent
age,86,20.574163
fare,1,0.239234
embarked,0,0.0
parch,0,0.0
sibsp,0,0.0
sex,0,0.0
pclass,0,0.0
passengerid,0,0.0


In [11]:
train['age'] = train['age'].fillna(train['age'].mean())
test['age'] = test['age'].fillna(train['age'].mean())

train['fare'] = train['fare'].fillna(train['fare'].mean())
test['fare'] = test['fare'].fillna(train['fare'].mean())

train['embarked'] = train['embarked'].fillna(train['embarked'].mode()[0])
test['embarked'] = test['embarked'].fillna(train['embarked'].mode()[0])

# Preprocessing

## Label encoding

In [12]:
from sklearn.preprocessing import LabelEncoder

In [13]:
enc = LabelEncoder()
new_cols_0 = []
for col in cats:
    new_col = str(col) + '_0'
    train[new_col] = enc.fit_transform(train[col])
    test[new_col] = enc.transform(test[col])
    new_cols_0.append(new_col)

## Frequency encoding

In [14]:
new_cols_1 = []
for col in cats:
    new_col = str(col) + '_1'
    encoding = train.groupby(col).size()/len(train)    
    train[new_col] = train[col].map(encoding)
    test[new_col] = test[col].map(encoding)
    new_cols_1.append(new_col)

In [15]:
train.drop(cats, axis=1, inplace=True)
test.drop(cats, axis=1, inplace=True)
cats = []
cats.extend(new_cols_0)
cats.extend(new_cols_1)

In [16]:
feats = cats + [f for f in nums if f not in ['passengerid', 'survived']]
print(feats)

['sex_0', 'embarked_0', 'sex_1', 'embarked_1', 'pclass', 'age', 'sibsp', 'parch', 'fare']


# Model

In [17]:
from sklearn.model_selection import KFold, StratifiedKFold

In [18]:
params = {
    'random_state': 13,
    'n_estimators': 500,
    'n_jobs': -1, # all cores
}

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=13)

skf_splitted = skf.split(train[feats], train['survived'])
train[cats] = train[cats].astype('category')

rf_model = RandomForestCV(cv=skf_splitted, obj='binary', **params)

rf_model.fit(train[feats], train.survived)

<utils.models.RandomForestCV at 0x7fd014be9b38>

In [19]:
cv_score = np.mean(rf_model.model_scores_); cv_score

0.7913640058319203

In [20]:
rf_model.model_scores_

[0.7716517857142857, 0.814849221968243, 0.7875910098132322]

In [21]:
rf_model.feature_importances_

Unnamed: 0,feature_names,importance_0,importance_1,importance_2
0,sex_0,0.158087,0.134828,0.130236
1,embarked_0,0.020453,0.020508,0.021187
2,sex_1,0.158496,0.139198,0.127236
3,embarked_1,0.01963,0.01891,0.017334
4,pclass,0.079792,0.077209,0.100129
5,age,0.232701,0.259037,0.257557
6,sibsp,0.049304,0.053855,0.047069
7,parch,0.033952,0.039593,0.042052
8,fare,0.247586,0.256862,0.257199


# Test set

In [22]:
test[cats] = test[cats].astype('category')

In [23]:
test['Survived'] = rf_model.predict(test[feats])

In [24]:
test.rename(columns={'passengerid': 'PassengerId'}, inplace=True)

In [25]:
sub_path = "submission/titanic_submission/"

In [26]:
test[['PassengerId', 'Survived']].to_csv(sub_path+\
         "rf_titanic_{0:.3f}.csv".format(cv_score), index=False)