
### The notebook is combine simple methods for feature engineering.
### Random Forest

## Intro

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [3]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [4]:
train.drop(['Ticket', 'Cabin'], axis=1, inplace=True)

In [5]:
train.head().T

Unnamed: 0,0,1,2,3,4
PassengerId,1,2,3,4,5
Survived,0,1,1,1,0
Pclass,3,1,3,1,3
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)","Allen, Mr. William Henry"
Sex,male,female,female,female,male
Age,22,38,26,35,35
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Fare,7.25,71.2833,7.925,53.1,8.05
Embarked,S,C,S,S,S


## Feature engineering

In [6]:
test.drop(['Ticket', 'Cabin'], axis=1, inplace=True)

In [7]:
test.head().T

Unnamed: 0,0,1,2,3,4
PassengerId,892,893,894,895,896
Pclass,3,3,2,3,3
Name,"Kelly, Mr. James","Wilkes, Mrs. James (Ellen Needs)","Myles, Mr. Thomas Francis","Wirz, Mr. Albert","Hirvonen, Mrs. Alexander (Helga E Lindqvist)"
Sex,male,female,male,male,female
Age,34.5,47,62,27,22
SibSp,0,1,0,0,1
Parch,0,0,0,0,1
Fare,7.8292,7,9.6875,8.6625,12.2875
Embarked,Q,S,Q,S,S


In [8]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Embarked         2
dtype: int64

In [9]:
test.isnull().sum()

PassengerId     0
Pclass          0
Name            0
Sex             0
Age            86
SibSp           0
Parch           0
Fare            1
Embarked        0
dtype: int64

In [10]:
train['Age'].fillna(train['Age'].median(), inplace=True)
test['Age'].fillna(test['Age'].median(), inplace=True)
train['Embarked'].fillna('S', inplace=True)
test['Fare'].fillna(test['Fare'].median(), inplace=True)

In [11]:
train.loc[ train['Fare'] <= 7.91, 'Fare']                             = 0
train.loc[(train['Fare'] > 7.91) & (train['Fare'] <= 14.454), 'Fare'] = 1
train.loc[(train['Fare'] > 14.454) & (train['Fare'] <= 31), 'Fare']   = 2
train.loc[ train['Fare'] > 31, 'Fare']                                = 3
train['Fare'] = train['Fare'].astype(int)

test.loc[ test['Fare'] <= 7.91, 'Fare']                            = 0
test.loc[(test['Fare'] > 7.91) & (test['Fare'] <= 14.454), 'Fare'] = 1
test.loc[(test['Fare'] > 14.454) & (test['Fare'] <= 31), 'Fare']   = 2
test.loc[ test['Fare'] > 31, 'Fare']                               = 3
test['Fare'] = test['Fare'].astype(int)

In [12]:
train['Title'] = train['Name'].str.split(", ", expand=True)[1].\
                str.split(".", expand=True)[0]
test['Title'] = test['Name'].str.split(", ", expand=True)[1].\
                str.split(".", expand=True)[0]

In [13]:
train['Title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer'], dtype=object)

In [14]:
train['Title'] = train['Title'].replace(['Don', 'Major', 'Sir', 'Col', 'Capt',\
                                        'Major', 'Jonkheer'], 'Mr')
train['Title'] = train['Title'].replace(['Mlle', 'Mme', 'Ms'], 'Miss')
train['Title'] = train['Title'].replace(['Mme', 'Dona', 'Lady', 'Countess',\
                                         'the Countess'], 'Mrs')

test['Title'] = test['Title'].replace(['Don', 'Major', 'Sir', 'Col', 'Capt',\
                                        'Major', 'Jonkheer'], 'Mr')
test['Title'] = test['Title'].replace(['Mlle', 'Mme', 'Ms'], 'Miss')
test['Title'] = test['Title'].replace(['Mme', 'Dona', 'Lady', 'Countess',\
                                       'the Countess'], 'Mrs')

In [15]:
train.loc[ train['Age'] <= 16, 'Age']                        = 0
train.loc[(train['Age'] > 16) & (train['Age'] <= 32), 'Age'] = 1
train.loc[(train['Age'] > 32) & (train['Age'] <= 48), 'Age'] = 2
train.loc[(train['Age'] > 48) & (train['Age'] <= 64), 'Age'] = 3
train.loc[ train['Age'] > 64, 'Age']                         = 4

test.loc[ test['Age'] <= 16, 'Age']                       = 0
test.loc[(test['Age'] > 16) & (test['Age'] <= 32), 'Age'] = 1
test.loc[(test['Age'] > 32) & (test['Age'] <= 48), 'Age'] = 2
test.loc[(test['Age'] > 48) & (test['Age'] <= 64), 'Age'] = 3
test.loc[ test['Age'] > 64, 'Age']                        = 4

In [16]:
train['Title'].unique(), test['Title'].unique()

(array(['Mr', 'Mrs', 'Miss', 'Master', 'Rev', 'Dr'], dtype=object),
 array(['Mr', 'Mrs', 'Miss', 'Master', 'Rev', 'Dr'], dtype=object))

In [17]:
train['Family'] = train['Parch'] + train['SibSp']

test['Family'] = test['Parch'] + test['SibSp']

In [18]:
train = pd.concat([train, pd.get_dummies(train['Pclass'], prefix='Pclass'),
                     pd.get_dummies(train['Sex'], prefix='Sex'),
                     pd.get_dummies(train['Family'], prefix='Family'),
                     pd.get_dummies(train['Embarked'], prefix='Embarked'),
                     pd.get_dummies(train['Age'], prefix='Age'),
                     pd.get_dummies(train['Fare'], prefix='Fare'),
                     pd.get_dummies(train['Title'], prefix='Title')],
                    axis=1)
test = pd.concat([test, pd.get_dummies(test['Pclass'], prefix='Pclass'),
                     pd.get_dummies(test['Sex'], prefix='Sex'),
                     pd.get_dummies(test['Family'], prefix='Family'),
                     pd.get_dummies(test['Embarked'], prefix='Embarked'),
                     pd.get_dummies(test['Age'], prefix='Age'),
                     pd.get_dummies(test['Fare'], prefix='Fare'),
                     pd.get_dummies(test['Title'], prefix='Title')],
                    axis=1)

In [19]:
train.drop(['Pclass', 'Name', 'Sex', 'Age', 'Fare', 'SibSp',\
            'Parch', 'Embarked', 'PassengerId', 'Title', 'Family'], axis=1, inplace=True)
test.drop(['Pclass', 'Name', 'Sex', 'Age', 'Fare', 'SibSp',\
           'Parch', 'Embarked', 'PassengerId', 'Title', 'Family'], axis=1, inplace=True)

In [20]:
train.shape, test.shape

((891, 33), (418, 32))

In [21]:
train.head(25)

Unnamed: 0,Survived,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Family_0,Family_1,Family_2,Family_3,...,Fare_0,Fare_1,Fare_2,Fare_3,Title_Dr,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rev
0,0,0,0,1,0,1,0,1,0,0,...,1,0,0,0,0,0,0,1,0,0
1,1,1,0,0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
2,1,0,0,1,1,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3,1,1,0,0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
4,0,0,0,1,0,1,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
5,0,0,0,1,0,1,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
6,0,1,0,0,0,1,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0
7,0,0,0,1,0,1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
8,1,0,0,1,1,0,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0
9,1,0,1,0,1,0,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0


In [22]:
train.columns, test.columns

(Index(['Survived', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female',
        'Sex_male', 'Family_0', 'Family_1', 'Family_2', 'Family_3', 'Family_4',
        'Family_5', 'Family_6', 'Family_7', 'Family_10', 'Embarked_C',
        'Embarked_Q', 'Embarked_S', 'Age_0.0', 'Age_1.0', 'Age_2.0', 'Age_3.0',
        'Age_4.0', 'Fare_0', 'Fare_1', 'Fare_2', 'Fare_3', 'Title_Dr',
        'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Rev'],
       dtype='object'),
 Index(['Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male',
        'Family_0', 'Family_1', 'Family_2', 'Family_3', 'Family_4', 'Family_5',
        'Family_6', 'Family_7', 'Family_10', 'Embarked_C', 'Embarked_Q',
        'Embarked_S', 'Age_0.0', 'Age_1.0', 'Age_2.0', 'Age_3.0', 'Age_4.0',
        'Fare_0', 'Fare_1', 'Fare_2', 'Fare_3', 'Title_Dr', 'Title_Master',
        'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Rev'],
       dtype='object'))

In [23]:
y = train['Survived']
train.drop('Survived', axis=1, inplace=True);

In [24]:
print(set(test.columns) - set(train.columns))

set()


In [25]:
train.shape, test.shape

((891, 32), (418, 32))

In [26]:
train.head().T

Unnamed: 0,0,1,2,3,4
Pclass_1,0,1,0,1,0
Pclass_2,0,0,0,0,0
Pclass_3,1,0,1,0,1
Sex_female,0,1,1,1,0
Sex_male,1,0,0,0,1
Family_0,0,0,1,0,1
Family_1,1,1,0,1,0
Family_2,0,0,0,0,0
Family_3,0,0,0,0,0
Family_4,0,0,0,0,0


In [27]:
test.head().T

Unnamed: 0,0,1,2,3,4
Pclass_1,0,0,0,0,0
Pclass_2,0,0,1,0,0
Pclass_3,1,1,0,1,1
Sex_female,0,1,0,0,1
Sex_male,1,0,1,1,0
Family_0,1,0,1,1,0
Family_1,0,1,0,0,0
Family_2,0,0,0,0,1
Family_3,0,0,0,0,0
Family_4,0,0,0,0,0


## Training machine learning model

In [28]:
rf = RandomForestClassifier(random_state=42, n_estimators=50,\
                           criterion='gini', max_depth=15)
rf.fit(train, y)
y_rf = rf.predict(test)
best_rf = round(rf.score(train, y) * 100, 2)
best_rf

88.44

In [29]:
rf_params = {'max_depth': list(range(5,20)),
               'n_estimators': list(range(100, 500, 50))}

rf_grid = GridSearchCV(rf, rf_params,
                         cv=10, n_jobs=-1,
                        verbose=True)
rf_grid.fit(train, y)

Fitting 10 folds for each of 120 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   29.0s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:  3.3min finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=15,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=50, n_jobs=None,
                                              oob_score=False, random_state=42,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'max_dep

In [30]:
rf_grid.best_params_, rf_grid.best_score_

({'max_depth': 5, 'n_estimators': 250}, 0.8260381593714927)

In [31]:
rf_new = RandomForestClassifier(random_state=42, max_depth=5, n_estimators=250,\
                               criterion='gini')
rf_new.fit(train, y)
y_rf = rf_new.predict(test)
best_rf = round(rf_new.score(train, y) * 100, 2)
best_rf

83.95

In [32]:
final_rf = pd.DataFrame()
tit_test = pd.read_csv('/kaggle/input/titanic/test.csv')
final_rf['PassengerId'] = tit_test['PassengerId']
final_rf['Survived'] = y_rf
final_rf.to_csv('submission_rf.csv',index=False)