# Contents 

---

- [Libraries and Data Imports](#Libraries)
- [Data Cleaning](#Data-Cleaning)
- [Model Prep](#Model-Prep)
- [Modeling](#Modeling)
- [Kaggle Submission](#Kaggle-Submission)

## Libraries 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

## Load Data

In [2]:
train = pd.read_csv('./datasets/train.csv')
train.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


In [3]:
test = pd.read_csv('./datasets/test.csv')
test.head(1)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q


## Data Cleaning

In [4]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
train = train[train['Embarked'].notnull()].copy()

In [6]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [7]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [8]:
test[test['Fare'].isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S


In [9]:
train.groupby('Pclass')['Fare'].mean()

Pclass
1    84.193516
2    20.662183
3    13.675550
Name: Fare, dtype: float64

In [10]:
test['Fare'] = test['Fare'].fillna(13.67)
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64

In [11]:
train.corr().loc['Age', 'Survived']

-0.08244586804341383

In [12]:
train['Age'].isnull().mean(), test['Age'].isnull().mean()

(0.19910011248593926, 0.20574162679425836)

In [13]:
train['Age'] = train['Age'].fillna(999)
test['Age'] = test['Age'].fillna(999)

### Feature Engineering

In [14]:
train['Survived'].mean()

0.38245219347581555

In [15]:
train.loc[train['Cabin'].notnull(), 'Survived'].mean()

0.6633663366336634

In [16]:
train['Cabin'].isnull().mean(), test['Cabin'].isnull().mean()

(0.7727784026996626, 0.7822966507177034)

In [17]:
train['Cabin'].head()

0     NaN
1     C85
2     NaN
3    C123
4     NaN
Name: Cabin, dtype: object

In [18]:
train['Cabin'] = (train['Cabin'].notnull()).astype(int)
test['Cabin'] = (test['Cabin'].notnull()).astype(int)

In [19]:
train = pd.get_dummies(train, columns=['Sex', 'Embarked'], drop_first=True)

test = pd.get_dummies(test, columns=['Sex', 'Embarked'], drop_first=True)

## Model Prep

In [20]:
features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 
            'Cabin', 'Sex_male', 'Embarked_Q', 'Embarked_S']

X = train[features]
y = train['Survived']

In [21]:
#np.random.choice(features, size = 3)

In [22]:
X.isnull().sum()

Pclass        0
Age           0
SibSp         0
Parch         0
Fare          0
Cabin         0
Sex_male      0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [23]:
train['Survived'].value_counts(normalize = True)

0    0.617548
1    0.382452
Name: Survived, dtype: float64

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state = 42)

## Modeling

In [25]:
rf = RandomForestClassifier()

In [26]:
et = ExtraTreesClassifier()

### Model Evaluation

In [27]:
cross_val_score(rf, X_train, y_train).mean()

0.8273818875547075

In [28]:
cross_val_score(et, X_train, y_train).mean()

0.8108629783413758

### Grid Search

In [29]:
params = {
    'max_depth': [None, 5, 6, 7],
    'min_samples_leaf': [1, 2, 3, 4],
    'n_estimators': [75, 100, 125]
}
gs = GridSearchCV(RandomForestClassifier(random_state = 42),
                  param_grid = params)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

0.8378745370889911


{'max_depth': 6, 'min_samples_leaf': 3, 'n_estimators': 125}

## Kaggle Submission

In [30]:
pred = gs.predict(test[features])

In [31]:
test['Survived'] = pred
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_male,Embarked_Q,Embarked_S,Survived
0,892,3,"Kelly, Mr. James",34.5,0,0,330911,7.8292,0,1,1,0,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",47.0,1,0,363272,7.0,0,0,0,1,0
2,894,2,"Myles, Mr. Thomas Francis",62.0,0,0,240276,9.6875,0,1,1,0,0
3,895,3,"Wirz, Mr. Albert",27.0,0,0,315154,8.6625,0,1,0,1,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22.0,1,1,3101298,12.2875,0,0,0,1,1


In [32]:
test[['PassengerId', 'Survived']].to_csv('.datasets/submission.csv', index = False)