## Developing the RF Model

In [309]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, roc_auc_score

sns.set()
plt.rcParams["figure.figsize"] = (10, 8)
SEED = 42

train = pd.read_csv("assets/train.csv")
test = pd.read_csv("assets/test.csv")

X = train.copy()
y = X.pop("Survived")

In [310]:
num_features = set([c for c in X.columns if X[c].dtype != "object"])
cat_features = set([c for c in X.columns if c not in num_features])
print("Numerical Features: {}\nCategorical Features: {}".format(num_features, cat_features))

Numerical Features: {'Age', 'Fare', 'SibSp', 'PassengerId', 'Pclass', 'Parch'}
Categorical Features: {'Cabin', 'Sex', 'Name', 'Ticket', 'Embarked'}


#### Pre-Processing

In [311]:
X.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [312]:
X["Age"].fillna(X["Age"].mean(), inplace=True)

In [313]:
X[X["Embarked"].isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [314]:
X["Embarked"].value_counts(normalize=True)

S    0.724409
C    0.188976
Q    0.086614
Name: Embarked, dtype: float64

In [315]:
X[X["Pclass"] == 1]["Embarked"].value_counts(normalize=True)

S    0.593458
C    0.397196
Q    0.009346
Name: Embarked, dtype: float64

In [316]:
X["Embarked"].fillna(X["Embarked"].mode()[0], inplace=True)

In [317]:
X.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

Deal with missing Cabin values later...

In [318]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 11), (179, 11), (712,), (179,))

In [319]:
def train_model(X_train, X_test, y_train, y_test):
    rf = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=SEED)
    rf.fit(X_train, y_train)
    yhat = rf.predict(X_test)
    
    roc = roc_auc_score(y_test, yhat)
    mse = mean_squared_error(y_test, yhat)
    accuracy = rf.score(X_test, y_test) #r2
    oob_score = rf.oob_score_ #oob
    
    return roc, mse, accuracy, oob_score

### Feature Engineering

In [320]:
X_train.describe() # numerical features only

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,712.0,712.0,712.0,712.0,712.0,712.0
mean,448.234551,2.330056,29.538225,0.553371,0.379213,32.586276
std,256.731423,0.824584,12.994548,1.176404,0.791669,51.969529
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,224.75,2.0,22.0,0.0,0.0,7.925
50%,453.5,3.0,29.699118,0.0,0.0,14.4542
75%,673.5,3.0,35.0,1.0,0.0,30.5
max,891.0,3.0,80.0,8.0,6.0,512.3292


In [321]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
331,332,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S
733,734,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0,,S
382,383,3,"Tikkanen, Mr. Juho",male,32.0,0,0,STON/O 2. 3101293,7.925,,S
704,705,3,"Hansen, Mr. Henrik Juul",male,26.0,1,0,350025,7.8542,,S
813,814,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,4,2,347082,31.275,,S


In [322]:
def engineer_numerical_features(df):
    temp = df.copy()
    
    family_size = df["SibSp"] + df["Parch"]
    temp["FamilySize"] = family_size
    
    age_class = df["Age"] * df["Pclass"]
    temp["AgeClass"] = age_class
    
    temp.drop(columns=cat_features, inplace=True)
    
    return temp


X_train_num = engineer_numerical_features(X_train)
X_test_num = engineer_numerical_features(X_test)
X_train_num.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,FamilySize,AgeClass
331,332,1,45.5,0,0,28.5,0,45.5
733,734,2,23.0,0,0,13.0,0,46.0
382,383,3,32.0,0,0,7.925,0,96.0
704,705,3,26.0,1,0,7.8542,1,78.0
813,814,3,6.0,4,2,31.275,6,18.0


In [323]:
X_train_num.isnull().sum().sum(), X_test_num.isnull().sum().sum() # confirm no missing values

(0, 0)

### Train Model - Numerical Only + Added Features

In [324]:
train_model(X_train_num, X_test_num, y_train, y_test)

(0.8124195624195624,
 0.1708312290502793,
 0.2955465366795367,
 0.1045799307589006)

### Categorical Variables

In [325]:
X_train["Name"].str.split(', ', expand=True).head()

Unnamed: 0,0,1
331,Partner,Mr. Austen
733,Berriman,Mr. William John
382,Tikkanen,Mr. Juho
704,Hansen,Mr. Henrik Juul
813,Andersson,Miss. Ebba Iris Alfrida


In [360]:
train_titles = X_train["Name"].str.split(', ', expand=True)[1].str.split(". ", expand=True)[0]
train_titles.value_counts()

Mr        419
Miss      143
Mrs        96
Master     33
Rev         5
Dr          5
Major       2
Col         2
Mlle        2
th          1
Ms          1
Mme         1
Capt        1
Lady        1
Name: 0, dtype: int64

In [361]:
test_titles = X_test["Name"].str.split(', ', expand=True)[1].str.split(". ", expand=True)[0]
test_titles.value_counts()

Mr          98
Miss        39
Mrs         29
Master       7
Dr           2
Rev          1
Don          1
Sir          1
Jonkheer     1
Name: 0, dtype: int64

Titles existing in either the training set or the test set, but not both.

In [373]:
set(set(train_titles.value_counts().index) - set(test_titles.value_counts().index)) | set(set(test_titles.value_counts().index) - set(train_titles.value_counts().index))

{'Capt',
 'Col',
 'Don',
 'Jonkheer',
 'Lady',
 'Major',
 'Mlle',
 'Mme',
 'Ms',
 'Sir',
 'th'}

In [384]:
def replace_rare_titles(df):
    title = df["Title"]
    if title in ["Capt", "Col", "Don", "Jonkheer", "Major", "Sir"]:
        return "Mr"
    elif title in ["Mme", "th", "Lady"]:
        return 'Mrs'
    elif title in ["Mlle", "Ms"]:
        return 'Miss'
    elif title =='Dr':
        if df['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

In [396]:
def substrings_in_string(big_string, substrings):
    if pd.isna(big_string):
        return "Unknown"
    for substring in substrings:
        if big_string.find(substring) != -1:
            if substring == "T":
                return "A"
            else:
                return substring

In [386]:
def list_decks():
    cabins = X_train["Cabin"].value_counts().index
    
    temp = []
    for c in cabins:
        for i in c:
            if i.isalpha():
                temp.append(i)

    return np.unique(temp)

In [387]:
cabin_list = np.append(list_decks(),"Unknown")
cabin_list

array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'Unknown'], dtype='<U7')

In [397]:
def engineer_categorical_features(df):
    temp = df.copy()
    
    titles = df["Name"].str.split(', ', expand=True)[1].str.split(". ", expand=True)[0]
    temp["Title"] = titles
    
    deck = df["Cabin"].map(lambda x: substrings_in_string(x, cabin_list))
    temp["Deck"] = deck
    
    temp["SexBinary"] = 0
    temp.loc[temp["Sex"] == "female", "SexBinary"] = 1
    
    temp["Title"] = temp.apply(replace_rare_titles, axis=1)
    
    one_hot_cols = ["Embarked", "Title", "Deck"]
    
    for o in one_hot_cols:
        dummies = pd.get_dummies(temp[o], prefix=o)
        temp = pd.concat([temp, dummies], axis=1)
    
    temp.drop(columns=list(cat_features) + ["Title", "Deck"], inplace=True)
    
    return temp

X_train_all = engineer_categorical_features(X_train)
X_test_all = engineer_categorical_features(X_test)
X_train_all.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,SexBinary,Embarked_C,Embarked_Q,Embarked_S,...,Title_Mrs,Title_Rev,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_Unknown
331,332,1,45.5,0,0,28.5,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
733,734,2,23.0,0,0,13.0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
382,383,3,32.0,0,0,7.925,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
704,705,3,26.0,1,0,7.8542,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
813,814,3,6.0,4,2,31.275,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [398]:
X_train_all.columns, X_test_all.columns

(Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'SexBinary',
        'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title_Master', 'Title_Miss',
        'Title_Mr', 'Title_Mrs', 'Title_Rev', 'Deck_A', 'Deck_B', 'Deck_C',
        'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_Unknown'],
       dtype='object'),
 Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'SexBinary',
        'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title_Master', 'Title_Miss',
        'Title_Mr', 'Title_Mrs', 'Title_Rev', 'Deck_A', 'Deck_B', 'Deck_C',
        'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_Unknown'],
       dtype='object'))

### Train Model - All Features

In [399]:
X_train_all.shape, X_test_all.shape

((712, 23), (179, 23))

In [400]:
X_train_all.columns, X_test_all.columns

(Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'SexBinary',
        'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title_Master', 'Title_Miss',
        'Title_Mr', 'Title_Mrs', 'Title_Rev', 'Deck_A', 'Deck_B', 'Deck_C',
        'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_Unknown'],
       dtype='object'),
 Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'SexBinary',
        'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title_Master', 'Title_Miss',
        'Title_Mr', 'Title_Mrs', 'Title_Rev', 'Deck_A', 'Deck_B', 'Deck_C',
        'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_Unknown'],
       dtype='object'))

In [401]:
train_model(X_train_all, X_test_all, y_train, y_test)

(0.8843629343629343, 0.1321069162011173, 0.455233242985843, 0.3901324702585224)

**To Do**

- Experiment with grouping titles - i.e. Rev should be Mr

**Resources**

* https://www.kaggle.com/gunesevitan/titanic-advanced-feature-engineering-tutorial