In [485]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv('/Users/alantrinh/Code/Spiced Academy/data/train.csv')

# create and fit the model
X = df.loc[:, df.columns != 'Survived']
y = df['Survived']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [195]:
X_train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
234,235,2,"Leyson, Mr. Robert William Norman",male,24.0,0,0,C.A. 29566,10.5000,,S
471,472,3,"Cacic, Mr. Luka",male,38.0,0,0,315089,8.6625,,S
669,670,1,"Taylor, Mrs. Elmer Zebley (Juliet Cummins Wright)",female,,1,0,19996,52.0000,C126,S
557,558,1,"Robbins, Mr. Victor",male,,0,0,PC 17757,227.5250,,C
517,518,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.1500,,Q
...,...,...,...,...,...,...,...,...,...,...,...
740,741,1,"Hawksford, Mr. Walter James",male,,0,0,16988,30.0000,D45,S
172,173,3,"Johnson, Miss. Eleanor Ileen",female,1.0,1,1,347742,11.1333,,S
778,779,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q
237,238,2,"Collyer, Miss. Marjorie ""Lottie""",female,8.0,0,2,C.A. 31921,26.2500,,S


In [196]:
X_train['female'] = X_train['Sex'].replace({'female':1,'male':0})
X_train.drop('Sex', inplace=True, axis=1)
X_train

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,female
234,235,2,"Leyson, Mr. Robert William Norman",24.0,0,0,C.A. 29566,10.5000,,S,0
471,472,3,"Cacic, Mr. Luka",38.0,0,0,315089,8.6625,,S,0
669,670,1,"Taylor, Mrs. Elmer Zebley (Juliet Cummins Wright)",,1,0,19996,52.0000,C126,S,1
557,558,1,"Robbins, Mr. Victor",,0,0,PC 17757,227.5250,,C,0
517,518,3,"Ryan, Mr. Patrick",,0,0,371110,24.1500,,Q,0
...,...,...,...,...,...,...,...,...,...,...,...
740,741,1,"Hawksford, Mr. Walter James",,0,0,16988,30.0000,D45,S,0
172,173,3,"Johnson, Miss. Eleanor Ileen",1.0,1,1,347742,11.1333,,S,1
778,779,3,"Kilgannon, Mr. Thomas J",,0,0,36865,7.7375,,Q,0
237,238,2,"Collyer, Miss. Marjorie ""Lottie""",8.0,0,2,C.A. 31921,26.2500,,S,1


In [197]:
X_train['title'] = X_train['Name'].str.extract('\, (.*?)\.', expand=False).to_frame()
X_train['title'].value_counts()

Mr              418
Miss            140
Mrs             101
Master           36
Dr                4
Rev               3
Col               2
Capt              1
Jonkheer          1
the Countess      1
Lady              1
Sir               1
Ms                1
Major             1
Don               1
Name: title, dtype: int64

In [198]:
X_train[X_train['title'].str.contains('Jonkheer|Sir|Lady|the Countess|Don')].join(y_train)

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,female,title,Survived
822,823,1,"Reuchlin, Jonkheer. John George",38.0,0,0,19972,0.0,,S,0,Jonkheer,0
759,760,1,"Rothes, the Countess. of (Lucy Noel Martha Dye...",33.0,0,0,110152,86.5,B77,S,1,the Countess,1
556,557,1,"Duff Gordon, Lady. (Lucille Christiana Sutherl...",48.0,1,0,11755,39.6,A16,C,1,Lady,1
599,600,1,"Duff Gordon, Sir. Cosmo Edmund (""Mr Morgan"")",49.0,1,0,PC 17485,56.9292,A20,C,0,Sir,1
30,31,1,"Uruchurtu, Don. Manuel E",40.0,0,0,PC 17601,27.7208,,C,0,Don,0


In [300]:
def categorise_titles(dataframe):
    dataframe['noble'] = dataframe['title'].str.contains('Jonkheer|Sir|Lady|the Countess|Don').astype(int).to_frame()
    dataframe['military'] = dataframe['title'].str.contains('Col|Capt|Major').astype(int).to_frame()
    dataframe['doctor'] = dataframe['title'].str.contains('Dr').astype(int).to_frame()
    dataframe['church'] = dataframe['title'].str.contains('Rev').astype(int).to_frame()
categorise_titles(X_train)
X_train

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,female,title,title_tmp,noble,military,doctor,church
234,235,2,"Leyson, Mr. Robert William Norman",24.0,0,0,C.A. 29566,10.5000,,S,0,Mr,common,0,0,0,0
471,472,3,"Cacic, Mr. Luka",38.0,0,0,315089,8.6625,,S,0,Mr,common,0,0,0,0
669,670,1,"Taylor, Mrs. Elmer Zebley (Juliet Cummins Wright)",,1,0,19996,52.0000,C126,S,1,Mrs,common,0,0,0,0
557,558,1,"Robbins, Mr. Victor",,0,0,PC 17757,227.5250,,C,0,Mr,common,0,0,0,0
517,518,3,"Ryan, Mr. Patrick",,0,0,371110,24.1500,,Q,0,Mr,common,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
740,741,1,"Hawksford, Mr. Walter James",,0,0,16988,30.0000,D45,S,0,Mr,common,0,0,0,0
172,173,3,"Johnson, Miss. Eleanor Ileen",1.0,1,1,347742,11.1333,,S,1,Miss,common,0,0,0,0
778,779,3,"Kilgannon, Mr. Thomas J",,0,0,36865,7.7375,,Q,0,Mr,common,0,0,0,0
237,238,2,"Collyer, Miss. Marjorie ""Lottie""",8.0,0,2,C.A. 31921,26.2500,,S,1,Miss,common,0,0,0,0


In [307]:
X_train['Fare'].info()

<class 'pandas.core.series.Series'>
Int64Index: 712 entries, 234 to 815
Series name: Fare
Non-Null Count  Dtype  
--------------  -----  
712 non-null    float64
dtypes: float64(1)
memory usage: 27.3 KB


In [377]:
df_age_mean = X_train.groupby('title')[['Age']].mean()
df_age_mean
X_train['age_filled'] = np.where(X_train['Age'].isna(), X_train['title'].map(df_age_mean['Age']), X_train['Age'])
#X_train['age_filled'] = X_train['Age'].fillna(X_train.groupby('title')['Age'].transform('mean'))
X_train[['Age', 'age_filled', 'title']][(X_train['Age'].isna()) & (X_train['title'] == 'Master')]

Unnamed: 0,Age,age_filled,title
709,,4.981875,Master
159,,4.981875,Master
176,,4.981875,Master
65,,4.981875,Master


In [391]:
def categorise_embarked(dataframe):
    dataframe['C'] = dataframe['title'].str.contains('C').astype(int).to_frame()
    dataframe['Q'] = dataframe['title'].str.contains('Q').astype(int).to_frame()
    dataframe['S'] = dataframe['title'].str.contains('S').astype(int).to_frame()
categorise_embarked(X_train)
X_train

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,...,title,title_tmp,noble,military,doctor,church,age_filled,C,Q,S
234,235,2,"Leyson, Mr. Robert William Norman",24.0,0,0,C.A. 29566,10.5000,,S,...,Mr,common,0,0,0,0,24.000000,0,0,0
471,472,3,"Cacic, Mr. Luka",38.0,0,0,315089,8.6625,,S,...,Mr,common,0,0,0,0,38.000000,0,0,0
669,670,1,"Taylor, Mrs. Elmer Zebley (Juliet Cummins Wright)",,1,0,19996,52.0000,C126,S,...,Mrs,common,0,0,0,0,36.209302,0,0,0
557,558,1,"Robbins, Mr. Victor",,0,0,PC 17757,227.5250,,C,...,Mr,common,0,0,0,0,32.217666,0,0,0
517,518,3,"Ryan, Mr. Patrick",,0,0,371110,24.1500,,Q,...,Mr,common,0,0,0,0,32.217666,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
740,741,1,"Hawksford, Mr. Walter James",,0,0,16988,30.0000,D45,S,...,Mr,common,0,0,0,0,32.217666,0,0,0
172,173,3,"Johnson, Miss. Eleanor Ileen",1.0,1,1,347742,11.1333,,S,...,Miss,common,0,0,0,0,1.000000,0,0,0
778,779,3,"Kilgannon, Mr. Thomas J",,0,0,36865,7.7375,,Q,...,Mr,common,0,0,0,0,32.217666,0,0,0
237,238,2,"Collyer, Miss. Marjorie ""Lottie""",8.0,0,2,C.A. 31921,26.2500,,S,...,Miss,common,0,0,0,0,8.000000,0,0,0


In [421]:
X_train['small_family'] = (((X_train['Parch'] > 0) & (X_train['Parch'] < 4)) | ((X_train['SibSp'] > 0) & (X_train['SibSp'] < 3))).astype(int)
X_train

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,...,title_tmp,noble,military,doctor,church,age_filled,C,Q,S,small_family
234,235,2,"Leyson, Mr. Robert William Norman",24.0,0,0,C.A. 29566,10.5000,,S,...,common,0,0,0,0,24.000000,0,0,0,0
471,472,3,"Cacic, Mr. Luka",38.0,0,0,315089,8.6625,,S,...,common,0,0,0,0,38.000000,0,0,0,0
669,670,1,"Taylor, Mrs. Elmer Zebley (Juliet Cummins Wright)",,1,0,19996,52.0000,C126,S,...,common,0,0,0,0,36.209302,0,0,0,1
557,558,1,"Robbins, Mr. Victor",,0,0,PC 17757,227.5250,,C,...,common,0,0,0,0,32.217666,0,0,0,0
517,518,3,"Ryan, Mr. Patrick",,0,0,371110,24.1500,,Q,...,common,0,0,0,0,32.217666,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
740,741,1,"Hawksford, Mr. Walter James",,0,0,16988,30.0000,D45,S,...,common,0,0,0,0,32.217666,0,0,0,0
172,173,3,"Johnson, Miss. Eleanor Ileen",1.0,1,1,347742,11.1333,,S,...,common,0,0,0,0,1.000000,0,0,0,1
778,779,3,"Kilgannon, Mr. Thomas J",,0,0,36865,7.7375,,Q,...,common,0,0,0,0,32.217666,0,0,0,0
237,238,2,"Collyer, Miss. Marjorie ""Lottie""",8.0,0,2,C.A. 31921,26.2500,,S,...,common,0,0,0,0,8.000000,0,0,0,1


In [444]:
df_fare_mean = X_train.groupby('Pclass')[['Fare']].mean()
df_fare_mean
X_train['fare_filled'] = np.where(X_train['Fare'].isna(), X_train['Pclass'].map(df_fare_mean['Fare']), X_train['Fare'])
X_train[['Fare', 'fare_filled', 'Pclass']]#[(X_train['Age'].isna()) & (X_train['title'] == 'Master')]

Unnamed: 0,Fare,fare_filled,Pclass
234,10.5000,10.5000,2
471,8.6625,8.6625,3
669,52.0000,52.0000,1
557,227.5250,227.5250,1
517,24.1500,24.1500,3
...,...,...,...
740,30.0000,30.0000,1
172,11.1333,11.1333,3
778,7.7375,7.7375,3
237,26.2500,26.2500,2


In [494]:
X_train_final=X_train[['Pclass', 'female', 'noble', 'military', 'doctor', 'church', 'fare_filled', 'age_filled', 'C', 'Q', 'S', 'small_family']]
#X_train_final=X_train[['Pclass', 'female', 'fare_filled', 'age_filled', 'C', 'Q', 'S']]
X_train_final.describe()

Unnamed: 0,Pclass,female,noble,military,doctor,church,fare_filled,age_filled,C,Q,S,small_family
count,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0
mean,2.301966,0.344101,0.007022,0.005618,0.005618,0.004213,32.558567,29.564763,0.005618,0.0,0.001404,0.391854
std,0.838656,0.475408,0.083564,0.074795,0.074795,0.06482,51.515889,13.70759,0.074795,0.0,0.037477,0.488508
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.67,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0,0.0,0.0,7.8958,21.158784,0.0,0.0,0.0,0.0
50%,3.0,0.0,0.0,0.0,0.0,0.0,14.5,30.5,0.0,0.0,0.0,0.0
75%,3.0,1.0,0.0,0.0,0.0,0.0,31.275,36.0,0.0,0.0,0.0,1.0
max,3.0,1.0,1.0,1.0,1.0,1.0,512.3292,80.0,1.0,0.0,1.0,1.0


In [500]:
m = LogisticRegression(max_iter=1000)
m.fit(X_train_final, y_train)
m.score(X_train_final, y_train)

0.7949438202247191

In [530]:
m_rf = RandomForestClassifier(max_depth=8, n_estimators=1000)
m_rf.fit(X_train_final, y_train)
m_rf.score(X_train_final, y_train)

0.9087078651685393

In [274]:
X_val['female'] = X_val['Sex'].replace({'female':1,'male':0})
X_val.drop('Sex', inplace=True, axis=1)

KeyError: 'Sex'

In [498]:
X_val['title'] = X_val['Name'].str.extract('\, (.*?)\.', expand=False).to_frame()
categorise_titles(X_val)

X_val['age_filled'] = np.where(X_val['Age'].isna(), X_val['title'].map(df_age_mean['Age']), X_val['Age'])

categorise_embarked(X_val)

X_val['small_family'] = (((X_val['Parch'] > 0) & (X_val['Parch'] < 4)) | ((X_val['SibSp'] > 0) & (X_val['SibSp'] < 3))).astype(int)

X_val['fare_filled'] = np.where(X_val['Fare'].isna(), X_val['Pclass'].map(df_fare_mean['Fare']), X_val['Fare'])

X_val_final=X_val[['Pclass', 'female', 'noble', 'military', 'doctor', 'church', 'fare_filled', 'age_filled', 'C', 'Q', 'S', 'small_family']]
#X_val_final=X_val[['Pclass', 'female', 'fare_filled', 'age_filled', 'C', 'Q', 'S']]
X_val_final

Unnamed: 0,Pclass,female,noble,military,doctor,church,fare_filled,age_filled,C,Q,S,small_family
13,3,0,0,0,0,0,31.2750,39.000000,0,0,0,1
312,2,1,0,0,0,0,26.0000,26.000000,0,0,0,1
823,3,1,0,0,0,0,12.4750,27.000000,0,0,0,1
610,3,1,0,0,0,0,31.2750,39.000000,0,0,0,1
142,3,1,0,0,0,0,15.8500,24.000000,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
507,1,0,0,0,0,0,26.5500,32.217666,0,0,0,0
651,2,1,0,0,0,0,23.0000,18.000000,0,0,0,1
771,3,0,0,0,0,0,7.8542,48.000000,0,0,0,0
754,2,1,0,0,0,0,65.0000,48.000000,0,0,0,1


In [501]:
m.score(X_val_final, y_val)

0.7932960893854749

In [529]:
m_rf.score(X_val_final, y_val)

0.8212290502793296

In [435]:
X_test = pd.read_csv('/Users/alantrinh/Code/Spiced Academy/data/test.csv')

In [436]:
X_test['female'] = X_test['Sex'].replace({'female':1,'male':0})
X_test.drop('Sex', inplace=True, axis=1)

In [503]:
X_test['title'] = X_test['Name'].str.extract('\, (.*?)\.', expand=False).to_frame()
categorise_titles(X_test)

X_test['age_filled'] = np.where(X_test['Age'].isna(), X_test['title'].map(df_age_mean['Age']), X_test['Age'])

categorise_embarked(X_test)

X_test['small_family'] = (((X_test['Parch'] > 0) & (X_test['Parch'] < 4)) | ((X_test['SibSp'] > 0) & (X_test['SibSp'] < 3))).astype(int)

X_test['fare_filled'] = np.where(X_test['Fare'].isna(), X_test['Pclass'].map(df_fare_mean['Fare']), X_test['Fare'])

X_test_final=X_test[['Pclass', 'female', 'noble', 'military', 'doctor', 'church', 'fare_filled', 'age_filled', 'C', 'Q', 'S', 'small_family']]
#X_test_final=X_test[['Pclass', 'female', 'fare_filled', 'age_filled', 'C', 'Q', 'S']]
X_test_final

Unnamed: 0,Pclass,female,noble,military,doctor,church,fare_filled,age_filled,C,Q,S,small_family
0,3,0,0,0,0,0,7.8292,34.500000,0,0,0,0
1,3,1,0,0,0,0,7.0000,47.000000,0,0,0,1
2,2,0,0,0,0,0,9.6875,62.000000,0,0,0,0
3,3,0,0,0,0,0,8.6625,27.000000,0,0,0,0
4,3,1,0,0,0,0,12.2875,22.000000,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,0,0,0,0,0,8.0500,32.217666,0,0,0,0
414,1,1,1,0,0,0,108.9000,39.000000,0,0,0,0
415,3,0,0,0,0,0,7.2500,38.500000,0,0,0,0
416,3,0,0,0,0,0,8.0500,32.217666,0,0,0,0


In [531]:
test_submission = X_test[['PassengerId']].join(pd.DataFrame(m_rf.predict(X_test_final), columns=['Survived']))
test_submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [532]:
test_submission.to_csv('/Users/alantrinh/Code/Spiced Academy/garlic-boosting-student-code/02_week_project/test_submission_rf_depth_8.csv', index=False)