In [768]:
import pandas as pd
import numpy as np
df = pd.read_csv('train.csv')
dftest = pd.read_csv('test.csv')
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

%matplotlib inline

In [769]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [770]:
dftest.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [771]:
df['Cabin']=df['Cabin'].str[0] # Change Cabin data to the first letter, corresponds with dec

In [772]:
X = df.drop('Survived', axis=1)
# X = df[['Age', 'Sex', 'Pclass', 'SibSp', 'Parch', 'Fare']]
# X = df[['Age', 'Sex', 'Fare', 'Pclass', 'Embarked']]
X = df[['Age', 'Sex', 'Fare', 'Pclass', 'Embarked', 'SibSp', 'Parch', 'Cabin']]
dftest['Cabin']=df['Cabin'].str[0] # Change Cabin data to the first letter, corresponds with deck
XPREDICT = dftest[['Age', 'Sex', 'Fare', 'Pclass', 'Embarked', 'SibSp', 'Parch', 'Cabin']]

In [773]:
XPREDICT.head() # used for final survival prediction

Unnamed: 0,Age,Sex,Fare,Pclass,Embarked,SibSp,Parch,Cabin
0,34.5,male,7.8292,3,Q,0,0,
1,47.0,female,7.0,3,S,1,0,C
2,62.0,male,9.6875,2,Q,0,0,
3,27.0,male,8.6625,3,S,0,0,C
4,22.0,female,12.2875,3,S,1,1,


In [774]:
y = df['Survived']

In [775]:
X.head()

Unnamed: 0,Age,Sex,Fare,Pclass,Embarked,SibSp,Parch,Cabin
0,22.0,male,7.25,3,S,1,0,
1,38.0,female,71.2833,1,C,1,0,C
2,26.0,female,7.925,3,S,0,0,
3,35.0,female,53.1,1,S,1,0,C
4,35.0,male,8.05,3,S,0,0,


In [776]:
X.shape #matrix, multi-dimensional (M, N)

(891, 8)

In [777]:
y.shape #vector, single array (M, )

(891,)

### 3. Train-Test-Split

In [778]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#80% -> training, 20% -> testing

In [779]:
X_test.head()

Unnamed: 0,Age,Sex,Fare,Pclass,Embarked,SibSp,Parch,Cabin
709,,male,15.2458,3,C,1,1,
439,31.0,male,10.5,2,S,0,0,
840,20.0,male,7.925,3,S,0,0,
720,6.0,female,33.0,2,S,0,1,
39,14.0,female,11.2417,3,C,1,0,


In [780]:
# Pre - feature engineering - convert Sex to numbers (could do this later with ColumTransformer, just gave it a shot as an alternative method)

In [781]:
femalecol=pd.get_dummies(X_train['Sex'])['female']
X_train = pd.concat ((X_train,femalecol),1)
X_train = X_train.drop('Sex', axis=1)

femalecol2=pd.get_dummies(X_test['Sex'])['female']
X_test=pd.concat ((X_test,femalecol2),1)
X_test = X_test.drop('Sex', axis=1)

testfemalecol=pd.get_dummies(XPREDICT['Sex'])['female']
XPREDICT = pd.concat ((XPREDICT,testfemalecol),1)
XPREDICT = XPREDICT.drop('Sex', axis=1)
XPREDICT.head()


Unnamed: 0,Age,Fare,Pclass,Embarked,SibSp,Parch,Cabin,female
0,34.5,7.8292,3,Q,0,0,,0
1,47.0,7.0,3,S,1,0,C,1
2,62.0,9.6875,2,Q,0,0,,0
3,27.0,8.6625,3,S,0,0,C,0
4,22.0,12.2875,3,S,1,1,,1


In [782]:
impute_then_onehotencode2 = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(sparse=False, drop='first', handle_unknown='error')
)

In [783]:
impute_then_onehotencode = make_pipeline(
    SimpleImputer(strategy='constant',fill_value='0'),
    OneHotEncoder(sparse=False, drop='first', handle_unknown='error')
)

In [784]:
#  ColumnTransformer 

fe = ColumnTransformer([
    ('ageimputer', SimpleImputer(strategy='median'), ['Age']),
    ('femalesex', 'passthrough', ['female']),
    ('fareimputer', SimpleImputer(strategy='median'), ['Fare']),
    ('onehotclass', OneHotEncoder(sparse=False, drop='first', handle_unknown='error'), ['Pclass']), 
    ('onehotcabin', impute_then_onehotencode, ['Cabin']),
     
    # uncertain about usefulness of Embarkation port in the model
    # removing it triggers a mild increase in logistic regression test accuracy (0.804->0.821) and a mild decrease in random forest test accuracy (0.821 to 0.81)
    ('onehotembarked', impute_then_onehotencode2, ['Embarked']), 
    
    ('do_nothing1', 'passthrough', ['SibSp']),
    ('do_nothing2', 'passthrough', ['Parch']),
])

In [785]:
fe.fit(X_train)
Xtrans = fe.transform(X_train)

In [786]:
#print(Xtrans)
#Xtrans.shape

In [787]:
# 1. create logistic regression model
model = LogisticRegression(max_iter=500)
# 2. train with training data
model.fit(Xtrans, y_train)

LogisticRegression(max_iter=500)

In [788]:
round(model.score(Xtrans, y_train), 3)  # --> train accuracy

0.819

In [789]:
Xtrans_test = fe.transform(X_test)

In [790]:
round(model.score(Xtrans_test, y_test), 3)  # --> test accuracy

0.804

In [791]:
# Alternative Model - Random Forest
m = RandomForestClassifier(max_depth=8, n_estimators=500, random_state=166)   # these hyperparameters seem to be beneficial for the accuracy scores
# if no random_state is provided, score may change slightly every time the model is called 
m.fit(Xtrans, y_train)

RandomForestClassifier(max_depth=8, n_estimators=500, random_state=166)

In [792]:
round(m.score(Xtrans, y_train), 3)  # --> train accuracy

0.904

In [793]:
round(m.score(Xtrans_test, y_test), 3)  # --> test accuracy -- value changes with multiple runs without random_state; results change too

0.821

In [794]:
#inspect the model parameters

model.coef_

array([[-0.02969131,  2.59166813,  0.00452539,  0.09089537, -1.09394095,
         0.3026584 ,  0.77337482,  0.29448522,  1.02650031,  1.59957192,
         0.76479533, -0.47274946, -0.1550561 , -0.14450186, -0.51276279,
        -0.29295845, -0.13017988]])

In [795]:
#cross-val score

accuracy = cross_val_score(model, Xtrans, y_train, cv=5, scoring='accuracy')
print("cross-validation scores", accuracy)  # 0,803 mean score

cross-validation scores [0.7972028  0.81818182 0.79577465 0.74647887 0.83098592]


In [796]:
#cross-val score - alternative model

accuracy = cross_val_score(m, Xtrans, y_train, cv=5, scoring='accuracy')
print("cross-validation scores", accuracy)  # 0,803 mean score

cross-validation scores [0.81818182 0.84615385 0.82394366 0.81690141 0.84507042]


In [797]:
to_predict = fe.transform(XPREDICT)       # prepare data for final prediction
prediction = model.predict(to_predict)    # run logistic regression prediction

In [798]:
XPREDICTFINAL = dftest['PassengerId']   # save prediction into csv
XPREDICTFINAL = pd.concat ((XPREDICTFINAL,pd.Series(prediction).rename('Survived')),1)
XPREDICTFINAL.to_csv ('submission.csv', index = False, header=True)

In [799]:
prediction = m.predict(to_predict)       # random forest prediction

XPREDICTFINAL = dftest['PassengerId']   # save prediction into csv
XPREDICTFINAL = pd.concat ((XPREDICTFINAL,pd.Series(prediction).rename('Survived')),1)
XPREDICTFINAL.to_csv ('submission2.csv', index = False, header=True)