In [402]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import numpy as np

from sklearn.feature_selection import VarianceThreshold

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix

train = pd.read_csv("/Users/camilledunning/Desktop/titanic-challenge/titanic/train.csv")
family_column = train['SibSp'] + train['Parch']
train['Family'] = family_column
train = train[['Survived', 'Pclass', 'Sex', 'Age', 'Family', 'Embarked', 'Fare']]
train['Age'] = train['Age'].interpolate()
train['Fare'] = train['Fare'].interpolate()
train

Unnamed: 0,Survived,Pclass,Sex,Age,Family,Embarked,Fare
0,0,3,male,22.0,1,S,7.2500
1,1,1,female,38.0,1,C,71.2833
2,1,3,female,26.0,0,S,7.9250
3,1,1,female,35.0,1,S,53.1000
4,0,3,male,35.0,0,S,8.0500
...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,S,13.0000
887,1,1,female,19.0,0,S,30.0000
888,0,3,female,22.5,3,S,23.4500
889,1,1,male,26.0,0,C,30.0000


In [403]:
train.describe()

Unnamed: 0,Survived,Pclass,Age,Family,Fare
count,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.726061,0.904602,32.204208
std,0.486592,0.836071,13.902353,1.613459,49.693429
min,0.0,1.0,0.42,0.0,0.0
25%,0.0,2.0,21.0,0.0,7.9104
50%,0.0,3.0,28.5,0.0,14.4542
75%,1.0,3.0,38.0,1.0,31.0
max,1.0,3.0,80.0,10.0,512.3292


# Some Essential Info About the Survivors

In [404]:
print(str(round(np.mean(train['Survived']) * 100)) + "% of the passengers on the RMS Titanic survived.\n")
print(str(round((sum((train[train['Sex'] == 'female'])['Survived']) / sum(train['Survived'])) * 100)) + "% of the survivors were female.\n")
print(str(round((sum((train[train['Pclass'] == 1])['Survived']) / sum(train['Survived'])) * 100)) + "% of the survivors were first class.")
print(str(round((sum((train[train['Pclass'] == 2])['Survived']) / sum(train['Survived'])) * 100)) + "% of the survivors were second class.")
print(str(round((sum((train[train['Pclass'] == 3])['Survived']) / sum(train['Survived'])) * 100)) + "% of the survivors were third class.\n")
print(str(round((sum((train[train['Age'] <= 20])['Survived']) / sum(train['Survived'])) * 100)) + "% of the survivors were 20 or younger.")
print(str(round((sum((train[(train['Age'] > 20) & (train['Age'] < 50)])['Survived']) / sum(train['Survived'])) * 100)) + "% of the survivors were between 20 and 50.")
print(str(round((sum((train[train['Age'] >= 50])['Survived']) / sum(train['Survived'])) * 100)) + "% of the survivors were 50 or older.\n")
print(str(round((sum((train[train['Family'] == 0])['Survived']) / sum(train['Survived'])) * 100)) + "% of the survivors had no family members aboard.")
print(str(round((sum((train[train['Family'] >= 3])['Survived']) / sum(train['Survived'])) * 100)) + "% of the survivors had three or more family members aboard.\n")
print(str(round((sum((train[train['Embarked'] == 'S'])['Survived']) / sum(train['Survived'])) * 100)) + "% of the survivors embarked from Southampton.")
print(str(round((sum((train[train['Embarked'] == 'C'])['Survived']) / sum(train['Survived'])) * 100)) + "% of the survivors embarked from Cherbourg.")
print(str(round((sum((train[train['Embarked'] == 'Q'])['Survived']) / sum(train['Survived'])) * 100)) + "% of the survivors embarked from Queenstown.")

38.0% of the passengers on the RMS Titanic survived.

68% of the survivors were female.

40% of the survivors were first class.
25% of the survivors were second class.
35% of the survivors were third class.

27% of the survivors were 20 or younger.
64% of the survivors were between 20 and 50.
9% of the survivors were 50 or older.

48% of the survivors had no family members aboard.
9% of the survivors had three or more family members aboard.

63% of the survivors embarked from Southampton.
27% of the survivors embarked from Cherbourg.
9% of the survivors embarked from Queenstown.


# Visualizing this Info

In [405]:
survivors = train[train['Survived'] == 1]
female_survivors = survivors[survivors['Sex'] == 'female']
male_survivors = survivors[survivors['Sex'] == 'male']
classes = ['First Class', 'Second Class', 'Third Class']
female_classes = female_survivors['Pclass'].value_counts(sort=False, normalize=True).to_list()
male_classes = male_survivors['Pclass'].value_counts(sort=False, normalize=True).to_list()
fig = go.Figure(data=[
    go.Bar(name='Female', x=classes, y=female_classes),
    go.Bar(name='Male', x=classes, y=male_classes)])
fig.update_layout(barmode='stack', width=400, height=400, title="Class and Sex of Survivors Ratios")
fig.show()


In [406]:
s_port = survivors[survivors['Embarked'] == 'S']
c_port = survivors[survivors['Embarked'] == 'C']
q_port = survivors[survivors['Embarked'] == 'Q']

s_classes = s_port['Pclass'].value_counts(sort=False, normalize=True).to_list()
c_classes = c_port['Pclass'].value_counts(sort=False, normalize=True).to_list()
q_classes = q_port['Pclass'].value_counts(sort=False, normalize=True).to_list()

fig = go.Figure(data=[
    go.Bar(name='Southampton', x=classes, y=s_classes),
    go.Bar(name='Cherbourg', x=classes, y=c_classes),
    go.Bar(name='Queenstown', x=classes, y=q_classes)])
fig.update_layout(barmode='stack', width=450, height=400, title="Class and Embarking Port of Survivors Ratios")
fig.show()


In [407]:
fig = px.histogram(train, x='Age', y='Survived', color='Survived', marginal='box', opacity=0.75, 
                   hover_data=train.columns, title='Ages of Survived and Dead Groups')
fig.update_layout(width=700, height=400)
fig.show()

In [408]:
fig = px.histogram(train, x='Survived', y='Family', color='Survived', marginal='box', opacity=0.75, 
                   hover_data=train.columns, orientation='h', title='Number of Family Members Aboard for Survived and Dead Groups')
fig.update_layout(width=700, height=400)
fig.show()

In [409]:
fig = px.histogram(train, x='Fare', y='Survived', color='Survived', marginal='box', opacity=0.75,
                  hover_data=train.columns, title='Fare Distribution Among Survivors and Non-Survivors')
fig.update_layout(width=700, height=400)
fig.show()

# Preparing the Data for a Machine Learning Model and Feature Selection

In [410]:
titanic_dummies = pd.get_dummies(train, columns=['Pclass', 'Sex', 'Embarked'], prefix=['Class', 'Sex', 'Port'])
titanic_dummies

Unnamed: 0,Survived,Age,Family,Fare,Class_1,Class_2,Class_3,Sex_female,Sex_male,Port_C,Port_Q,Port_S
0,0,22.0,1,7.2500,0,0,1,0,1,0,0,1
1,1,38.0,1,71.2833,1,0,0,1,0,1,0,0
2,1,26.0,0,7.9250,0,0,1,1,0,0,0,1
3,1,35.0,1,53.1000,1,0,0,1,0,0,0,1
4,0,35.0,0,8.0500,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,27.0,0,13.0000,0,1,0,0,1,0,0,1
887,1,19.0,0,30.0000,1,0,0,1,0,0,0,1
888,0,22.5,3,23.4500,0,0,1,1,0,0,0,1
889,1,26.0,0,30.0000,1,0,0,0,1,1,0,0


In [411]:
sel = VarianceThreshold(threshold=0.8 * (1 - 0.8))
sel.fit_transform(titanic_dummies)
fitted = titanic_dummies[titanic_dummies.columns[sel.get_support(indices=True)]]
fitted

Unnamed: 0,Survived,Age,Family,Fare,Class_1,Class_2,Class_3,Sex_female,Sex_male,Port_S
0,0,22.0,1,7.2500,0,0,1,0,1,1
1,1,38.0,1,71.2833,1,0,0,1,0,0
2,1,26.0,0,7.9250,0,0,1,1,0,1
3,1,35.0,1,53.1000,1,0,0,1,0,1
4,0,35.0,0,8.0500,0,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...
886,0,27.0,0,13.0000,0,1,0,0,1,1
887,1,19.0,0,30.0000,1,0,0,1,0,1
888,0,22.5,3,23.4500,0,0,1,1,0,1
889,1,26.0,0,30.0000,1,0,0,0,1,0


In [412]:
print('Original DF shape vs feature-selected DF shape: ' + str(titanic_dummies.shape) + ', ' + str(fitted.shape))

Original DF shape vs feature-selected DF shape: (891, 12), (891, 10)


# MODEL 1
## Linear Support Vector Classifier

In [413]:
SVC_classifier = SVC(kernel='linear')
features = fitted[fitted.columns[1:]]
label = fitted[fitted.columns[0]]

X_train, X_test, Y_train, Y_test = train_test_split(features, label, test_size=0.2)
SVC_classifier.fit(X_train, Y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [414]:
y_pred = SVC_classifier.predict(X_test)
y_pred

array([1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 0, 0])

### K-Fold Cross Validation to Get Accuracy

In [415]:
def cross_val(model, X_test, Y_test, cv):
    cross_val_scores = cross_val_score(model, X_test, Y_test, cv=cv)
    print("10-Fold Cross Validation Scores for Linear SVC: " + str(list(cross_val_scores)))
    print("Accuracy: %0.2f (+/- %0.2f)" % (cross_val_scores.mean(), cross_val_scores.std() * 2))
    
cross_val(SVC_classifier, X_test, Y_test, 10)

10-Fold Cross Validation Scores for Linear SVC: [0.7777777777777778, 0.7222222222222222, 0.6666666666666666, 0.8333333333333334, 0.7777777777777778, 0.4444444444444444, 0.6111111111111112, 0.8333333333333334, 0.7777777777777778, 0.8235294117647058]
Accuracy: 0.73 (+/- 0.23)


### Plot Confusion Matrix

In [431]:
tn, fp, fn, tp = confusion_matrix(Y_test, y_pred).ravel()
print((tn, fp, fn, tp))

def plot_confusion_matrix(Y_true, Y_pred):
    cm = list(confusion_matrix(Y_true, y_pred))
    x = ['Pred. Not Survived', 'Pred. Survived']
    y = ['Not Survived', 'Survived']
    cm_text = [['TN', 'FP'], ['FN', 'TP']]
    fig = ff.create_annotated_heatmap(cm, x=x, y=y, annotation_text=cm_text, colorscale="aggrnyl")
    fig.update_layout(title="Confusion Matrix", width=400, height=400)
    fig.show()

plot_confusion_matrix(Y_test, y_pred)

(83, 21, 29, 46)


### Compute Metrics from this Confusion Matrix