In [58]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
import math
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [59]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Cabin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,C85
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,C123
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,


In [60]:
df['Survived'].value_counts() #Let's have a look at the 'survived' variable

0    549
1    342
Name: Survived, dtype: int64

In [61]:
df_subset = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Survived']] #create subset with x variables to predict y.
df_subset = df_subset.dropna()
df_subset.head() 

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Survived
0,3,male,22.0,1,0,0
1,1,female,38.0,1,0,1
2,3,female,26.0,0,0,1
3,1,female,35.0,1,0,1
4,3,male,35.0,0,0,0


In [62]:
#Creating dummy variables of qualitative variables
dummies = pd.get_dummies(df_subset['Sex'])
df_subset = pd.concat([df_subset, dummies], axis=1) #the axis=1 means: add it to the columns (axis=0 is rows)
df_subset.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Survived,female,male
0,3,male,22.0,1,0,0,0,1
1,1,female,38.0,1,0,1,1,0
2,3,female,26.0,0,0,1,1,0
3,1,female,35.0,1,0,1,1,0
4,3,male,35.0,0,0,0,0,1


In [63]:
df_subset = df_subset[['Pclass', 'female', 'Age', 'SibSp', 'Parch', 'Survived']] #create subset with x variables to predict y.
df_subset.head()

Unnamed: 0,Pclass,female,Age,SibSp,Parch,Survived
0,3,0,22.0,1,0,0
1,1,1,38.0,1,0,1
2,3,1,26.0,0,0,1
3,1,1,35.0,1,0,1
4,3,0,35.0,0,0,0


In [64]:
y = df_subset['Survived'] #We need to take out the romantic as our Y-variable
X = df_subset[['Pclass', 'female', 'Age', 'SibSp', 'Parch']] #and the subset variables as our x variable

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) #split the data into training and test set, store it into different variables

In [69]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3) # use k-nearest neighbor
knn.fit(X_train, y_train)
knn.score(X_test, y_test) #calculate the fit on the test data

0.7953488372093023

In [67]:
from sklearn.metrics import confusion_matrix
y_test_pred = knn.predict(X_test) #the predicted values
cm = confusion_matrix(y_test, y_test_pred) #creates a "confusion matrix"
cm

array([[111,  23],
       [ 21,  60]], dtype=int64)

In [71]:
y_test.value_counts() #you see which one survived and not survived

0    134
1     81
Name: Survived, dtype: int64

In [68]:
#In order to read it easily , let's make a dataframe out of it, and add labels to it.
conf_matrix = pd.DataFrame(cm, index=['not survived', 'survived'], columns = ['not survived_p', 'survived_p']) 
conf_matrix

Unnamed: 0,not survived_p,survived_p
not survived,111,23
survived,21,60


$precision = \frac{60}{23 + 60} = .72$
$recall = \frac{60}{21 + 60} = .74$