# Preparing data

In [14]:
import pandas as pd

In [15]:
dataset = pd.read_csv("data.csv", index_col="PassengerId")
dataset.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [16]:
dataset['Sex'] = dataset['Sex'].astype('category')
dataset['Pclass'] = dataset['Pclass'].replace({1: 'Upper', 2: 'Middle', 3: 'Lower'}).astype('category')
dataset['Embarked'] = dataset['Embarked'].replace({'S':'Southampton', 'C':'Cherbourg', 'Q':'Queenstown'}).astype('category')

In [17]:
d_dataset = pd.get_dummies(dataset.drop(["Name", "Ticket", "Cabin"], 1))
d_dataset.head()

Unnamed: 0_level_0,Survived,Age,SibSp,Parch,Fare,Pclass_Lower,Pclass_Middle,Pclass_Upper,Sex_female,Sex_male,Embarked_Cherbourg,Embarked_Queenstown,Embarked_Southampton
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,22.0,1,0,7.25,1,0,0,0,1,0,0,1
2,1,38.0,1,0,71.2833,0,0,1,1,0,1,0,0
3,1,26.0,0,0,7.925,1,0,0,1,0,0,0,1
4,1,35.0,1,0,53.1,0,0,1,1,0,0,0,1
5,0,35.0,0,0,8.05,1,0,0,0,1,0,0,1


In [18]:
d_dataset[d_dataset.isnull().any(axis=1)]

Unnamed: 0_level_0,Survived,Age,SibSp,Parch,Fare,Pclass_Lower,Pclass_Middle,Pclass_Upper,Sex_female,Sex_male,Embarked_Cherbourg,Embarked_Queenstown,Embarked_Southampton
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
6,0,,0,0,8.4583,1,0,0,0,1,0,1,0
18,1,,0,0,13.0000,0,1,0,0,1,0,0,1
20,1,,0,0,7.2250,1,0,0,1,0,1,0,0
27,0,,0,0,7.2250,1,0,0,0,1,1,0,0
29,1,,0,0,7.8792,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
860,0,,0,0,7.2292,1,0,0,0,1,1,0,0
864,0,,8,2,69.5500,1,0,0,1,0,0,0,1
869,0,,0,0,9.5000,1,0,0,0,1,0,0,1
879,0,,0,0,7.8958,1,0,0,0,1,0,0,1


In [19]:
d_dataset.interpolate(inplace=True)

In [20]:
X = d_dataset.drop(["Survived"], 1)
Y = d_dataset["Survived"]

# SVM classifier

In [21]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

In [23]:
model = SVC()
cross_val_score(model, X, Y, cv=5, n_jobs=-1)

array([0.58659218, 0.71348315, 0.68539326, 0.68539326, 0.69101124])

# KNeigbours

In [25]:
from sklearn.neighbors import KNeighborsClassifier

In [29]:
model2 = KNeighborsClassifier()
cross_val_score(model, X, Y, cv=5, n_jobs=-1)

array([0.64804469, 0.66292135, 0.71348315, 0.70786517, 0.73595506])

# Finding better hyperparameters

In [27]:
from sklearn.model_selection import GridSearchCV

In [28]:
parameters = {
      "n_neighbors": [3, 5, 10, 15, 20],
      "weights" : ['uniform', 'distance'],
      "p" : [1, 2,3,4]
}

In [30]:
neighbors_model = KNeighborsClassifier()
clf = GridSearchCV(neighbors_model, parameters)
clf.fit(X, Y)

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [3, 5, 10, 15, 20], 'p': [1, 2, 3, 4],
                         'weights': ['uniform', 'distance']})

In [31]:
clf.best_params_

{'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}

In [33]:
model2 = KNeighborsClassifier(n_neighbors=5, p=1, weights='uniform')
cross_val_score(model2, X, Y, cv=5, n_jobs=-1)

array([0.68156425, 0.74719101, 0.7752809 , 0.78651685, 0.73033708])