In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

In [3]:
df=pd.read_csv('./titanic/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [88]:
df.shape

(891, 12)

### Lets clean our data again, same steps as the last time

We check for which values are empty. We can observe that the majority of the cabin values are empty so we will drop the column 'Cabin'. We drop the empty rows next, to clear out all the empty and missing values

In [89]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [90]:
df.drop(['Cabin'],axis=1,inplace=True)

In [91]:
df["Age"].fillna(df["Age"].median(skipna=True), inplace=True)
df["Fare"].fillna(df["Fare"].median(skipna=True), inplace=True)

In [92]:
df['TravelAlone']=np.where((df["SibSp"]+df["Parch"])>0, 0, 1)

In [93]:
training=pd.get_dummies(df, columns=["Pclass","Embarked","Sex"])
training.drop('Sex_female', axis=1, inplace=True)
training.drop('PassengerId', axis=1, inplace=True)
training.drop('Name', axis=1, inplace=True)
training.drop('Ticket', axis=1, inplace=True)

training['IsMinor']=np.where(training['Age']<=16, 1, 0)
training

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,TravelAlone,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Sex_male,IsMinor
0,0,22.0,1,0,7.2500,0,0,0,1,0,0,1,1,0
1,1,38.0,1,0,71.2833,0,1,0,0,1,0,0,0,0
2,1,26.0,0,0,7.9250,1,0,0,1,0,0,1,0,0
3,1,35.0,1,0,53.1000,0,1,0,0,0,0,1,0,0
4,0,35.0,0,0,8.0500,1,0,0,1,0,0,1,1,0
5,0,28.0,0,0,8.4583,1,0,0,1,0,1,0,1,0
6,0,54.0,0,0,51.8625,1,1,0,0,0,0,1,1,0
7,0,2.0,3,1,21.0750,0,0,0,1,0,0,1,1,1
8,1,27.0,0,2,11.1333,0,0,0,1,0,0,1,0,0
9,1,14.0,1,0,30.0708,0,0,1,0,1,0,0,0,1


In [94]:
from sklearn.model_selection import train_test_split
labels=training['Survived']
train_data=training.drop(['Survived'],axis=1)
x_train,x_test,y_train,y_test=train_test_split(train_data,labels,test_size=0.2)

In [95]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()

In [96]:
model.fit(x_train,y_train)
prediction=model.predict(x_test)

In [97]:
model.score(x_test,y_test)

0.7821229050279329

### Feature Engineering time....
Let's make some new data now, and improve our poor little model

In [98]:
df['Title'] = df.Name.apply(lambda x: re.search(' ([A-Z][a-z]+)\.', x).group(1))

In [99]:
df['Fam_Size'] = df.Parch + df.SibSp

In [101]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,TravelAlone,Title,Fam_Size
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,0,Mr,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,0,Mrs,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,1,Miss,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,0,Mrs,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,1,Mr,0


In [107]:
training=pd.get_dummies(df, columns=["Pclass","Embarked","Sex","Title"])
training.drop('PassengerId', axis=1, inplace=True)
training.drop('Name', axis=1, inplace=True)
training.drop('Ticket', axis=1, inplace=True)

training['IsMinor']=np.where(training['Age']<=16, 1, 0)
training.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,TravelAlone,Fam_Size,Pclass_1,Pclass_2,Pclass_3,...,Title_Master,Title_Miss,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir,IsMinor
0,0,22.0,1,0,7.25,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,1,38.0,1,0,71.2833,0,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1,26.0,0,0,7.925,1,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
3,1,35.0,1,0,53.1,0,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,35.0,0,0,8.05,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0


In [109]:
from sklearn.model_selection import train_test_split
labels=training['Survived']
train_data=training.drop(['Survived'],axis=1)
x_train,x_test,y_train,y_test=train_test_split(train_data,labels,test_size=0.2)

In [110]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()

In [111]:
model.fit(x_train,y_train)
prediction=model.predict(x_test)

In [112]:
model.score(x_test,y_test)

0.8435754189944135

### Now we'll move onto some more powerful modesl

## THE MIGHTY SVM

In [113]:
from sklearn import svm

In [130]:
clf=svm.SVC(C = 30, gamma = 0.001)

In [131]:
clf.fit(x_train,y_train)

SVC(C=30, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [132]:
clf.score(x_test,y_test)

0.8212290502793296