# Working on Titanic Dataset

### Doing necessary imports

In [24]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans

### Reading Data from CSV file

In [2]:
df = pd.read_csv('/Users/ayushsaksena/Desktop/Interests/SOM/datasets/titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Data Preprocessing

In [3]:
df.shape

(891, 12)

### Checking to see any NaN values

In [4]:
df.isnull().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool

### Dealing with NaN values

We can ignore the NaN values of the columns that has to be dropped from the dataset such as Cabin.

For Age column, we can deal with it in 3 ways-
1) Remove the entry
2) Use mathematical formulas of Mean/Median to replace with
3) Train a smaller Regression model to predict Ages of passengers

For the sake of simplicity of this assignment, I'll be using 2nd way.

In [5]:
medianAge = df['Age'].median()
medianAge

28.0

The median Age turned out to be 28. So filling the NaN values of 'Age' column with median Age.

In [6]:
df['Age'] = df['Age'].fillna(medianAge)

### Dropping the columns from Dataset

This step is necessary as it drops all columns that does not matter for the model training. This will elimate the chances for Overfitting and noise in the model. This step also reduces Irrelevant Features.

In [7]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [8]:
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'],axis='columns')
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


### Using Label Encoder

Models can only be trained on Numerical Values. Due to this limitation, we have to use libaries that encode words into values that can be evaluated.

For this we have multiple options:
1) Label Encoder
2) One Hot Encoder
3) Panda's get dummies function

We'll be using Label Encoder. This is so because we wish to implement this on 'Sex' and 'Embarked' column which has only two categorical labels so Label encoder will convert the labels into integer and create a single column.

In [9]:
label_encoder = LabelEncoder()
df['Sex'] = label_encoder.fit_transform(df.Sex)
df['Embarked'] = label_encoder.fit_transform(df.Embarked)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


### Scaling down the columns

We'll be using MinMax Scalar to scale down the 'Age' Column to get a better fit model.

In [10]:
minMax = MinMaxScaler()
minMax.fit(df[['Age']])
df.Age = minMax.transform(df[['Age']])
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,0.271174,1,0,7.25,2
1,1,1,0,0.472229,1,0,71.2833,0
2,1,3,0,0.321438,0,0,7.925,2
3,1,1,0,0.434531,1,0,53.1,2
4,0,3,1,0.434531,0,0,8.05,2


### Seperating Dataset as X and Y values

In [11]:
X = df.drop(['Survived'],axis='columns')
Y = df.Survived

## Selecting the best model and Hyperparameter Tuning

As taught in Week 4, We'll be using GridSearchCV to find the best hyperparameter for the model. It is because the dataset is small so searching through all the parameters will be the best and not take a lot of time.

We'll be testing on 3 models:
1) Logistic Regression
2) Decision Tree
3) Random Forest Classifier

### Using Logistic Regression

In [32]:
clf = GridSearchCV(LogisticRegression(solver='liblinear'),
            {'C': [1,5,10]}, cv=5)

clf.fit(X, Y)
clf.best_score_

0.7934844014813884

In [14]:
clf.best_params_

{'C': 1}

In [15]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002728,0.002045,0.000901,0.00054,1,{'C': 1},0.798883,0.797753,0.780899,0.769663,0.820225,0.793484,0.017253,1
1,0.001377,4.9e-05,0.00051,4e-05,5,{'C': 5},0.782123,0.786517,0.780899,0.769663,0.825843,0.789009,0.019237,2
2,0.001282,5.3e-05,0.000446,1.6e-05,10,{'C': 10},0.77095,0.786517,0.780899,0.769663,0.825843,0.786774,0.020514,3


### Using Decision Tree

In [17]:
clf = GridSearchCV(tree.DecisionTreeClassifier(),{
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random']
}, cv=5)
clf.fit(X, Y)
clf.best_score_

0.7867867679367271

In [18]:
clf.best_params_

{'criterion': 'entropy', 'splitter': 'best'}

In [19]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.010606,0.016974,0.001212,0.00094,gini,best,"{'criterion': 'gini', 'splitter': 'best'}",0.73743,0.786517,0.786517,0.735955,0.814607,0.772205,0.03076,5
1,0.000889,2.3e-05,0.000468,5.2e-05,gini,random,"{'criterion': 'gini', 'splitter': 'random'}",0.77095,0.741573,0.825843,0.786517,0.775281,0.780033,0.027294,4
2,0.001467,5.2e-05,0.00045,3.1e-05,entropy,best,"{'criterion': 'entropy', 'splitter': 'best'}",0.759777,0.797753,0.797753,0.775281,0.803371,0.786787,0.016598,1
3,0.000867,2.2e-05,0.000417,7e-06,entropy,random,"{'criterion': 'entropy', 'splitter': 'random'}",0.715084,0.775281,0.808989,0.764045,0.780899,0.768859,0.030697,6
4,0.001439,5.2e-05,0.000427,1.6e-05,log_loss,best,"{'criterion': 'log_loss', 'splitter': 'best'}",0.776536,0.780899,0.780899,0.769663,0.814607,0.784521,0.015594,2
5,0.000883,4.3e-05,0.000433,3.8e-05,log_loss,random,"{'criterion': 'log_loss', 'splitter': 'random'}",0.75419,0.780899,0.837079,0.780899,0.758427,0.782299,0.029546,3


### Using Random Forest Classifier

In [21]:
clf = GridSearchCV(RandomForestClassifier(),{
     'n_estimators': [1,5,10]
}, cv=5)
clf.fit(X, Y)
clf.best_score_

0.8092586780490867

In [22]:
clf.best_params_

{'n_estimators': 10}

In [23]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004843,0.006347,0.000971,0.000668,1,{'n_estimators': 1},0.715084,0.752809,0.803371,0.764045,0.764045,0.759871,0.028224,3
1,0.004004,0.000177,0.000728,0.000104,5,{'n_estimators': 5},0.776536,0.780899,0.853933,0.786517,0.780899,0.795757,0.02926,2
2,0.006593,0.000225,0.00077,5.4e-05,10,{'n_estimators': 10},0.759777,0.814607,0.837079,0.797753,0.837079,0.809259,0.028835,1
