# Predicting survivors in the legendary Titanic dataset

### Download the dataset here -> https://www.kaggle.com/hesh97/titanicdataset-traincsv

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.shape

(891, 12)

In [4]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# Setting passenger ID as the index column
data.set_index('PassengerId',inplace=True)

# Brief description of what those columns mean

### Pclass - Passenger Class (1 = 1st, 2 = 2nd, 3 = 3rd)
### Name - Name
### Sex - Gender
### SibSp - Number of siblings/spouses aboard
### Parch - Number of parents/children aboard
### Ticket - Ticket number
### Fare - Ticket fare
### Cabin - Cabin number
### Embarked - Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
### Survived - Survived (1- yes, 0- no)

In [6]:
# Most people of the class 2 and 3 don't have a cabin. Those both columns are almost related to each other.
# So I would like to drop the 'Cabin' column
# Apart from that I'll drop the 'Name' and 'Ticket' columns

In [7]:
drop = ['Cabin','Name','Ticket']
data.drop(drop,axis=1,inplace=True)

# Dealing with the null values

In [8]:
data.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [9]:
# Age has 177 missing values, I would like to replace all those missing values with mode of that column
data.Age.fillna(data['Age'].mode()[0],inplace=True)
data.Age.isna().sum()

0

In [10]:
# Dropping the observations with no Embarked record
data.dropna(inplace=True)

# Now let's try converting the categorical variables into numerical

In [11]:
# Label encoding the column Sex, as there are only two unique values

# Import PreProcessing
from sklearn import preprocessing 
 
label_encoder = preprocessing.LabelEncoder()

In [12]:
data['Sex'] = label_encoder.fit_transform(data['Sex'])

In [13]:
# One-hot encoding the column Embark, as there is no direct relationship between the boarding stations,
# so label encoding it won't make sense                    

In [14]:
onehot_variables = pd.get_dummies(data['Embarked'],prefix='Embarked_at')
onehot_variables.head()

Unnamed: 0_level_0,Embarked_at_C,Embarked_at_Q,Embarked_at_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,0,1
2,1,0,0
3,0,0,1
4,0,0,1
5,0,0,1


In [15]:
onehot_variables.rename(columns = {'Embarked_at_C':'Embarked_at_Cherbourg','Embarked_at_Q':'Embarked_at_Queenstown',
                                   'Embarked_at_S':'Embarked_at_Southampton'},inplace=True)

In [16]:
# We can drop one column from these to avoid multicollinearity
onehot_variables.drop('Embarked_at_Southampton',axis=1,inplace=True)

In [17]:
# Dropping Embarked column in the main dataframe
data.drop('Embarked',axis=1,inplace=True)

In [18]:
# Concatinating the main dataframe and the onehot variable dataframe
final_data = pd.concat([data,onehot_variables],axis=1)

In [19]:
# Moving the target feature to the end
final_data_target = final_data.Survived
final_data.drop('Survived',axis=1,inplace=True)
final_data['Survived'] = final_data_target
final_data.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_at_Cherbourg,Embarked_at_Queenstown,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3,1,22.0,1,0,7.25,0,0,0
2,1,0,38.0,1,0,71.2833,1,0,1
3,3,0,26.0,0,0,7.925,0,0,1
4,1,0,35.0,1,0,53.1,0,0,1
5,3,1,35.0,0,0,8.05,0,0,0


# Removing outliers

In [20]:
from scipy import stats
z_scores = stats.zscore(final_data)

In [21]:
import numpy as np
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
new_df = final_data[filtered_entries]

# Splitting the train and test datasets

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
target = new_df.Survived
new_df.drop('Survived',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [41]:
train,test,target_train,target_test = train_test_split(new_df,target,test_size=0.2,random_state=9)

# The first model I'll be using is Random forest classifier
## By hyperparameter tuning, I'll decide which parameters are to be used for a better accuracy

In [42]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [43]:
# Number of trees in Random forest
n_estimators = [int(x) for x in np.linspace(start=200,stop=2000,num=10)]

# The number of features to consider when looking for the best split
max_features = ['auto','sqrt','log2']

# Maximum number of levels in each tree (The maximum depth of the tree)
max_depth = [int(x) for x in np.linspace(10,100,10)]

# Minimum number of samples required to split a node
min_samples_split = [2,3,4,5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1,2,3,5,6]

random_grid = {'n_estimators':n_estimators,
               'max_features':max_features,
               'max_depth':max_depth,
               'min_samples_split':min_samples_split,
               'min_samples_leaf':min_samples_leaf,
               'criterion':['entropy','gini']}

In [44]:
rf_classifier = RandomForestClassifier()

In [45]:
# n_iter(How many iterations do we want to check), cv or cross-validation(number of times train and test will be split)
# n_jobs given -1 to use all processors

randomCV = RandomizedSearchCV(estimator=rf_classifier,param_distributions=random_grid,n_iter=100,cv=4,random_state=9,
                              n_jobs=-1)

In [46]:
# Fitting the randomized model

randomCV.fit(train,target_train)

RandomizedSearchCV(cv=4, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'min_samples_split': [2, 3, 4, 5], 'min_samples_leaf': [1, 2, 3, 5, 6], 'criterion': ['entropy', 'gini']},
          pre_dispatch='2*n_jobs', random_state=9, refit=True,
          return_train_score='warn', scoring=None, verbose

In [47]:
# Checking the best parameters

randomCV.best_params_

{'n_estimators': 800,
 'min_samples_split': 3,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': 10,
 'criterion': 'entropy'}

In [48]:
classifier = randomCV.best_estimator_

In [49]:
test_pred = classifier.predict(test)

In [50]:
# Checking accuracy
from sklearn.metrics import accuracy_score
print("Accuracy is {} percent".format(round(100*accuracy_score(target_test,test_pred),2)))

Accuracy is 84.67 percent


In [51]:
# After randomizedsearchCV applying grid searchCV
from sklearn.model_selection import GridSearchCV

In [52]:
parameter_grid = {'n_estimators':[700,750,800,850,900],
               'max_features':['log2'],
               'max_depth':[8,10,12],
               'min_samples_split':[2,3,4],
               'min_samples_leaf':[1,2],
               'criterion':['entropy']}
parameter_grid

{'n_estimators': [700, 750, 800, 850, 900],
 'max_features': ['log2'],
 'max_depth': [8, 10, 12],
 'min_samples_split': [2, 3, 4],
 'min_samples_leaf': [1, 2],
 'criterion': ['entropy']}

In [53]:
grid_search = GridSearchCV(estimator=rf_classifier,param_grid=parameter_grid,cv=10,n_jobs=-1)
grid_search.fit(train,target_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [700, 750, 800, 850, 900], 'max_features': ['log2'], 'max_depth': [8, 10, 12], 'min_samples_split': [2, 3, 4], 'min_samples_leaf': [1, 2], 'criterion': ['entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [54]:
best_grid = grid_search.best_estimator_

In [57]:
best_grid

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=8, max_features='log2', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [55]:
test_pred = best_grid.predict(test)

In [56]:
print("Accuracy is {} percent".format(round(100*accuracy_score(target_test,test_pred),2)))

Accuracy is 84.67 percent


# Generally after applying gridsearchCV, the accuracy increases, but here we didn't notice any significant changes in the accuracy and that's fine.

## Now I'll try to predict the survivors by applying hyperparameter tuning for logistic regression

In [93]:
# Used to specify the norm used in the penalization.
penalty = ['l2']

# Inverse of regularization strength
C = [0.5,0.8,1.0,1.2,1.4]

# Specifies if a constant should be added to the decision function.
fit_intercept = [True,False]

# Algorithm to use in the optimization problem.
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

# Maximum number of iterations taken over the solvers to converge
max_iter = [50,100,150,200,250]

random_grid = {'penalty':penalty,
               'C':C,
               'fit_intercept':fit_intercept,
               'solver':solver,
               'max_iter':max_iter}
random_grid

{'penalty': ['l2'],
 'C': [0.5, 0.8, 1.0, 1.2, 1.4],
 'fit_intercept': [True, False],
 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
 'max_iter': [50, 100, 150, 200, 250]}

In [94]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()

In [95]:
randomCV = RandomizedSearchCV(estimator=log_reg,param_distributions=random_grid,n_iter=100,cv=4,random_state=9,
                              n_jobs=-1)

In [96]:
randomCV.fit(train,target_train)

RandomizedSearchCV(cv=4, error_score='raise-deprecating',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'penalty': ['l2'], 'C': [0.5, 0.8, 1.0, 1.2, 1.4], 'fit_intercept': [True, False], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'max_iter': [50, 100, 150, 200, 250]},
          pre_dispatch='2*n_jobs', random_state=9, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [97]:
randomCV.best_params_

{'solver': 'liblinear',
 'penalty': 'l2',
 'max_iter': 200,
 'fit_intercept': False,
 'C': 1.4}

In [99]:
classifier = randomCV.best_estimator_
classifier

LogisticRegression(C=1.4, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=200, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [100]:
test_pred = classifier.predict(test)

In [101]:
print("Accuracy is {} percent".format(round(100*accuracy_score(target_test,test_pred),2)))

Accuracy is 79.33 percent


# As we can see, Random forest gave us more accuracy than Logistic regression. There are many other classification models but for this dataset, I'll stick to Random Forest.