In [103]:
##############################################
# AI1104 - Project
# Titanic Surival Prediction
# https://www.kaggle.com/c/titanic
##############################################
# Import all required libraries
import pandas as pd
import numpy as np
import io
from google.colab import files
from google.colab import drive
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [104]:
# Uploading train.csv file from local drive and reading it into a data frame
uploaded = files.upload()
tr_dframe = pd.read_csv(io.BytesIO(uploaded['train.csv']))

Saving train.csv to train.csv


In [105]:
# insight into the data
tr_dframe.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [106]:
# description of the data
####
# Survived
####
# It informs whether a passenger has survived or not
# It has 2 categories: 0,1
# 0: Did not survive
# 1: Survived

####
# Pclass 
####
# It refers to passanger class
# It has 3 categories: 1,2,3
# 1: First Class 
# 2: Second Class
# 3: Third Class

####
# SibSp
####
# It refers to total count of siblings/spouse who were on titanic with the passenger

####
# Parch
####
# It refers to total count of parents/children who were on titanic with the passenger

####
# Embarked
####
# It refers to the port of embarkation
# It has 3 categories: C,Q,S
# C: Cherbourg
# Q: Queenstown
# S: Southampton

In [107]:
# some analysis of data
tr_dframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [108]:
# we can convert some "object" type data to "int", for easier computation

# replacing 'male' with 0 and 'female' with 1 in 'Sex'
tr_dframe.replace({'Sex':{'male': 0, 'female': 1}}, inplace=True) 

# replacing 'C' with 0, 'Q' with 1 and 'S' with 2 in 'Embarked'
tr_dframe.replace({'Embarked':{'C': 1, 'Q': 2, 'S': 3}}, inplace=True)

In [109]:
# Some data have missing values of 'Age', 'Cabin', 'Embarked'
tr_dframe.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [110]:
# As only 2 values from 'Embarked' are missing, we can replace them with approximate values and use the column/feature for training
mode = tr_dframe['Embarked'].mode() # highest occuring data
mean = tr_dframe['Embarked'].mean() # average
print(mean)
print(mode)
print(mode[0])

2.5354330708661417
0    3.0
dtype: float64
3.0


In [111]:
# As 'Embarked' is categorical column, we can't replace missing values with mean.
# Hence we approximate the missing values, by replacing with mode value.
tr_dframe['Embarked'].fillna(mode[0], inplace = True)

In [112]:
# As not many values are missing from 'Age', we can replace them with mean values and use the column/feature for training
mean = tr_dframe['Age'].mean() # average
print(mean)
tr_dframe['Age'].fillna(mean, inplace = True)

29.69911764705882


In [113]:
# As more than half of the values are missing from 'Cabin', we can't fill the missing values with good approximation
# So we have to drop this column/feature for training

# analysis of modified data
tr_dframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int64  
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     891 non-null    float64
dtypes: float64(3), int64(6), object(3)
memory usage: 83.7+ KB


In [114]:
# Its safe to assume that 'PassengerId', 'Name', 'Ticket' do not influence the chances of survival of a person
# Hence, we can drop these columns/features too for training

# final dataframe to be used for training/choosing model
final_tr_dframe=tr_dframe.drop(['Cabin', 'PassengerId', 'Name', 'Ticket'], axis=1)

# insight into the final dataframe
final_tr_dframe.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,3.0
1,1,1,1,38.0,1,0,71.2833,1.0
2,1,3,1,26.0,0,0,7.925,3.0
3,1,1,1,35.0,1,0,53.1,3.0
4,0,3,0,35.0,0,0,8.05,3.0


In [115]:
# extracting features
tr_fea=final_tr_dframe.iloc[:,1:8].values

# extracting labels
tr_label=final_tr_dframe.iloc[:,0].values

In [116]:
# # hypertuning parameters of various models 
# model_params = {
#     'svm': {
#         'model': SVC(),
#         'params' : {
#             'C': [0.5,1,2],
#             'kernel': ['rbf','linear'] # were found to give better results than other kernels like 'poly', 'sigmoid'
#         }  
#     },
#     'knn': {
#          'model': KNeighborsClassifier(metric='minkowski'),
#          'params' : {
#              'p': [2]
#          }
#     },    
#     'logistic_regression' : {
#         'model': LogisticRegression(class_weight='balanced'),
#         'params': {
#             'C': [0.5,1,2],
#             'solver': ['newton-cg','lbfgs'] # were found to give better results than other solvers like 'liblinear'
#         }
#     }
# }

In [117]:
# scores = []

# for model_name, mp in model_params.items():
#     clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
#     clf.fit(tr_fea, tr_label)
#     scores.append({
#         'model': model_name,
#         'best_score': clf.best_score_,
#         'best_params': clf.best_params_
#     })
    
# df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
# df

In [118]:
# we can check cross validation score of different models, with different combinations of parameters, and choose the model with best score, for this classificatio problem
# trying hyper parameterization

In [119]:
# checking cross validation score for various combinations of model parameters

lr = LogisticRegression(solver='liblinear',class_weight='balanced',C=0.5)
print(np.average(cross_val_score(lr,tr_fea,tr_label,cv=5)))

lr = LogisticRegression(solver='liblinear',class_weight='balanced',C=0.75)
print(np.average(cross_val_score(lr,tr_fea,tr_label,cv=5)))

lr = LogisticRegression(solver='liblinear',class_weight='balanced',C=1)
print(np.average(cross_val_score(lr,tr_fea,tr_label,cv=5)))

lr = LogisticRegression(solver='liblinear',class_weight='balanced',C=1.25)
print(np.average(cross_val_score(lr,tr_fea,tr_label,cv=5)))

lr = LogisticRegression(solver='liblinear',class_weight='balanced',C=0.80)
print(np.average(cross_val_score(lr,tr_fea,tr_label,cv=5)))

0.7777854497520558
0.7811499591990458
0.7800263636934279
0.7800263636934279
0.782273554704664


In [120]:
# checking cross validation score for various combinations of model parameters

lr = LogisticRegression(solver='lbfgs',class_weight='balanced',C=0.5,max_iter=150)
print(np.average(cross_val_score(lr,tr_fea,tr_label,cv=5)))

lr = LogisticRegression(solver='lbfgs',class_weight='balanced',C=0.75,max_iter=150)
print(np.average(cross_val_score(lr,tr_fea,tr_label,cv=5)))

lr = LogisticRegression(solver='lbfgs',class_weight='balanced',C=1,max_iter=150)
print(np.average(cross_val_score(lr,tr_fea,tr_label,cv=5)))

lr = LogisticRegression(solver='lbfgs',class_weight='balanced',C=1.25,max_iter=150)
print(np.average(cross_val_score(lr,tr_fea,tr_label,cv=5)))

lr = LogisticRegression(solver='lbfgs',class_weight='balanced',C=0.25,max_iter=150)
print(np.average(cross_val_score(lr,tr_fea,tr_label,cv=5)))

0.776655577176574
0.7744146632352018
0.7744146632352018
0.7755382587408197
0.7822672776348001


In [121]:
# checking cross validation score for various combinations of model parameters

lr = LogisticRegression(solver='newton-cg',class_weight='balanced',C=0.5,max_iter=150)
print(np.average(cross_val_score(lr,tr_fea,tr_label,cv=5)))

lr = LogisticRegression(solver='newton-cg',class_weight='balanced',C=0.75,max_iter=150)
print(np.average(cross_val_score(lr,tr_fea,tr_label,cv=5)))

lr = LogisticRegression(solver='newton-cg',class_weight='balanced',C=1,max_iter=150)
print(np.average(cross_val_score(lr,tr_fea,tr_label,cv=5)))

lr = LogisticRegression(solver='newton-cg',class_weight='balanced',C=1.25,max_iter=150)
print(np.average(cross_val_score(lr,tr_fea,tr_label,cv=5)))

lr = LogisticRegression(solver='newton-cg',class_weight='balanced',C=0.25,max_iter=150)
print(np.average(cross_val_score(lr,tr_fea,tr_label,cv=5)))

0.776655577176574
0.7744146632352018
0.7744146632352018
0.7755382587408197
0.7822672776348001


In [122]:
# checking cross validation score for various combinations of model parameters

svm = SVC(kernel='linear',C=0.5)
print(np.average(cross_val_score(svm,tr_fea,tr_label,cv=5)))

svm = SVC(kernel='linear',C=0.75)
print(np.average(cross_val_score(svm,tr_fea,tr_label,cv=5)))

svm = SVC(kernel='linear',C=1)
print(np.average(cross_val_score(svm,tr_fea,tr_label,cv=5)))

svm = SVC(kernel='linear',C=1.25)
print(np.average(cross_val_score(svm,tr_fea,tr_label,cv=5)))

svm = SVC(kernel='linear',C=0.25)
print(np.average(cross_val_score(svm,tr_fea,tr_label,cv=5)))

0.7878601468834348
0.7878601468834348
0.7878601468834348
0.7878601468834348
0.7878601468834348


In [123]:
# checking cross validation score for various combinations of model parameters

svm = SVC(kernel='rbf',C=1,gamma='auto')
print(np.average(cross_val_score(svm,tr_fea,tr_label,cv=5)))

svm = SVC(kernel='rbf',C=1.5,gamma='auto')
print(np.average(cross_val_score(svm,tr_fea,tr_label,cv=5)))

svm = SVC(kernel='rbf',C=1.75,gamma='auto')
print(np.average(cross_val_score(svm,tr_fea,tr_label,cv=5)))

svm = SVC(kernel='rbf',C=5,gamma='auto')
print(np.average(cross_val_score(svm,tr_fea,tr_label,cv=5)))

svm = SVC(kernel='rbf',C=7.5,gamma='auto')
print(np.average(cross_val_score(svm,tr_fea,tr_label,cv=5)))

0.6847529973008599
0.7015253279769004
0.7048898374238906
0.7116125792480071
0.710488983742389


In [124]:
# checking cross validation score for various combinations of model parameters

svm = SVC(kernel='sigmoid',C=0.5,gamma='auto')
print(np.average(cross_val_score(svm,tr_fea,tr_label,cv=5)))

svm = SVC(kernel='sigmoid',C=0.75,gamma='auto')
print(np.average(cross_val_score(svm,tr_fea,tr_label,cv=5)))

svm = SVC(kernel='sigmoid',C=1,gamma='auto')
print(np.average(cross_val_score(svm,tr_fea,tr_label,cv=5)))

svm = SVC(kernel='sigmoid',C=1.25,gamma='auto')
print(np.average(cross_val_score(svm,tr_fea,tr_label,cv=5)))

svm = SVC(kernel='sigmoid',C=0.25,gamma='auto')
print(np.average(cross_val_score(svm,tr_fea,tr_label,cv=5)))

0.6161634548992531
0.6161634548992531
0.6161634548992531
0.6161634548992531
0.6161634548992531


In [125]:
# checking cross validation score for various combinations of model parameters

knn = KNeighborsClassifier(metric='minkowski',p=2,n_neighbors=5)
print(np.average(cross_val_score(knn,tr_fea,tr_label,cv=5)))

knn = KNeighborsClassifier(metric='minkowski',p=2,n_neighbors=3)
print(np.average(cross_val_score(knn,tr_fea,tr_label,cv=5)))

knn = KNeighborsClassifier(metric='minkowski',p=2,n_neighbors=7)
print(np.average(cross_val_score(knn,tr_fea,tr_label,cv=5)))

0.698141987320319
0.7060134329295085
0.7049149457033457


In [126]:
# Uploading test.csv file from local drive and reading it into a data frame
uploaded = files.upload()
te_dframe = pd.read_csv(io.BytesIO(uploaded['test.csv']))

Saving test.csv to test.csv


In [127]:
# insight into the data
te_dframe.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [128]:
# we can convert some "object" type data to "int", for easier computation

# replacing 'male' with 0 and 'female' with 1 in 'Sex'
te_dframe.replace({'Sex':{'male': 0, 'female': 1}}, inplace=True) 

# replacing 'C' with 0, 'Q' with 1 and 'S' with 2 in 'Embarked'
te_dframe.replace({'Embarked':{'C': 1, 'Q': 2, 'S': 3}}, inplace=True)

In [129]:
# Some data have missing values of 'Age', 'Fare', 'Cabin'
te_dframe.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [130]:
# As only 1 value from 'Fare' is missing, we can replace it with mean value 
mean = te_dframe['Embarked'].mean() # average
print(mean)
te_dframe['Fare'].fillna(mean, inplace = True)

2.401913875598086


In [131]:
# As not many values are missing from 'Age', we can replace them with mean values 
mean = te_dframe['Age'].mean() # average
print(mean)
te_dframe['Age'].fillna(mean, inplace = True)

30.272590361445783


In [132]:
# We need to drop 'Cabin', as we didn't consider it for training
# analysis of modified data
te_dframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    int64  
 4   Age          418 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         418 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    int64  
dtypes: float64(2), int64(6), object(3)
memory usage: 36.0+ KB


In [133]:
# Also, we need to drop 'PassengerId', 'Name', 'Ticket' as they do not influence the chances of survival of a person

# final dataframe to be used for testing model
te_fea=te_dframe.drop(['Cabin', 'PassengerId', 'Name', 'Ticket'], axis=1)

# insight into the final dataframe
final_tr_dframe.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,3.0
1,1,1,1,38.0,1,0,71.2833,1.0
2,1,3,1,26.0,0,0,7.925,3.0
3,1,1,1,35.0,1,0,53.1,3.0
4,0,3,0,35.0,0,0,8.05,3.0


In [134]:
# model with highest cross validation score: 0.7878601468834348
####
# This would be the 'preferred/final' model for this classification problem 
####
svm = SVC(kernel='linear',C=1)
svm.fit(tr_fea,tr_label)
pred=svm.predict(te_fea)

In [135]:
# submitting csv into drive
submsn=te_dframe.iloc[:,0:1]
submsn['Survived']=pred
drive.mount('/drive')
submsn.to_csv('/drive/My Drive/BOOKS_Sem2/Submsn1.csv')
# got a public score of 0.7655 in kaggle 

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [136]:
# just observing the training accuracy and precision of this model
pred=svm.predict(tr_fea)
print('accuracy:',accuracy_score(tr_label, pred))
print('precision:',precision_score(tr_label, pred, average='binary'))

accuracy: 0.7867564534231201
precision: 0.7420382165605095


In [137]:
# just trying to check how other models perform
# model with second highest cross validation score: 0.782273554704664
lr = LogisticRegression(solver='liblinear',class_weight='balanced',C=0.80)
lr.fit(tr_fea,tr_label)
pred=svm.predict(te_fea)

In [138]:
# submitting csv into drive
submsn=te_dframe.iloc[:,0:1]
submsn['Survived']=pred
drive.mount('/drive')
submsn.to_csv('/drive/My Drive/BOOKS_Sem2/Submsn2.csv')
# got a public score of 0.7655 in kaggle 

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [139]:
# just observing the training accuracy and precision of this model
pred=lr.predict(tr_fea)
print('accuracy:',accuracy_score(tr_label, pred))
print('precision:',precision_score(tr_label, pred, average='binary'))

accuracy: 0.7845117845117845
precision: 0.7083333333333334


In [140]:
# just trying to check how other models perform
# model with third highest cross validation score: 0.7822672776348001
lr = LogisticRegression(solver='lbfgs',class_weight='balanced',C=0.25,max_iter=150)
lr.fit(tr_fea,tr_label)
pred=svm.predict(te_fea)

In [141]:
# submitting csv into drive
submsn=te_dframe.iloc[:,0:1]
submsn['Survived']=pred
drive.mount('/drive')
submsn.to_csv('/drive/My Drive/BOOKS_Sem2/Submsn3.csv')
# got a public score of 0.7655 in kaggle 

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [142]:
# just observing the training accuracy and precision of this model
pred=lr.predict(tr_fea)
print('accuracy:',accuracy_score(tr_label, pred))
print('precision:',precision_score(tr_label, pred, average='binary'))

accuracy: 0.7912457912457912
precision: 0.7108108108108108


In [143]:
# just trying to check how other models perform
# model with fourth highest cross validation score: 0.7811499591990458
lr = LogisticRegression(solver='liblinear',class_weight='balanced',C=0.75)
lr.fit(tr_fea,tr_label)
pred=svm.predict(te_fea)

In [144]:
# submitting csv into drive
submsn=te_dframe.iloc[:,0:1]
submsn['Survived']=pred
drive.mount('/drive')
submsn.to_csv('/drive/My Drive/BOOKS_Sem2/Submsn4.csv')
# got a public score of 0.7655 in kaggle 

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [145]:
# just observing the training accuracy and precision of this model
pred=lr.predict(tr_fea)
print('accuracy:',accuracy_score(tr_label, pred))
print('precision:',precision_score(tr_label, pred, average='binary'))

accuracy: 0.7845117845117845
precision: 0.7083333333333334


In [146]:
# Observations
# 1. The final model to be chosen, from cross validation scores, is SVM.
# 2. All the top4 validation_score models performed similarly on test data, according to kaggle scores.
# 3. Tough not the correct criteria for choosing a model, for the purpose of exploration, I calculated training accuracy and precision scores. 
      # a. Logistic regression was found to have better accuracy than SVM.
      # b. SVM was found to have better precision than Logistic regression.