In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [None]:
data_set = pd.read_csv('titanic.csv')
xs = ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked']

In [None]:
train, test = train_test_split(data_set, test_size = 0.2)

In [None]:
!pip install --upgrade Pillow
!pip install ydata_profiling
import ydata_profiling
profile = ydata_profiling.ProfileReport(data_set)
profile.to_file('summary.html')



Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# filling missing values

train['Age'].fillna(train['Age'].mean(), inplace = True)
train['Embarked'].fillna(train['Embarked'].mode().iloc[0], inplace = True)

test['Age'].fillna(test['Age'].mean(), inplace = True)
test['Embarked'].fillna(test['Embarked'].mode().iloc[0], inplace = True)

#deleting the Cabin column
train.drop('Cabin', axis = 1, inplace = True)
test.drop('Cabin', axis = 1, inplace = True)

In [None]:
# creating onehot encoding

train = pd.get_dummies(train, columns = ['Pclass', 'Sex', 'Embarked'], drop_first = True)
test = pd.get_dummies(test, columns = ['Pclass', 'Sex', 'Embarked'], drop_first = True)

In [None]:
train

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S
63,64,0,"Skoog, Master. Harald",4.000000,3,2,347088,27.9000,0,1,1,0,1
213,214,0,"Givard, Mr. Hans Kristensen",30.000000,0,0,250646,13.0000,1,0,1,0,1
484,485,1,"Bishop, Mr. Dickinson H",25.000000,1,0,11967,91.0792,0,0,1,0,0
515,516,0,"Walker, Mr. William Anderson",47.000000,0,0,36967,34.0208,0,0,1,0,1
325,326,1,"Young, Miss. Marie Grice",36.000000,0,0,PC 17760,135.6333,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
500,501,0,"Calic, Mr. Petar",17.000000,0,0,315086,8.6625,0,1,1,0,1
598,599,0,"Boulos, Mr. Hanna",29.494859,0,0,2664,7.2250,0,1,1,0,0
807,808,0,"Pettersson, Miss. Ellen Natalia",18.000000,0,0,347087,7.7750,0,1,0,0,1
320,321,0,"Dennis, Mr. Samuel",22.000000,0,0,A/5 21172,7.2500,0,1,1,0,1


In [None]:
xs = ['Pclass_2', 'Pclass_3', 'Sex_male', 'Age', 'SibSp', 'Fare', 'Embarked_Q', 'Embarked_S']
xtrain = train[xs]
ytrain = train['Survived']

xtest = test[xs]
ytest = test['Survived']

In [None]:
lr = LogisticRegression()
lr.fit(xtrain, ytrain)
yhat_train = lr.predict(xtrain)
yhat_test = lr.predict(xtest)
print(f"score on train: {accuracy_score(ytrain, yhat_train)}, score on test: {accuracy_score(ytest, yhat_test)}")

score on train: 0.8258426966292135, score on test: 0.7206703910614525


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
dt = DecisionTreeClassifier()
dt.fit(xtrain, ytrain)
yhat_train = dt.predict(xtrain)
yhat_test = dt.predict(xtest)
print(f"score on train: {accuracy_score(ytrain, yhat_train)}, score on test: {accuracy_score(ytest, yhat_test)}")

score on train: 0.9873595505617978, score on test: 0.7150837988826816


finding best param to avoid overfitting

In [None]:
from sklearn.model_selection import GridSearchCV
dt = DecisionTreeClassifier()
gs = GridSearchCV(dt, {'max_depth':[2,3,4,5]}, cv = 5)
gs.fit(xtrain, ytrain)
print(gs.best_params_)

{'max_depth': 4}


In [None]:
dt = DecisionTreeClassifier(max_depth = 4)
dt.fit(xtrain, ytrain)
yhat_train = dt.predict(xtrain)
yhat_test = dt.predict(xtest)
print(f"score on train: {accuracy_score(ytrain, yhat_train)}, score on test: {accuracy_score(ytest, yhat_test)}")

score on train: 0.8581460674157303, score on test: 0.7486033519553073


In [None]:
rf = RandomForestClassifier()
rf.fit(xtrain, ytrain)
yhat_train = rf.predict(xtrain)
yhat_test = rf.predict(xtest)
print(f"score on train: {accuracy_score(ytrain, yhat_train)}, score on test: {accuracy_score(ytest, yhat_test)}")

score on train: 0.9873595505617978, score on test: 0.7430167597765364


finding best param to avoid overfitting

In [None]:
from sklearn.model_selection import GridSearchCV
rf = RandomForestClassifier()
gs = GridSearchCV(rf, {'n_estimators': [100, 150, 200, 250, 300], 'max_depth':[2,3,4,5]}, cv = 5)
gs.fit(xtrain, ytrain)
print(gs.best_params_)

{'max_depth': 5, 'n_estimators': 250}


In [None]:
rf = RandomForestClassifier(max_depth = 5, n_estimators= 250)
rf.fit(xtrain, ytrain)
yhat_train = rf.predict(xtrain)
yhat_test = rf.predict(xtest)
print(f"score on train: {accuracy_score(ytrain, yhat_train)}, score on test: {accuracy_score(ytest, yhat_test)}")

score on train: 0.8623595505617978, score on test: 0.7653631284916201


In [None]:
from sklearn.model_selection import GridSearchCV
knn = KNeighborsClassifier()
gs = GridSearchCV(knn, {'n_neighbors': list(range(1,50))}, cv = 5)
gs.fit(xtrain, ytrain)
print(gs.best_params_)

{'n_neighbors': 17}


In [None]:
knn = KNeighborsClassifier(n_neighbors = 17)
knn.fit(xtrain, ytrain)
yhat_train = knn.predict(xtrain)
yhat_test = knn.predict(xtest)
print(f"score on train: {accuracy_score(ytrain, yhat_train)}, score on test: {accuracy_score(ytest, yhat_test)}")

score on train: 0.7598314606741573, score on test: 0.6145251396648045


In [None]:
svm = LinearSVC()
svm.fit(xtrain, ytrain)
yhat_train = svm.predict(xtrain)
yhat_test = svm.predict(xtest)
print(f"score on train: {accuracy_score(ytrain, yhat_train)}, score on test: {accuracy_score(ytest, yhat_test)}")

score on train: 0.6587078651685393, score on test: 0.6033519553072626




In [None]:
gnb = GaussianNB()
gnb.fit(xtrain, ytrain)
yhat_train = gnb.predict(xtrain)
yhat_test = gnb.predict(xtest)
print(f"score on train: {accuracy_score(ytrain, yhat_train)}, score on test: {accuracy_score(ytest, yhat_test)}")

score on train: 0.8089887640449438, score on test: 0.7094972067039106


In [None]:
mnb = MultinomialNB()
mnb.fit(xtrain, ytrain)
yhat_train = mnb.predict(xtrain)
yhat_test = mnb.predict(xtest)
print(f"score on train: {accuracy_score(ytrain, yhat_train)}, score on test: {accuracy_score(ytest, yhat_test)}")

score on train: 0.6980337078651685, score on test: 0.664804469273743
