# Importing the Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import os
import sklearn

# Importing and Preparing the Training and Test Datasets

In [2]:
X_train = pd.read_csv(r'C:\Users\Aditya Deepak\Downloads\Projects\Titanic\train.csv')
X_test = pd.read_csv(r'C:\Users\Aditya Deepak\Downloads\Projects\Titanic\test.csv')
Gender_Data = pd.read_csv(r'C:\Users\Aditya Deepak\Downloads\Projects\Titanic\gender_submission.csv')

In [3]:
X_test['Survived'] = Gender_Data['Survived']

# Understanding the Data

In [4]:
X_train.shape

(891, 12)

In [5]:
X_test.shape

(418, 12)

In [6]:
Gender_Data.shape

(418, 2)

In [7]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
 11  Survived     418 non-null    int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB


In [9]:
Gender_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int64
 1   Survived     418 non-null    int64
dtypes: int64(2)
memory usage: 6.7 KB


In [10]:
X_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [11]:
X_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
Survived         0
dtype: int64

In [12]:
Gender_Data.isnull().sum()

PassengerId    0
Survived       0
dtype: int64

# Dealing with the Null Values

In [13]:
X_train['Age'] = X_train['Age'].fillna(X_train['Age'].median())
X_test['Age'] = X_test['Age'].fillna(X_test['Age'].median())

In [14]:
X_train.drop(['Cabin'], axis=1, inplace=True)
X_test.drop(['Cabin'], axis=1, inplace=True)

In [15]:
X_test['Fare'] = X_test['Fare'].fillna(X_test['Fare'].mean())

In [16]:
X_train['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [17]:
X_train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [18]:
X_train['Embarked'] = X_train['Embarked'].fillna('S')

In [19]:
X_train.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [20]:
X_test.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
Survived       0
dtype: int64

# Removing the Unnecessary Columns

In [21]:
X_train.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

In [22]:
X_test.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

In [23]:
X_train.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')

In [24]:
X_test.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked',
       'Survived'],
      dtype='object')

# Data Analysis

In [25]:
for i in X_train.columns:
    print(X_train[i].value_counts())

0    549
1    342
Name: Survived, dtype: int64
3    491
1    216
2    184
Name: Pclass, dtype: int64
male      577
female    314
Name: Sex, dtype: int64
28.00    202
24.00     30
22.00     27
18.00     26
19.00     25
        ... 
55.50      1
74.00      1
0.92       1
70.50      1
12.00      1
Name: Age, Length: 88, dtype: int64
0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64
0    678
1    118
2     80
3      5
5      5
4      4
6      1
Name: Parch, dtype: int64
8.0500     43
13.0000    42
7.8958     38
7.7500     34
26.0000    31
           ..
50.4958     1
13.8583     1
8.4583      1
7.7250      1
7.5208      1
Name: Fare, Length: 248, dtype: int64
S    646
C    168
Q     77
Name: Embarked, dtype: int64


In [26]:
for i in X_test.columns:
    print(X_test[i].value_counts())

3    218
1    107
2     93
Name: Pclass, dtype: int64
male      266
female    152
Name: Sex, dtype: int64
27.00    98
21.00    17
24.00    17
22.00    16
30.00    15
         ..
0.83      1
22.50     1
26.50     1
40.50     1
60.50     1
Name: Age, Length: 79, dtype: int64
0    283
1    110
2     14
3      4
4      4
8      2
5      1
Name: SibSp, dtype: int64
0    324
1     52
2     33
3      3
4      2
9      2
5      1
6      1
Name: Parch, dtype: int64
7.7500     21
26.0000    19
13.0000    17
8.0500     17
7.8958     11
           ..
50.0000     1
31.6833     1
16.0000     1
53.1000     1
20.2500     1
Name: Fare, Length: 170, dtype: int64
S    270
C    102
Q     46
Name: Embarked, dtype: int64
0    266
1    152
Name: Survived, dtype: int64


In [27]:
X_train.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.361582,0.523008,0.381594,32.204208
std,0.486592,0.836071,13.019697,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,22.0,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,35.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [28]:
X_test.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Survived
count,418.0,418.0,418.0,418.0,418.0,418.0
mean,2.26555,29.599282,0.447368,0.392344,35.627188,0.363636
std,0.841838,12.70377,0.89676,0.981429,55.8405,0.481622
min,1.0,0.17,0.0,0.0,0.0,0.0
25%,1.0,23.0,0.0,0.0,7.8958,0.0
50%,3.0,27.0,0.0,0.0,14.4542,0.0
75%,3.0,35.75,1.0,0.0,31.5,1.0
max,3.0,76.0,8.0,9.0,512.3292,1.0


# Separating the Dependent Variables from X_train and X_test

In [29]:
y_train = X_train['Survived']

In [30]:
X_train.drop(['Survived'], axis=1, inplace=True)

In [31]:
y_test = X_test['Survived']

In [32]:
X_test.drop(['Survived'], axis=1, inplace=True)

# Encoding Categorical Variables

In [33]:
X_train = pd.get_dummies(X_train, drop_first=True)

In [34]:
print(X_train)

     Pclass   Age  SibSp  Parch     Fare  Sex_male  Embarked_Q  Embarked_S
0         3  22.0      1      0   7.2500         1           0           1
1         1  38.0      1      0  71.2833         0           0           0
2         3  26.0      0      0   7.9250         0           0           1
3         1  35.0      1      0  53.1000         0           0           1
4         3  35.0      0      0   8.0500         1           0           1
..      ...   ...    ...    ...      ...       ...         ...         ...
886       2  27.0      0      0  13.0000         1           0           1
887       1  19.0      0      0  30.0000         0           0           1
888       3  28.0      1      2  23.4500         0           0           1
889       1  26.0      0      0  30.0000         1           0           0
890       3  32.0      0      0   7.7500         1           1           0

[891 rows x 8 columns]


In [35]:
X_test = pd.get_dummies(X_test, drop_first=True)

In [36]:
print(X_test)

     Pclass   Age  SibSp  Parch      Fare  Sex_male  Embarked_Q  Embarked_S
0         3  34.5      0      0    7.8292         1           1           0
1         3  47.0      1      0    7.0000         0           0           1
2         2  62.0      0      0    9.6875         1           1           0
3         3  27.0      0      0    8.6625         1           0           1
4         3  22.0      1      1   12.2875         0           0           1
..      ...   ...    ...    ...       ...       ...         ...         ...
413       3  27.0      0      0    8.0500         1           0           1
414       1  39.0      0      0  108.9000         0           0           0
415       3  38.5      0      0    7.2500         1           0           1
416       3  27.0      0      0    8.0500         1           0           1
417       3  27.0      1      1   22.3583         1           0           0

[418 rows x 8 columns]


# Feature Scaling

In [37]:
#final_X_train = X_train.to_numpy()

In [38]:
#final_X_test = X_test.to_numpy()

In [39]:
from sklearn.preprocessing import StandardScaler
mysc = StandardScaler()
X_train = mysc.fit_transform(X_train)
X_test = mysc.transform(X_test)

In [40]:
#from sklearn.preprocessing import StandardScaler
#mysc = StandardScaler()
#X_res[:,5:] = mysc.fit_transform(X_res[:,5:])
#X_test[:,5:] = mysc.transform(X_test[:,5:])

# Importing the Machine Learning Algorithms

In [41]:
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Training XGBoost Algo on Training set

In [42]:
#classifier = XGBClassifier()
#classifier.fit(X_train, y_train)

# [[211  55]
# [ 33 119]]
#              precision    recall  f1-score   support
#
#           0       0.86      0.79      0.83       266
#           1       0.68      0.78      0.73       152
#
#    accuracy                           0.79       418
#   macro avg       0.77      0.79      0.78       418
# weighted avg       0.80      0.79      0.79       418

# 0.7894736842105263

# Accuracy: 82.38 %
# Standard Deviation: 3.67 %


# Training Logistic Regression Algo on Training set

In [43]:
#classifier = LogisticRegression(random_state = 0)
#classifier.fit(X_train, y_train)

#  [[251  15]
#   [  9 143]]
#               precision    recall  f1-score   support

#            0       0.97      0.94      0.95       266
#            1       0.91      0.94      0.92       152

#     accuracy                           0.94       418
#    macro avg       0.94      0.94      0.94       418
#  weighted avg       0.94      0.94      0.94       418

# 0.9425837320574163

#  Accuracy: 79.24 %
#  Standard Deviation: 2.15 %
    
#  Best Accuracy: 79.35 %
#  Best Parameters: {'penalty': 'none'}


# Training KNN Algo on Training set

In [44]:
#classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
#classifier.fit(X_train, y_train)

# [[213  53]
#  [ 25 127]]
#               precision    recall  f1-score   support

#            0       0.89      0.80      0.85       266
#            1       0.71      0.84      0.77       152

#     accuracy                           0.81       418
#    macro avg       0.80      0.82      0.81       418
# weighted avg       0.83      0.81      0.82       418

# 0.8133971291866029

# Accuracy: 80.92 %
# Standard Deviation: 3.47 %
    

# Best Accuracy: 81.94 %
# Best Parameters: {'algorithm': 'auto', 'metric': 'minkowski', 'n_neighbors': 15, 'p': 1, 'weights': 'uniform'}

# Training Decision Trees Algo on Training set¶

In [45]:
#classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
#classifier.fit(X_train, y_train)

# [[206  60]
#  [ 38 114]]
#               precision    recall  f1-score   support

#            0       0.84      0.77      0.81       266
#            1       0.66      0.75      0.70       152

#     accuracy                           0.77       418
#    macro avg       0.75      0.76      0.75       418
# weighted avg       0.78      0.77      0.77       418

# 0.7655502392344498

# Accuracy: 77.56 %
# Standard Deviation: 4.70 %
    
# Best Accuracy: 78.79 %
# Best Parameters: {'criterion': 'gini'}

# Training Naive Bayes Algo on Training set¶

In [46]:
#classifier = GaussianNB()
#classifier.fit(X_train, y_train)

# [[238  28]
#  [  6 146]]
#               precision    recall  f1-score   support

#            0       0.98      0.89      0.93       266
#            1       0.84      0.96      0.90       152

#     accuracy                           0.92       418
#    macro avg       0.91      0.93      0.91       418
# weighted avg       0.93      0.92      0.92       418

# 0.9186602870813397

# Accuracy: 78.12 %
# Standard Deviation: 1.81 %

# Training SVM Algo on Training set

In [47]:
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

# [[256  10]
#  [ 33 119]]
#               precision    recall  f1-score   support

#            0       0.89      0.96      0.92       266
#            1       0.92      0.78      0.85       152

#     accuracy                           0.90       418
#    macro avg       0.90      0.87      0.88       418
# weighted avg       0.90      0.90      0.90       418

# 0.8971291866028708

# Accuracy: 82.49 %
# Standard Deviation: 3.89 %
    
# Best Accuracy: 83.17 %
# Best Parameters: {'gamma': 0.2, 'kernel': 'rbf'}

# Training Random Forest Algo on Training set¶

In [48]:
#classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
#classifier.fit(X_train, y_train)

# [[233  33]
#  [ 39 113]]
#               precision    recall  f1-score   support

#            0       0.86      0.88      0.87       266
#            1       0.77      0.74      0.76       152

#     accuracy                           0.83       418
#    macro avg       0.82      0.81      0.81       418
# weighted avg       0.83      0.83      0.83       418

# 0.8277511961722488

# Accuracy: 80.48 %
# Standard Deviation: 3.97 %
    
# Best Accuracy: 80.93 %
# Best Parameters: {'criterion': 'entropy', 'n_estimators': 90}

# Confusion Matrix - Accuracy Score

In [49]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, plot_confusion_matrix, precision_score, recall_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)
print(cm)
print(cr)
accuracy_score(y_test, y_pred)

[[256  10]
 [ 33 119]]
              precision    recall  f1-score   support

           0       0.89      0.96      0.92       266
           1       0.92      0.78      0.85       152

    accuracy                           0.90       418
   macro avg       0.90      0.87      0.88       418
weighted avg       0.90      0.90      0.90       418



0.8971291866028708

In [50]:
print(*y_pred, sep = '\n')

0
0
0
0
0
0
1
0
1
0
0
0
1
0
1
1
0
0
0
1
0
1
1
0
1
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
1
0
0
0
1
1
0
0
0
0
0
1
0
0
0
1
0
1
1
0
0
0
1
0
0
0
1
0
0
1
0
1
1
0
0
0
0
0
1
0
1
1
0
0
1
0
0
0
1
0
0
0
1
0
0
0
1
0
0
0
0
0
0
1
1
1
1
0
0
1
0
1
1
0
1
0
0
1
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
1
0
0
0
0
0
1
0
0
1
1
1
1
0
0
0
0
0
1
0
0
0
0
0
0
1
1
0
1
1
0
0
1
0
1
0
1
0
0
0
0
0
0
0
1
0
1
0
0
0
1
1
0
1
0
0
1
0
1
0
0
0
0
1
0
0
1
0
1
0
1
0
1
0
1
1
0
1
0
0
0
1
0
0
0
0
0
0
1
1
1
1
0
0
0
0
1
0
1
1
1
0
0
0
0
0
0
0
1
0
0
0
1
1
0
0
0
0
0
0
0
0
1
1
0
1
0
0
0
0
0
1
1
1
1
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
0
0
0
0
1
1
0
1
0
0
0
0
0
1
1
1
0
0
0
0
0
0
0
0
1
0
1
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
1
0
1
0
1
0
1
1
0
0
0
1
0
1
0
0
0
0
1
1
0
1
0
0
1
1
0
0
1
0
0
1
1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
1
1
0
0
1
0
1
0
0
1
0
1
0
0
0
0
0
1
1
1
1
0
0
1
0
0
0


# K-Fold Cross Validation

In [51]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 82.49 %
Standard Deviation: 3.89 %


# Grid Search

In [52]:
from sklearn.model_selection import GridSearchCV
parameters = [{'kernel': ['rbf', 'linear','sigmoid','callable'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 83.17 %
Best Parameters: {'gamma': 0.2, 'kernel': 'rbf'}
