In [1]:
# Data Analysis Libraries
import csv
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

# Machine Learning Libraries
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data_raw = pd.read_csv('titanic/train.csv')
data_raw.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Checking if there is a NaN value preset or not.
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [4]:
data_raw.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
# We got to know that there are few columsn which contains NAN values, so we need to replace them which proper 
# and staisfying values
data_clean = data_raw
#Age
data_clean['Age'] = data_clean['Age'].fillna(data_clean['Age'].mean())
normalized_df=(data_clean['Age']-data_clean['Age'].min())/(data_clean['Age'].max()-data_clean['Age'].min())
data_clean['Age'] = normalized_df
data_clean['Age'].unique()

array([0.27117366, 0.4722292 , 0.32143755, 0.43453129, 0.36792055,
       0.67328474, 0.01985423, 0.33400352, 0.17064589, 0.04498618,
       0.72354863, 0.24604172, 0.48479517, 0.68585072, 0.3842674 ,
       0.42196532, 0.18321186, 0.34656949, 0.09525006, 0.23347575,
       0.49736115, 0.8240764 , 0.52249309, 0.25860769, 0.22090978,
       0.03242021, 0.08268409, 0.61045489, 0.35913546, 0.81151043,
       0.35285248, 0.05755215, 0.13294798, 0.560191  , 0.2083438 ,
       0.39683338, 0.19577783, 0.30887158, 0.00515205, 0.37170143,
       0.40939935, 0.28373963, 0.2963056 , 0.57275697, 0.7361146 ,
       0.88690626, 0.45966323, 0.58532295, 0.17692888, 0.88062327,
       0.40311636, 0.14551395, 0.10781603, 0.45338025, 0.63558683,
       0.6921337 , 0.50364413, 0.54762503, 0.00728826, 0.76124654,
       0.69841669, 0.62302086, 0.44709726, 0.56647399, 0.2523247 ,
       0.77381252, 0.50992712, 0.6481528 , 0.78637849, 0.29002262,
       0.00628299, 0.53505906, 0.74868057, 0.12038201, 0.79894

In [6]:
# Fare
normalized_df_f=(data_clean['Fare']-data_clean['Fare'].min())/(data_clean['Fare'].max()-data_clean['Fare'].min())
data_clean['Fare'] = normalized_df_f

In [7]:
# Cabin
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if big_string.find(substring) == 0:
            return substring

le = LabelEncoder()
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G']
# Forward Fill to propagate the previous value forward
data_clean['Cabin'] = data_clean['Cabin'].fillna(method='ffill')
# Backward Fill to propagate the next value forward (if in case there is no forward value to be filled)
data_clean['Cabin'] = data_clean['Cabin'].fillna(method='bfill')
data_clean['Deck']= data_clean['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
data_clean['Deck'].value_counts()


C    260
B    201
E    147
D    126
A     77
F     55
G     24
T      1
Name: Deck, dtype: int64

In [8]:
# Embarked
# Forward Fill to propagate the previous value forward
data_clean['Embarked'] = data_clean['Embarked'].fillna(method='ffill')
# Backward Fill to propagate the next value forward (if in case there is no forward value to be filled)
data_clean['Embarked'] = data_clean['Embarked'].fillna(method='bfill')

data_raw['Embarked'].value_counts()

S    644
C    169
Q     78
Name: Embarked, dtype: int64

In [9]:
# Converting into categorial form
data_clean['Sex_cat']= data_clean['Sex'].astype('category').cat.codes
data_clean['Deck_cat']= data_clean['Deck'].astype('category').cat.codes
data_clean['Embarked_cat']= data_clean['Embarked'].astype('category').cat.codes

In [10]:
# Adding new features
data_clean['Family'] = data_clean['SibSp'] + data_clean['Parch'] + 1
alone_or_family = []
for data in data_clean['Family']:
    if data == 1:
        alone_or_family.append(0)
    else:
        alone_or_family.append(1)
data_clean['Is_alone'] = alone_or_family

In [11]:
#cleaned_data
# Not conisdering Fare and Ticket as it is irrelevant for the person survival factor
# X = data_clean[['Pclass','Sex_cat','Age','SibSp','Parch','Deck_cat','Embarked_cat', 'Is_alone', 'Family']]
X = data_clean[['Pclass','Sex_cat','Age','Deck_cat','Embarked_cat', 'Is_alone', 'Family']]
Y = data_clean[['Survived']]

In [12]:
Y['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [13]:
X.head()

Unnamed: 0,Pclass,Sex_cat,Age,Deck_cat,Embarked_cat,Is_alone,Family
0,3,1,0.271174,2,2,1,2
1,1,0,0.472229,2,0,1,2
2,3,0,0.321438,2,2,0,1
3,1,0,0.434531,2,2,1,2
4,3,1,0.434531,2,2,0,1


In [31]:
type(Y)

pandas.core.frame.DataFrame

In [30]:
# Model - Linear Regression
model = LinearRegression()
scores = []
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
for i, (train, test) in enumerate(kfold.split(X, Y)):
    model.fit(X.iloc[train,:], Y.iloc[train,:])
    score = model.score(X.iloc[test,:], Y.iloc[test,:])
    scores.append(score)
print(scores)

[0.44196517491481296, 0.34604336875007435, 0.4802682275209212, 0.28450047022616387, 0.3965669702707104]


In [15]:
# Working on test data set provided by the kaggle
test_data_raw = pd.read_csv('titanic/test.csv')
test_data_clean = test_data_raw

In [16]:
test_data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [17]:
# Pre-Processing as done for the training data

#AGE
test_data_clean['Age'] = test_data_clean['Age'].fillna(test_data_clean['Age'].mean())
normalized_df=(test_data_clean['Age']-test_data_clean['Age'].min())/(test_data_clean['Age'].max()-test_data_clean['Age'].min())
test_data_clean['Age'] = normalized_df

# Fare
normalized_df_fare=(data_clean['Fare']-data_clean['Fare'].min())/(data_clean['Fare'].max()-data_clean['Fare'].min())
data_clean['Fare'] = normalized_df_fare

#CABIN
# Forward Fill to propagate the previous value forward
test_data_clean['Cabin'] = test_data_clean['Cabin'].fillna(method='ffill')
# Backward Fill to propagate the next value forward (if in case there is no forward value to be filled)
test_data_clean['Cabin'] = test_data_clean['Cabin'].fillna(method='bfill')
test_data_clean['Deck']= test_data_clean['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
test_data_clean['Deck'].value_counts()

# EMBARKED
# Forward Fill to propagate the previous value forward
test_data_clean['Embarked'] = test_data_clean['Embarked'].fillna(method='ffill')
# Backward Fill to propagate the next value forward (if in case there is no forward value to be filled)
test_data_clean['Embarked'] = test_data_clean['Embarked'].fillna(method='bfill')

# Adding new features
test_data_clean['Family'] = test_data_clean['SibSp'] + test_data_clean['Parch'] + 1
alone_or_family_test = []
for data in test_data_clean['Family']:
    if data == 1:
        alone_or_family_test.append(0)
    else:
        alone_or_family_test.append(1)
test_data_clean['Is_alone'] = alone_or_family_test

# Converting into categorial form
test_data_clean['Sex_cat']= test_data_clean['Sex'].astype('category').cat.codes
test_data_clean['Deck_cat']= test_data_clean['Deck'].astype('category').cat.codes
test_data_clean['Embarked_cat']= test_data_clean['Embarked'].astype('category').cat.codes


# X_test = test_data_clean[['Pclass','Sex_cat','Age','SibSp','Parch','Deck_cat','Embarked_cat','Is_alone','Family']]
X_test = test_data_clean[['Pclass','Sex_cat','Age','Deck_cat','Embarked_cat','Is_alone','Family']]
X_test.shape

(418, 7)

In [18]:
output = model.predict(X_test)
predicted_output = []
for i in output:
    if list(i)[0] > 0.5:
        predicted_output.append(1)
    else:
        predicted_output.append(0)
passenger_id_test = test_data_raw['PassengerId'].tolist()

In [19]:
type(passenger_id_test)

list

In [20]:
# Model - SVC
svc = SVC()
svc.fit(X, Y)
Y_pred_svc = svc.predict(X_test)
acc_svc = round(svc.score(X, Y) * 100, 2)
acc_svc

  y = column_or_1d(y, warn=True)


81.71

In [21]:
# Model - KNN 
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X, Y)
Y_pred_knn = knn.predict(X_test)
acc_knn = round(knn.score(X, Y) * 100, 2)
acc_knn

  This is separate from the ipykernel package so we can avoid doing imports until


87.32

In [22]:
# Model - GNB
gnb = GaussianNB()
gnb.fit(X, Y)
Y_pred_gnb = gnb.predict(X_test)
acc_gnb = round(gnb.score(X, Y) * 100, 2)
acc_gnb

  y = column_or_1d(y, warn=True)


78.9

In [23]:
# Model - Perceptron
perceptron = Perceptron()
perceptron.fit(X, Y)
Y_pred_percep = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X, Y) * 100, 2)
acc_perceptron

  y = column_or_1d(y, warn=True)


73.74

In [24]:
# Model - Linear SVC
svc = LinearSVC()
svc.fit(X, Y)
Y_pred_svc = svc.predict(X_test)
acc_linear_svc = round(svc.score(X, Y) * 100, 2)
acc_linear_svc

  y = column_or_1d(y, warn=True)


80.7

In [25]:
# Model SGD
sgd = SGDClassifier()
sgd.fit(X, Y)
Y_pred_sgd = sgd.predict(X_test)
acc_sgd = round(sgd.score(X, Y) * 100, 2)
acc_sgd

  y = column_or_1d(y, warn=True)


78.0

In [26]:
# Model _ Decission Tree
dt = DecisionTreeClassifier()
dt.fit(X, Y)
Y_pred_dt = dt.predict(X_test)
acc_decision_tree = round(dt.score(X, Y) * 100, 2)
acc_decision_tree

96.41

In [27]:
# Model Random Forest
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X, Y)
Y_pred_rf = rf.predict(X_test)
acc_rf = round(rf.score(X, Y) * 100, 2)
acc_rf

  This is separate from the ipykernel package so we can avoid doing imports until


96.41

In [28]:
with open('gender_submission.csv', 'w') as outcsv:
    writer = csv.writer(outcsv)
    writer.writerow(["PassengerId", "Survived"])
    writer.writerows(zip(passenger_id_test, Y_pred_svc))                 
