In [1]:
import csv
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

%matplotlib inline


In [2]:
data_raw = pd.read_csv('titanic/train.csv')
data_raw.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Checking if there is a NaN value preset or not.
data_raw['PassengerId'].unique()

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
       157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
       170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 18

In [4]:
data_raw['Survived'].unique()

array([0, 1])

In [5]:
data_raw['Pclass'].unique()

array([3, 1, 2])

In [6]:
data_raw['Sex'].unique()

array(['male', 'female'], dtype=object)

In [7]:
# NaN value present
data_raw['Age'].unique()

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

In [8]:
data_raw['SibSp'].unique()

array([1, 0, 3, 4, 2, 5, 8])

In [9]:
data_raw['Parch'].unique()

array([0, 1, 2, 5, 3, 4, 6])

In [10]:
data_raw['Fare'].unique()

array([  7.25  ,  71.2833,   7.925 ,  53.1   ,   8.05  ,   8.4583,
        51.8625,  21.075 ,  11.1333,  30.0708,  16.7   ,  26.55  ,
        31.275 ,   7.8542,  16.    ,  29.125 ,  13.    ,  18.    ,
         7.225 ,  26.    ,   8.0292,  35.5   ,  31.3875, 263.    ,
         7.8792,   7.8958,  27.7208, 146.5208,   7.75  ,  10.5   ,
        82.1708,  52.    ,   7.2292,  11.2417,   9.475 ,  21.    ,
        41.5792,  15.5   ,  21.6792,  17.8   ,  39.6875,   7.8   ,
        76.7292,  61.9792,  27.75  ,  46.9   ,  80.    ,  83.475 ,
        27.9   ,  15.2458,   8.1583,   8.6625,  73.5   ,  14.4542,
        56.4958,   7.65  ,  29.    ,  12.475 ,   9.    ,   9.5   ,
         7.7875,  47.1   ,  15.85  ,  34.375 ,  61.175 ,  20.575 ,
        34.6542,  63.3583,  23.    ,  77.2875,   8.6542,   7.775 ,
        24.15  ,   9.825 ,  14.4583, 247.5208,   7.1417,  22.3583,
         6.975 ,   7.05  ,  14.5   ,  15.0458,  26.2833,   9.2167,
        79.2   ,   6.75  ,  11.5   ,  36.75  ,   7.7958,  12.5

In [11]:
# NaN value present
data_raw['Cabin'].unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [12]:
# NaN value present
data_raw['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [13]:
# We got to know that there are few columsn which contains NAN values, so we need to replace them which proper 
# and staisfying values
data_clean = data_raw
#Age
data_clean['Age'] = data_clean['Age'].fillna(data_clean['Age'].mean())
normalized_df=(data_clean['Age']-data_clean['Age'].min())/(data_clean['Age'].max()-data_clean['Age'].min())
data_clean['Age'] = normalized_df
data_clean['Age'].unique()

array([0.27117366, 0.4722292 , 0.32143755, 0.43453129, 0.36792055,
       0.67328474, 0.01985423, 0.33400352, 0.17064589, 0.04498618,
       0.72354863, 0.24604172, 0.48479517, 0.68585072, 0.3842674 ,
       0.42196532, 0.18321186, 0.34656949, 0.09525006, 0.23347575,
       0.49736115, 0.8240764 , 0.52249309, 0.25860769, 0.22090978,
       0.03242021, 0.08268409, 0.61045489, 0.35913546, 0.81151043,
       0.35285248, 0.05755215, 0.13294798, 0.560191  , 0.2083438 ,
       0.39683338, 0.19577783, 0.30887158, 0.00515205, 0.37170143,
       0.40939935, 0.28373963, 0.2963056 , 0.57275697, 0.7361146 ,
       0.88690626, 0.45966323, 0.58532295, 0.17692888, 0.88062327,
       0.40311636, 0.14551395, 0.10781603, 0.45338025, 0.63558683,
       0.6921337 , 0.50364413, 0.54762503, 0.00728826, 0.76124654,
       0.69841669, 0.62302086, 0.44709726, 0.56647399, 0.2523247 ,
       0.77381252, 0.50992712, 0.6481528 , 0.78637849, 0.29002262,
       0.00628299, 0.53505906, 0.74868057, 0.12038201, 0.79894

In [14]:
# Fare
normalized_df_f=(data_clean['Fare']-data_clean['Fare'].min())/(data_clean['Fare'].max()-data_clean['Fare'].min())
data_clean['Fare'] = normalized_df_f

In [15]:
# Cabin
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if big_string.find(substring) == 0:
            return substring

le = LabelEncoder()
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G']
# Forward Fill to propagate the previous value forward
data_clean['Cabin'] = data_clean['Cabin'].fillna(method='ffill')
# Backward Fill to propagate the next value forward (if in case there is no forward value to be filled)
data_clean['Cabin'] = data_clean['Cabin'].fillna(method='bfill')
data_clean['Deck']= data_clean['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
data_clean['Deck'].value_counts()


C    260
B    201
E    147
D    126
A     77
F     55
G     24
T      1
Name: Deck, dtype: int64

In [16]:
# Embarked
# Forward Fill to propagate the previous value forward
data_clean['Embarked'] = data_clean['Embarked'].fillna(method='ffill')
# Backward Fill to propagate the next value forward (if in case there is no forward value to be filled)
data_clean['Embarked'] = data_clean['Embarked'].fillna(method='bfill')

data_raw['Embarked'].value_counts()

S    644
C    169
Q     78
Name: Embarked, dtype: int64

In [17]:
# Converting into categorial form
data_clean['Sex_cat']= data_clean['Sex'].astype('category').cat.codes
data_clean['Deck_cat']= data_clean['Deck'].astype('category').cat.codes
data_clean['Embarked_cat']= data_clean['Embarked'].astype('category').cat.codes

In [18]:
# Adding new features
data_clean['Family'] = data_clean['SibSp'] + data_clean['Parch'] + 1
alone_or_family = []
for data in data_clean['Family']:
    if data == 1:
        alone_or_family.append(0)
    else:
        alone_or_family.append(1)
data_clean['Is_alone'] = alone_or_family

In [19]:
#cleaned_data
# Not conisdering Fare and Ticket as it is irrelevant for the person survival factor
X = data_clean[['Pclass','Sex_cat','Age','SibSp','Parch','Deck_cat','Embarked_cat', 'Is_alone', 'Family']]
# X = data_clean[['Pclass','Sex_cat','Age','Deck_cat','Embarked_cat', 'Is_alone', 'Family']]
Y = data_clean[['Survived']]

In [20]:
Y['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [21]:
X.head()

Unnamed: 0,Pclass,Sex_cat,Age,SibSp,Parch,Deck_cat,Embarked_cat,Is_alone,Family
0,3,1,0.271174,1,0,2,2,1,2
1,1,0,0.472229,1,0,2,0,1,2
2,3,0,0.321438,0,0,2,2,0,1
3,1,0,0.434531,1,0,2,2,1,2
4,3,1,0.434531,0,0,2,2,0,1


In [22]:
model = LinearRegression()
scores = []
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
for i, (train, test) in enumerate(kfold.split(X, Y)):
 model.fit(X.iloc[train,:], Y.iloc[train,:])
 score = model.score(X.iloc[test,:], Y.iloc[test,:])
 scores.append(score)
print(scores)

[0.44307529313835026, 0.3466936034017244, 0.48088457087229597, 0.2858687393155517, 0.3915439122511488]


In [23]:
# Working on test data set provided by the kaggle

test_data_raw = pd.read_csv('titanic/test.csv')
test_data_clean = test_data_raw

# Pre-Processing as done for the training data

#AGE
test_data_clean['Age'] = test_data_clean['Age'].fillna(test_data_clean['Age'].mean())
normalized_df=(test_data_clean['Age']-test_data_clean['Age'].min())/(test_data_clean['Age'].max()-test_data_clean['Age'].min())
test_data_clean['Age'] = normalized_df

# Fare
normalized_df_fare=(data_clean['Fare']-data_clean['Fare'].min())/(data_clean['Fare'].max()-data_clean['Fare'].min())
data_clean['Fare'] = normalized_df_fare

#CABIN
# Forward Fill to propagate the previous value forward
test_data_clean['Cabin'] = test_data_clean['Cabin'].fillna(method='ffill')
# Backward Fill to propagate the next value forward (if in case there is no forward value to be filled)
test_data_clean['Cabin'] = test_data_clean['Cabin'].fillna(method='bfill')
test_data_clean['Deck']= test_data_clean['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
test_data_clean['Deck'].value_counts()

# EMBARKED
# Forward Fill to propagate the previous value forward
test_data_clean['Embarked'] = test_data_clean['Embarked'].fillna(method='ffill')
# Backward Fill to propagate the next value forward (if in case there is no forward value to be filled)
test_data_clean['Embarked'] = test_data_clean['Embarked'].fillna(method='bfill')

# Adding new features
test_data_clean['Family'] = test_data_clean['SibSp'] + test_data_clean['Parch'] + 1
alone_or_family_test = []
for data in test_data_clean['Family']:
    if data == 1:
        alone_or_family_test.append(0)
    else:
        alone_or_family_test.append(1)
test_data_clean['Is_alone'] = alone_or_family_test

# Converting into categorial form
test_data_clean['Sex_cat']= test_data_clean['Sex'].astype('category').cat.codes
test_data_clean['Deck_cat']= test_data_clean['Deck'].astype('category').cat.codes
test_data_clean['Embarked_cat']= test_data_clean['Embarked'].astype('category').cat.codes


X_test = test_data_clean[['Pclass','Sex_cat','Age','SibSp','Parch','Deck_cat','Embarked_cat','Is_alone','Family']]
# X_test = test_data_clean[['Pclass','Sex_cat','Age','Deck_cat','Embarked_cat','Is_alone','Family']]
X_test.shape

(418, 9)

In [24]:
output = model.predict(X_test)
predicted_output = []
for i in output:
    if list(i)[0] > 0.5:
        predicted_output.append(1)
    else:
        predicted_output.append(0)
passenger_id_test = test_data_raw['PassengerId'].tolist()

In [25]:
type(passenger_id_test)

list

In [26]:
with open('gender_submission.csv', 'w') as outcsv:
    writer = csv.writer(outcsv)
    writer.writerow(["PassengerId", "Survived"])
    writer.writerows(zip(passenger_id_test, predicted_output))                 
