In [342]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [343]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd
import tensorflow as tf

In [344]:
df = pd.read_csv('/content/drive/My Drive/train.csv')
df_test = pd.read_csv('/content/drive/My Drive/test.csv')

In [345]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [346]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [347]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [348]:
df_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [349]:
df = df.drop(['PassengerId', 'Ticket', 'Cabin'], axis = 'columns')
df_test = df_test.drop(['PassengerId', 'Ticket', 'Cabin'], axis = 'columns')

In [350]:
fare = pd.concat([df['Fare'], df_test['Fare']])
df_test['Fare'].fillna(fare.mean(), inplace = True)

In [351]:
age = pd.concat([df['Age'], df_test['Age']])

df['Age'].fillna(age.mean(), inplace = True)
df_test['Age'].fillna(age.mean(), inplace = True)

combine = [df, df_test]

for dataset in combine:
  dataset.loc[dataset['Age'] < 10, 'Age'] = 0
  dataset.loc[(dataset['Age'] >= 10) & (dataset['Age'] < 20), 'Age'] = 1
  dataset.loc[(dataset['Age'] >= 20) & (dataset['Age'] < 30), 'Age'] = 2
  dataset.loc[(dataset['Age'] >= 30) & (dataset['Age'] < 40), 'Age'] = 3
  dataset.loc[(dataset['Age'] >= 40) & (dataset['Age'] < 50), 'Age'] = 4
  dataset.loc[(dataset['Age'] >= 50) & (dataset['Age'] < 60), 'Age'] = 5
  dataset.loc[(dataset['Age'] >= 60) & (dataset['Age'] < 70), 'Age'] = 6
  dataset.loc[(dataset['Age'] >= 70) & (dataset['Age'] < 80), 'Age'] = 7
  dataset.loc[(dataset['Age'] >= 80) & (dataset['Age'] < 90), 'Age'] = 8
  dataset.loc[dataset['Age'] >= 90, 'Age'] = 9

In [352]:
combine = [df, df_test]

for dataset in combine:
  dataset['Title'] = dataset.Name.str.extract('([A-Za-z]+)\.', expand = False)

for dataset in combine:
  dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
  dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
  dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
  dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

title = pd.concat([df['Title'], df_test['Title']])
title_ohe = pd.get_dummies(title)
title_ohe_train = title_ohe[:891]
title_ohe_test = title_ohe[891:]

df = pd.concat([df, title_ohe_train], axis = 1)
df_test = pd.concat([df_test, title_ohe_test], axis = 1)

df.drop('Name', axis = 'columns', inplace = True)
df_test.drop('Name', axis = 'columns', inplace = True)
df.drop('Title', axis = 'columns', inplace = True)
df_test.drop('Title', axis = 'columns', inplace = True)

In [353]:
df.tail()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Master,Miss,Mr,Mrs,Rare
886,0,2,male,2.0,0,0,13.0,S,0,0,0,0,1
887,1,1,female,1.0,0,0,30.0,S,0,1,0,0,0
888,0,3,female,2.0,1,2,23.45,S,0,1,0,0,0
889,1,1,male,2.0,0,0,30.0,C,0,0,1,0,0
890,0,3,male,3.0,0,0,7.75,Q,0,0,1,0,0


In [354]:
df['Embarked'].value_counts(ascending = True)

Q     77
C    168
S    644
Name: Embarked, dtype: int64

In [355]:
df['Embarked'].fillna('S', inplace = True)
df_test['Embarked'].fillna('S', inplace = True)

In [356]:
embarked_ohe_train = pd.get_dummies(df['Embarked'])
embarked_ohe_test = pd.get_dummies(df_test['Embarked'])

df = pd.concat([df, embarked_ohe_train], axis = 'columns')
df_test = pd.concat([df_test, embarked_ohe_test], axis = 'columns')

df.drop('Embarked', axis = 'columns', inplace = True)
df_test.drop('Embarked', axis = 'columns', inplace = True)

In [357]:
df.replace({'Sex' : {'male' : 0, 'female' : 1}}, inplace = True)
df_test.replace({'Sex' : {'male' : 0, 'female' : 1}}, inplace = True)

In [358]:
df['Family'] = df['SibSp'] + df['Parch']
df_test['Family'] = df_test['SibSp'] + df_test['Parch']
df.drop(['SibSp', 'Parch'], axis = 'columns', inplace = True)
df_test.drop(['SibSp', 'Parch'], axis = 'columns', inplace = True)

In [359]:
df.corr()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Master,Miss,Mr,Mrs,Rare,C,Q,S,Family
Survived,1.0,-0.338481,0.543351,-0.050972,0.257307,0.085221,0.335636,-0.549199,0.341994,-0.012054,0.16824,0.00365,-0.149683,0.016639
Pclass,-0.338481,1.0,-0.1319,-0.33961,-0.5495,0.082081,-0.010261,0.142698,-0.153758,-0.187174,-0.243292,0.221009,0.074053,0.065997
Sex,0.543351,-0.1319,1.0,-0.074794,0.182333,-0.159934,0.693916,-0.867334,0.550146,-0.075638,0.082853,0.074115,-0.119224,0.200988
Age,-0.050972,-0.33961,-0.074794,1.0,0.099775,-0.339213,-0.232425,0.158386,0.159131,0.194853,0.030254,-0.063081,0.013193,-0.23425
Fare,0.257307,-0.5495,0.182333,0.099775,1.0,0.010908,0.119518,-0.183766,0.107259,0.016275,0.269335,-0.117216,-0.162184,0.217138
Master,0.085221,0.082081,-0.159934,-0.339213,0.010908,1.0,-0.110981,-0.254903,-0.087987,-0.035291,-0.035225,0.010478,0.024264,0.372472
Miss,0.335636,-0.010261,0.693916,-0.232425,0.119518,-0.110981,1.0,-0.601857,-0.207749,-0.083327,0.036204,0.167531,-0.137144,0.1075
Mr,-0.549199,0.142698,-0.867334,0.158386,-0.183766,-0.254903,-0.601857,1.0,-0.47716,-0.191387,-0.072567,-0.078338,0.11287,-0.338014
Mrs,0.341994,-0.153758,0.550146,0.159131,0.107259,-0.087987,-0.207749,-0.47716,1.0,-0.066063,0.067872,-0.090432,-0.00255,0.153842
Rare,-0.012054,-0.187174,-0.075638,0.194853,0.016275,-0.035291,-0.083327,-0.191387,-0.066063,1.0,0.030095,0.000311,-0.026561,-0.047426


In [360]:
df.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Master,Miss,Mr,Mrs,Rare,C,Q,S,Family
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.352413,2.420875,32.204208,0.044893,0.207632,0.580247,0.141414,0.025814,0.188552,0.08642,0.725028,0.904602
std,0.486592,0.836071,0.47799,1.356289,49.693429,0.207186,0.40584,0.493796,0.348644,0.158668,0.391372,0.281141,0.446751,1.613459
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,2.0,7.9104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,3.0,0.0,2.0,14.4542,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,1.0,3.0,1.0,3.0,31.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
max,1.0,3.0,1.0,8.0,512.3292,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10.0


In [361]:
df_test.describe()

Unnamed: 0,Pclass,Sex,Age,Fare,Master,Miss,Mr,Mrs,Rare,C,Q,S,Family
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,2.26555,0.363636,2.476077,35.62161,0.050239,0.188995,0.574163,0.172249,0.014354,0.244019,0.110048,0.645933,0.839713
std,0.841838,0.481622,1.301411,55.840617,0.2187,0.391974,0.495062,0.378049,0.119088,0.430019,0.313324,0.478803,1.519072
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,2.0,7.8958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,0.0,2.0,14.4542,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,3.0,1.0,3.0,31.5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
max,3.0,1.0,7.0,512.3292,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10.0


In [362]:
df['Pclass'] = df['Pclass']/3
df_test['Pclass'] = df_test['Pclass']/3

df['Age'] = df['Age']/8
df_test['Age'] = df_test['Age']/8

df['Fare'] = df['Fare']/512.329200
df_test['Fare'] = df_test['Fare']/512.329200

df['Family'] = df['Family']/10
df_test['Family'] = df_test['Family']/10

In [363]:
X = df.drop(['Survived'], axis = 'columns')
y = df['Survived']

In [364]:
X_train, X_test = X[0:600], X[600:890]
y_train, y_test = y[0:600], y[600:890]

In [365]:
class Net(tf.keras.Model):
  def __init__(self):
    super(Net, self).__init__()
    self.f1 = tf.keras.layers.Flatten(
        input_dim = 13
    )

    self.f2 = tf.keras.layers.Dense(
        units = 300,
        activation = 'relu'
    )

    self.f3 = tf.keras.layers.Dense(
        units = 200,
        activation = 'relu'
    )

    self.f4 = tf.keras.layers.Dense(
        units = 100,
        activation = 'relu'
    )

    self.f5 = tf.keras.layers.Dense(
        units = 1,
        activation = 'sigmoid'
    )

  def call(self, x, training = None):
    x = self.f1(x)
    x = self.f2(x)
    x = self.f3(x)
    x = self.f4(x)
    y = self.f5(x)

    return y

model = Net()

In [366]:
model.compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = ['accuracy']
    )

In [367]:
history = model.fit(X_train, y_train,
                    batch_size = 100,
                    epochs = 20,
                    verbose = 1,
                    validation_data = (X_test, y_test)
                    )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [368]:
test_loss, test_acc = model.evaluate(X_test, y_test, verbose = 2)

print('\nTest accuracy:', test_acc)

10/10 - 0s - loss: 0.3820 - accuracy: 0.8483 - 53ms/epoch - 5ms/step

Test accuracy: 0.8482758402824402


In [372]:
prediction = model.predict(df_test)
prediction = np.array(prediction)



In [377]:
df_new = pd.read_csv('/content/drive/My Drive/test.csv')

df_new['Survived'] = prediction
df_new.loc[df_new['Survived'] < 0.5, 'Survived'] = 0
df_new.loc[~(df_new['Survived'] < 0.5), 'Survived'] = 1

submission = pd.DataFrame({'PassengerId': df_new['PassengerId'], 'Survived': df_new['Survived']})

In [379]:
from google.colab import files

submission.to_csv('submission.csv', index = False)
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>