In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

sns.set_style("darkgrid")

In [2]:
# Importing the dataset
df = pd.read_csv("dataset/titanic_survivor.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [3]:
print(df.shape)

(418, 12)


In [4]:
# Drop unnecessary columns

df=df.drop(["Name","Ticket"],axis=1)

In [5]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,892,0,3,male,34.5,0,0,7.8292,,Q
1,893,1,3,female,47.0,1,0,7.0,,S
2,894,0,2,male,62.0,0,0,9.6875,,Q
3,895,0,3,male,27.0,0,0,8.6625,,S
4,896,1,3,female,22.0,1,1,12.2875,,S


In [6]:
df.shape

(418, 10)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Fare         417 non-null    float64
 8   Cabin        91 non-null     object 
 9   Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(3)
memory usage: 32.8+ KB


In [8]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age             86
SibSp            0
Parch            0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [9]:
# Dropping cabin because it holds many null values.

df.drop("Cabin",axis=1,inplace=True)

In [10]:
df["Age"].median()

27.0

In [11]:
df["Age"] = df["Age"].fillna(df["Age"].median())

In [12]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           1
Embarked       0
dtype: int64

In [13]:
df=df.dropna()

In [14]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [15]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,0,3,male,34.5,0,0,7.8292,Q
1,893,1,3,female,47.0,1,0,7.0,S
2,894,0,2,male,62.0,0,0,9.6875,Q
3,895,0,3,male,27.0,0,0,8.6625,S
4,896,1,3,female,22.0,1,1,12.2875,S


In [16]:
df["Survived_or_Not"]=df["Survived"]

In [17]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived_or_Not
0,892,0,3,male,34.5,0,0,7.8292,Q,0
1,893,1,3,female,47.0,1,0,7.0,S,1
2,894,0,2,male,62.0,0,0,9.6875,Q,0
3,895,0,3,male,27.0,0,0,8.6625,S,0
4,896,1,3,female,22.0,1,1,12.2875,S,1


In [18]:
df["Survived_or_Not"].replace({0:"Not Survived",1:"Survived"},inplace=True)

In [19]:
df["Survived_or_Not"].head(2)

0    Not Survived
1        Survived
Name: Survived_or_Not, dtype: object

In [20]:
df["Survived_or_Not"].value_counts()

Not Survived    265
Survived        152
Name: Survived_or_Not, dtype: int64

In [21]:
#Starting AINN

In [22]:
df["Survived_or_Not"].replace({"Not Survived":0,"Survived":1},inplace=True)

In [23]:
df["Sex"].replace({"male":1,"female":0},inplace=True)

In [24]:
df["Embarked"].replace({"Q":0,"S":1,"C":2},inplace=True)

In [25]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived_or_Not
0,892,0,3,1,34.5,0,0,7.8292,0,0
1,893,1,3,0,47.0,1,0,7.0,1,1
2,894,0,2,1,62.0,0,0,9.6875,0,0
3,895,0,3,1,27.0,0,0,8.6625,1,0
4,896,1,3,0,22.0,1,1,12.2875,1,1


In [26]:
df["Survived_or_Not"].value_counts()

0    265
1    152
Name: Survived_or_Not, dtype: int64

In [27]:
# Dividing dependent and independent variables

X=df.drop("Survived_or_Not",axis=1)
y=df["Survived_or_Not"]

In [28]:
#Why "Survived_or_Not" is here..??
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived_or_Not
0,892,0,3,1,34.5,0,0,7.8292,0,0
1,893,1,3,0,47.0,1,0,7.0,1,1
2,894,0,2,1,62.0,0,0,9.6875,0,0
3,895,0,3,1,27.0,0,0,8.6625,1,0
4,896,1,3,0,22.0,1,1,12.2875,1,1


In [29]:
# Sampling

from imblearn.over_sampling import SMOTE 
from collections import Counter

sampler = SMOTE()
X_res, y_res = sampler.fit_resample(X, y)

print("SMOTE {}".format(Counter(y_res)))

SMOTE Counter({0: 265, 1: 265})


In [30]:
# Splitting the dataset into the Training set and Test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [31]:
# Feature Scaling

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [33]:
#Create ANN

# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense

# Initialising the ANN
classifier = Sequential()

# Adding the input layer and the first hidden layer
classifier.add(Dense(activation="relu", input_dim=9, units=6, kernel_initializer="uniform"))

# Adding the second hidden layer
classifier.add(Dense(activation="relu", units=6, kernel_initializer="uniform"))

# Adding the output layer
classifier.add(Dense(activation="sigmoid", units=1, kernel_initializer="uniform"))

# Compiling the ANN
classifier.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])

# Fitting the ANN to the Training set
classifier.fit(X_train, y_train, batch_size = 10, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f5942389240>

In [34]:
# Part 3 - Making the predictions and evaluating the model

# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)



In [35]:
y_pred

array([[ True],
       [False],
       [ True],
       [ True],
       [ True],
       [False],
       [ True],
       [False],
       [False],
       [ True],
       [ True],
       [ True],
       [False],
       [ True],
       [False],
       [False],
       [ True],
       [ True],
       [ True],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [False],
       [False],
       [ True],
       [ True],
       [ True],
       [ True],
       [False],
       [ True],
       [ True],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [False],
       [ True],
       [False],
       [ True],
       [False],
       [ True],
       [False],
       [ True],
       [