In [1]:
import pandas as pd

<h1>Load Titanic Dataset</h1>

In [2]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


<h1>Data Cleaning</h1>

In [3]:
df.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis='columns', inplace=True)
df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


<h1>Split into Input, Target</h1>

In [4]:
inputs = df.iloc[:, :-1]
target = df.iloc[:, -1]

<h1>Dummies</h1>

In [5]:
sex_dummy = pd.get_dummies(df.Sex)
sex_dummy.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [6]:
inputs = pd.concat([df, sex_dummy], axis='columns')
inputs.drop(['Sex'], axis='columns', inplace=True)
inputs.head()

Unnamed: 0,Pclass,Age,Fare,Survived,female,male
0,3,22.0,7.25,0,0,1
1,1,38.0,71.2833,1,1,0
2,3,26.0,7.925,1,1,0
3,1,35.0,53.1,1,1,0
4,3,35.0,8.05,0,0,1


<h1>Preprocessing</h1>

In [7]:
inputs.columns[inputs.isna().any()]

Index(['Age'], dtype='object')

In [8]:
inputs.Age = inputs.Age.fillna(inputs.Age.mean())

In [9]:
inputs.columns[inputs.isna().any()]

Index([], dtype='object')

<h1>Naive Bayes (Gaussian NB)</h1>

In [10]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

<h1>Train, Test Split</h1>

In [11]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10)

scores = []

for train_index, test_index in skf.split(inputs, target):
    x_train, x_test = inputs.iloc[train_index], inputs.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    
    gnb.fit(x_train, y_train)
    
    scores.append(gnb.score(x_test, y_test))

In [12]:
scores

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]