In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('titanic.csv')

In [3]:
df.head()

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


In [4]:
df.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'],axis='columns',inplace=True)
df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


In [5]:
inputs = df.drop('Survived',axis='columns')
inputs.head()

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,22.0,7.25
1,1,female,38.0,71.2833
2,3,female,26.0,7.925
3,1,female,35.0,53.1
4,3,male,35.0,8.05


In [6]:
target = df.Survived
target.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [7]:
dummies = pd.get_dummies(inputs.Sex)
dummies.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [8]:
X = pd.concat([inputs,dummies],axis='columns')
X.head(3)

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,22.0,7.25,0,1
1,1,female,38.0,71.2833,1,0
2,3,female,26.0,7.925,1,0


In [9]:
X.drop('Sex',axis='columns',inplace=True)
X.head(3)

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0


In [10]:
X.columns[X.isna().any()]

Index(['Age'], dtype='object')

In [11]:
X.isnull().sum()

Pclass      0
Age       177
Fare        0
female      0
male        0
dtype: int64

In [12]:
X.Age = X.Age.fillna(X.Age.mean())

In [13]:
X.isnull().sum()

Pclass    0
Age       0
Fare      0
female    0
male      0
dtype: int64

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train,X_test,y_train,y_test = train_test_split(X,target,test_size=0.2)

In [16]:
len(X_train)

712

In [17]:
len(X_test)

179

In [18]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [19]:
gnb.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [20]:
gnb.score(X_test,y_test)

0.8044692737430168

In [21]:
gnb.predict(X_test)

array([0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 1], dtype=int64)

In [22]:
gnb.predict_proba(X_test)[:10]

array([[9.04171062e-01, 9.58289381e-02],
       [5.06795967e-02, 9.49320403e-01],
       [9.54014988e-01, 4.59850125e-02],
       [5.41825021e-05, 9.99945817e-01],
       [8.96580480e-01, 1.03419520e-01],
       [9.87905742e-01, 1.20942576e-02],
       [9.89937547e-01, 1.00624528e-02],
       [9.89376909e-01, 1.06230911e-02],
       [8.77756184e-02, 9.12224382e-01],
       [5.58331698e-03, 9.94416683e-01]])

### Cross Val Score

In [23]:
from sklearn.model_selection import cross_val_score

In [24]:
gnb_score = cross_val_score(GaussianNB(),X,target,cv=5)
gnb_score

array([0.79329609, 0.80337079, 0.78089888, 0.74719101, 0.79775281])

In [25]:
np.average(gnb_score)

0.7845019145063085