## Import libraries and collect data

In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
df = pd.read_csv("data/titanic.csv")
df.head()

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


## Clean data

In [31]:
df.drop(["PassengerId", "Name", "SibSp", "Parch", "Ticket", "Cabin", "Embarked"], axis="columns", inplace=True)
df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


In [32]:
dummies = pd.get_dummies(df["Sex"])

df = pd.concat([df,dummies], axis="columns")
df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived,female,male
0,3,male,22.0,7.25,0,0,1
1,1,female,38.0,71.2833,1,1,0
2,3,female,26.0,7.925,1,1,0
3,1,female,35.0,53.1,1,1,0
4,3,male,35.0,8.05,0,0,1


In [33]:
df.drop(["Sex", "female"], axis="columns", inplace=True)
df.head()

Unnamed: 0,Pclass,Age,Fare,Survived,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,1


In [34]:
df.columns[df.isna().any()]

Index(['Age'], dtype='object')

In [35]:
df["Age"][:10]

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

In [36]:
df["Age"] = df["Age"].fillna(df["Age"].mean())
df.columns[df.isna().any()]

Index([], dtype='object')

## Create model

In [37]:
X = df.drop("Survived", axis="columns")
Y = df["Survived"]
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2)

In [38]:
len(X_train)

712

In [39]:
len(X_test)

179

In [40]:
mymodel = GaussianNB()
mymodel.fit(X_train, Y_train)

GaussianNB()

## Evaluate model

In [41]:
mymodel.score(X_test, Y_test)

0.7486033519553073

In [42]:
Y_test[:10]

196    0
539    1
571    1
817    0
169    0
804    1
684    0
445    1
750    1
174    0
Name: Survived, dtype: int64

In [44]:
mymodel.predict(X_test[:10])

array([0, 1, 1, 0, 0, 0, 0, 1, 1, 0], dtype=int64)

In [45]:
mymodel.predict_proba(X_test[:10])

array([[0.97122251, 0.02877749],
       [0.04250937, 0.95749063],
       [0.04302818, 0.95697182],
       [0.92384793, 0.07615207],
       [0.94375935, 0.05624065],
       [0.97023403, 0.02976597],
       [0.90847174, 0.09152826],
       [0.25884503, 0.74115497],
       [0.14467632, 0.85532368],
       [0.75879473, 0.24120527]])