# Gaussian Titanic Naive Bayes

In [1]:
# import libraries

import pandas as pd
import numpy as np

from sklearn.naive_bayes import GaussianNB   #import Gaussian Bayes modeling function
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [4]:
df = pd.read_excel("titanic.xlsx")
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [5]:
df.info()

#missing values mostly for body category

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB


In [6]:
df.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0,121.0
mean,2.294882,0.381971,29.881135,0.498854,0.385027,33.295479,160.809917
std,0.837836,0.486055,14.4135,1.041658,0.86556,51.758668,97.696922
min,1.0,0.0,0.1667,0.0,0.0,0.0,1.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958,72.0
50%,3.0,0.0,28.0,0.0,0.0,14.4542,155.0
75%,3.0,1.0,39.0,1.0,0.0,31.275,256.0
max,3.0,1.0,80.0,8.0,9.0,512.3292,328.0


In [8]:
modeldf = df[["pclass", "survived", "sex", "age", "sibsp", "parch", "fare"]]

#not including "body" column because so many missing values (and directly correlated with survival)

modeldf.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,1,1,female,29.0,0,0,211.3375
1,1,1,male,0.9167,1,2,151.55
2,1,0,female,2.0,1,2,151.55
3,1,0,male,30.0,1,2,151.55
4,1,0,female,25.0,1,2,151.55


In [80]:
#transform gender column to binary values (0,1)
modeldf['sex'] = modeldf['sex'].map({'female': 0, 'male': 1})
modeldf.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  modeldf['sex'] = modeldf['sex'].map({'female': 0, 'male': 1}).copy()


Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,1,1,,29.0,0,0,211.3375
1,1,1,,0.9167,1,2,151.55
2,1,0,,2.0,1,2,151.55
3,1,0,,30.0,1,2,151.55
4,1,0,,25.0,1,2,151.55


In [18]:
modeldf.corr()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
pclass,1.0,-0.319979,0.144105,-0.411086,0.047746,0.017685,-0.565255
survived,-0.319979,1.0,-0.537719,-0.053958,-0.012657,0.114091,0.249164
sex,0.144105,-0.537719,1.0,0.062236,-0.094902,-0.220864,-0.18793
age,-0.411086,-0.053958,0.062236,1.0,-0.243139,-0.150241,0.178739
sibsp,0.047746,-0.012657,-0.094902,-0.243139,1.0,0.374291,0.141184
parch,0.017685,0.114091,-0.220864,-0.150241,0.374291,1.0,0.216723
fare,-0.565255,0.249164,-0.18793,0.178739,0.141184,0.216723,1.0


In [62]:
#dataframe with predicting features
X = modeldf.drop('survived', axis=1)

#column of predictive target values
y = modeldf['survived']

In [63]:
#create training and test data
#will leave test size at default (25%)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=499)

In [64]:
#initialize Gaussian Bayes classifier
gnb = GaussianNB()

In [65]:
#train the model to learn trends
gnb.fit(X_train, y_train)

GaussianNB()

In [32]:
modeldf.isnull().sum()

pclass      0
survived    0
sex         0
age         0
sibsp       0
parch       0
fare        0
dtype: int64

In [17]:
modeldf.dropna(inplace = True)

modeldf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1045 entries, 0 to 1308
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1045 non-null   int64  
 1   survived  1045 non-null   int64  
 2   sex       1045 non-null   int64  
 3   age       1045 non-null   float64
 4   sibsp     1045 non-null   int64  
 5   parch     1045 non-null   int64  
 6   fare      1045 non-null   float64
dtypes: float64(2), int64(5)
memory usage: 65.3 KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [66]:
#predictive score of the model on the training data
gnb.score(X_train, y_train)

0.7777777777777778

In [67]:
#test the model on unseen data
#score predictive values in variable
y_pred = gnb.predict(X_test)

In [68]:
#Confusion matrix shows which values model predicted correctly vs incorrectly

cm = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    columns=['Predicted Died', 'Predicted Survived'],
    index=['True Died', 'True Survived']
)

cm

Unnamed: 0,Predicted Died,Predicted Survived
True Died,129,25
True Survived,33,75


In [69]:
#frequency of passed students to failed students in the test dataset
y_test.value_counts()

0    154
1    108
Name: survived, dtype: int64

In [70]:
#predictive score of the model on the test data
gnb.score(X_test, y_test)

0.7786259541984732

# Bernoulli Titanic Naive Bayes

In [46]:
#import Bernoulli Naïve Bayes function from scikit-learn library
from sklearn.naive_bayes import BernoulliNB

In [71]:
#initialize Bernoulli Naïve Bayes function to a variable
bnb = BernoulliNB()

In [72]:
#build the model with training data
bnb.fit(X_train, y_train)

BernoulliNB()

In [73]:
#model's predictive score on the training data
bnb.score(X_train, y_train)

0.7739463601532567

In [77]:
#test the model on unseen data
#score predictive values in variable
y_pred = bnb.predict(X_test)

In [78]:
#Confusion matrix shows which values model predicted correctly vs incorrectly

cm = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    columns=['Predicted Died', 'Predicted Survived'],
    index=['True Died', 'True Survived']
)

cm

Unnamed: 0,Predicted Died,Predicted Survived
True Died,133,21
True Survived,33,75


In [79]:
#predictive score of the model on the test data
bnb.score(X_test, y_test)

0.7938931297709924