# Naive Bayes classifier (NB)

Logistic Regression과 어떻게 다른지 알아봅니다.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data_file = 'data/titanic.csv'
titanic = pd.read_csv(data_file, index_col='PassengerId')
titanic.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
embark_dum = pd.get_dummies(titanic['Embarked'], prefix='port')
embark_dum.drop(embark_dum.columns[2], axis=1, inplace=True)
titanic = pd.concat([titanic, embark_dum], axis=1)
titanic.drop(['Embarked'], axis=1, inplace=True)

In [4]:
age_group = []
for i in range (0, len(titanic)):
    age = titanic.iloc[i]['Age']
    age = 'child' if age < 20 else 'adult' if age >= 20 else 'unknown'
    age_group.append(age)
    
titanic['Age_modified'] = age_group
age_dum = pd.get_dummies(titanic['Age_modified'], prefix = 'Age')
age_dum.drop(age_dum.columns[2], axis=1, inplace=True)
age_dum.sample(n = 10)
titanic = pd.concat([titanic, age_dum], axis=1)
titanic.drop(['Age', 'Age_modified'], axis=1, inplace=True)

In [6]:
pclass_dum = pd.get_dummies(titanic['Pclass'], prefix = 'pclass')
pclass_dum.drop(pclass_dum.columns[2], axis=1, inplace=True)
titanic = pd.concat([titanic, pclass_dum], axis=1)
titanic.drop(['Pclass'], axis=1, inplace=True)

In [7]:
titanic['Sex'] = titanic['Sex'].map({'female':1, 'male':0})
titanic.head(10)

Unnamed: 0_level_0,Survived,Name,Sex,SibSp,Parch,Ticket,Fare,Cabin,port_C,port_Q,Age_adult,Age_child,pclass_1,pclass_2
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0,"Braund, Mr. Owen Harris",0,1,0,A/5 21171,7.25,,0,0,1,0,0,0
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,1,0,PC 17599,71.2833,C85,1,0,1,0,1,0
3,1,"Heikkinen, Miss. Laina",1,0,0,STON/O2. 3101282,7.925,,0,0,1,0,0,0
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,1,0,113803,53.1,C123,0,0,1,0,1,0
5,0,"Allen, Mr. William Henry",0,0,0,373450,8.05,,0,0,1,0,0,0
6,0,"Moran, Mr. James",0,0,0,330877,8.4583,,0,1,0,0,0,0
7,0,"McCarthy, Mr. Timothy J",0,0,0,17463,51.8625,E46,0,0,1,0,1,0
8,0,"Palsson, Master. Gosta Leonard",0,3,1,349909,21.075,,0,0,0,1,0,0
9,1,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,0,2,347742,11.1333,,0,0,1,0,0,0
10,1,"Nasser, Mrs. Nicholas (Adele Achem)",1,1,0,237736,30.0708,,1,0,0,1,0,1


# Category 와 Continuous 분할

NB은 피쳐 형태에 따라 모형이 다릅니다. Category 변수는 CategoricalNB, 연속형 변수는 GaussianNB를 사용합니다.

In [12]:
all_features = ['SibSp','Parch','Fare','Sex', 'port_C', 'port_Q', 'Age_adult', 'Age_child', 'pclass_1', 'pclass_2']
cont_features = ['SibSp','Parch','Fare']
cat_features = ['Sex', 'port_C', 'port_Q', 'Age_adult', 'Age_child', 'pclass_1', 'pclass_2']
X = titanic[all_features]
y = titanic.Survived

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

In [18]:
X_cont_train = X_train[cont_features]
X_cont_test = X_test[cont_features]
X_cat_train = X_train[cat_features]
X_cat_test = X_test[cat_features]

X_cont_train.head()

Unnamed: 0_level_0,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
661,2,0,133.65
853,1,1,15.2458
704,0,0,7.7417
887,0,0,13.0
526,0,0,7.75


## Naive Bayes classifier

In [31]:
# Because all variables in iris dataset are numerical, we use Gaussian naive Bayes.
from sklearn.naive_bayes import CategoricalNB, GaussianNB

gnb = GaussianNB()
gnb.fit(X_cont_train, y_train)
y_cont_prob = gnb.predict_proba(X_cont_test) #predict_proba()는 확률을 예측해 줍니다.

In [34]:
cnb = CategoricalNB()
cnb.fit(X_cat_train, y_train)
y_cat_prob = cnb.predict_proba(X_cat_test)

In [35]:
print(y_cont_prob)

[[6.80675333e-01 3.19324667e-01]
 [7.08902758e-01 2.91097242e-01]
 [7.02712363e-01 2.97287637e-01]
 [6.89297547e-01 3.10702453e-01]
 [7.33632452e-01 2.66367548e-01]
 [7.05735717e-01 2.94264283e-01]
 [7.14011793e-01 2.85988207e-01]
 [7.31547427e-01 2.68452573e-01]
 [6.60352433e-01 3.39647567e-01]
 [4.24794169e-01 5.75205831e-01]
 [8.73991071e-01 1.26008929e-01]
 [7.09697690e-01 2.90302310e-01]
 [1.89411366e-01 8.10588634e-01]
 [7.09871893e-01 2.90128107e-01]
 [7.00842434e-01 2.99157566e-01]
 [9.12601370e-01 8.73986304e-02]
 [4.76311465e-03 9.95236885e-01]
 [6.18000564e-01 3.81999436e-01]
 [4.37996009e-01 5.62003991e-01]
 [7.31547427e-01 2.68452573e-01]
 [6.12159216e-01 3.87840784e-01]
 [5.83887305e-01 4.16112695e-01]
 [7.13552486e-01 2.86447514e-01]
 [9.99950989e-01 4.90106418e-05]
 [9.99943308e-01 5.66915569e-05]
 [7.33331814e-01 2.66668186e-01]
 [7.09697690e-01 2.90302310e-01]
 [1.02825450e-01 8.97174550e-01]
 [7.09697690e-01 2.90302310e-01]
 [7.09529634e-01 2.90470366e-01]
 [9.938571

In [36]:
print(y_cat_prob)

[[0.3091181  0.6908819 ]
 [0.80096594 0.19903406]
 [0.48637142 0.51362858]
 [0.67200391 0.32799609]
 [0.78318465 0.21681535]
 [0.84618914 0.15381086]
 [0.31772922 0.68227078]
 [0.31772922 0.68227078]
 [0.17135765 0.82864235]
 [0.39816099 0.60183901]
 [0.3091181  0.6908819 ]
 [0.39816099 0.60183901]
 [0.14779715 0.85220285]
 [0.88656382 0.11343618]
 [0.07420824 0.92579176]
 [0.17135765 0.82864235]
 [0.07420824 0.92579176]
 [0.69535651 0.30464349]
 [0.39816099 0.60183901]
 [0.84618914 0.15381086]
 [0.42430791 0.57569209]
 [0.23951524 0.76048476]
 [0.84618914 0.15381086]
 [0.81997162 0.18002838]
 [0.84090817 0.15909183]
 [0.88656382 0.11343618]
 [0.88656382 0.11343618]
 [0.05142258 0.94857742]
 [0.84090817 0.15909183]
 [0.88239281 0.11760719]
 [0.3091181  0.6908819 ]
 [0.31772922 0.68227078]
 [0.84618914 0.15381086]
 [0.88656382 0.11343618]
 [0.78816626 0.21183374]
 [0.70955227 0.29044773]
 [0.88656382 0.11343618]
 [0.88656382 0.11343618]
 [0.31772922 0.68227078]
 [0.48637142 0.51362858]


# 추정의 합

NB은 상호 독립을 가정합니다. 두 사건이 독립일 때, 두 사건이 같이 발생할 확률은 각 사건의 확률의 곱으로 표현합니다.

In [37]:
y_pred_prob = y_cont_prob * y_cat_prob

In [40]:
y_pred = np.argmax(y_pred_prob, axis=1)
print(y_pred)

[1 0 0 0 0 0 0 0 1 1 0 0 1 0 1 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0
 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 0 0 1
 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 1 0 1 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0
 1 0 1 0 1 0 0 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 1
 0]


In [41]:
from sklearn import metrics
accuracy = metrics.accuracy_score(y_test, y_pred)
cm = metrics.confusion_matrix(y_test, y_pred)

In [42]:
print(accuracy)
print(cm)

0.7623318385650224
[[128  11]
 [ 42  42]]
