# Load Data

In [None]:
import pandas as pd

In [4]:
training_data = pd.read_excel('training_data.xlsx')

In [5]:
testing_data = pd.read_excel('testing_data.xlsx')

In [6]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32950 entries, 0 to 32949
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             32950 non-null  int64  
 1   job             32950 non-null  int64  
 2   marital         32950 non-null  int64  
 3   education       32950 non-null  int64  
 4   default         32950 non-null  int64  
 5   housing         32950 non-null  int64  
 6   loan            32950 non-null  int64  
 7   contact         32950 non-null  int64  
 8   month           32950 non-null  int64  
 9   day_of_week     32950 non-null  int64  
 10  duration        32950 non-null  int64  
 11  campaign        32950 non-null  int64  
 12  pdays           32950 non-null  int64  
 13  previous        32950 non-null  int64  
 14  poutcome        32950 non-null  int64  
 15  emp.var.rate    32950 non-null  float64
 16  cons.price.idx  32950 non-null  float64
 17  cons.conf.idx   32950 non-null 

In [7]:
training_data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,42,7,1,0,0,0,0,0,3,1,...,3,999,0,1,1.4,93.918,-42.7,4.962,5228.1,0
1,37,1,2,1,1,2,0,1,6,2,...,1,999,0,1,1.1,93.994,-36.4,4.855,5191.0,0
2,43,0,1,6,0,2,0,1,6,0,...,2,999,0,1,1.1,93.994,-36.4,4.864,5191.0,0
3,53,6,1,2,0,2,0,0,7,1,...,1,999,0,1,-0.1,93.2,-42.0,4.191,5195.8,0
4,33,0,1,6,0,2,0,0,1,4,...,3,999,0,1,1.4,93.444,-36.1,4.964,5228.1,0


In [18]:
training_data.shape

(32950, 21)

In [19]:
testing_data.shape

(8238, 21)

In [29]:
xs_train = training_data.loc[0:, training_data.columns!='y']
ys_train = training_data['y']
xs_test = testing_data.loc[0:, testing_data.columns!='y']
ys_test = testing_data['y']

# Beyes Modelling

## Gaussian Model

In [36]:
from sklearn import naive_bayes
from sklearn import metrics

In [54]:
# fit the model
clf = naive_bayes.GaussianNB()
clf.fit(xs_train,ys_train)
ys_pred = clf.predict(xs_test)
ys_pred_proba = clf.predict_proba(xs_test)
ys_pred_log_proba = clf.predict_log_proba(xs_test)
# model result
print('training data score:', clf.score(xs_train,ys_train))
print('testing data score:', clf.score(xs_test,ys_test))
print("accuracy score:", metrics.accuracy_score(ys_test,ys_pred))

training data score: 0.8474658573596359
testing data score: 0.8421947074532653
accuracy score: 0.8421947074532653


In [55]:
# report model result
from sklearn.metrics import classification_report
print(classification_report(ys_test, ys_pred))

              precision    recall  f1-score   support

           0       0.94      0.87      0.91      7277
           1       0.39      0.60      0.47       961

    accuracy                           0.84      8238
   macro avg       0.66      0.74      0.69      8238
weighted avg       0.88      0.84      0.86      8238



## Naive Bayes Model 

In [56]:
# drop the continuous variables to fit naive bayes model
xs_train2 = training_data.drop(['pdays','emp.var.rate','duration', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],axis=1)
xs_test2 = testing_data.drop(['pdays','emp.var.rate','duration', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],axis=1)

In [57]:
# fit the model
clf2 = naive_bayes.MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)
clf2.fit(xs_train2,ys_train)
ys_pred2 = clf2.predict(xs_test2)
ys_pred_proba2 = clf2.predict_proba(xs_test2)
ys_pred_log_proba2 = clf2.predict_log_proba(xs_test2)
# model result
print('training data score:', clf2.score(xs_train2,ys_train))
print('testing data score:', clf2.score(xs_test2,ys_test))
print("accuracy score:", metrics.accuracy_score(ys_test,ys_pred2))

training data score: 0.8896813353566009
testing data score: 0.8839524156348628
accuracy score: 0.8839524156348628


In [58]:
# report model result
print(classification_report(ys_test, ys_pred2))

              precision    recall  f1-score   support

           0       0.89      0.99      0.94      7277
           1       0.51      0.12      0.19       961

    accuracy                           0.88      8238
   macro avg       0.70      0.55      0.56      8238
weighted avg       0.85      0.88      0.85      8238

