# Multinomial and Bernoulli Naive Bayes

In [2]:
import numpy as np
import pandas as pd
import sklearn

In [5]:

docs = pd.read_table('python_multinomial_&_bernouli/SMSSpamCollection', header=None, names=['Class', 'sms'])

/Users/ankur/my_project/git_upgrad/upgrad/machine_learning/chapter-2


In [7]:
docs.head()

Unnamed: 0,Class,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
# number of smses/ document
len(docs)

5572

In [11]:
# counting spam and ham instance
ham_spam = docs.Class.value_counts()
ham_spam

Class
ham     4825
spam     747
Name: count, dtype: int64

In [25]:
print("spam rate is about {0}%".format(
    round((ham_spam[1]/float(ham_spam[0]+ham_spam[1]))*100), 2))

spam rate is about 13%


  round((ham_spam[1]/float(ham_spam[0]+ham_spam[1]))*100), 2))


In [27]:
#mapping labels to 0 and 1
docs['label'] = docs.Class.map({'ham': 0 , 'spam': 1})

In [28]:
docs.head()

Unnamed: 0,Class,sms,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [29]:
# we can drop the Class coloumns
docs = docs.drop('Class', axis =1)

In [30]:
docs.head()

Unnamed: 0,sms,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [32]:
# conver to X and y
X= docs.sms
y = docs.label
print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [33]:
# split into the test and train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=1)

In [34]:
X_train.head()

4393                       what are your new years plans?
216     Finally the match heading towards draw as your...
4471    Lemme know when I can swing by and pick up, I'...
3889                   ok....take care.umma to you too...
5030    I'd like to tell you my deepest darkest fantas...
Name: sms, dtype: object

In [35]:
y_train.head()

4393    0
216     0
4471    0
3889    0
5030    1
Name: label, dtype: int64

In [38]:
#vectorising the sentence; removing the stop word
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words='english')

In [39]:
vect.fit(X_train)

In [None]:
vect.vocabulary_

In [41]:
#vocab len
len(vect.vocabulary_.keys())

6904

In [42]:
#transform the train and test datasets
X_train_transformed = vect.transform(X_train)
X_test_transformed = vect.transform(X_test)

In [43]:
print(type(X_train_transformed))
print(X_train_transformed)

<class 'scipy.sparse._csr.csr_matrix'>
  (np.int32(0), np.int32(4272))	1
  (np.int32(0), np.int32(4682))	1
  (np.int32(0), np.int32(6855))	1
  (np.int32(1), np.int32(2202))	1
  (np.int32(1), np.int32(2550))	1
  (np.int32(1), np.int32(3018))	1
  (np.int32(1), np.int32(3930))	1
  (np.int32(1), np.int32(4801))	1
  (np.int32(2), np.int32(1119))	1
  (np.int32(2), np.int32(2663))	1
  (np.int32(2), np.int32(3548))	1
  (np.int32(2), np.int32(3650))	1
  (np.int32(2), np.int32(4644))	1
  (np.int32(2), np.int32(5369))	1
  (np.int32(2), np.int32(5957))	1
  (np.int32(2), np.int32(6163))	1
  (np.int32(3), np.int32(1485))	1
  (np.int32(3), np.int32(4406))	1
  (np.int32(3), np.int32(6353))	1
  (np.int32(4), np.int32(103))	1
  (np.int32(4), np.int32(222))	1
  (np.int32(4), np.int32(538))	1
  (np.int32(4), np.int32(1949))	1
  (np.int32(4), np.int32(2000))	1
  (np.int32(4), np.int32(2478))	1
  :	:
  (np.int32(3897), np.int32(3721))	1
  (np.int32(3897), np.int32(4248))	1
  (np.int32(3897), np.int32(5026))

# BUilding and Evaluting the Model

In [44]:
# training  the NB Model and making predictions

In [45]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()

In [46]:
#fit
mnb.fit(X_train_transformed, y_train)

In [49]:
# predict class
y_pred_class = mnb.predict(X_test_transformed)

#predict probalilitied
y_pred_proba = mnb.predict_proba(X_test_transformed)

In [50]:
mnb

In [51]:
# Model Evaluation

In [52]:
# printing the overall accuracy
from sklearn import metrics

metrics.accuracy_score(y_test, y_pred_class)

0.986244019138756

In [60]:
# Confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)
help(metrics.confusion_matrix)

Help on function confusion_matrix in module sklearn.metrics._classification:

confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None, normalize=None)
    Compute confusion matrix to evaluate the accuracy of a classification.
    
    By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}`
    is equal to the number of observations known to be in group :math:`i` and
    predicted to be in group :math:`j`.
    
    Thus in binary classification, the count of true negatives is
    :math:`C_{0,0}`, false negatives is :math:`C_{1,0}`, true positives is
    :math:`C_{1,1}` and false positives is :math:`C_{0,1}`.
    
    Read more in the :ref:`User Guide <confusion_matrix>`.
    
    Parameters
    ----------
    y_true : array-like of shape (n_samples,)
        Ground truth (correct) target values.
    
    y_pred : array-like of shape (n_samples,)
        Estimated targets as returned by a classifier.
    
    labels : array-like of shape (n_classes), default=

In [61]:
TN, FP, FN, TP = metrics.confusion_matrix(y_test, y_pred_class).ravel()

In [65]:
print(TN)
print(FP)
print(FN)
print(TP)

1434
8
15
215


In [67]:
senstitivity = TP/ float(FN + TP)
print(f"senstitivity: {senstitivity}")

senstitivity: 0.9347826086956522


In [68]:
specificity = TN/ float(FP + TN)
print(f"specificity: {specificity}")

specificity: 0.9944521497919556


In [69]:
precision = TP /float(TP + FP)
print(f"precision: {precision}")

precision: 0.9641255605381166


In [70]:
print(metrics.precision_score(y_test, y_pred_class))

0.9641255605381166


In [73]:
print(f"precision: {precision}")
print(f"precision_score: {metrics.precision_score(y_test, y_pred_class)}")
print(f"recall_score: {metrics.recall_score(y_test, y_pred_class)}")
print(f"F1 score:", metrics.f1_score(y_test, y_pred_class))

precision: 0.9641255605381166
precision_score: 0.9641255605381166
recall_score: 0.9347826086956522
F1 score: 0.9492273730684326


In [74]:
y_pred_class

array([0, 0, 0, ..., 0, 0, 0])

In [75]:
y_pred_proba

array([[9.94958999e-01, 5.04100100e-03],
       [9.99854067e-01, 1.45932813e-04],
       [9.07992380e-01, 9.20076203e-02],
       ...,
       [9.42539394e-01, 5.74606060e-02],
       [9.91184998e-01, 8.81500189e-03],
       [9.99990076e-01, 9.92429553e-06]])

In [76]:
# Creating an ROC curve

In [77]:
from sklearn.metrics import confusion_matrix as sk_confusion_matrix
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt


false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_proba[:,1])
roc_auc = auc(false_positive_rate, true_positive_rate)

In [78]:
print(roc_auc)

0.9919646626062835


In [79]:
# matrix of threshold, tpr, fpr
pd.DataFrame({
    'Threshold' : thresholds,
    'TPR' : true_positive_rate,
    'FPR' : false_positive_rate
})

Unnamed: 0,Threshold,TPR,FPR
0,inf,0.000000,0.000000
1,1.000000e+00,0.278261,0.000000
2,1.000000e+00,0.295652,0.000000
3,1.000000e+00,0.304348,0.000000
4,1.000000e+00,0.308696,0.000000
...,...,...,...
152,1.120707e-12,1.000000,0.958391
153,8.802916e-14,1.000000,0.968100
154,8.621377e-14,1.000000,0.969487
155,8.024988e-14,1.000000,0.971567


In [None]:
#plotting the ROC Curve
%matplotlib inline
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.title("ROC")
plt.plot(false_positive_rate, true_positive_rate)