# Naive Bayes Classification

### This notebook tests two word vectorization techniques, Count and TF-IDF, and three variations of the Naive Bayes Classification algorithm: Multinomial, Gaussian and Bernoulli Baive Bayes. We used a dataset of What'sApp messages from customers of an auto insurance company in Latin America. The messages were classified by two criteria, consulations (*consulta*) and complaints (*reclamos*).

In [1]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from nltk.corpus import stopwords

In [2]:
#nltk.download('stopwords')
stop_words = stopwords.words("spanish")

In [3]:
df = pd.read_csv('mattermost_etiquetado.csv')

### Set up our test and train data sets

In [4]:
# change class type to bianary
df.loc[df["type"]=='consulta',"type"]=0
df.loc[df["type"]=='reclamo',"type"]=1

In [5]:
# Seperate messages and class into two dataframes
df_x = df["text"]
df_y= df["type"]

In [6]:
#train-test-split the data (this is a commonly used line of code)
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=4)

In [7]:
#for TYPE we need our bianary values to be integers
y_train=y_train.astype('int')

# Vectorization: turn words into numbers

### Which performs better? Count Vectorizer or TF-IDF Vectorizer?

### Count Vectorizer

In [8]:
cv = CountVectorizer()

In [9]:
x_traincv=cv.fit_transform(x_train)
#x_testcv =cv.fit_transform(x_test)
x_traincv.shape

(216, 611)

In [10]:
# transform to an array
train_arrai = x_traincv.toarray()
#test_arrai  = x_testcv.toarray()
train_arrai.shape

(216, 611)

In [11]:
#feature extraction of the data
cv.get_feature_names()

['100',
 '11',
 '19',
 '1983',
 '2008',
 '2009',
 '2011',
 '351',
 '3515069733',
 '430000',
 '506973',
 '690',
 '790',
 '794',
 '80',
 '936',
 '97',
 'ab235',
 'abonaba',
 'abonar',
 'aca',
 'accidente',
 'acercar',
 'actuó',
 'acá',
 'además',
 'adherirse',
 'agradezco',
 'aguardo',
 'ahi',
 'ahora',
 'al',
 'alberto',
 'algo',
 'alguna',
 'alta',
 'andar',
 'anterior',
 'antes',
 'anulada',
 'aparece',
 'apenas',
 'aproximadamente',
 'aqui',
 'arriba',
 'as',
 'asegurado',
 'asegurar',
 'asesoro',
 'asi',
 'assistance',
 'así',
 'atención',
 'atendian',
 'atendieron',
 'atras',
 'atrás',
 'atte',
 'aumentan',
 'auto',
 'automovil',
 'autos',
 'averiguar',
 'avise',
 'aviso',
 'ayer',
 'bersion',
 'bertuccelli_edo',
 'bicicletas',
 'bien',
 'boleta',
 'bonito',
 'boton',
 'breve',
 'brindar',
 'bs',
 'bsas',
 'buen',
 'buena',
 'buenas',
 'buenisimo',
 'bueno',
 'buenos',
 'bustos',
 'cabrera',
 'camioneta',
 'cancelar',
 'carol',
 'casa',
 'categoria',
 'cba',
 'cbu',
 'celular',
 'c

In [12]:
cv.inverse_transform(train_arrai[0])

[array(['averiguar', 'quería', 'seguro', 'un'], dtype='<U15')]

### Tfidf Vectorizer

In [13]:
cv1 = TfidfVectorizer(min_df=1,stop_words=stop_words)

In [14]:
# for making our language based training data use "fit_transform()"
x_traincv1=cv1.fit_transform(x_train)
x_traincv1.shape

(216, 531)

In [15]:
arrai1 = x_traincv1.toarray()
arrai1.shape

(216, 531)

In [16]:
#feature extraction of the data
cv1.get_feature_names()

['100',
 '11',
 '19',
 '1983',
 '2008',
 '2009',
 '2011',
 '351',
 '3515069733',
 '430000',
 '506973',
 '690',
 '790',
 '794',
 '80',
 '936',
 '97',
 'ab235',
 'abonaba',
 'abonar',
 'aca',
 'accidente',
 'acercar',
 'actuó',
 'acá',
 'además',
 'adherirse',
 'agradezco',
 'aguardo',
 'ahi',
 'ahora',
 'alberto',
 'alguna',
 'alta',
 'andar',
 'anterior',
 'anulada',
 'aparece',
 'apenas',
 'aproximadamente',
 'aqui',
 'arriba',
 'as',
 'asegurado',
 'asegurar',
 'asesoro',
 'asi',
 'assistance',
 'así',
 'atención',
 'atendian',
 'atendieron',
 'atras',
 'atrás',
 'atte',
 'aumentan',
 'auto',
 'automovil',
 'autos',
 'averiguar',
 'avise',
 'aviso',
 'ayer',
 'bersion',
 'bertuccelli_edo',
 'bicicletas',
 'bien',
 'boleta',
 'bonito',
 'boton',
 'breve',
 'brindar',
 'bs',
 'bsas',
 'buen',
 'buena',
 'buenas',
 'buenisimo',
 'bueno',
 'buenos',
 'bustos',
 'cabrera',
 'camioneta',
 'cancelar',
 'carol',
 'casa',
 'categoria',
 'cba',
 'cbu',
 'celular',
 'centro',
 'cerca',
 'chiqui

In [17]:
cv1.inverse_transform(arrai1[0])

[array(['averiguar', 'quería', 'seguro'], dtype='<U15')]

In [18]:
x_train.iloc[0]

'Quería averiguar x un seguro'

### Let's test the two vectorizations with the NB classification algo

### Count vectorizer w/ NB Classifier

In [19]:
mnb=MultinomialNB()

In [20]:
# for applying the Naive Bayes algorithm use "fit()"
mnb.fit(train_arrai, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [21]:
# test data use "transform()"
x_testcv = cv.transform(x_test)
test_arrai  = x_testcv.toarray()

In [22]:
#predictions
pred = mnb.predict(test_arrai)
pred

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1])

In [23]:
actual=np.array(y_test)
actual

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=object)

In [24]:
count=0
for i in range (len(pred)):
    if pred[i]==actual[i]:
        count=count+1
print("We have",count,"correct predictions out of",len(pred),".")
print("Total",(count/len(pred))*100,"accuracy using Count Vectorizer.")

We have 49 correct predictions out of 55 .
Total 89.0909090909091 accuracy using Count Vectorizer.


### TFIDF Vectorizer w/ NB classification

In [25]:
mnb.fit(x_traincv1, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [26]:
#x_testcv1 = cv1.transform(x_test)

# test data use "transform()"
x_testcv1 = cv1.transform(x_test)
test_arrai1  = x_testcv1.toarray()

In [27]:
pred1 = mnb.predict(test_arrai1)
pred1

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [28]:
actual1=np.array(y_test)
actual1

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=object)

In [29]:
count1=0
for i in range (len(pred1)):
    if pred1[i]==actual1[i]:
        count1=count1+1
print("We have",count1,"correct predictions out of",len(pred1))
print("Total",(count1/len(pred1))*100,"accuracy using Tfidf Vectorizer")

We have 47 correct predictions out of 55
Total 85.45454545454545 accuracy using Tfidf Vectorizer


### We have our answer: The Naive Bayes classifier performed better with the *Count Vectorizer* (89 percent accuracy) than with the TF-IDF Vectorizer (85 percent accuracy). As out data set grows, the vectorizer we want to use may change as well. 

### Now that we know Count Vectorizer performs better, let's try a couple variations of the NB algo

### Gaussian Naive Bayes

In [30]:
gnb=GaussianNB()

In [31]:
# for applying the Naive Bayes algorithm use "fit()"
gnb.fit(train_arrai, y_train)
# test data use "transform()"
x_testcv = cv.transform(x_test)

In [32]:
#predictions
pred = gnb.predict(test_arrai)
pred

array([0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [33]:
actual=np.array(y_test)
actual

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=object)

In [34]:
count1=0
for i in range (len(pred1)):
    if pred1[i]==actual1[i]:
        count1=count1+1
print("We have",count1,"correct predictions out of",len(pred1))
print("Total",(count1/len(pred1))*100,"accuracy using Gaussian NB")

We have 47 correct predictions out of 55
Total 85.45454545454545 accuracy using Gaussian NB


### Bernoulli Naive Bayes

In [35]:
bnb=BernoulliNB()

In [36]:
# for applying the Naive Bayes algorithm use "fit()"
bnb.fit(train_arrai, y_train)
# test data use "transform()"
x_testcv = cv.transform(x_test)

In [37]:
#predictions
pred = bnb.predict(test_arrai)
pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [38]:
actual=np.array(y_test)
actual

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=object)

In [39]:
count1=0
for i in range (len(pred1)):
    if pred1[i]==actual1[i]:
        count1=count1+1
print("We have",count1,"correct predictions out of",len(pred1))
print("Total",(count1/len(pred1))*100,"accuracy using Bernoulli NB")

We have 47 correct predictions out of 55
Total 85.45454545454545 accuracy using Bernoulli NB


### Conclusion: All perform pretty similarly. 