## Building a classifier

In [2]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import stopwords

In [3]:
nltk.download('stopwords')
stop_words = stopwords.words("spanish")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/brandonjanes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df = pd.read_csv('mattermost_etiquetado.csv')

In [5]:
# change type to bianary
df.loc[df["type"]=='consulta',"type"]=0
df.loc[df["type"]=='reclamo',"type"]=1

In [6]:
df_x = df["text"]
df_y= df["type"]

In [7]:
#split the data (this is a commonly used line of code)
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=4)

In [8]:
#for TYPE we need our bianary values to be integers
y_train=y_train.astype('int')

### Count Vectorizer

In [9]:
cv = CountVectorizer()

In [10]:
x_traincv=cv.fit_transform(x_train)

In [11]:
arrai=x_traincv.toarray()
arrai

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [12]:
cv.get_feature_names()

['100',
 '11',
 '19',
 '1983',
 '2008',
 '2009',
 '2011',
 '351',
 '3515069733',
 '430000',
 '506973',
 '690',
 '790',
 '794',
 '80',
 '936',
 '97',
 'ab235',
 'abonaba',
 'abonar',
 'aca',
 'accidente',
 'acercar',
 'actuó',
 'acá',
 'además',
 'adherirse',
 'agradezco',
 'aguardo',
 'ahi',
 'ahora',
 'al',
 'alberto',
 'algo',
 'alguna',
 'alta',
 'andar',
 'anterior',
 'antes',
 'anulada',
 'aparece',
 'apenas',
 'aproximadamente',
 'aqui',
 'arriba',
 'as',
 'asegurado',
 'asegurar',
 'asesoro',
 'asi',
 'assistance',
 'así',
 'atención',
 'atendian',
 'atendieron',
 'atras',
 'atrás',
 'atte',
 'aumentan',
 'auto',
 'automovil',
 'autos',
 'averiguar',
 'avise',
 'aviso',
 'ayer',
 'bersion',
 'bertuccelli_edo',
 'bicicletas',
 'bien',
 'boleta',
 'bonito',
 'boton',
 'breve',
 'brindar',
 'bs',
 'bsas',
 'buen',
 'buena',
 'buenas',
 'buenisimo',
 'bueno',
 'buenos',
 'bustos',
 'cabrera',
 'camioneta',
 'cancelar',
 'carol',
 'casa',
 'categoria',
 'cba',
 'cbu',
 'celular',
 'c

In [13]:
cv.inverse_transform(arrai[0])

[array(['averiguar', 'quería', 'seguro', 'un'], dtype='<U15')]

In [14]:
x_train.iloc[0]

'Quería averiguar x un seguro'

### Tfidf Vectorizer

In [15]:
cv1 = TfidfVectorizer(min_df=1,stop_words=stop_words)

In [16]:
# for making our language based training data use "fit_transform()"
x_traincv1=cv1.fit_transform(x_train)

In [17]:
arrai1 = x_traincv1.toarray()
arrai1

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
#feature extraction of the data
cv1.get_feature_names()

['100',
 '11',
 '19',
 '1983',
 '2008',
 '2009',
 '2011',
 '351',
 '3515069733',
 '430000',
 '506973',
 '690',
 '790',
 '794',
 '80',
 '936',
 '97',
 'ab235',
 'abonaba',
 'abonar',
 'aca',
 'accidente',
 'acercar',
 'actuó',
 'acá',
 'además',
 'adherirse',
 'agradezco',
 'aguardo',
 'ahi',
 'ahora',
 'alberto',
 'alguna',
 'alta',
 'andar',
 'anterior',
 'anulada',
 'aparece',
 'apenas',
 'aproximadamente',
 'aqui',
 'arriba',
 'as',
 'asegurado',
 'asegurar',
 'asesoro',
 'asi',
 'assistance',
 'así',
 'atención',
 'atendian',
 'atendieron',
 'atras',
 'atrás',
 'atte',
 'aumentan',
 'auto',
 'automovil',
 'autos',
 'averiguar',
 'avise',
 'aviso',
 'ayer',
 'bersion',
 'bertuccelli_edo',
 'bicicletas',
 'bien',
 'boleta',
 'bonito',
 'boton',
 'breve',
 'brindar',
 'bs',
 'bsas',
 'buen',
 'buena',
 'buenas',
 'buenisimo',
 'bueno',
 'buenos',
 'bustos',
 'cabrera',
 'camioneta',
 'cancelar',
 'carol',
 'casa',
 'categoria',
 'cba',
 'cbu',
 'celular',
 'centro',
 'cerca',
 'chiqui

In [19]:
cv1.inverse_transform(arrai1[0])

[array(['averiguar', 'quería', 'seguro'], dtype='<U15')]

In [20]:
x_train.iloc[0]

'Quería averiguar x un seguro'

### Naive Bayes Classifier with Count Vectorizer

In [21]:
mnb=MultinomialNB()

In [22]:
# for applying the Naive Bayes algorithm use "fit()"
mnb.fit(x_traincv, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [23]:
# test data use "transform()"
x_testcv = cv.transform(x_test)

In [24]:
#predictions
pred = mnb.predict(x_testcv)
pred

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1])

In [25]:
actual=np.array(y_test)
actual

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=object)

In [28]:
count=0
for i in range (len(pred)):
    if pred[i]==actual[i]:
        count=count+1
print("We have",count,"correct predictions out of",len(pred),".")
print("Total",(count/len(pred))*100,"accuracy using Count Vectorizer.")

We have 49 correct predictions out of 55 .
Total 89.0909090909091 accuracy using Count Vectorizer.


### Now let's try NB classification with TFIDF Vectorizer

In [29]:
mnb.fit(x_traincv1, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [30]:
x_testcv1 = cv1.transform(x_test)

In [31]:
pred1 = mnb.predict(x_testcv1)
pred1

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [32]:
actual1=np.array(y_test)
actual1

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=object)

In [33]:
count1=0
for i in range (len(pred1)):
    if pred1[i]==actual1[i]:
        count1=count1+1
print("we have",count1,"correct predictions out of",len(pred1))
print("Total",(count1/len(pred1))*100,"accuracy using Tfidf Vectorizer")

we have 47 correct predictions out of 55
Total 85.45454545454545 accuracy using Tfidf Vectorizer


### The Naive Bayes classifier performed better with the Count Vectorizer than with the TF-IDF Vectorizer. As out data set grows, the vectorizer we want to use may change as well. 