## This text classification program uses a probabilistic machine learning algorithm, Naive Bayes, to guess whether a What'sApp message is a "consulta" (consulation) or a "reclamo" (complaint).

In [25]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import stopwords

In [26]:
nltk.download('stopwords')
stop_words = stopwords.words("spanish")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/brandonjanes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
#read dataset from csv
df = pd.read_csv('mattermost_etiquetado.csv')

In [28]:
# change type to bianary
df.loc[df["type"]=='consulta',"type"]=0
df.loc[df["type"]=='reclamo',"type"]=1

In [29]:
df_x = df["text"]
df_y = df["type"]

In [30]:
#average of 'reclamos' versus 'consultas'
reclamo_total = df_y.sum()
total = df.type.count()
average = reclamo_total / total
print(average*100,"percent of the messages are 'reclamos'.")

12.177121771217712 percent of the messages are 'reclamos'.


##### WARNING: If the percent of 'reclamos' is very low, this will cloud the accuracy of our classification accuracy calculator. For example, if the 'reclamo' percent is 10 percent, a dumb classifier, which predicts 'consulta' every time, would be 90 percent accurate. 

In [31]:
#split the data (this is a commonly used line of code)
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=4)

In [10]:
#for TYPE we need our bianary values to be integers
y_train=y_train.astype('int')

### Count Vectorizer

In [11]:
cv = CountVectorizer()

In [12]:
x_traincv=cv.fit_transform(x_train)
x_traincv

<216x611 sparse matrix of type '<class 'numpy.int64'>'
	with 1895 stored elements in Compressed Sparse Row format>

In [13]:
arrai=x_traincv.toarray()
arrai

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### Multinomial Naive Bayes Classifier with Count Vectorizer

In [14]:
# for applying the Naive Bayes algorithm use "fit()" - this is the line where we are training our model
mnb_trained = MultinomialNB().fit(x_traincv, y_train)

In [15]:
# test data use "transform()"
x_testcv = cv.transform(x_test)

In [16]:
#predictions of (0=consulta, 1=reclamo)
pred = mnb_trained.predict(x_testcv)
pred

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1])

In [19]:
#original data (0=consulta, 1=reclamo)
actual=np.array(y_test)
actual

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=object)

In [20]:
# preparing our results for a tabular visualization
messg = []
for msg in x_test:
    messg.append(msg)
predictions = []
for x in pred:
    predictions.append(x)
original = []
for i in y_test:
    original.append(i)

In [21]:
# this is the Original, hand labeled data
dataf = pd.DataFrame({'ORIGINAL_TYPE' : original,
                      'ML_PREDICT' : predictions,
                     'TEXT' : messg})
dataf

Unnamed: 0,ML_PREDICT,ORIGINAL_TYPE,TEXT
0,0,0,Y en SanCor?
1,0,0,gracias
2,0,0,Me podes confirmar si voy a poderlo hacer?
3,0,0,Quería consultar sobre un seguro de accidente ...
4,0,0,Tengo un auto Ford falcon modelo 1983 y quería...
5,0,0,Me llamo Natalia
6,1,0,pense q seria 690 el valor de la cuota
7,0,0,Toco el boton donde dice otros medios de pagos...
8,0,1,Gente necesitamos ayuda
9,0,0,"y asi como yo estoy interviniendo, también pod..."


In [24]:
count=0
for i in range (len(pred)):
    if pred[i]==actual[i]:
        count=count+1
print("We have",count,"correct predictions out of",len(pred),".")
print("That's",(count/len(pred))*100,"accuracy")

We have 49 correct predictions out of 55 .
That's 89.0909090909091 accuracy
