In [2]:
import pandas as pd
import numpy as np
from collections import Counter
import re 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB

In [3]:
#load data
df = pd.read_csv("adara_corpus.csv")

In [38]:
#nltk.download('stopwords')
stop_words = stopwords.words("spanish")

In [5]:
#pre-processing (for later)
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\n", "", string)    
    string = re.sub(r"\r", "", string) 
    string = re.sub(r"[0-9]", "digit", string)
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

In [6]:
df.head()

Unnamed: 0,CreateAt,Hashtags,type,Mensaje,UserId,Name_2,UserName
0,2019-05-13T13:00:10.000Z,,m,Soy de Córdoba capital no se de donde son ustedes,7kut6fgex3du7xt7s38h43shqw,adara,whatsapp
1,2019-05-13T13:00:12.000Z,#consulta_campaña,m,Quiero un turno de la Campaña Gratuita,7kut6fgex3du7xt7s38h43shqw,adara,whatsapp
2,2019-05-13T16:36:13.000Z,#consulta_campaña,r,Gracias por comunicarse con Adara!\n\nLe infor...,ij6w4x8n9fbpxqxh79gj8474by,adara,adara
3,2019-05-20T15:35:29.000Z,,r,¡Buenas tardes! Nos comunicamos nuevamente par...,ij6w4x8n9fbpxqxh79gj8474by,adara,adara
4,2019-05-20T16:03:02.000Z,,m,Hola. si. me encantaria,7kut6fgex3du7xt7s38h43shqw,adara,whatsapp


In [7]:
Counter(df["Hashtags"])

Counter({'#cambiar_turno': 3,
         '#consulta': 2,
         '#consulta_campaña': 207,
         '#dermatologia': 47,
         '#deteccion_lunares': 1,
         '#dr_falcon': 22,
         '#dr_forni': 1,
         '#dr_lamoratta': 31,
         '#dr_morales ': 18,
         '#dra_elorza': 6,
         '#dra_forni': 5,
         '#dra_pozzi': 2,
         '#encuesta': 19,
         '#escleroterapia': 5,
         '#estetica': 15,
         '#flebologia': 6,
         '#flebologia ': 156,
         '#gordura': 1,
         '#lic_porchietto': 1,
         '#lic_vazquez': 2,
         '#lic_vazquez #dr_lamoratta': 1,
         '#no_confirmado': 6,
         '#nutricion': 5,
         '#otro_servicio': 3,
         '#pendiente_respuesta': 41,
         '#queja': 2,
         '#turno': 14,
         '#turno_cancelado': 72,
         '#turno_confirmado': 185,
         '#turno_facebook': 6,
         '#turno_reprogramado': 72,
         nan: 2035})

## deal with empty hashtag blocks

In [8]:
df['Hashtags'].head(10)

0                  NaN
1    #consulta_campaña
2    #consulta_campaña
3                  NaN
4                  NaN
5                  NaN
6                  NaN
7         #dr_morales 
8                  NaN
9                  NaN
Name: Hashtags, dtype: object

In [9]:
# rename all empty hashtag blocks
df['Hashtags'] = df['Hashtags'].fillna('blank')

In [10]:
le = LabelEncoder()

In [11]:
dfle = df.drop(['CreateAt', 'type', 'UserId', 'Name_2', 'UserName'], axis=1)
dfle.head(5)

Unnamed: 0,Hashtags,Mensaje
0,blank,Soy de Córdoba capital no se de donde son ustedes
1,#consulta_campaña,Quiero un turno de la Campaña Gratuita
2,#consulta_campaña,Gracias por comunicarse con Adara!\n\nLe infor...
3,blank,¡Buenas tardes! Nos comunicamos nuevamente par...
4,blank,Hola. si. me encantaria


## Label Encoder: working with the dependent variable (Y)

In [12]:
dfle.Hashtags = le.fit_transform(dfle.Hashtags)
dfle.head(5)

Unnamed: 0,Hashtags,Mensaje
0,31,Soy de Córdoba capital no se de donde son ustedes
1,2,Quiero un turno de la Campaña Gratuita
2,2,Gracias por comunicarse con Adara!\n\nLe infor...
3,31,¡Buenas tardes! Nos comunicamos nuevamente par...
4,31,Hola. si. me encantaria


In [13]:
Y = dfle['Hashtags'].values
Y

array([31,  2,  2, ..., 31, 31,  3])

##  Now let's work with the independent variable (X) 

In [14]:
# do some cleaning of the text. all lower case
X = []
for i in range(dfle.shape[0]):
    strng = str(dfle.iloc[i][1])
    X.append(clean_str(strng))

## TRAIN, TEST, SPLIT

In [15]:
#train test split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=5)

## Vectorize: turn the messages into numbers

In [16]:
cv = CountVectorizer()

In [17]:
x_traincv=cv.fit_transform(x_train)
#x_testcv =cv.fit_transform(x_test)
x_traincv.shape

(2094, 1664)

In [18]:
# transform to an array
train_arrai = x_traincv.toarray()
#test_arrai  = x_testcv.toarray()
train_arrai.shape

(2094, 1664)

In [19]:
#cv.get_feature_names()

In [20]:
cv.inverse_transform(train_arrai[0])

[array(['con', 'contamos', 'de', 'digitdigit', 'digitdigitdigitdigit',
        'disponibilidad', 'dr', 'días', 'lamoratta', 'los', 'morales',
        'para', 'turno'], dtype='<U120')]

## Naive Bayes classification

In [21]:
mnb=MultinomialNB()

In [22]:
###### for applying the Naive Bayes algorithm use "fit()"
mnb.fit(train_arrai, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [23]:
# test data use "transform()"
x_testcv = cv.transform(x_test)
test_arrai  = x_testcv.toarray()

In [24]:
#predictions
pred = mnb.predict(test_arrai)
pred

array([31, 28, 31,  2, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
        2, 31, 31, 28, 31, 16, 31,  2, 31,  2, 28, 31, 31, 31, 31, 16, 31,
       28, 31, 31, 31, 31, 31, 31, 16, 31, 31, 31, 31, 28, 28, 31, 31, 31,
       31, 27, 31, 31,  2, 31, 31, 31, 31,  2, 16, 31,  2,  2,  2, 31, 31,
       24, 31,  2, 31, 31, 31,  2, 31,  2, 31, 31, 31, 31, 31, 31, 31, 31,
        2, 31, 31,  2, 31, 31, 31, 31, 31, 31, 28, 16, 31, 28, 27, 28,  3,
       31, 31, 31, 27, 31, 31,  2, 31, 30, 28, 16, 31, 31, 31, 31, 31, 28,
       31, 31,  2, 30, 31, 31, 31,  2, 31, 31, 16, 28, 31, 31, 31, 31, 31,
       31,  3, 31, 31, 31, 27, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
       28, 31, 31, 28, 16, 31, 31, 28, 31, 31, 31, 28, 31, 31, 31, 31, 31,
       31, 31, 31, 31,  2, 16, 31, 31, 31, 31, 16,  2, 24, 31, 31, 31, 31,
       31,  2, 30, 31, 31, 16, 31, 12, 27, 31, 31, 31,  2, 31, 31, 31, 31,
       31, 31,  2, 28, 31, 31, 31,  2,  2, 31, 28, 28, 31, 31, 16, 16, 31,
       31, 31, 31, 31, 31

In [25]:
actual=np.array(y_test)
actual

array([31, 28, 31,  2, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
       31, 31, 28, 28, 31, 16, 31,  2,  7, 31, 31, 31, 31, 31, 31, 16, 16,
       28, 31, 31, 31, 30, 31, 31, 16, 31, 28, 31, 31, 31, 31, 31, 31, 31,
       31, 31, 31,  7, 31, 31, 31, 31, 31,  3, 16, 31, 31,  2, 16, 31, 31,
       24,  2, 31, 31, 31,  2, 31, 31,  2, 31, 31, 31, 31, 28, 31, 31, 31,
        2, 31, 31,  2, 31, 31, 27, 31, 31, 16, 31, 31, 31, 31, 27, 31,  3,
       31, 31, 31, 31, 31, 31,  2, 31, 30, 31, 16,  2, 31, 24, 31,  7, 28,
        9, 31,  2, 31, 28, 31, 31, 31, 31, 31, 16, 31, 31, 13, 28, 31, 31,
        7,  3, 31, 31, 28, 31, 31, 31, 31, 31, 31, 31, 28, 28, 31, 31, 16,
       31,  8, 31, 31, 16, 31, 31, 28, 31, 31, 31, 28, 31, 28, 30, 31, 31,
       31, 31, 31, 31, 31, 16,  7, 31, 31, 16, 16, 22, 24, 31, 31, 31, 31,
       31,  2, 30, 31, 31, 31, 31, 12, 31, 31, 31, 31,  2, 31, 31, 31, 31,
       31, 31, 31, 31, 31, 31, 31, 31, 16, 31, 28, 28, 31, 31, 16, 31, 31,
       31, 31, 31, 10, 31

In [26]:
count=0
for i in range (len(pred)):
    if pred[i]==actual[i]:
        count=count+1
print("We have",count,"correct predictions out of",len(pred),".")
print("Total",(count/len(pred))*100,"accuracy using Count Vectorizer.")

We have 661 correct predictions out of 898 .
Total 73.60801781737194 accuracy using Count Vectorizer.


In [27]:
# preparing our results for a tabular visualization
messg = []
for msg in x_test:
    messg.append(msg)
predictions = []
for x in le.inverse_transform(pred):
    predictions.append(x)
original = []
for i in le.inverse_transform(y_test):
    original.append(i)

In [28]:
# this is the Original, hand labeled data
dataf = pd.DataFrame({'ORIGINAL_TYPE' : original,
                      'ML_PREDICT' : predictions,
                     'TEXT' : messg})
dataf

Unnamed: 0,ML_PREDICT,ORIGINAL_TYPE,TEXT
0,blank,blank,leidys rojas dni digitdigit.digitdigitdigit.di...
1,#turno_confirmado,#turno_confirmado,"muchas gracias, su turno ha sido confirmado.so..."
2,blank,blank,hola puedo pedir un turno de la campaña
3,#consulta_campaña,#consulta_campaña,gracias por comunicarse con adara!le informamo...
4,blank,blank,verinia tupayachi dni digitdigitdigitdigitdigi...
5,blank,blank,me decis la dirección
6,blank,blank,"correcto, puede solicitar el turno personalmente."
7,blank,blank,podría ser digitdigit/digitdigit/digitdigitdig...
8,blank,blank,trabajo de digitdigit a digitdigit
9,blank,blank,cuando me podrán otorgar un turno????


In [53]:
# if you want to experiment, try writing something in the blank
mess = ["<type your message here>"]
message = cv.transform(mess)
messages = message.toarray()
predicted = mnb.predict(messages)
le.inverse_transform(predicted)

array(['#flebologia '], dtype=object)