# Laboratorio #2 Text Mining
## Álvaro Andrés Esquivel Gómez

In [139]:
import pandas as pd
import spacy
import spacy.cli 
import re
import numpy as np

In [2]:
#Utilizando el diccionario en lenguje español
#spacy.cli.download('es_core_news_sm')

nlp = spacy.load('es_core_news_sm')

In [3]:
spamDB = pd.read_csv('es_spam.csv', sep = r',', names=['label', 'message'])
spamDB = spamDB.iloc[1:]
spamDB.head()

Unnamed: 0,label,message
1,ham,"Ir hasta el punto de jurong, loco .. Disponibl..."
2,ham,lar bien ... Bromas WIF u oni ...
3,spam,Entrada libre en una imagen de obsequio 2 wkly...
4,ham,T Dun decir hor tan temprano ... t r ya contin...
5,ham,"Nah no creo que vaya a USF, que vive por aquí,..."


In [4]:
spamDB.shape

(5572, 2)

In [5]:
corpus = spamDB.message

In [6]:
spamDB.label.value_counts()/len(spamDB)

ham     0.865937
spam    0.134063
Name: label, dtype: float64

In [7]:
#Dividir lo tipos de correos
ham = spamDB[spamDB['label'] == 'ham']
spam = spamDB[spamDB['label'] == 'spam']

ham.shape, spam.shape

((4825, 2), (747, 2))

In [8]:
#Tomar una muestra de ham de tamano del doble de correos de spam
ham = ham.sample(2*spam.shape[0])
ham.shape, spam.shape

((1494, 2), (747, 2))

In [9]:
dataset = ham.append(spam, ignore_index=True) 
dataset.shape

(2241, 2)

In [10]:
dataset

Unnamed: 0,label,message
0,ham,Jaja ... ¿Dónde tiene el peso tan rápido perde...
1,ham,Gracias cariño. Que tengas un gran día.
2,ham,Crucify es c no lo es. Que habérmelo dicho antes.
3,ham,Oh oh ... Wasted ... Den muz chiong en SAT N s...
4,ham,No estoy del todo contento con lo que dice o hace
...,...,...
2236,spam,Quiero sexo explícito en 30 segundos? Anillo 0...
2237,spam,PREGUNTADO SI 3Mobile 0870 canales de conversa...
2238,spam,Tenía su contrato móvil 11 mnths? Última Motor...
2239,spam,"RECORDATORIO DE O2: Para obtener 2,50 libras e..."


## Normalizacion

In [11]:
corpus = dataset.message
corpus

0       Jaja ... ¿Dónde tiene el peso tan rápido perde...
1                 Gracias cariño. Que tengas un gran día.
2       Crucify es c no lo es. Que habérmelo dicho antes.
3       Oh oh ... Wasted ... Den muz chiong en SAT N s...
4       No estoy del todo contento con lo que dice o hace
                              ...                        
2236    Quiero sexo explícito en 30 segundos? Anillo 0...
2237    PREGUNTADO SI 3Mobile 0870 canales de conversa...
2238    Tenía su contrato móvil 11 mnths? Última Motor...
2239    RECORDATORIO DE O2: Para obtener 2,50 libras e...
2240    Esta es la segunda vez que hemos intentado 2 d...
Name: message, Length: 2241, dtype: object

In [79]:
#Deja los string que cumplan las caraccteristicas de una expresion regular
#Normalizacion elemento por elemento

def Normalizacion(corpus):
    newCorpus = []
    for doc in corpus:
        #Deja las cosas que no cumpla con la expre regular
        newCorpus.append(re.sub(r'[^a-zA-Z\s{1}áéíóúüñÑÁÉÍÓÚÜ]', '', str(doc)).lower().strip().rstrip('\n').strip('\r\n'))

    return newCorpus

In [13]:
corpus = Normalizacion(corpus)
corpus

['jaja  dónde tiene el peso tan rápido perder thk muz ir  al mes den tiene efecto  gee después nos dirigimos puesto aust bk correo peso',
 'gracias cariño que tengas un gran día',
 'crucify es c no lo es que habérmelo dicho antes',
 'oh oh  wasted  den muz chiong en sat n sol liao',
 'no estoy del todo contento con lo que dice o hace',
 'audrie pésimo autocorrección',
 'no puedo recoger el teléfono en este momento pls enviar un mensaje',
 'estoy en clase recibiste mi mensaje',
 'qué estaba buscando',
 'yo sé que no oredi cerca  ú ventilador v ma',
 'e administrador construir allí podría b un poco antes  voy a llamar u cuando estoy llegando',
 'gracias por entender he estado tratando de decir que azora',
 'no sé pero estoy violando tipos en el póker',
 'me pregunto si u reunión del ed noche tmr ge',
 'carlosll estar aquí en un minuto si usted todavía necesita para comprar',
 'cómo u doin niña  la esperanza u son bien cada vez que el teléfono llamada que ure está apagado extraño u obtene

## Tokenizacion

In [14]:
def tokenizacion(corpus):
    documents = []
    for doc in corpus:
        documents.append(nlp(doc))

    return documents

In [15]:
corpus = tokenizacion(corpus)
corpus

[jaja  dónde tiene el peso tan rápido perder thk muz ir  al mes den tiene efecto  gee después nos dirigimos puesto aust bk correo peso,
 gracias cariño que tengas un gran día,
 crucify es c no lo es que habérmelo dicho antes,
 oh oh  wasted  den muz chiong en sat n sol liao,
 no estoy del todo contento con lo que dice o hace,
 audrie pésimo autocorrección,
 no puedo recoger el teléfono en este momento pls enviar un mensaje,
 estoy en clase recibiste mi mensaje,
 qué estaba buscando,
 yo sé que no oredi cerca  ú ventilador v ma,
 e administrador construir allí podría b un poco antes  voy a llamar u cuando estoy llegando,
 gracias por entender he estado tratando de decir que azora,
 no sé pero estoy violando tipos en el póker,
 me pregunto si u reunión del ed noche tmr ge,
 carlosll estar aquí en un minuto si usted todavía necesita para comprar,
 cómo u doin niña  la esperanza u son bien cada vez que el teléfono llamada que ure está apagado extraño u obtener en contacto,
 así es que me a

In [17]:
copiacorpus = corpus

In [None]:
corpus = copiacorpus

## Remover Stopwords

In [18]:
def removeStops(corpus):
    documen = []
    #Sacar todo lo que no son stopword y juntar el string completo para lematizar despues
    for doc in corpus:
        s = "" #resultado sin stopword
        for token in doc:
              if (token.is_stop == False):
                s = s + token.text + " "
        documen.append(s.strip())
    return documen

In [19]:
corpus = removeStops(corpus)
corpus

['jaja   peso rápido perder thk muz   mes den efecto   gee dirigimos puesto aust bk correo peso',
 'gracias cariño tengas',
 'crucify c habérmelo',
 'oh oh   wasted   den muz chiong sat n sol liao',
 'contento o',
 'audrie pésimo autocorrección',
 'recoger teléfono pls enviar mensaje',
 'clase recibiste mensaje',
 'buscando',
 'oredi   ú ventilador v ma',
 'e administrador construir b   a llamar u llegando',
 'gracias entender tratando azora',
 'violando tipos póker',
 'pregunto u reunión ed noche tmr ge',
 'carlosll minuto necesita comprar',
 'u doin niña   esperanza u teléfono llamada ure apagado extraño u obtener contacto',
 'alegro g desperdicia noche applebees',
 'regalos   estás tratando tiro precipicio o',
 'wan u   dun   y u dun ah   wat u comiendo',
 'enviado webadres nómina',
 'tratando llegar a éxito',
 'c película juz mah decisión minuto juz   lar tot   interés',
 'wewa 1 iriver   1 mb',
 'celebrar ny familia',
 'k enviado',
 'batería baja nena',
 'kkhow negocio',
 'er hola

## Stemming y Lematización

In [20]:
def stemmingLemmating(corpus):
    documents = tokenizacion(corpus)
    newDocs = []
    for doc in documents:
        s = ""
        for token in doc:
            s = s + token.lemma_ + " "
        newDocs.append(s.strip())
    return newDocs

In [21]:
corpus = stemmingLemmating(corpus)
corpus

['jaja    pesar rápido perder thk muz    mes dar efecto    gee dirigir poner aust bk correar pesar',
 'gracia cariño tener',
 'crucify c habérmelo',
 'oh oh    wasted    dar muz chiong sat n sol liao',
 'contentar o',
 'audrie malo autocorrección',
 'recoger teléfono pls enviar mensaje',
 'clase recibir mensaje',
 'buscar',
 'oredi    ú ventilador v ma',
 'e administrador construir b    a llamar u llegar',
 'gracia entender tratar azorar',
 'violar tipo póker',
 'preguntar u reunión ed noche tmr ge',
 'carlosll minutar necesitar comprar',
 'u doin niño    esperanzar u teléfono llamar ure apagar extrañar u obtener contactar',
 'alegrar gramo desperdiciar noche applebees',
 'regalo    estar tratar tirar precipicio o',
 'wan u    dun    y u dun ah    wat u comer',
 'enviar webadres nómina',
 'tratar llegar a éxito',
 'c película juz mah decisión minutar juz    lar tot    interés',
 'wewa 1 iriver    1 mb',
 'celebrar ny familia',
 'k enviar',
 'batería bajo nene',
 'kkhow negociar',
 'er 

## Medida TF Term Freq
Razon entre cuantas veces uaparece cada palabra en cada documento, dividido el total de palabras en documento j.

Identificar las palabras que tienen relevancia entre cada documento, determina la palabra mas utilizada, por lo que es la mas importante en el documento. El termino que parece mas ponderado. 

In [22]:
def listToString(s):
    strX = ""
    for palabra in s:
        strX = strX + palabra + " "
    return strX

In [23]:
strCorpus = listToString(corpus)
strCorpus

'jaja    pesar rápido perder thk muz    mes dar efecto    gee dirigir poner aust bk correar pesar gracia cariño tener crucify c habérmelo oh oh    wasted    dar muz chiong sat n sol liao contentar o audrie malo autocorrección recoger teléfono pls enviar mensaje clase recibir mensaje buscar oredi    ú ventilador v ma e administrador construir b    a llamar u llegar gracia entender tratar azorar violar tipo póker preguntar u reunión ed noche tmr ge carlosll minutar necesitar comprar u doin niño    esperanzar u teléfono llamar ure apagar extrañar u obtener contactar alegrar gramo desperdiciar noche applebees regalo    estar tratar tirar precipicio o wan u    dun    y u dun ah    wat u comer enviar webadres nómina tratar llegar a éxito c película juz mah decisión minutar juz    lar tot    interés wewa 1 iriver    1 mb celebrar ny familia k enviar batería bajo nene kkhow negociar er hola didnt plan ai cojear lentamente a casar seguir aa y escapar colgar hey doc pls querer camiseta esposar f

In [24]:
strCorpus = strCorpus.strip().split(' ')
strCorpus

['jaja',
 '',
 '',
 '',
 'pesar',
 'rápido',
 'perder',
 'thk',
 'muz',
 '',
 '',
 '',
 'mes',
 'dar',
 'efecto',
 '',
 '',
 '',
 'gee',
 'dirigir',
 'poner',
 'aust',
 'bk',
 'correar',
 'pesar',
 'gracia',
 'cariño',
 'tener',
 'crucify',
 'c',
 'habérmelo',
 'oh',
 'oh',
 '',
 '',
 '',
 'wasted',
 '',
 '',
 '',
 'dar',
 'muz',
 'chiong',
 'sat',
 'n',
 'sol',
 'liao',
 'contentar',
 'o',
 'audrie',
 'malo',
 'autocorrección',
 'recoger',
 'teléfono',
 'pls',
 'enviar',
 'mensaje',
 'clase',
 'recibir',
 'mensaje',
 'buscar',
 'oredi',
 '',
 '',
 '',
 'ú',
 'ventilador',
 'v',
 'ma',
 'e',
 'administrador',
 'construir',
 'b',
 '',
 '',
 '',
 'a',
 'llamar',
 'u',
 'llegar',
 'gracia',
 'entender',
 'tratar',
 'azorar',
 'violar',
 'tipo',
 'póker',
 'preguntar',
 'u',
 'reunión',
 'ed',
 'noche',
 'tmr',
 'ge',
 'carlosll',
 'minutar',
 'necesitar',
 'comprar',
 'u',
 'doin',
 'niño',
 '',
 '',
 '',
 'esperanzar',
 'u',
 'teléfono',
 'llamar',
 'ure',
 'apagar',
 'extrañar',
 'u',
 

In [25]:
setCorpus = set(strCorpus)

In [26]:
corpusCols = list(setCorpus) #Indices de las columnas
corpusRows = range(0, len(corpus))

In [27]:
def generateEmptyTF(cols, rows):
    outDF = pd.DataFrame(index = rows, columns=cols)
    outDF = outDF.fillna(0)
    return outDF

In [28]:
tfCorpus = generateEmptyTF(corpusCols, corpusRows)
tfCorpus

Unnamed: 0,Unnamed: 1,hol,maravilloso,btnacional,felicidad,babe,gal,adecuar,animar,lograr,...,didntgive,definitivo,crazy,blake,bello,adjudicar,fijar,dave,triste,ubicación
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2236,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2237,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2238,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2239,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
def calculoTF(corpus, df):
    corpus = tokenizacion(corpus)
    for index, doc in enumerate(corpus):
        bagOfWordsLen = len(doc)
        for word in doc:
            try:
                colIndex = list(df.columns).index(word.text)
                df.iloc[index, colIndex] = df.iloc[index, colIndex] + 1
            except:
                pass
        df.iloc[index, :] = df.iloc[index, :] / bagOfWordsLen
    return df

In [31]:
tfMatrix = calculoTF(corpus, tfCorpus)
tfMatrix

Unnamed: 0,Unnamed: 1,hol,maravilloso,btnacional,felicidad,babe,gal,adecuar,animar,lograr,...,didntgive,definitivo,crazy,blake,bello,adjudicar,fijar,dave,triste,ubicación
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## IDF
log(cantidad de documentos / cantidad documentos que contienen termino i)
El resultado indica el peso de elementos pocos comunes que desceriben algo particular. 

TDF: Que tan importante es para un documento en particular esa especifica palabra.

IDF: Que tan importante es esa palabra respecto a todos los documentos. 

In [34]:
def calculoIDF(df):
    N = df.shape[0]
    valX = (N / df.astype(bool).sum(axis = 0))
    idfValues = pd.Series(np.log(valX))
    return idfValues

In [35]:
corpusIDF = calculoIDF(tfMatrix)
corpusIDF

               5.517453
hol            5.412092
maravilloso    5.229771
btnacional     5.316782
felicidad      4.188317
                 ...   
adjudicar      5.149728
fijar          4.188317
dave           5.316782
triste         5.149728
ubicación      5.412092
Length: 4468, dtype: float64

## Calculo TF-IDF 
Multiplicacion de los IDF x su correspondiente de cada termino

In [36]:
tfIdfCorpus = tfMatrix.mul(corpusIDF, axis=1)
tfIdfCorpus = tfIdfCorpus.fillna(0)
tfIdfCorpus

Unnamed: 0,Unnamed: 1,hol,maravilloso,btnacional,felicidad,babe,gal,adecuar,animar,lograr,...,didntgive,definitivo,crazy,blake,bello,adjudicar,fijar,dave,triste,ubicación
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tfIdfCorpus, dataset['label'], 
                                                   test_size = 0.3, random_state= 0, shuffle = True)

In [39]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1568, 4468), (673, 4468), (1568,), (673,))

In [40]:
def encode(x):
    if (x=="ham"):
        return 0
    else:
        return 1

In [41]:
y_train = list(map(encode, y_train))
y_train[0:10]

[0, 0, 0, 1, 1, 1, 1, 0, 0, 0]

In [42]:
#Random Forest

from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_depth=15, random_state=0)
rfc.fit(X_train, y_train)

RandomForestClassifier(max_depth=15, random_state=0)

In [43]:
y_preds_rfc = rfc.predict(X_test)
y_preds_rfc

array([0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,

## Función de predicción

In [132]:
##Procesamiento de texto

def procesamiento(texto):
    str_corpus = texto
    corpus = dataset.message
    serie = pd.Series(str_corpus)
    corpus = corpus.append(serie, ignore_index = True)
    corpus = Normalizacion(corpus)
    corpus = tokenizacion(corpus)
    corpus = removeStops(corpus)
    corpus = stemmingLemmating(corpus)
    strCorpus = listToString(corpus)
    strCorpus = strCorpus.strip().split(' ')
    setCorpus = set(strCorpus)
    corpusCols = list(setCorpus) #Indices de las columnas
    corpusRows = range(0, len(corpus))
    tfCorpus = generateEmptyTF(corpusCols, corpusRows)
    tfMatrix = calculoTF(corpus, tfCorpus)
    corpusIDF = calculoIDF(tfMatrix)
    tfIdfCorpus = tfMatrix.mul(corpusIDF, axis=1)
    tfIdfCorpus = tfIdfCorpus.fillna(0)
    
    a_predecir = tfIdfCorpus.iloc[[-1]]
    tfIdfCorpus = tfIdfCorpus[:-1]
    
    return a_predecir, tfIdfCorpus
    

In [133]:
def predic(texto):
    
    a_predecir, tfIdfCorpus = procesamiento(texto)
    
    X_train, X_test, y_train, y_test = train_test_split(tfIdfCorpus, dataset['label'], 
                                                   test_size = 0.3, random_state= 0, shuffle = True)
    
    y_train = list(map(encode, y_train))
    rfc = RandomForestClassifier(max_depth=15, random_state=0)
    rfc.fit(X_train, y_train)
    prediccion = rfc.predict(a_predecir)
    if(int(prediccion[0]) == 1):
        return "spam"
    else:
        return "ham"

In [134]:
predic("Este correo es solo una prueba para saber su clasificaccion")

'ham'

## Problema #2

In [135]:
#importando datos de reviews
revDB = pd.read_csv('imdb_rev.txt', sep='\t', names=['rev', 'class'])
revDB.head()

Unnamed: 0,rev,class
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [142]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

In [146]:
revDB['rev']

0      A very, very, very slow-moving, aimless movie ...
1      Not sure who was more lost - the flat characte...
2      Attempting artiness with black & white and cle...
3           Very little music or anything to speak of.  
4      The best scene in the movie was when Gerardo i...
                             ...                        
743    I just got bored watching Jessice Lange take h...
744    Unfortunately, any virtue in this film's produ...
745                     In a word, it is embarrassing.  
746                                 Exceptionally bad!  
747    All in all its an insult to one's intelligence...
Name: rev, Length: 748, dtype: object

In [147]:
#Normalizacion, Tokenizacion, quitar stop words en ingles usando sklearn
#Usando CountVectorizer para convertir los documentos de texto en una matriz de tokens
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts= cv.fit_transform(revDB['rev'])

In [148]:
#Dividiendo el dataset 70-30
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    text_counts, revDB['class'], test_size=0.3, random_state=1)

In [158]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# Utilizando el algoritmo de Naive Bayes como clasificador
clf = MultinomialNB().fit(X_train, y_train)
prediccion = clf.predict(X_test)


print("Accuracy Score del modelo:", round(metrics.accuracy_score(y_test, prediccion)*100, 2))

Accuracy Score del modelo: 76.89
