## Detección de Spam - Laboratorio 02

Sara Zavala  18893   
Alexa Bravo  18831  

In [1]:
#Librerias que se van a utilizar. 
import pandas as pd
import numpy as np
import  re

import neattext as nt
import unicodedata
import nltk
from sklearn import metrics, model_selection, tree
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

### Parte 1 – Ingeniería de características

#### Exploración de datos 

In [2]:
#Cargamos los datos del primer dataset.
df1 = pd.read_csv('completeSpamAssassin.csv')
#Quitamos las columnas que no necesitamos. 
df1 = df1.drop(columns = ["Unnamed: 0"])
df1.head()

Unnamed: 0,Body,Label
0,\nSave up to 70% on Life Insurance.\nWhy Spend...,1
1,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
2,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
3,##############################################...,1
4,I thought you might like these:\n1) Slim Down ...,1


In [3]:
#Cargamos los datos del segundo dataset.
df2 = pd.read_csv('enronSpamSubset.csv')
#Quitamos las columnas que no necesitamos. 
df2 = df2.drop(columns = ["Unnamed: 0", "Unnamed: 0.1"])
df2.head()

Unnamed: 0,Body,Label
0,Subject: stock promo mover : cwtd\n * * * urge...,1
1,Subject: are you listed in major search engine...,1
2,"Subject: important information thu , 30 jun 20...",1
3,Subject: = ? utf - 8 ? q ? bask your life with...,1
4,"Subject: "" bidstogo "" is places to go , things...",1


In [4]:
#Unimos los dataset. 
dfTotal = pd.concat([df1, df2])
dfTotal.head()

Unnamed: 0,Body,Label
0,\nSave up to 70% on Life Insurance.\nWhy Spend...,1
1,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
2,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
3,##############################################...,1
4,I thought you might like these:\n1) Slim Down ...,1


#### Preprocesamiento

In [5]:
body = dfTotal["Body"]

In [6]:
#Pasamos todo a minuscula. 
body = body.str.lower()
body.head()

0    \nsave up to 70% on life insurance.\nwhy spend...
1    1) fight the risk of cancer!\nhttp://www.adcli...
2    1) fight the risk of cancer!\nhttp://www.adcli...
3    ##############################################...
4    i thought you might like these:\n1) slim down ...
Name: Body, dtype: object

In [7]:
#Eliminamos los espacios
body = body.str.strip()
body.head()

0    save up to 70% on life insurance.\nwhy spend m...
1    1) fight the risk of cancer!\nhttp://www.adcli...
2    1) fight the risk of cancer!\nhttp://www.adcli...
3    ##############################################...
4    i thought you might like these:\n1) slim down ...
Name: Body, dtype: object

In [8]:
#Quitamos los acentos 
def QuitarAcentos(texto):
    texto = unicodedata.normalize('NFKD', str(texto)).encode('ascii','ignore').decode('utf-8','ignore')
    return texto

In [9]:
body = body.apply(QuitarAcentos)
body.head()

0    save up to 70% on life insurance.\nwhy spend m...
1    1) fight the risk of cancer!\nhttp://www.adcli...
2    1) fight the risk of cancer!\nhttp://www.adcli...
3    ##############################################...
4    i thought you might like these:\n1) slim down ...
Name: Body, dtype: object

In [10]:
#Quitamos los numeros y caracteres especiales. 
def limpiar(texto, removerDigitos = False):
    patron = r'[^a-zA-Z0-9\s]' if not removerDigitos else r'[^a-zA-Z\s]'
    texto = re.sub(patron, '', texto, re.I|re.A)
    return texto

In [11]:
body = body.apply(limpiar)
body.head()

0    save up to 70 on life insurance\nwhy spend mor...
1    1 fight the risk of cancer\nhttpwwwadclickwspc...
2    1 fight the risk of cancer\nhttpwwwadclickwspc...
3    \n                                            ...
4    i thought you might like these\n1 slim down  g...
Name: Body, dtype: object

In [12]:
#Quitamos las contractions.
#import contractions
#body = body.apply(contractions.fix)

In [13]:
#Stop Words
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sarit\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [14]:
stop_words = nltk.corpus.stopwords.words('english')

In [15]:
dfTotal["Body"] = body
dfTotal.head()

Unnamed: 0,Body,Label
0,save up to 70 on life insurance\nwhy spend mor...,1
1,1 fight the risk of cancer\nhttpwwwadclickwspc...,1
2,1 fight the risk of cancer\nhttpwwwadclickwspc...,1
3,\n ...,1
4,i thought you might like these\n1 slim down g...,1


#### Representación de Texto

In [16]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sarit\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [17]:
def normDoc(doc):
    tokens = nltk.word_tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc

In [18]:
dfTotal['Body'] = dfTotal['Body'].apply(normDoc)
dfTotal.head()

Unnamed: 0,Body,Label
0,save 70 life insurance spend tolife quote savi...,1
1,1 fight risk cancer httpwwwadclickwspcfmo315sp...,1
2,1 fight risk cancer httpwwwadclickwspcfmo315sp...,1
3,adult club offers free membership instant acce...,1
4,thought might like 1 slim guaranteed lose 1012...,1


In [19]:
#Modelo Bag of Words
cv = CountVectorizer(min_df=0.1, max_df=0.9)
cv_matrix = cv.fit_transform(dfTotal['Body'])
cv_matrix = cv_matrix.toarray()
cv_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 2, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]], dtype=int64)

In [20]:
vocabulario = cv.get_feature_names()
cv_df = pd.DataFrame(cv_matrix, columns=vocabulario)
cv_df.head()

Unnamed: 0,10,2000,2002,also,best,business,cc,click,com,company,...,thanks,time,today,us,use,want,way,well,work,would
0,0,0,0,0,3,0,0,2,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,3,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [21]:
#Modelo Bag of N - grams
bv = CountVectorizer(ngram_range=(1,2), min_df=0.1, max_df=0.9)
bv_matrix = bv.fit_transform(dfTotal['Body'])
bv_matrix = bv_matrix.toarray()
vocabulario = bv.get_feature_names()
pd.DataFrame(bv_matrix, columns=vocabulario)

Unnamed: 0,10,2000,2002,also,best,business,cc,click,com,company,...,thanks,time,today,us,use,want,way,well,work,would
0,0,0,0,0,3,0,0,2,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,3,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16041,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16042,5,0,0,0,0,0,5,0,0,0,...,2,5,0,3,0,0,0,0,0,0
16043,1,2,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
16044,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [22]:
tv = TfidfVectorizer(min_df=0.1, max_df=0.9, use_idf=True)
tv_matrix = tv.fit_transform(dfTotal['Body'])
tv_matrix = tv_matrix.toarray()

vocabulario = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocabulario)

Unnamed: 0,10,2000,2002,also,best,business,cc,click,com,company,...,thanks,time,today,us,use,want,way,well,work,would
0,0.00,0.00,0.0,0.00,0.5,0.0,0.00,0.34,0.00,0.00,...,0.00,0.0,0.0,0.15,0.00,0.00,0.0,0.0,0.00,0.00
1,0.00,0.00,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.00,...,0.00,0.0,0.0,0.00,0.32,0.00,0.0,0.0,0.00,0.00
2,0.00,0.00,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.00,...,0.00,0.0,0.0,0.00,0.43,0.00,0.0,0.0,0.00,0.00
3,0.00,0.00,0.0,0.00,0.2,0.0,0.00,0.07,0.00,0.00,...,0.00,0.0,0.0,0.06,0.00,0.00,0.0,0.0,0.00,0.00
4,0.00,0.00,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.00,...,0.00,0.0,0.0,0.00,0.33,0.00,0.0,0.0,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16041,0.56,0.00,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.00,...,0.00,0.0,0.0,0.00,0.00,0.00,0.0,0.0,0.00,0.00
16042,0.35,0.00,0.0,0.00,0.0,0.0,0.38,0.00,0.00,0.00,...,0.14,0.3,0.0,0.19,0.00,0.00,0.0,0.0,0.00,0.00
16043,0.21,0.48,0.0,0.00,0.0,0.0,0.23,0.00,0.00,0.00,...,0.00,0.0,0.0,0.00,0.00,0.22,0.0,0.0,0.23,0.00
16044,0.00,0.00,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.00,...,0.54,0.0,0.0,0.00,0.00,0.00,0.0,0.0,0.00,0.00


### Parte 2 – Implementación del Modelo 

#### Separación de Datos

In [24]:
#Datos para realizar pruebas
bog_train, bog_test, bog_target_train, bog_target_test = model_selection.train_test_split(cv_df, dfTotal['Label'], test_size=0.3, random_state=31)

In [26]:
#tfid_train
#tfid_target_test
#dfTotal
#Usando el matrix podemos...
tfid_train, tfid_test, tfid_target_train, tfid_target_test = model_selection.train_test_split(tv_matrix, dfTotal['Label'], test_size=0.3, random_state=31)

In [28]:
#Last step
ngram_train, ngram_test, ngram_target_train, ngram_target_test = model_selection.train_test_split(bv_matrix, dfTotal['Label'], test_size=0.3, random_state=31)

#### Implementación

In [None]:
#N-GRAMS


#### Conclusión 

##### ¿Qué representación numérica produjo el mejor resultado? 