In [20]:
# Data
import numpy as np
import pandas as pd

# NLP
import re

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


In [3]:
data = pd.read_csv('spam.csv', encoding='latin-1')

In [4]:
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [5]:
data.drop([data.columns[col] for col in [2, 3, 4]], axis=1, inplace=True)

In [7]:
encoder = LabelEncoder()

data['v1'] = encoder.fit_transform(data['v1'])
class_mappings = {index: label for index, label in enumerate(encoder.classes_)}

In [8]:
class_mappings

{0: 'ham', 1: 'spam'}

In [22]:
# Aquí limpiamos los datos, los tokenizamos y los regresamos como una lista de palabras
def processEmail(contents):
    ps = PorterStemmer()
    
    contents = contents.lower()
    contents = re.sub(r'<[^<>]+>', ' ', contents)
    contents = re.sub(r'[0-9]+', 'number', contents)
    contents = re.sub(r'(http|https)://[^\s]*', 'httpaddr', contents)
    contents = re.sub(r'[^\s]+@[^\s]+', 'emailaddr', contents)
    contents = re.sub(r'[$]+', 'dollar', contents)
    
    words = word_tokenize(contents)
    
    for i in range(len(words)):
        words[i] = re.sub(r'[^a-zA-Z0-9]', '', words[i])
        words[i] = ps.stem(words[i])
        
    words = [word for word in words if len(word) >= 1]
    
    return words

In [23]:
# Aquí recibimos la lista de las palabras usadas en todos los emails 
# y creamos un vocabulario con las palabras más frecuentes
def getVocabulary(emails, vocab_length):
    vocabulary = dict()
    
    for i in range(len(emails)):
        emails[i] = processEmail(emails[i])
        for word in emails[i]:
            if word in vocabulary.keys():
                vocabulary[word] += 1
            else:
                vocabulary[word] = 1
                
    vocabulary = sorted(vocabulary.items(), key=lambda x: x[1], reverse=True)
    vocabulary = list(map(lambda x: x[0], vocabulary[0:vocab_length]))
    vocabulary = {index: word for index, word in enumerate(vocabulary)}
    
    return vocabulary

In [24]:
# Get a dictionary key given a value
# Esto es para el modelo de Machine Learning
def getKey(dictionary, val):
    for key, value in dictionary.items():
        if value == val:
            return key

In [25]:
# Get the indices of vocab words used in a given email
# Esto es para el modelo de Machine Learning
def getIndices(email, vocabulary):
    word_indices = set()
    
    for word in email:
        if word in vocabulary.values():
            word_indices.add(getKey(vocabulary, word))
    
    return word_indices

In [26]:
# Esto es para el modelo de Machine Learning
def getFeatureVector(word_indices, vocab_length):
    feature_vec = np.zeros(vocab_length)
    
    for i in word_indices:
        feature_vec[i] = 1
        
    return feature_vec

In [27]:
vocab_length = 2000

In [35]:
import nltk
nltk.download('punkt', download_dir='C:\\nltk_data')

[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [36]:
vocabulary = getVocabulary(data['v2'].to_list(), vocab_length)

emails = data['v2'].to_list()
emails = list(map(lambda x: processEmail(x), emails))

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\eduar/nltk_data'
    - 'c:\\Users\\eduar\\anaconda3\\envs\\FraudDetection\\nltk_data'
    - 'c:\\Users\\eduar\\anaconda3\\envs\\FraudDetection\\share\\nltk_data'
    - 'c:\\Users\\eduar\\anaconda3\\envs\\FraudDetection\\lib\\nltk_data'
    - 'C:\\Users\\eduar\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [19]:
vocabulary

{0: 'i',
 1: 'number',
 2: 'to',
 3: 'you',
 4: 'a',
 5: 'the',
 6: 'u',
 7: 'and',
 8: 'it',
 9: 'is',
 10: 'in',
 11: 'me',
 12: 'my',
 13: 'for',
 14: 'your',
 15: 'call',
 16: 'have',
 17: 'do',
 18: 'that',
 19: 'of',
 20: 's',
 21: 'on',
 22: 'are',
 23: 'now',
 24: 'so',
 25: 'go',
 26: 'get',
 27: 'not',
 28: 'but',
 29: 'be',
 30: 'or',
 31: 'm',
 32: 'can',
 33: 'at',
 34: 'we',
 35: 'will',
 36: 'if',
 37: 'ur',
 38: 'with',
 39: 'nt',
 40: 'just',
 41: 'no',
 42: 'thi',
 43: 'how',
 44: 'gt',
 45: 'lt',
 46: 'up',
 47: 'what',
 48: 'come',
 49: 'when',
 50: 'ok',
 51: 'from',
 52: 'free',
 53: 'know',
 54: 'all',
 55: 'out',
 56: 'like',
 57: 'got',
 58: 'love',
 59: 'day',
 60: 'time',
 61: 'wa',
 62: 'want',
 63: 'good',
 64: 'then',
 65: 'll',
 66: 'there',
 67: 'he',
 68: 'text',
 69: 'am',
 70: 'onli',
 71: 'send',
 72: 'hi',
 73: 'need',
 74: 'one',
 75: 'txt',
 76: 'as',
 77: 'today',
 78: 'see',
 79: 'by',
 80: 'take',
 81: 'think',
 82: 'about',
 83: 'she',
 84: 'd

### Esto es para el entrenamiento de un modelo de Machine Learning 

In [15]:
X = list(map(lambda x: getFeatureVector(getIndices(x, vocabulary), vocab_length), emails))
X = pd.DataFrame(np.array(X).astype(np.int16))

In [None]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,1,0,0,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,1,0,1,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
