In [36]:
import math
import os
from zipfile import ZipFile

In [37]:
!git clone https://github.com/pachocamacho1990/datasets

fatal: destination path 'datasets' already exists and is not an empty directory.


In [44]:
ZipFile('/content/datasets/email/plaintext/corpus1.zip', 'r').extractall()

In [45]:
data = []
clases = []

In [47]:
#leyendo spam
for file in os.listdir('/content/corpus1/spam'):
  with open('/content/corpus1/spam/'+file, encoding='latin-1') as f:
    data.append(f.read())
    clases.append('spam')

In [48]:
#leyendo ham
for file in os.listdir('/content/corpus1/ham'):
  with open('/content/corpus1/ham/'+file, encoding='latin-1') as f:
    data.append(f.read())
    clases.append('ham')

In [49]:
len(clases), len(data)

(5172, 5172)

In [50]:
#usando spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

nlp = English()
tokenizer = Tokenizer(nlp.vocab)

In [51]:
print([t.text for t in tokenizer(data[0])])

['Subject:', 'a', 'more', 'radiant', 'you', '\n', 'act', 'now', '.', 'get', 'your', '\n', 'free', 'trial', 'offer', 'of', 'the', 'revelotionary', '\n', 'new', 'anti', '-', 'wrinkle', '\n', 'complex', 'by', 'derma', 'radiant', 'of', 'beverly', 'hills', '.', '\n', '2', '-', 'step', 'miracle', ':', '\n', 'anti', '-', 'wrinkle', 'complext', '\n', 'ageless', 'eyest', '\n', '.', 'reduce', 'fine', 'lines', 'and', 'deep', 'wrinkles', 'by', '98', '%', '.', '\n', '.', 'increase', 'collagen', 'synthesis', 'by', '350', '%', '.', '\n', '.', 'remove', 'dark', 'circles', 'and', 'puffiness', 'under', 'the', 'eyes', '.', '\n', 'dermaradiant', '\n', '468', 'north', 'camden', 'drive', '2', 'nd', 'floor', '\n', 'beverly', 'hills', ',', 'ca', '90210', '\n', '1', '-', '800', '-', '859', '-', '3265', '\n', 'cs', '@', 'dermaradiant', '.', 'com', '\n', 'www', '.', 'dermaradiant', '.', 'com', '\n', 'if', 'you', 'would', 'rather', 'not', 'receive', 'further', 'emails', 'from', 'derma', 'radiant', 'please', 'go',

### Clase principal para el algoritmo

Recuerda que la clase más probable viene dada por (en espacio de cómputo logarítmico): 


$$\hat{c} = {\arg \max}_{(c)}\log{P(c)}
 +\sum_{i=1}^n
\log{ P(f_i \vert c)}
$$

Donde, para evitar casos atípicos, usaremos el suavizado de Laplace así:

$$
P(f_i \vert c) = \frac{C(f_i, c)+1}{C(c) + \vert V \vert}
$$

siendo $\vert V \vert$ la longitud del vocabulario de nuestro conjunto de entrenamiento. 


In [120]:
import numpy as np

class NaiveBayesClassifier():

  nlp = English()
  tokenizer = Tokenizer(nlp.vocab)


  def tokenize(self, doc):
    return  [t.text.lower() for t in tokenizer(doc)]

  def word_counts(self, words):
    wordCount = {}
    for w in words:
      if w in wordCount.keys():
        wordCount[w] +=1
      else:
        wordCount[w] = 1

    return wordCount

  def fit(self, data, clases):
    n = len(data)
    self.unique_clases = set(clases)
    self.vocab = set()
    self.classCount = {} #C(c)
    self.log_classPriorProb = {} #P(c)
    self.wordConditionalCounts = {} #C(w|c)
    #conteos de clases
    for c in clases:
      if c in self.classCount.keys():
        self.classCount[c] +=1
      else:
        self.classCount[c] =1

      #claculo de P(c)
    for c in self.classCount.keys():
      self.log_classPriorProb[c] = math.log(self.classCount[c]/n)
      self.wordConditionalCounts[c] = {}
      #calculo de probabilidades condicionales P(w|c)

    for text, c in zip(data, clases):
      counts = self.word_counts(self.tokenize(text))
      for word , count in counts.items():
        if word not in self.vocab:
          self.vocab.add(word)
        if word not in self.wordConditionalCounts[c]:
          self.wordConditionalCounts[c][word] = 0.0
        self.wordConditionalCounts[c][word] += count
    
  def predict(self, data):
    results = []
    for text in data:
      words = set(self.tokenize(text))
      scoreProb = {}
      for word in words: 
        if word not in self.vocab: continue #ignoramos palabras nuevas
        #suavizado Laplaciano para P(w|c)
        for c in self.unique_clases:
          log_wordClassProb = math.log(
              (self.wordConditionalCounts[c].get(word, 0.0)+1)/(self.classCount[c]+len(self.vocab)))
          scoreProb[c] = scoreProb.get(c, self.log_classPriorProb[c]) + log_wordClassProb
      arg_maxprob = np.argmax(np.array(list(scoreProb.values())))
      results.append(list(scoreProb.keys())[arg_maxprob])
    return results

In [132]:
from sklearn.model_selection import  train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [122]:
data_train, data_test, clases_train, clases_test = train_test_split(data,clases, test_size=0.10, random_state=42)

In [123]:
classifier = NaiveBayesClassifier()

In [127]:
classifier.fit(data_train,clases_train)

In [128]:
clases_predict = classifier.predict(data_test)

In [130]:
accuracy_score(clases_test, clases_predict)

0.862934362934363

In [133]:
precision_score(clases_test, clases_predict, average=None, zero_division=1)

array([0.83640553, 1.        ])

In [134]:
recall_score(clases_test, clases_predict, average=None, zero_division=1)

array([1.        , 0.54193548])