<a href="https://colab.research.google.com/github/YakiVS/Proyectos-pr-cticos---Machine-Learning/blob/main/Detecci%C3%B3n_de_SPAN_Regresi%C3%B3n_Log%C3%ADstica.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Funciones para limpiar los correos electronicos para conseguir formatos limpios y estructurados

In [127]:
# Clase que facilita el procesamiento de correos electrónicos que continen código HMTL
from html.parser import HTMLParser

In [128]:
class MLStripper(HTMLParser):
  def __init__(self):
    self.reset()
    self.strict=False
    self.convert_charrefs=True
    self.fed=[]

  def handle_data(self, d):
    self.fed.append(d)

  def get_data(self):
    return ''.join(self.fed)


In [129]:
# En esta función se eliminan los Tags HTML de los correos electrónicos
def strip_tags(html):
  s=MLStripper()
  s.feed(html)
  return s.get_data()

In [None]:
# Formas como se elimnan los tags
t = '<tr><td align="left"><a href="../../issues/51/16.html#article">Phrack World News</a></td>'
strip_tags(t)

# Proceso de Steaming

In [131]:
import email
import string
import nltk

class Parser:
  def __init__(self) :
    self.stemmer=nltk.PorterStemmer()
    self.stopwords=set(nltk.corpus.stopwords.words('english'))
    self.punctuation=list(string.punctuation)

  def parse(self, email_path):
    """Parse an email"""
    with open(email_path, errors='ignore') as e:
      msg=email.message_from_file(e)
    return None if not msg else self.get_email_content(msg)

  def get_email_content(self,msg):
    """Etract the email content."""
    subject=self.tokenize(msg['Subject']) if msg['Subject'] else []
    body=self.get_email_body(msg.get_payload(),
                             msg.get_content_type())
    content_type=msg.get_content_type()

    # Returning the content of the email
    return {
        "subject":subject,
        "body":body,
        "content_type":content_type
    }
  def get_email_body(self, payload, content_type):
        """Extract the body of the email."""
        body = []
        if type(payload) is str and content_type == 'text/plain':
            return self.tokenize(payload)
        elif type(payload) is str and content_type == 'text/html':
            return self.tokenize(strip_tags(payload))
        elif type(payload) is list:
            for p in payload:
                body += self.get_email_body(p.get_payload(),
                                            p.get_content_type())
        return body

  def tokenize(self,text):
    for c in self.punctuation:
      text=text.replace(c,"")
    text=text.replace("\t"," ")
    text=text.replace("\n"," ")
    tokens=list(filter(None, text.split(" ")))

    # Steaming of the tokens
    return [self.stemmer.stem(w) for w in tokens if w not in self.stopwords]

# Lectura de un correo en formato raw

In [None]:
inmail = open("/content/data/inmail.1").read()
print(inmail)

# Parsing del correo electrónico

In [None]:
import nltk
nltk.download('stopwords')


In [None]:
p = Parser()
p.parse("/content/data/inmail.1")

# Lectura del índice

In [None]:
index=open("/content/data/inmail.1").readlines()
index

In [136]:
def parse_index(path_to_index, n_elements):
  ret_indexes = []
  index = open(path_to_index).readlines()
  for i in range(n_elements):
    mail=index[i].split('../')
    label=mail[0]
    path=mail[1][:-1]
    ret_indexes.append({'label':label, 'email_path':path})
  return ret_indexes

In [137]:
def parse_email(index):
  p=Parser()
  pmail=p.parse(index['email_path'])
  return pmail,index['label']

In [None]:
indexes = parse_index("/content/data/index",10)
indexes

# Procesamiento de todo el conjunto de datos

In [139]:
# cargamos el índice y las etiquetas en memoria
index=parse_index('/content/data/index',1)

In [None]:
# Leemos el primer correo
import os

open(index[0]["email_path"]).read()

In [None]:
# Parseamos el primer correo
mail, label = parse_email(index[0])
print("El correo es:", label)
print(mail)

# Aplicación del vector CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Preapración del email en una cadena de texto
prep_email = [" ".join(mail['subject']) + " ".join(mail['body'])]

vectorizer = CountVectorizer()
X = vectorizer.fit(prep_email)

print("Email:", prep_email, "\n")
print("Características de entrada:", vectorizer.get_feature_names_out())

In [None]:
X = vectorizer.transform(prep_email)
print("\nValues:\n", X.toarray())

# Aplicación de OneHotEncoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

prep_email = [[w] for w in mail['subject'] + mail['body']]

enc = OneHotEncoder(handle_unknown='ignore')
X = enc.fit_transform(prep_email)

print("Features:\n", enc.get_feature_names_out())
print("\nValues:\n", X.toarray())

In [145]:
def create_prep_dataset(index_path, n_elements):
    X = []
    y = []
    indexes = parse_index(index_path, n_elements)
    for i in range(n_elements):
        print("\rParsing email: {0}".format(i+1), end='')
        try:
            mail, label = parse_email(indexes[i])
            X.append(" ".join(mail['subject']) + " ".join(mail['body']))
            y.append(label)
        except:
            pass
    return X, y

# Entrenamiento del algoritmo

In [None]:
# Leemos únicamente un subconjunto de 100 correos electrónicos
X_train, y_train = create_prep_dataset("/content/data/index",100)
X_train

In [None]:
import numpy as np
np.unique(y_train, return_counts=True)

## Aplicamos vectorización a los datos

In [148]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)

In [None]:
print(X_train.toarray())
print("\nFeatures:", len(vectorizer.get_feature_names_out()))

In [None]:
import pandas as pd

pd.DataFrame(X_train.toarray(), columns=[vectorizer.get_feature_names_out()])

In [None]:
y_train

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train)

# Predicción


In [None]:
# Leemos 150 correos de nuestro conjunto de datos y nos quedamos únicamente con los 50 últimos
# Estos 50 correos electrónicos no se han utilizado para entrenar el algoritmo
X, y = create_prep_dataset("datasets/trec07p/full/index", 150)
X_test = X[100:]
y_test = y[100:]

In [None]:
# procesamiento de los correos con el vectorizador creado anteriormente
X_test = vectorizer.transform(X_test)

In [None]:
# predicción del tipo de correo
y_pred = clf.predict(X_test)
y_pred

In [None]:
print("Predicción:\n", y_pred)
print("\nEtiquetas reales:\n", y_test)

In [None]:
# valoración de los resultados
from sklearn.metrics import accuracy_score

print('Accuracy: {:.3f}'.format(accuracy_score(y_test, y_pred)))

# Incrementando el conjunto de datos

In [None]:
# Leemos 12000 correos electrónicos
X, y = create_prep_dataset("datasets/trec07p/full/index", 12000)

In [None]:
# Utilizamos 10000 correos electrónicos para entrenar el algoritmo y 2000 para realizar pruebas
X_train, y_train = X[:10000], y[:10000]
X_test, y_test = X[10000:], y[10000:]

In [None]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)

In [None]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

In [None]:
X_test = vectorizer.transform(X_test)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
print('Accuracy: {:.3f}'.format(accuracy_score(y_test, y_pred)))