In [1]:
# Esta clase facilita el preprocesamiento de correos electrónicos que poseen 
# código HTML
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict=False
        self.convert_charrefs=False
        self.fed=[]

    def handle_data(self,d):
        self.fed.append(d)

    def get_data(self):
        return ''.join(self.fed)

In [2]:
# Esta función se encarga de elimar los tags HTML que se 
# encuentren en el texto del correo electrónico
def strip_tags(html):
    s=MLStripper()
    s.feed(html)
    return s.get_data()

In [3]:
import email
import string 
import nltk
from nltk.stem import PorterStemmer

class parser:
    def __init__(self) -> None:
        self.stemmer =PorterStemmer()
        self.stopwords=set(nltk.corpus.stopwords.words('english'))
        self.punctuation =list(string.punctuation)

    def parse(self,email_path):
        with open(email_path,errors='ignore') as e:
            msg=email.message_from_file(e)
            return None if not msg else self.get_email_content(msg)
        
    def get_email_content(self,msg):
        subject=self.tokenize(msg['Subject']) if msg['Subject'] else []
        body= self.get_email_body(msg.get_payload(),msg.get_content_type())
        content_type=msg.get_content_type()
        return {"Subject":subject,"Body":body,"content_type":content_type}
    
    def get_email_body(self,payload,content_type):
        body=[]
        if type(payload) is str and content_type=="text/plain":
            return self.tokenize(payload)
        elif type(payload) is str and content_type=="text/html":
            return self.tokenize(strip_tags(payload))
        elif type(payload) is list:
            for p in payload:
                body+=self.get_email_body(p.get_payload(),p.get_content_type())
        return body
    
    def tokenize(self,text):
        for c in self.punctuation:
            text = text.replace(c,"")
        text =text.replace("\t"," ")
        text =text.replace("\n"," ")
        tokens = list(filter(None,text.split(" ")))
        return [self.stemmer.stem(w) for w in tokens if w not in self.stopwords]

In [4]:
inmail=open("C:\\Users\\YoelR\\Desktop\\IA2\\Practica2\\Correos\\Coursera1.eml").read()
print(inmail)

Received: from MW4P221MB0975.NAMP221.PROD.OUTLOOK.COM (2603:10b6:303:207::9)
 by IA2P221MB1374.NAMP221.PROD.OUTLOOK.COM with HTTPS; Mon, 19 Aug 2024
 22:05:28 +0000
Received: from PH7PR13CA0011.namprd13.prod.outlook.com (2603:10b6:510:174::26)
 by MW4P221MB0975.NAMP221.PROD.OUTLOOK.COM (2603:10b6:303:207::9) with
 Microsoft SMTP Server (version=TLS1_2,
 cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.7875.22; Mon, 19 Aug
 2024 22:05:25 +0000
Received: from SN1PEPF00036F3E.namprd05.prod.outlook.com
 (2603:10b6:510:174:cafe::49) by PH7PR13CA0011.outlook.office365.com
 (2603:10b6:510:174::26) with Microsoft SMTP Server (version=TLS1_2,
 cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.7897.13 via Frontend
 Transport; Mon, 19 Aug 2024 22:05:25 +0000
Authentication-Results: spf=pass (sender IP is 192.174.83.11)
 smtp.mailfrom=t.mail.coursera.org; dkim=pass (signature was verified)
 header.d=t.mail.coursera.org;dmarc=pass action=none
 header.from=t.mail.coursera.org;compauth=pas

In [5]:
p=parser()
p.parse("C:\\Users\\YoelR\\Desktop\\IA2\\Practica2\\Correos\\Coursera1.eml")

{'Subject': ['utf8brmvsawnpdgfjaw9uzxmuimkhvhugq2vydglmawnhzg8gzxn0w6eg',
  'utf8bbglzdg8h'],
 'Body': ['welcom',
  'coursera',
  '0d0ahttpseventingcourseraorgredirectsig',
  'nedeyjrzxkioijlbwfpbc5saw5rlm9wzw4ilcj2ywx1zsi6eyj1cmwioijodhrwczovl3d3dy5',
  'jb3vyc2vyys5vcmcdxrtx21lzgl1bt1lbwfpbcz1dg1fc291cmnlpw90agvyjnv0bv9jyw1wyw',
  'lnbj1jb3vyc2vdb21wbgv0aw9ufllfakd3zg8wrwuyae5ssndzbxzon3cilcj0cmfja2luzyi6',
  'yj1c2vyswqioje1njg4ntm2ncwidxnlckvtywlsijoiew9lbhjtmtdaag90bwfpbc5jb20ilcju',
  'b3rpzmljyxrpb25uexblijoidmvyawzpzwrfy2vydglmawnhdguuy29uz3jhdhmilcjjyw1wywl',
  'nbii6im9uzgvtyw5klnzlcmlmawvkq2vydglmawnhdguudmvyawzpzwrfy2vydglmawnhdgvfy2',
  'ftcgfpz24ilcjjyw1wywlnbklkijoiy291cnnlq29tcgxldglvbn5zrwphd2rvmevlmmhoukp3w',
  'w12add3iiwibglua3mioltdfx0sinvzzxjjzci6mtu2odg1mzy0fqyr1qwxjhrel6kln3nbvpx',
  'kvrw3zt2p0c83sdwi0sla0d0ac2a1felicitaciones0d0atu',
  'certificadoest',
  'c3a1',
  'listo0d0ahttpseventingcourseraorgredirectsignedeyjrzxkioij',
  'lbwfpbc5saw5rlm9wzw4ilcj2ywx1zs

In [6]:
index = open("C:\\Users\\YoelR\\Desktop\\IA2\\Practica2\\trec07p\\trec07p\\full\\index2").readlines()
index

['spam ../Correos/Elmejorhosting.eml\n',
 'spam ../Correos/Encuentra las mejores ofertas.eml\n',
 'spam ../Correos/Exclusive 8 Ball Pool.eml\n',
 'spam ../Correos/Joel, tienes una semana para ganar.eml\n',
 'spam ../Correos/Y si los agregas al carrito.eml\n',
 'ham ../Correos/Coursera1.eml\n',
 'ham ../Correos/Coursera2.eml\n',
 'ham ../Correos/Coursera3.eml\n',
 'ham ../Correos/Coursera4.eml\n',
 'ham ../Correos/Coursera5.eml\n',
 'ham ../Correos/Coursera6.eml\n',
 'ham ../Correos/Coursera7.eml\n',
 'ham ../Correos/Coursera8.eml\n',
 'ham ../Correos/Coursera9.eml\n',
 'ham ../Correos/Coursera10.eml\n',
 'ham ../Correos/Coursera11.eml\n',
 'ham ../Correos/Coursera12.eml\n',
 'ham ../Correos/Coursera13.eml\n',
 'ham ../Correos/Coursera14.eml\n',
 'ham ../Correos/Coursera15.eml\t']

In [7]:
import os
DATASET_PATH= "C:\\Users\\YoelR\\Desktop\\IA2\\Practica2\\trec07p\\trec07p"

def parse_index(path_to_index,n_elements):
    ret_indexes=[]
    index = open (path_to_index).readlines()
    for i in range(n_elements):
        mail=index[i].split(' ../')
        label =mail[0]
        path=mail[1][:-1]
        ret_indexes.append({
            "label":label,
            "email_path":os.path.join(DATASET_PATH,path)
        })
    return ret_indexes

In [8]:
def parse_email(index):
    p=parser()
    pemail=p.parse(index["email_path"])
    return pemail,index["label"]

In [9]:
indexes=parse_index("C:\\Users\\YoelR\\Desktop\\IA2\\Practica2\\trec07p\\trec07p\\full\\index2",20)
indexes

[{'label': 'spam',
  'email_path': 'C:\\Users\\YoelR\\Desktop\\IA2\\Practica2\\trec07p\\trec07p\\Correos/Elmejorhosting.eml'},
 {'label': 'spam',
  'email_path': 'C:\\Users\\YoelR\\Desktop\\IA2\\Practica2\\trec07p\\trec07p\\Correos/Encuentra las mejores ofertas.eml'},
 {'label': 'spam',
  'email_path': 'C:\\Users\\YoelR\\Desktop\\IA2\\Practica2\\trec07p\\trec07p\\Correos/Exclusive 8 Ball Pool.eml'},
 {'label': 'spam',
  'email_path': 'C:\\Users\\YoelR\\Desktop\\IA2\\Practica2\\trec07p\\trec07p\\Correos/Joel, tienes una semana para ganar.eml'},
 {'label': 'spam',
  'email_path': 'C:\\Users\\YoelR\\Desktop\\IA2\\Practica2\\trec07p\\trec07p\\Correos/Y si los agregas al carrito.eml'},
 {'label': 'ham',
  'email_path': 'C:\\Users\\YoelR\\Desktop\\IA2\\Practica2\\trec07p\\trec07p\\Correos/Coursera1.eml'},
 {'label': 'ham',
  'email_path': 'C:\\Users\\YoelR\\Desktop\\IA2\\Practica2\\trec07p\\trec07p\\Correos/Coursera2.eml'},
 {'label': 'ham',
  'email_path': 'C:\\Users\\YoelR\\Desktop\\IA2\\P

In [10]:
index=parse_index("C:\\Users\\YoelR\\Desktop\\IA2\\Practica2\\trec07p\\trec07p\\full\\index2",1)
index

[{'label': 'spam',
  'email_path': 'C:\\Users\\YoelR\\Desktop\\IA2\\Practica2\\trec07p\\trec07p\\Correos/Elmejorhosting.eml'}]

In [11]:
# Leemos el primer correo
import os

open(index[0]["email_path"]).read()

'Delivered-To: munozjavier541@gmail.com\nReceived: by 2002:a05:7208:9028:b0:8e:6d6d:f117 with SMTP id j40csp6230rbd;\n        Tue, 24 Sep 2024 14:07:50 -0700 (PDT)\nX-Google-Smtp-Source: AGHT+IGxt04qLPfmpAzcrNi6Iv2zD+WcNwf7EyKZTGYWDF1LcWICiRaGqP8GKotsK7n+7h9G1VlP\nX-Received: by 2002:a05:6000:1244:b0:374:c287:2afe with SMTP id ffacd0b85a97d-37cc24ca69cmr382483f8f.56.1727212070359;\n        Tue, 24 Sep 2024 14:07:50 -0700 (PDT)\nARC-Seal: i=1; a=rsa-sha256; t=1727212070; cv=none;\n        d=google.com; s=arc-20240605;\n        b=BfhArNXy4LqboOh4niYmpZKUs/extLhOH9sUmf0Kp4t1WGifBO1PhCY790YT+7Jd6O\n         MVTzaFrKuJyT3p63GQJhcJz/1dofEYdtzgAECtyxdceyVCALmReCcTknMbCPunedcFWq\n         lhpY0gYO20MI3m0DM3UM39squJ/d6gVyiCPJQZ7i3SZWOCaQbsbgbccnTDItA5nD4yiC\n         hzhTDRy1ngnjg20hnrEG8bmndiphfYfXnoqLkGaxd1ADSLqebzQB7nzmqWXspUQUaStL\n         ZgHFu1e2tvZNcago501kkYp09dkTHbnVLgwGHF2Uw/7NFCCiL7v/V/3aEMDw6RFp6/S8\n         24gQ==\nARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=go

In [12]:
# Parseamos el primer correo
mail, label = parse_email(index[0])
print("El correo es:", label)
print(mail)

El correo es: spam
{'Subject': ['elmejorhostingonlin', 'free', 'host', 'php', '82', 'upgrad'], 'Body': ['dear', 'valu', 'client', 'greet', 'elmejorhostingonlin', 'ifastnet', 'provid', 'server', 'network', 'elmejorhostingonlin', 'great', 'news', 'we', 'upgrad', 'entir', 'free', 'host', 'network', 'use', 'latest', 'stabl', 'php', 'version', '82', 'you', 'run', 'latest', 'advanc', 'php', 'script', 'free', 'platform', 'with', 'expir', 'date', 'free', 'host', 'life', 'add', 'domain', 'name', 'use', 'softaculi', 'script', 'instal', 'build', 'perfect', 'websit', 'free', 'elmejorhostingonlin', 'not', 'updat', 'php', 'v82', 'also', 'expand', 'server', 'cluster', 'ad', 'huge', 'amount', 'extra', 'cpu', 'ram', 'ssd', 'storag', 'capac', 'thi', 'allow', 'us', 'increas', 'ram', 'cpu', 'power', 'given', 'everi', 'free', 'host', 'websit', 'make', 'faster', 'even', 'capabl', 'our', 'free', 'host', 'awesom', 'premium', 'upgrad', 'plan', 'includ', 'free', 'top', 'level', 'domain', 'free', 'ssl', 'smtpima

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
prep_email=[" ".join(mail['Subject'])+ " ".join(mail['Body'])]

vectorizer=CountVectorizer()
X=vectorizer.fit(prep_email)

print("email: ", prep_email,"\n")
print("entradas: ",vectorizer.get_feature_names_out())

email:  ['elmejorhostingonlin free host php 82 upgraddear valu client greet elmejorhostingonlin ifastnet provid server network elmejorhostingonlin great news we upgrad entir free host network use latest stabl php version 82 you run latest advanc php script free platform with expir date free host life add domain name use softaculi script instal build perfect websit free elmejorhostingonlin not updat php v82 also expand server cluster ad huge amount extra cpu ram ssd storag capac thi allow us increas ram cpu power given everi free host websit make faster even capabl our free host awesom premium upgrad plan includ free top level domain free ssl smtpimap email unlimit disk space 100 extra perfect place host site outgrow super free host servic want we give special discount coupon get 25 web premium host plan ifastnetcom coupon code alwaysgettingbett as usual premium host plan includ free top level domain get new site onlin without complic dont miss chanc get 25 host free domain ifastnetcom 

In [14]:
X=vectorizer.transform(prep_email)
print("\nValues\n",X.toarray())


Values
 [[ 1  2  2  1  1  1  1  1  2  1  1  1  1  1  1  1  1  1  1  1  1  3  2  1
   1  1  4  1  4  1  1  1  1  1  1  2  1  1 12  3  1  1  1  1  1  1 10  1
   1  1  1  2  2  1  1  2  2  1  1  1  1  2  2  1  1  1  1  1  2  4  1  3
   1  1  3  1  2  1  1  2  3  1  2  1  1  1  1  1  1  1  1  2  1  1  2  1
   1  1  2  1  1  1  3  1  1  1  1  1  1  2  2  2  1  1  1]]


In [15]:
from sklearn.preprocessing import OneHotEncoder

prep_email = [[w] for w in mail['Subject'] + mail['Body']]

enc = OneHotEncoder(handle_unknown='ignore')
X = enc.fit_transform(prep_email)

print("Features:\n", enc.get_feature_names_out())
print("\nValues:\n", X.toarray())

Features:
 ['x0_100' 'x0_25' 'x0_82' 'x0_ad' 'x0_add' 'x0_advanc' 'x0_allow'
 'x0_also' 'x0_alwaysgettingbett' 'x0_amount' 'x0_as' 'x0_awesom'
 'x0_browser' 'x0_build' 'x0_capabl' 'x0_capac' 'x0_chanc' 'x0_client'
 'x0_cluster' 'x0_code' 'x0_complic' 'x0_coupon' 'x0_cpu' 'x0_date'
 'x0_dear' 'x0_discount' 'x0_disk' 'x0_domain' 'x0_dont'
 'x0_elmejorhostingonlin' 'x0_email' 'x0_entir' 'x0_even' 'x0_everi'
 'x0_expand' 'x0_expir' 'x0_extra' 'x0_fast' 'x0_faster' 'x0_free'
 'x0_get' 'x0_give' 'x0_given' 'x0_great' 'x0_greet' 'x0_happi'
 'x0_holiday' 'x0_host'
 'x0_httpbyethostcomunsubscribephpidc0d6d8ec0fd38fe4ba15bc37f560793bmunozjavier541gmailcom'
 'x0_httpsifastnetcom' 'x0_huge' 'x0_ifastnet' 'x0_ifastnetcom'
 'x0_includ' 'x0_increas' 'x0_instal' 'x0_latest' 'x0_level' 'x0_life'
 'x0_make' 'x0_miss' 'x0_name' 'x0_network' 'x0_new' 'x0_news' 'x0_not'
 'x0_onlin' 'x0_our' 'x0_outgrow' 'x0_perfect' 'x0_php' 'x0_place'
 'x0_plan' 'x0_platform' 'x0_power' 'x0_premium' 'x0_provid' 'x0_ram'
 

In [16]:
def create_prep_dataset(index_path,n_elements):
    X=[]
    y=[]
    indexes = parse_index(index_path,n_elements)
    for i in range(n_elements):
        print("\rParsing email:{0}".format(i+1),end="")
        mail,label =parse_email(indexes[i])
        X.append(" ".join(mail["Subject"])+" ".join(mail["Body"]))
        y.append(label)
    return X,y

In [17]:
# Leemos únicamente un subconjunto de 100 correos electrónicos
X_train,y_train=create_prep_dataset("C:\\Users\\YoelR\\Desktop\\IA2\\Practica2\\trec07p\\trec07p\\full\\index",100)
X_train

Parsing email:100

['gener ciali brand qualitido feel pressur perform rise occas tri viagra anxieti thing past back old self',
 'typo debianreadmhi ive updat gulu i check mirror it seem littl typo debianreadm file exampl httpgulususherbrookecadebianreadm ftpftpfrdebianorgdebianreadm test lenni access releas diststest the current test develop snapshot name etch packag test unstabl pass autom test propog releas etch replac lenni like readmehtml yan morin consult en logiciel libr yanmorinsavoirfairelinuxcom 5149941556 to unsubscrib email debianmirrorsrequestlistsdebianorg subject unsubscrib troubl contact listmasterlistsdebianorg',
 'authent viagramega authenticv i a g r a discount pricec i a l i s discount pricedo miss it click httpwwwmoujsjkhchumcom authent viagra mega authenticv i a g r a discount pricec i a l i s discount pricedo miss it click',
 'nice talk yahey billi realli fun go night talk said felt insecur manhood i notic toilet quit small area worri websit i tell secret weapon extra 3 inch trust g

In [18]:
vectorizer=CountVectorizer()
X_train=vectorizer.fit_transform(X_train)

In [19]:
print(X_train.toarray())
print("\nFeatures",len(vectorizer.get_feature_names_out()))

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

Features 4842


In [20]:
import pandas as pd
pd.DataFrame(X_train.toarray(),columns=[vectorizer.get_feature_names_out()])

Unnamed: 0,0000,000000,00085,002,003,00450,009,01,01000u,0107,...,ӧanz,ӭѯ,ԡšݡ淶,լһʽ,չҵϣ,سŵþʊʊݾѯ,ڶҵţ,㶫иï26,饻jwk,쵼ã
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
y_train

['spam',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'spam',
 'ham',
 'spam',
 'spam',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'spam',
 'ham',
 'spam',
 'spam',
 'ham',
 'spam',
 'spam',
 'spam',
 'ham',
 'ham',
 'spam',
 'spam',
 'spam',
 'ham',
 'ham',
 'ham',
 'spam',
 'ham',
 'spam',
 'ham',
 'spam',
 'spam',
 'spam',
 'ham',
 'spam',
 'spam',
 'ham',
 'spam',
 'ham',
 'spam',
 'spam',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam']

In [22]:
from sklearn.linear_model import LogisticRegression

clf=LogisticRegression()
clf.fit(X_train,y_train)

In [23]:
X,y=create_prep_dataset("C:\\Users\\YoelR\\Desktop\\IA2\\Practica2\\full\\index2",20)
X_test=X
y_test=y


Parsing email:20

In [24]:
X_test=vectorizer.transform(X_test)

In [25]:
y_pred=clf.predict(X_test)
y_pred

array(['spam', 'ham', 'ham', 'spam', 'spam', 'spam', 'spam', 'ham',
       'spam', 'ham', 'spam', 'spam', 'spam', 'ham', 'spam', 'spam',
       'spam', 'spam', 'spam', 'ham'], dtype='<U4')

In [26]:
from sklearn.metrics import accuracy_score

print('Precisión: {:.3f}'.format(accuracy_score(y_test,y_pred)))

Precisión: 0.350
