In [34]:
import os
import urllib.request 
import tarfile

DOWNLOAD_ROOT = "https://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets" , "spam") #It will create a path datasets/spam

#Fetched the emails
def fetch_spam_data(spam_url = SPAM_URL , spam_path = SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for file_name , urls in (("ham.tar.bz2" , HAM_URL) , ("spam.tar.bz2" , SPAM_URL)):
        path = os.path.join(spam_path , file_name) #It will create a path datasets/spam/file_name 
        if not os.path.isfile(path):
            urllib.request.urlretrieve( urls , path) #This will download the file and then store it in the path
        tar_bz2_file = tarfile.open(path) #The downloaded fie will be in the format of the .tar so we need tarfile
        tar_bz2_file.extractall(spam_path) #Then it will extract the file
        tar_bz2_file.close() 

        

In [35]:
fetch_spam_data()

In [36]:
#Loaded the email
HAM_DIR = os.path.join(SPAM_PATH , "easy_ham") #datasets\spam\easy_ham
SPAM_DIR = os.path.join(SPAM_PATH , "spam") #'datasets\\spam\\spam'

ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

In [37]:
import email
import email.policy 

def load_email(is_spam , filename , spam_path = SPAM_PATH): 
    directory = "spam" if is_spam else "easy_ham" #esley chahi kun email read bhanera select garcha

    with open  (os.path.join(spam_path , directory , filename) , "rb") as f:   # open .. as f allows to read the email and close when its done as rb-> read in bianry
        return email.parser.BytesParser(policy = email.policy.default).parse(f)


In [39]:
ham_emails = [load_email(is_spam=False , filename=name) for name in ham_filenames ]
spam_emails = [load_email(is_spam = True , filename=name) for name in spam_filenames]

In [40]:
print(ham_emails[:5])

[<email.message.EmailMessage object at 0x000001F8883ECFE0>, <email.message.EmailMessage object at 0x000001F8883EC320>, <email.message.EmailMessage object at 0x000001F8883EC4A0>, <email.message.EmailMessage object at 0x000001F8883EC980>, <email.message.EmailMessage object at 0x000001F8883EC710>]


In [41]:
def get_email_structure(email):
    if isinstance(email , str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(",".join([get_email_structure(sub_email) for sub_email in payload]))
    else:
        return email.get_content_type()

In [42]:
from collections import Counter

def structures_counter(emails):
    structures = Counter() 
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1  
    return structures  

In [43]:
structures_counter(ham_emails).most_common()
structures_counter(spam_emails).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain,text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain,image/jpeg)', 3),
 ('multipart(text/html,application/octet-stream)', 2),
 ('multipart(text/plain,application/octet-stream)', 1),
 ('multipart(text/html,text/plain)', 1),
 ('multipart(multipart(text/html),application/octet-stream,image/jpeg)', 1),
 ('multipart(multipart(text/plain,text/html),image/gif)', 1),
 ('multipart/alternative', 1)]

In [44]:
import numpy as np
from sklearn.model_selection import train_test_split

X = ham_emails + spam_emails 
Y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

In [45]:
x_train , y_train , x_test , y_test = train_test_split(X , Y , test_size= 0.2 , random_state= 42)

In [None]:
from bs4 import BeautifulSoup #esley chahi strimg accept garcha hai 
import re
from html import unescape

def html_to_plain_text(html):  #This accepts only string

    soup = BeautifulSoup(html , 'lxml')

    if soup.head: #Esley Head remove garcha
        soup.head.decompose()


    for a in soup.find_all("a"): #This replaces all the <a> tags with Hyperlink
        a.replace_with(" HYPERLINK ")


    for tags in soup.find_all():
        if tags.name == "a":
            continue
        if tags.name == "head":
            continue
        if tags.string:
            continue
        tags.unwrap()

    text = soup.get_text()

    return unescape(text.strip())


In [50]:
def email_to_text(msg):
    # If HTML part exists, extract it
    for part in msg.walk():
        content_type = part.get_content_type()
        if content_type == "text/html":
            return part.get_payload(decode=True).decode(errors="ignore")
        if content_type == "text/plain":
            text = part.get_payload(decode=True)
            if text:
                return text.decode(errors="ignore")
    return ""


In [55]:
sample_spam_email = spam_emails[0]
print(sample_spam_email.get_content().strip())
print(type(sample_spam_email))

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<HTML><HEAD>
<META content="text/html; charset=windows-1252" http-equiv=Content-Type>
<META content="MSHTML 5.00.2314.1000" name=GENERATOR></HEAD>
<BODY><!-- Inserted by Calypso -->
<TABLE border=0 cellPadding=0 cellSpacing=2 id=_CalyPrintHeader_ rules=none 
style="COLOR: black; DISPLAY: none" width="100%">
  <TBODY>
  <TR>
    <TD colSpan=3>
      <HR color=black noShade SIZE=1>
    </TD></TR></TD></TR>
  <TR>
    <TD colSpan=3>
      <HR color=black noShade SIZE=1>
    </TD></TR></TBODY></TABLE><!-- End Calypso --><!-- Inserted by Calypso --><FONT 
color=#000000 face=VERDANA,ARIAL,HELVETICA size=-2><BR></FONT></TD></TR></TABLE><!-- End Calypso --><FONT color=#ff0000 
face="Copperplate Gothic Bold" size=5 PTSIZE="10">
<CENTER>Save up to 70% on Life Insurance.</CENTER></FONT><FONT color=#ff0000 
face="Copperplate Gothic Bold" size=5 PTSIZE="10">
<CENTER>Why Spend More Than You Have To?
<CENTER><FONT color=#ff0000 face="Copp

In [None]:
raw = email_to_text(sample_spam_email)
print(type(raw))
print(honda = html_to_plain_text(raw))

<class 'str'>
Save up to 70% on Life Insurance.
Why Spend More Than You Have To?

Life Quote Savings










Ensuring your 
      family's financial security is very important. Life Quote Savings makes 
      buying life insurance simple and affordable. We Provide FREE Access to The 
      Very Best Companies and The Lowest Rates.





Life Quote Savings is FAST, EASY and 
            SAVES you money! Let us help you get started with the best values in 
            the country on new coverage. You can SAVE hundreds or even thousands 
            of dollars by requesting a FREE quote from Lifequote Savings. Our 
            service will take you less than 5 minutes to complete. Shop and 
            compare. SAVE up to 70% on all types of Life insurance! 



 HYPERLINK 

Protecting your family is the best investment you'll ever 
          make!








If you are in receipt of this email 
      in error and/or wish to be removed from our list,  HYPERLINK  AND TYPE REMOVE. If you 
    