In [91]:
import os
import urllib.request 
import tarfile

DOWNLOAD_ROOT = "https://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets" , "spam") #It will create a path datasets/spam

#Fetched the emails
def fetch_spam_data(spam_url = SPAM_URL , spam_path = SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for file_name , urls in (("ham.tar.bz2" , HAM_URL) , ("spam.tar.bz2" , SPAM_URL)):
        path = os.path.join(spam_path , file_name) #It will create a path datasets/spam/file_name 
        if not os.path.isfile(path):
            urllib.request.urlretrieve( urls , path) #This will download the file and then store it in the path
        tar_bz2_file = tarfile.open(path) #The downloaded fie will be in the format of the .tar so we need tarfile
        tar_bz2_file.extractall(spam_path) #Then it will extract the file
        tar_bz2_file.close() 

        

In [92]:
fetch_spam_data()

In [93]:
#Loaded the email
HAM_DIR = os.path.join(SPAM_PATH , "easy_ham") #datasets\spam\easy_ham
SPAM_DIR = os.path.join(SPAM_PATH , "spam") #'datasets\\spam\\spam'

ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

In [94]:
import email
import email.policy 

def load_email(is_spam , filename , spam_path = SPAM_PATH): 
    directory = "spam" if is_spam else "easy_ham" #esley chahi kun email read bhanera select garcha

    with open  (os.path.join(spam_path , directory , filename) , "rb") as f:   # open .. as f allows to read the email and close when its done as rb-> read in bianry
        return email.parser.BytesParser(policy = email.policy.default).parse(f)


In [95]:
ham_emails = [load_email(is_spam=False , filename=name) for name in ham_filenames ]
spam_emails = [load_email(is_spam = True , filename=name) for name in spam_filenames]

In [96]:
print(ham_emails[:5])

[<email.message.EmailMessage object at 0x0000022C85B14B00>, <email.message.EmailMessage object at 0x0000022C85B14D10>, <email.message.EmailMessage object at 0x0000022C85B14E60>, <email.message.EmailMessage object at 0x0000022C85B15C70>, <email.message.EmailMessage object at 0x0000022C85B15580>]


In [97]:
def get_email_structure(email):
    if isinstance(email , str): 
        return email
    payload = email.get_payload() #Here if the email is multipart then it will contain contain content of each subpart
    if isinstance(payload, list):
        return "multipart({})".format(",".join([get_email_structure(sub_email) for sub_email in payload]))
    else:
        return email.get_content_type()
    

In [98]:
from collections import Counter

def structures_counter(emails):
    structures = Counter() 
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1  
    return structures  

In [99]:
structures_counter(ham_emails).most_common()
structures_counter(spam_emails).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain,text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain,image/jpeg)', 3),
 ('multipart(text/html,application/octet-stream)', 2),
 ('multipart(text/plain,application/octet-stream)', 1),
 ('multipart(text/html,text/plain)', 1),
 ('multipart(multipart(text/html),application/octet-stream,image/jpeg)', 1),
 ('multipart(multipart(text/plain,text/html),image/gif)', 1),
 ('multipart/alternative', 1)]

In [100]:
import numpy as np
from sklearn.model_selection import train_test_split

X = ham_emails + spam_emails
Y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

In [101]:
x_train , x_test , y_train , y_test = train_test_split(X , Y , test_size= 0.2 , random_state= 42)

In [102]:
from bs4 import BeautifulSoup #esley chahi string accept garcha hai 
from html import unescape

def html_to_plain_text(html):  #This accepts only string

    soup = BeautifulSoup(html , 'lxml')

    if soup.head: #Esley Head remove garcha
        soup.head.decompose()


    for a in soup.find_all("a"): #This replaces all the <a> tags with Hyperlink
        a.replace_with(" HYPERLINK ")


    for tags in soup.find_all():
        if tags.name == "a":
            continue
        if tags.name == "head":
            continue
        if tags.string:
            continue
        tags.unwrap()

    text = soup.get_text()

    return unescape(text.strip())


In [103]:
def email_to_text(msg): #Converts the html of the email object into the strings
    for part in msg.walk(): #Iterates through multiple parts of the email
        content_type = part.get_content_type() 

        if content_type == "text/html":
            return part.get_payload(decode = True).decode(errors ="ignore")
        
        if content_type == "text/plain":
            text = part.get_payload(deocde = True)
            if text:
                return text.decode(errors = "ignore")
    return ""

In [115]:
sample_spam_email = [email for email , label in zip(x_train , y_train) 
                     if label == 1 and get_email_structure(email) == 'text/html']
sample_spam_email_1 = sample_spam_email[7] 

In [116]:
raw = email_to_text(sample_spam_email_1)
print(html_to_plain_text(raw))

OTC

 Newsletter
Discover Tomorrow's Winners 

For Immediate Release

Cal-Bay (Stock Symbol: CBYI)
Watch for analyst "Strong Buy Recommendations" and several advisory newsletters picking CBYI.  CBYI has filed to be traded on the OTCBB, share prices historically INCREASE when companies get listed on this larger trading exchange. CBYI is trading around 25 cents and should skyrocket to $2.66 - $3.25 a share in the near future.
Put CBYI on your watch list, acquire a position TODAY.

REASONS TO INVEST IN CBYI

A profitable company and is on track to beat ALL earnings estimates!

One of the FASTEST growing distributors in environmental & safety equipment instruments.

Excellent management team, several EXCLUSIVE contracts.  IMPRESSIVE client list including the U.S. Air Force, Anheuser-Busch, Chevron Refining and Mitsubishi Heavy Industries, GE-Energy & Environmental Research.

RAPIDLY GROWING INDUSTRY
Industry revenues exceed $900 million, estimates indicate that there could be as much as $2

In [11]:
import urlextract
import nltk
url = urlextract.URLExtract()
stemmer = nltk.PorterStemmer()


In [12]:
from sklearn.base import BaseEstimator , TransformerMixin

class email_to_word_count(BaseEstimator , TransformerMixin):
    def __init__(self , strip_headers = True , lower_case = True , 
                 url_replace = True , stemming = True ,  replace_numbers = True ):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.url_replace = url_replace
        self.stemming = stemming 
        self.replace_numbers = replace_numbers
    def fit( self  , x , y=None):
        return self
    def transform(self , x , y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.url_replace and url is not None:
                url_1 = list(set(url.find_urls(text)))
                url_1.sort(key = lambda url : len(url) , reverse=True)
                for url in url_1:
                    print(url , "url")
            if self.replace_numbers:
                words = text.split()
                new_word = []
                for word in words:
                    try:
                        float(word)
                        new_word.append("NUMBER")
                    except ValueError:
                        new_word.append(word)
                text = "" .join(new_word)
            counter = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word , count in counter.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                counter = stemmed_word_counts

            X_transformed.append(counter)
        return np.array(X_transformed)




