In [56]:
import os
import urllib.request 
import tarfile

DOWNLOAD_ROOT = "https://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets" , "spam") #It will create a path datasets/spam

#Fetched the emails
def fetch_spam_data(spam_url = SPAM_URL , spam_path = SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for file_name , urls in (("ham.tar.bz2" , HAM_URL) , ("spam.tar.bz2" , SPAM_URL)):
        path = os.path.join(spam_path , file_name) #It will create a path datasets/spam/file_name 
        if not os.path.isfile(path):
            urllib.request.urlretrieve( urls , path) #This will download the file and then store it in the path
        tar_bz2_file = tarfile.open(path) #The downloaded fie will be in the format of the .tar so we need tarfile
        tar_bz2_file.extractall(spam_path) #Then it will extract the file
        tar_bz2_file.close() 

        

In [57]:
fetch_spam_data()

In [58]:
#Loaded the email
HAM_DIR = os.path.join(SPAM_PATH , "easy_ham") #datasets\spam\easy_ham
SPAM_DIR = os.path.join(SPAM_PATH , "spam") #'datasets\\spam\\spam'

ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

In [59]:
import email
import email.policy 

def load_email(is_spam , filename , spam_path = SPAM_PATH): 
    directory = "spam" if is_spam else "easy_ham" #esley chahi kun email read bhanera select garcha

    with open  (os.path.join(spam_path , directory , filename) , "rb") as f:   # open .. as f allows to read the email and close when its done as rb-> read in bianry
        return email.parser.BytesParser(policy = email.policy.default).parse(f)


In [60]:
ham_emails = [load_email(is_spam=False , filename=name) for name in ham_filenames ]
spam_emails = [load_email(is_spam = True , filename=name) for name in spam_filenames]

In [61]:
print(ham_emails[:5])

[<email.message.EmailMessage object at 0x000001A07767E570>, <email.message.EmailMessage object at 0x000001A07767C110>, <email.message.EmailMessage object at 0x000001A07767CD70>, <email.message.EmailMessage object at 0x000001A07767EC00>, <email.message.EmailMessage object at 0x000001A07767F770>]


In [62]:
def get_email_structure(email):
    if isinstance(email , str): 
        return email
    payload = email.get_payload() #Here if the email is multipart then it will contain contain content of each subpart
    if isinstance(payload, list):
        return "multipart({})".format(",".join([get_email_structure(sub_email) for sub_email in payload]))
    else:
        return email.get_content_type()
    

In [63]:
from collections import Counter

def structures_counter(emails):
    structures = Counter() #Iniatializes an empty dictionary
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1  
    return structures  

In [64]:
structures_counter(ham_emails).most_common()
structures_counter(spam_emails).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain,text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain,image/jpeg)', 3),
 ('multipart(text/html,application/octet-stream)', 2),
 ('multipart(text/plain,application/octet-stream)', 1),
 ('multipart(text/html,text/plain)', 1),
 ('multipart(multipart(text/html),application/octet-stream,image/jpeg)', 1),
 ('multipart(multipart(text/plain,text/html),image/gif)', 1),
 ('multipart/alternative', 1)]

In [65]:
import numpy as np
from sklearn.model_selection import train_test_split

X = ham_emails + spam_emails
Y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

In [66]:
x_train , x_test , y_train , y_test = train_test_split(X , Y , test_size= 0.2 , random_state= 42)

In [67]:
from bs4 import BeautifulSoup #esley chahi string accept garcha hai 
from html import unescape

def html_to_plain_text(html):  #This accepts only string

    soup = BeautifulSoup(html , 'lxml')

    if soup.head: #Esley Head remove garcha
        soup.head.decompose()


    for a in soup.find_all("a"): #This replaces all the <a> tags with Hyperlink
        a.replace_with(" HYPERLINK ")


    for tags in soup.find_all():
        if tags.name == "a":
            continue
        if tags.name == "head":
            continue
        if tags.string:
            continue
        tags.unwrap()

    text = soup.get_text()

    return unescape(text.strip())


In [74]:
def email_to_text_complete(msg):
    """Gets ALL text content from email, not just first part"""
    all_text = []
    
    for part in msg.walk():
        content_type = part.get_content_type()
        
        if content_type in ["text/html", "text/plain"]:
            text = part.get_payload(decode=True)
            if text:
                all_text.append(text.decode(errors="ignore"))
    
    return " ".join(all_text)  # Combine ALL parts

In [75]:
sample_spam_email = [email for email , label in zip(x_train , y_train) 
                     if label == 1 and get_email_structure(email) == 'text/html']
sample_spam_email_1 = sample_spam_email[7] 

In [76]:
raw = email_to_text_complete(sample_spam_email_1)
print(html_to_plain_text(raw))

OTC

 Newsletter
Discover Tomorrow's Winners 

For Immediate Release

Cal-Bay (Stock Symbol: CBYI)
Watch for analyst "Strong Buy Recommendations" and several advisory newsletters picking CBYI.  CBYI has filed to be traded on the OTCBB, share prices historically INCREASE when companies get listed on this larger trading exchange. CBYI is trading around 25 cents and should skyrocket to $2.66 - $3.25 a share in the near future.
Put CBYI on your watch list, acquire a position TODAY.

REASONS TO INVEST IN CBYI

A profitable company and is on track to beat ALL earnings estimates!

One of the FASTEST growing distributors in environmental & safety equipment instruments.

Excellent management team, several EXCLUSIVE contracts.  IMPRESSIVE client list including the U.S. Air Force, Anheuser-Busch, Chevron Refining and Mitsubishi Heavy Industries, GE-Energy & Environmental Research.

RAPIDLY GROWING INDUSTRY
Industry revenues exceed $900 million, estimates indicate that there could be as much as $2

In [77]:
import urlextract
import nltk
url = urlextract.URLExtract()
stemmer = nltk.PorterStemmer()


In [78]:
from sklearn.base import BaseEstimator, TransformerMixin
from collections import Counter
import numpy as np

class email_to_word_count(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, 
                 url_replace=True, stemming=True, replace_numbers=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.url_replace = url_replace
        self.stemming = stemming 
        self.replace_numbers = replace_numbers
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, x, y=None):
        X_transformed = []
        for email in x: 
            text = email_to_text_complete(email) or " "
            if self.lower_case:
                text = text.lower()
            if self.url_replace and url is not None:
                url_1 = list(set(url.find_urls(text)))
                url_1.sort(key=lambda url: len(url), reverse=True)
                for found_url in url_1:
                    text = text.replace(found_url, " url ")
            if self.replace_numbers:
                words = text.split()
                new_word = []
                for word in words:
                    try:
                        float(word)
                        new_word.append("NUMBER")
                    except ValueError:
                        new_word.append(word)
                text = " ".join(new_word)  
            
            word_counts = Counter(text.split())  
            
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()  
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts

            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [87]:
x_demo = X[:5]
x_hero = email_to_word_count().fit_transform(x_demo)
print(x_hero)


[Counter({'the': 15, 'pick': 9, 'number': 6, '-lbrace': 6, 'of': 5, '...': 5, '-rbrace': 5, 'i': 4, 'is': 4, '-list': 4, 'thi': 3, '+inbox': 3, '-subject': 3, 'ftp': 3, '-sequenc': 3, '18:19:04': 3, 'command': 3, 'delta$': 3, 'from': 3, '18:19:03': 2, '4852-4852': 2, 'mercuri': 2, 'hit': 2, "that'": 2, 'come': 2, 'version': 2, 'use': 2, 'on': 2, 'url': 2, 'and': 2, 'one': 2, 'date:': 1, 'wed,': 1, 'aug': 1, '10:54:46': 1, 'from:': 1, 'chri': 1, 'garrigu': 1, '<cwg-dated-1030377287.06fa6d@deepeddy.com>': 1, 'message-id:': 1, '<1029945287.4797.tmda@deepeddy.vircio.com>': 1, '|': 1, "can't": 1, 'reproduc': 1, 'error.': 1, 'for': 1, 'me': 1, 'it': 1, 'veri': 1, 'repeatable...': 1, '(like': 1, 'everi': 1, 'time,': 1, 'without': 1, 'fail).': 1, 'debug': 1, 'log': 1, 'happen': 1, 'pick_it': 1, '{exec': 1, '-rbrace}': 1, '{4852-4852': 1, 'mercury}': 1, 'exec': 1, 'ftoc_pickmsg': 1, '{{1': 1, 'hit}}': 1, 'mark': 1, 'tkerror:': 1, 'syntax': 1, 'error': 1, 'in': 1, 'express': 1, '"int': 1, 'note,

In [88]:
from scipy.sparse import csr_matrix

class word_count_to_vector(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
        
    def fit(self, x, y=None):
        total_count = Counter()
        
        
        for word_count in x:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        
        
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.most_common = most_common
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    
    def transform(self, x, y=None):
        rows = []
        cols = []
        data = []
        
        for row, word_count in enumerate(x):
            for word, count in word_count.items():
                if word in self.vocabulary_:
                    rows.append(row)
                    cols.append(self.vocabulary_[word])
                    data.append(count)
        
        return csr_matrix((data, (rows, cols)), shape=(len(x), self.vocabulary_size + 1))

In [91]:
hola = word_count_to_vector(vocabulary_size=10)
x_hero_vector = hola.fit_transform(x_hero)
x_hero_vector 


<Compressed Sparse Row sparse matrix of dtype 'int32'
	with 46 stored elements and shape (5, 11)>

In [92]:
x_hero_vector.toarray()

array([[ 0, 15,  5,  1,  2,  6,  1,  2,  4,  1,  9],
       [ 0,  5,  3,  2,  3,  4,  3,  2,  2,  1,  0],
       [ 0, 16,  5, 10, 10,  2,  4,  2,  1,  1,  0],
       [ 0,  9,  7,  3,  4,  5,  3,  2,  1,  6,  0],
       [ 0,  4,  5,  7,  2,  1,  3,  3,  2,  1,  0]])

In [93]:
hola.vocabulary_

{'the': 1,
 'of': 2,
 'to': 3,
 'and': 4,
 'number': 5,
 'a': 6,
 'url': 7,
 'is': 8,
 'that': 9,
 'pick': 10}

In [96]:
from sklearn.pipeline import Pipeline

Pipeline = Pipeline([
    ("email_to_word_count" , email_to_word_count()),
    ("word_count_to_vector" , word_count_to_vector(vocabulary_size= 1000))
])

x_train_transformed = Pipeline.fit_transform(x_train)
x_test_transformed = Pipeline.transform(x_test)

In [101]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score , recall_score

log_train = LogisticRegression(solver = 'liblinear' , random_state=42 , max_iter=1000)
log_train.fit(x_train_transformed , y_train)

y_prediction = log_train.predict(x_test_transformed)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_prediction)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_prediction)))



Precision: 95.74%
Recall: 94.74%
