## AML 2304 Lab 3 - Group E

In [1]:
import os
import pandas as pd
import email
import re
import nltk
import contractions
from email import policy
from email.header import decode_header
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.metrics import edit_distance, jaccard_distance
from collections import Counter

nltk.download('wordnet')
pd.set_option('max_colwidth', 100)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aurad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Read file contents from Dataset main folder. Extract the `SUBJECT`, `TO`, `FROM` and `EMAIL BODY` from the email, then add the data in dataframe with the correct label: `spam` or `ham`

In [2]:
def get_email_body(msg):
    email_body = ""

    if "multipart" in msg.get_content_type():

        for part in msg.get_payload(decode=False):
            if not isinstance(part, str):
                email_part = str(part.get_payload(decode=False)).strip()
                email_body = email_body + email_part
    else:
        email_body = str(msg.get_payload(decode=False)).strip()

    return email_body


try:
    data_raw_df = pd.DataFrame(columns=['subject', 'to', 'from', 'email_body', 'label'])

    labels = ['ham', 'spam']
    for idx in range(len(labels)):

        spam_dir = "Dataset/" + labels[idx] + "/"
        # Read all files from the dataset folder
        for file in os.listdir(spam_dir)[:]:
            with open(spam_dir + file, "r", encoding="latin-1") as f:
                # Parse the email messages
                msg = email.message_from_string(f.read(), policy=policy.default)

                subject = str(decode_header(msg.get('subject'))[0][0]) if msg.get('subject') is not None else None
                email_body = get_email_body(msg)

                # Add label to your record: 0 = ham, 1 = spam
                new_data = [subject, msg.get('to'), msg.get('from'), email_body, idx]
                data_raw_df.loc[len(data_raw_df)] = new_data

    display(data_raw_df.head(5))

except Exception as err:
    print(f"Error: {err} - filename: {file}")

Unnamed: 0,subject,to,from,email_body,label
0,Re: New Sequences Window,Chris Garrigues <cwg-dated-1030314468.7c7c85@DeepEddy.Com>,Robert Elz <kre@munnari.OZ.AU>,"Date: Tue, 20 Aug 2002 17:27:47 -0500\n From: Chris Garrigues <cwg-exmh@DeepEdd...",0
1,Re: New Sequences Window,Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>,Robert Elz <kre@munnari.OZ.AU>,"Date: Wed, 21 Aug 2002 10:54:46 -0500\n From: Chris Garrigues <cwg-dated-103037...",0
2,Personal Finance: Resolutions You Can Keep,mkettler@home.com,The Motley Fool <Fool@motleyfool.com>,======================== THE MOTLEY FOOL ========================\n PERSO...,0
3,Re: New Sequences Window,Valdis.Kletnieks@vt.edu,Chris Garrigues <cwg-exmh@DeepEddy.Com>,"> From: Valdis.Kletnieks@vt.edu\n> Date: Wed, 21 Aug 2002 02:36:56 -0400\n>\n> --==_Exmh_77858...",0
4,[zzzzteana] RE: Alexander,"""'zzzzteana@yahoogroups.com'"" <zzzzteana@yahoogroups.com>",Steve Burt <Steve_Burt@cursor-system.com>,"Martin A posted:\nTassos Papadopoulos, the Greek sculptor behind the plan, judged that the\n lim...",0


## Exploratory Data Analysis (incl. cleaning the data)

In [9]:
# Check if there are null values
print(data_raw_df.isnull().sum())

subject       0
to            0
from          0
email_body    0
label         0
dtype: int64


In [10]:
# Replace all None data with empty string
data_raw_df = data_raw_df.fillna('')
data_raw_df.isnull().sum()

subject       0
to            0
from          0
email_body    0
label         0
dtype: int64

In [11]:
# Check the total records for each column grouped by label
data_raw_df.groupby('label').agg('count')

Unnamed: 0_level_0,subject,to,from,email_body
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,6701,6701,6701,6701
1,1896,1896,1896,1896


In [None]:
# This variable will contain the cleaned data
data_cleaned_df = data_raw_df.copy()
# Convert to lower case characters
data_cleaned_df = data_cleaned_df.applymap(lambda x: x.lower() if type(x) == str else x)

data_cleaned_df.head(5)

Unnamed: 0,subject,to,from,email_body,label
0,re: new sequences window,Chris Garrigues <cwg-dated-1030314468.7c7c85@DeepEddy.Com>,Robert Elz <kre@munnari.OZ.AU>,"date: tue, 20 aug 2002 17:27:47 -0500\n from: chris garrigues <cwg-exmh@deepedd...",0
1,re: new sequences window,Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>,Robert Elz <kre@munnari.OZ.AU>,"date: wed, 21 aug 2002 10:54:46 -0500\n from: chris garrigues <cwg-dated-103037...",0
2,personal finance: resolutions you can keep,mkettler@home.com,The Motley Fool <Fool@motleyfool.com>,======================== the motley fool ========================\n perso...,0
3,re: new sequences window,Valdis.Kletnieks@vt.edu,Chris Garrigues <cwg-exmh@DeepEddy.Com>,"> from: valdis.kletnieks@vt.edu\n> date: wed, 21 aug 2002 02:36:56 -0400\n>\n> --==_exmh_77858...",0
4,[zzzzteana] re: alexander,"""'zzzzteana@yahoogroups.com'"" <zzzzteana@yahoogroups.com>",Steve Burt <Steve_Burt@cursor-system.com>,"martin a posted:\ntassos papadopoulos, the greek sculptor behind the plan, judged that the\n lim...",0


Extract mail server and domain name from `TO` and `FROM`

In [None]:
def extract_mail_server_and_domain(email):
    pattern = r'@([\w.]+\.[a-zA-Z]{2,})'
    match = re.search(pattern, email)
    return match.group(1) if match else None
    

data_cleaned_df['to'] = data_cleaned_df['to'].apply(extract_mail_server_and_domain)
data_cleaned_df['from'] = data_cleaned_df['from'].apply(extract_mail_server_and_domain)
data_cleaned_df[['to', 'from']].head(5)

Unnamed: 0,to,from
0,DeepEddy.Com,munnari.OZ.AU
1,DeepEddy.Com,munnari.OZ.AU
2,home.com,motleyfool.com
3,vt.edu,DeepEddy.Com
4,yahoogroups.com,


For `SUBJECT` and `EMAIL_BODY`, the following will be applied to clean the data
- Remove email addresses and URLs
- Remove HTML tags and scripting variable names
- Replace all whitespaces with space
- Handling contractions
- Remove special characters and numbers
- Tokenization, stopwords removal and lemmatization
- Handling typos and mispellings (?)
- Remove words that are not in dictionary (?)

In [None]:
pd.set_option('max_colwidth', None)


# Remove email addresses
pattern_email = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
data_cleaned_df[['subject', 'email_body']] = data_cleaned_df[['subject', 'email_body']].applymap(lambda x : re.sub(pattern_email, '', x))

# Remove URLs
pattern_url = r'https?://(?:www\.)?[\w\.-]+(?:\.[a-zA-Z]{2,})+(?:/[-\w\.,/]*)*(?:\?[\w\%&=]*)?'
data_cleaned_df[['subject', 'email_body']] = data_cleaned_df[['subject', 'email_body']].applymap(lambda x : re.sub(pattern_url, '', x))

# Handle contractions
data_cleaned_df[['subject', 'email_body']] = data_cleaned_df[['subject', 'email_body']].applymap(lambda x : contractions.fix(x))

# Remove HTML tags
pattern_html = r'<[^>]+>'
data_cleaned_df[['subject', 'email_body']] = data_cleaned_df[['subject', 'email_body']].applymap(lambda x : re.sub(pattern_html, '', x))
data_cleaned_df[['subject', 'email_body']] = data_cleaned_df[['subject', 'email_body']].applymap(lambda x : x.replace('&nbsp;', ' '))

# Remove scripting variable names, e.g. $msg
pattern_scripting = r'\$\w+\s*'
data_cleaned_df[['subject', 'email_body']] = data_cleaned_df[['subject', 'email_body']].applymap(lambda x : re.sub(pattern_scripting, '', x))


# Remove all special characters and numbers
pattern_special_num = r'(?<![a-zA-Z])-(?![a-zA-Z])|[^a-zA-Z\s-]+'
data_cleaned_df[['subject', 'email_body']] = data_cleaned_df[['subject', 'email_body']].applymap(lambda x : re.sub(pattern_special_num, ' ', x))

# Replace all whitespaces (new lines, tabs, etc.) to space
pattern_whitespace = r'\s+'
data_cleaned_df[['subject', 'email_body']] = data_cleaned_df[['subject', 'email_body']].applymap(lambda x : re.sub(pattern_whitespace, ' ', x))

display(data_cleaned_df.head(2))
display(data_cleaned_df.tail(2))

Unnamed: 0,subject,to,from,email_body,label
0,re new sequences window,DeepEddy.Com,munnari.OZ.AU,date tue aug from chris garrigues message-id i am hoping that all people with no additional sequences will notice are purely cosmetic changes well first when exmh the latest one with your changes starts i get cannot read flist totalcount unseen no such element in array while executing if totalcount unseen-sequence flaginner spool iconspool labelup else flaginner down icondown labeldown procedure flag msgseen line invoked from within flag msgseen procedure msgseen line invoked from within msgseen procedure msgshow line invoked from within msgshow procedure msgchange line invoked from within msgchange show invoked from within time list msgchange procedure msg change line invoked from within msg change id procedure msg show line invoked from within msg show cur eval body line invoked from within eval procedure folderchange line invoked from within folderchange inbox msg show cur invoked from within time list folderchange procedure folder change line invoked from within folder change folder procedure exmh line invoked from within exmh after script which is probably related to my not having an unseen sequence anywhere certainly not in inbox i read all of my outstanding mail before i tried this new exmh second i have been used to having a key binding which was to msg markunseen which does not seem to exist any more and i am not sure what i should replace that with there is obviously a way as the sequences menu does this the mark unseen menu entry in the message more menu is still wanting that function as well for those who have other sequences defined the window will widen to display the other sequences any chance of having that lengthen instead i like all my exmh stuff in nice columns fits the display better that is i use the detached folder list one column the main exmh window takes up full screen top to bottom but less than half the width etc i have space for more sequences in the unseen window as long as they remain once nice narrow window best would be if the sequences could be ordered by some preference then ones which did not fit would just fall off the bottom and not be shown i would also prefer it if that window had no unusual background colouring just one constant colour i have been running the unseen window with background black on a root window that is all black with no borders or other decorations but made sticky the appearance is just like the folders with unseen messages and their counts are written into the root window because it is sticky this small display follows me around and do i can see when new mail needs processing i also find that i tend to have a bunch of sequences that only ever occur in one folder some i had forgotten i ever created so in addition to the sequences to always show and sequences to never show a preference to only show sequences that occur in more than one folder would be useful and then have the sequences that occor only in the folder i am visiting appear in the list when that folder is current this is just to keep the list size somewhat manageable while remaining productive i quite often use a sequence to remember a particular message in a folder the name is used only there and only for one message it gives me a handle on the message which remains as the folder is packed sorted etc i have not updated my exmh for some time now so i am not sure if this next one is new or just new since but the sequences menu on the bar with new flist search only contains unseen and urgent it would be useful if it contained all of the sequences that the folder happens to have defined a new sequence entry would also be useful to mark the message with a sequence name that did not previously exist which can be done now using search and the pick interface but is clumsy that way actually you once could now when i try this entering a sequence name in the pick box and a single message number or a range n-n in the list of messages and no pick attributes at all i now get syntax error in expression int hit- while executing expr int procedure ftoc findmsg line invoked from within ftoc findmsg procedure ftoc findmsgs line invoked from within ftoc findmsgs procedure ftoc pickmsgs line invoked from within ftoc pickmsgs ids addtosel procedure pickinner line invoked from within pickinner exec pick inbox -list -sequence mercury uplevel body line invoked from within uplevel procedure busycursorinner line invoked from within busycursorinner procedure busycursorhack line invoked from within busycursorhack cursor arm line invoked from within switch style icon busyicon cursorall busycursor cursor busycursorhack default eval procedure busy line invoked from within busy pickinner procedure pick it line invoked from within pick it invoked from within pick but pick invoke uplevel body line invoked from within uplevel list invoke procedure tkbuttonup line invoked from within tkbuttonup pick but pick command bound to event it has been ages since i did this last though i tried adding a subject to pick on easy as i know what is in the message which made no difference looks as if something is now saying hit when before it did not or similar i have also changed the ftoc colorization as discussed briefly on the list a week or so ago any chance of making the current message a little brighter background just to make it stand out a fraction more than it does maybe this is more apparent to me than many as i use very small fonts everywhere the background of the ftoc line is not very wide hope this helps kre exmh-workers mailing list,0
1,re new sequences window,DeepEddy.Com,munnari.OZ.AU,date wed aug from chris garrigues message-id i cannot reproduce this error for me it is very repeatable like every time without fail this is the debug log of the pick happening pick it exec pick inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace -sequence mercury exec pick inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace -sequence mercury ftoc pickmsgs hit marking hits tkerror syntax error in expression int note if i run the pick command by hand delta pick inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace -sequence mercury hit that is where the hit comes from obviously the version of nmh i am using is delta pick -version pick nmh- compiled on fuchsia cs mu oz au at sun mar ict and the relevant part of my mh profile delta mhparam pick -seq sel -list since the pick command works the sequence actually both of them the one that is explicit on the command line from the search popup and the one that comes from mh profile do get created kre ps this is still using the version of the code form a day ago i have not been able to reach the cvs repository today local routing issue i think exmh-workers mailing list,0


Unnamed: 0,subject,to,from,email_body,label
8595,cannabis difference,webnote.net,dialix.oz.au,mid-summer customer appreciation sale to express our appreciation to all of our loyal customers we are offering the following products for for a limited time only ragga dagga stoney mahoney sweet vjestika aqueous kathmandu along with a free oz package of capillaris herba we have three new renegade botanicals from exotic botanical resources the ethnobotanical herbalists who brought the herba supplementals kathmandu temple kiff personal choice pipe-smoking products substances to the common market announcing temple ragga dagga tm aqueous kathmandu tm stoney mahoney tm temple ragga dagga we are finally able to offer for your sensitive responsive personal choice smoking enjoyment the temple ragga dagga tm pipe-smoking substance supplemental product introduced after three years of research and development temple is personal choice legal smoking indulgence redefined thanks to recent dramatic technological advances in the laboratorial processes for the extraction of alkaloid and glycocide supplements from botanicals herbas plant matter we are now able to offer in more cultivated enhanced viripotent substantiated format what had actually already been the most significant lawful personal choice smoking substance available on the planet temple ragga dagga tm is the sweet sweet evolution of all of that a x more viripotent herba supplement than its predecessors temple happier happy smoking indeed a depressive regressive supplemental mood-enhancer more sophisticated uplifting poised than illegal smoking substances no regulation no illegality no failed drug tests inhibits stress anxiety inspires conteplativeness creativity ehances the sexual experience generates more restful sleep lucid dreaming a significant herba botanical supplement in the battles against drug alcohol dependence easily ignited stoked smokes sweetly absolutely legal non-invasive no downside lingers for a good goodly while possesses many fine ganja virtues with none of the negatives just a little snippet pinch goes a long long way just or draws of your pipe a traditional hand herb-pipe is included with each package of temple ragga dagga temple ragga dagga tm is an exclusive botanical herba proprietary nepalese-formulated sensitive responsive pipe-smoking stoking substance and it is undoubtedly the most prestigious legal offering of its sort on the planet so smokin stokin potent is this cutting edge formulation that we have even been able to establish a very happy clientele market base within the hard-core stoner arena and have made positive happy smoking differences in many many lives absolutely legal marvelously potent a one-of-a-kind proprietary amalgamation comprised of extreme high-ratio concentrated extracts which are derived from various common and uncommon sensitive responsive herbas primarily cultivated within and imported from the southern and eastern hemispheres temple ragga dagga tm high-ratio factored botanical extractions are master-crafted into solid jiggets bars which are structurally reminiscent of what one might find in the happiness coffee and tea houses of nepal kathmandu or amsterdam and in many aspects possesses a more collected and more focused less-scattered ambience ingredients temple smoking substances and temple ragga dagga tm have always been and will always remain exclusive exotic botanical resources house smoking substance specialties temple ragga dagga tm is both a euphonious celebratory and relaxing calming pipe-smoking substance that offers both physical and cerebral significators temple ragga dagga tm is a proprietary prescribed botanical amalgamation which includes the following synergistically synesthesia conglomerated core-refined ratio-enhanced herbas botanicals resins essences flower-tops and oils in extreme ratio extractment ranging from to to to viripotent concentrations drachasha chavana prash trikatu black seed herb hybrid flowering turnera diffusa capillaris herba angelica root wild dagga mature leaf matter haritaki shatavari labdunum neroli unicorn root papaver rhoes dendrobian stems calea zacalechichi buddings rue amla salvia divinorum crocus sativa lotus and gokshura cuttings please note temple ragga dagga tm is an absolutely legal herba botanical personal choice pipe-smoking substantiality product no included botanical factor therein is regulated by law or considered to be harmful by regulatory agencies there is no tobacco in temple ragga dagga tm there is certainly no cannabis marijuana in temple ragga dagga tm and although we are not age-governed by law temple ragga dagga tm is intended exclusively for sophisticated adult usage subsequently it is our mandatory ethical policy that temple ragga dagga tm may not be sold offered or given to any person that has not attained at least twenty-one years of age all things in their time as well temple ragga dagga tm is not intended for use during work or while driving it should not be enjoyed during pregnancy nor is it intended to supercede physician s care in any regard aqueous kathmandu for those who choose not to smoke we have something for you introducing aqueous kathmandu happy drops tm aqueous kathmandu tm sensitive responsive happiness drops temple quantum variety indeed a happiness brew from the kathmandu a k a secret fire from a toke-smoke point of view if you know what i mean and be groovin the scene who du the kathmandu now everybody can du the kathmandu that is if you are years of age or older aqueous kathmandu is engaged to holistically inspire and instill sensitive responsive happiness and mellowness without the detriment of carcinogenic inhalation smoking aqueous kathmandu is absolutely legal and does not contain any controlled considered to be harmful or regulated herbs or cannabis marijuana factors as smoking has become so socially taboo over the years and as so many people have asked us for a liquid product we have long strived to bring a quantum-factored concentrated liquid product to fruition this has been no easy task for a variety of botanical and technological reasons finally we are able to say that this task has been accomplished a sensitive herbal botanical awakening if you will as we have introduced and brought to market aqueous kathmandu tm happiness drops temple quantum variety a k a secret fire secret fire indeed this liquid innovation affords us four factors within this particular botanical equation that were previously not realized the ability to engage a more substantial high-ratio concentrated application of particular botanical factors that are utilized in our personal-choice sensitive responsive pipe-smoking product the ability to extract high-ratio concentrates of certain botanical factors that are not applicable to a smoking commodity as they cannot be extracted into dry format please note all botanical factors included in all any of our products are absolutely legal in the instance of aqueous kathmandu we are simply able to enjoy the advantages of specific legal herbas that just are not dry concentrate-applicable a most notable assimilation factor being that liquid is more easily and therefore more generally metabolistically assimilated and as such more absolute in that herba botanical applications via the administrative vehicle of smoke possess a much wider variance in just how efficiently that smoke is integrated metabolistically psychologically and perspectively therein we have created a unique uniqueness between a smoke and a non-smoke sort of like the camaraderie between indicas and sativas if you will different aspects of the same botanical genus arrangement each impeccable singularly and at this juncture we will also mention to those of you who are not so anti-smoking something that aqueous kathmandu tm happiness drops and the more traditional temple ragga dagga tm personal-choice pipe-smoking substance indeed establish an enchanting camaraderie please refer to introductory offers no need to smoke no carcinogenic factors absolutely legal no prescription required does not include cannibis or any tobacco variety marvelously potent remarkably substantial inspires contemplativeness and creativity adjusts attitude mood enhancement relaxes stress anxiety better than kava kava st john s wort etc similar to but variant from ragga dagga certainly safer than pharmaceuticals many fine ganja virtues with none of the negatives better sleeping better dreaming non-invasive no downside an exquisite compliment with temple ragga dagga tm see intro offers quite simply it is a superior product contents aqueous kathmandu is a unique botanical substantiality it is offered and marketed as such undisputedly it achieves distinctive accolade of its own merit aqueous kathmandu is absolutely legal and does not contain any controlled or regulated or harmful herbs or cannabis factors however it is our mandatory ethical policy that aqueous kathmandu not be offered to individuals who have not yet attained at least years of age all things in their time please note as well that aqueous kathmandu is not intended for usage during work or while driving and as is true of all substance and indulgence this product should not be enjoyed during pregnancy this proprietary formulation does include the following quantum-ratio core-extracted refined botanicals in an alcohol base as a preservative albizzia flower-tops drachsha chavana prash lactuca virosa hybrid flowering turnera diffusa wild dagga capillaris herba angelica root zizyphi spinosae buplerum hybrid valeriana officinalis root albizzia flower-tops mature polygonum vine calea zacatechichi crocus sativa flower-tops leonorus sibricus buds cinnabaris margarita herba biotae orientalis salviae miltiorrhizae usage instructions shake well mix drops with juice or water best on empty stomach ambiance lasts about two hours or so not intended for use during pregnancy or while working or driving keep out of reach of children stoney mahoney in regard to the introduction of stoney mahoney tm personal choice smoking substantiality product we have strived for years to develop a consummate sensitive responsive loose-leaf smoking product amsterdam cup style so to speak with acquired knowledge via the development of such notable personal choice smokeables as wizard smoke tm dream smoke tm dragon smoke tm vajroli mudra tm stashish tm shemhamphorash tm yerba lena yesca tm weed tm kathmandu temple kiff tm which for the most part have been proprietary formulations of which the rights have been sold to other less aggressive less developmental companies we arrived at the lofty personal choice smoking status of our premier ragga dagga tm solid pipe-smoking product after all of this we had come to the technological and philosophical point of view that we could not conjure a loose-leaf formulation that would provide effectuality that would be more significant than that of our much-heralded ragga dagga tm solid-format pipe smoking product mostly we were right about this as this new stoney mahoney agenda is not so much significant as in better but signature significant as in uniqueness in that we chanced upon a new world class botanical source for the rare true-brid variety of artemisia absinthum flowering tops only which at one time in history was distilled into absinthe a greenish aniseed flavored liqueur which heralded certain euphoric and narcotic attributes which ultimately resulted in it being declared illegal as well we have finally perfected our exclusive kiffening technique which inspires and establishes alkaloid and glycoside potentiation enhancement on a molecular level hence we introduce although supply is limited this highly potentiated stoney mahoney tm smoking product loose-leaf personal choice doesn t get any sweeter than this and for all of you ragga dagga fans please note that indeed it isn t so much that stoney mahoney loose-leaf is more viripotent than the ragga dagga solid-format moreover it is that stoney mahoney is effective in a much different way than ragga dagga sort of like apples and oranges like panamanian and jamaican like indica and sativa if you will within our test marketing even traditional kind bud aficionados have magnanimously acclaimed this stoney mahoney jester variety most folks say that ragga dagga motivates them in an up sativa sort of way and that stoney mahoney mellows them out or giggles them out in a soma indica sort of way if stoney mahoney has any shortcoming at all it is quite simply that due to the extreme kiffening of this product it may be to the uninitiated practitioner a harsh draw it is for this reason that in addition to the inclusion of a standard herb pipe and package of rolling papers we also include a brass personal water-pipe hookah as is true for many high-minded folks a water-pipe filled with chilled water or wine will temperament the demonstrativeness of stoney mahoney s draw indica oriented definitely happy smoking mood enhancer smokeable brewable mood food good-bye stress anxiety restlessness sleep deep good-bye funky dreams hornier than horny goatweed superlative mixer with ragga dagga and or aqueous kathmandu roll it or bowl it or brew it just a pinch goes a long long way possesses many fine ganja virtues with none of the negatives non-invasive absolutely legal rolling papers herb pipe personal brass water pipe hookah are included attention attention we have by popular demand and in the interest of good economics created an introductory offer that features both ragga dagga and stoney mahoney products and also still another introductory offer which includes ragga dagga stoney mahoney and for you non-smokers our aqueous kathmandu happiness drops these are pretty special and to mention furthermore for non-smokers stoney mahoney jester variety loose-leaf product is also a brewable delight a tea for thee please note stoney mahoney jester variety tm is an absolutely legal herba botanical personal choice loose-leaf substantiality smoking product no included botanical factor therein is regulated by law or considered to be harmful by regulatory agencies there is no tobacco in stoney mahoney jester variety tm there is certainly no cannabis marijuana in stoney mahoney jester variety tm and although we are not age-governed by law stoney mahoney jester variety tm is intended exclusively for sophisticated adult usage subsequently it is our mandatory ethical policy that stoney mahoney jester variety tm may not be sold offered or given to any person that has not attained at least twenty-one years of age as well stoney mahoney jester variety tm is not intended for use during work or while driving it should not be enjoyed during pregnancy nor is it intended to supercede physician s care in any regard stoney mahoney jester variety tm factored in an absinthium labyrinthine configuration is an exclusive kiffened loose-leaf primo modino sensitive responsive smoking and or brewing herba which may be depending upon preference rolled or bowled or brewed as is an herbalist s way three or four draws of smoke should be inhaled and retained for the non-smoker it is most appropriate to engage this herba as a potentiated tea brew steep approximately one tea-spoon of stoney mahoney herba per one cup of water for ten minutes or so strain when smoking stoney mahoney please draw gently as stoney mahoney is a most preeminent sensitive responsive smoke and may be considered to be a harsh draw to the uninitiated practitioner water-pipe is included with each package of stoney mahoney jester variety stoney mahoney contains and is kiffened with the following exotica botanicals and botanical extracts essences dutch lactuca virosa bulgarian artemisia absinthum flowering-tops only yucatan turnera aphrodisaca chinese valeriana ceae jamaican verbena officinalis spanish peumus boldo and european flowering-tops only sarothamnus scoparius stoney mahoney does not include any tobacco or any cannabis factors stoney mahoney does indeed achieve distinction upon its own merit we offer other fine herbal botanical products including the following sweet vjestika aphrodisia drops tm an erotic aphrodisia sexual intensifier enhancer liquid amalgamated extract for men and women seventh heaven prosaka tablets tm a botanical alternative to pharmaceutical medications for calm balance serenity and joyful living seventh heaven gentle ferocity tablets tm a most efficacious non-caffeine non-ephedrine non-mahuang botanical energizer and cutting-edge appetite suppressant extreme martial arts botanical remedies equivalence tablets dragon wing remedy spray pain management that works to alleviate pain even for arthritis and fibromyalgia sufferers cockle doodle doo tm penile restorative renewal enhancement souffle an exclusive proprietary blendage created to dramatically emphatically aggrandize enhance age-inhibit penile skin quality and vascular composure soothes refreshes provides a youthful glow go sensitivity hairricane tm an extreme high-ratio dry botanical extract herba dietary hair supplement for men women pricing information temple ragga dagga tm one oz jigget bar one oz jigget bar aqueous kathmandu happy drops tm one oz bottle usages two oz bottles stoney mahoney tm jester variety loose leaf one oz package two oz packages each oz package of stoney mahoney includes pkg of clipped-rolling papers a traditional herb pipe a brass personal water-pipe hookah sorry but due to a limited supply of stoney mahoney jester variety only a maximum purchase of two oz packages per customer is allowed per order sweet vjestika aphrodisia drops tm one oz bottle two oz bottles seventh heaven prosaka tm one tablet tin three tablet tins six tablet tins seventh heaven gentle ferocity tm one tablet jar equivalence tablets each bottle contains mg tablets -pack tablets -pack tablets save -pack tablets save -pack tablets save dragon wing spray remedy each spray bottle contains liquid oz -pack oz bottles -pack oz bottles save -pack oz bottles save -pack oz bottles save dynamic duo introductory offers -pack equivalence tabs -pack dragon wing save -pack equivalence tabs -pack dragon wing save -pack equivalence tabs -pack dragon wing save -pack equivalence tabs -pack dragon wing save cockle doodle doo tm souffle one oz jar two oz jars six oz jars hairricane tm pack bags of capsules each pack bags of capsules each save pack bags of capsules each save alpha introductory offer one oz pkg of stoney mahoney one oz jigget bar of temple ragga dagga price reg save beta introductory offer one oz pkg of stoney mahoney one oz pkg of temple ragga dagga one oz bottle of aqueous kathmandu happiness drops price reg save intro offer a one oz jigget bar of temple ragga dagga one tin tablets of seventh heaven prosaka one jar tablets of seventh heaven gentle ferocity price reg save intro offer b one oz jigget bar of temple ragga dagga one jar tablets of seventh heaven gentle ferocity price reg save intro offer c one oz jigget bar of temple ragga dagga one tin tablets of seventh heaven prosaka price reg save intro offer d one oz jigget bar of temple ragga dagga one oz bottle of sweet vjestika aphrodisia drops price reg save intro offer e one oz jigget bar of temple ragga dagga one oz bottle of sweet vjestika aphrodisia drops one tin tablets of seventh heaven prosaka one jar tablets of gentle ferocity price reg save intro offer f one oz jar of cockle doodle doo souffle one oz jigget bar of temple ragga dagga one oz bottle of sweet vjestika love drops regular price yours for only save intro offer h one oz bottle of aqueous kathmandu one oz bar jigget of temple ragga dagga price reg save intro offer i one oz bottle of aqueous kathmandu one oz bar jigget of temple ragga dagga one oz bottle of sweet vjestika price reg save intro offer j one oz bottle of aqueous kathmandu one oz bar jigget of temple ragga dagga one oz bottle of sweet vjestika one tablet jar of gentle ferocity price reg save intro offer k one oz bottle of aqueous kathmandu one oz bottle of sweet vjestika price reg save ordering information for your convenience you can call us direct with your orders or questions call monday saturday am to pm mountain time for all domestic orders add shipping handling shipped you s priority mail add for international orders to remove your address from our list click on the following link and send a blank email mailto subject remove,1
8596,ilug wilson kamela,linux.ie,netscape.net,attn sir madan strictly confidential i am pleased to introduce myself to you my name is mr wilson kamela a native of south africa and a senior employee of mines and natural resources department currently on a trainning course in holland for few months i am writing this letter to request your assistance in order to redeem an investment with the south african mining corporation the said investment now valued at million dollars fifteen million five hundred thousand dollars only was purchased by lucio harper and contracted out to the south african mining corporation in now recognised as mines and natural resources department this redeemable investment interest has now matured since march last year since march last year several attempts have been made to contact lucio harper without success and there is no way to contact any of his close relatives in whose favour the investment cash value can be paid since we have access to all lucio harper s information we can claim this money with the help of my partners with the south african mines and natural resources department all we have to do is to file claim using you as lucio harper s relative i will like to assure you that there is absolutely nothing to worry about because it is perfectly safe with no risk involved please ensure to keep this matter strictly confidential my partner will file a claim for this money on your behalf from the southafrican mining corporation when the claim is approved you as the beneficiary will be paid of the total amouth since this money can be paid directly into any bank account of your choice you have responsibility to ensure that my partner and ireceive of the total amouth while the balance will be set aside for any unforseen expenses in the because of transfering this money i will appreciate if you can give your assurance and guarantee that our share will be well secured please for the sake of confidentiality reach me on my e-mail address please let me know if this proposal is acceptable to you kindly reach me immediately with any of the stated contact addresses so that better clearifications relating to the transaction will be explained to you truly yours wilson kamela irish linux users group for un subscription information list maintainer,1


In [None]:
# Tokenize the text and remove stopwords
stop_words = set(stopwords.words('english'))
data_cleaned_df[['subject', 'email_body']] = data_cleaned_df[['subject', 'email_body']].applymap(lambda x : [word for word in word_tokenize(x) if word not in stop_words])

display(data_cleaned_df.head(2))
display(data_cleaned_df.tail(2))

Unnamed: 0,subject,to,from,email_body,label
0,"[new, sequences, window]",DeepEddy.Com,munnari.OZ.AU,"[date, tue, aug, chris, garrigues, message-id, hoping, people, additional, sequences, notice, purely, cosmetic, changes, well, first, exmh, latest, one, changes, starts, get, read, flist, totalcount, unseen, element, array, executing, totalcount, unseen-sequence, flaginner, spool, iconspool, labelup, else, flaginner, icondown, labeldown, procedure, flag, msgseen, line, invoked, within, flag, msgseen, procedure, msgseen, line, invoked, within, msgseen, procedure, msgshow, line, invoked, within, msgshow, procedure, msgchange, line, invoked, within, msgchange, show, invoked, within, time, list, msgchange, procedure, msg, change, line, invoked, within, msg, change, id, procedure, msg, show, line, invoked, within, msg, show, cur, eval, body, line, invoked, within, eval, procedure, folderchange, line, invoked, within, ...]",0
1,"[new, sequences, window]",DeepEddy.Com,munnari.OZ.AU,"[date, wed, aug, chris, garrigues, message-id, reproduce, error, repeatable, like, every, time, without, fail, debug, log, pick, happening, pick, exec, pick, inbox, -list, -lbrace, -lbrace, -subject, ftp, -rbrace, -rbrace, -sequence, mercury, exec, pick, inbox, -list, -lbrace, -lbrace, -subject, ftp, -rbrace, -rbrace, -sequence, mercury, ftoc, pickmsgs, hit, marking, hits, tkerror, syntax, error, expression, int, note, run, pick, command, hand, delta, pick, inbox, -list, -lbrace, -lbrace, -subject, ftp, -rbrace, -rbrace, -sequence, mercury, hit, hit, comes, obviously, version, nmh, using, delta, pick, -version, pick, nmh-, compiled, fuchsia, cs, mu, oz, au, sun, mar, ict, relevant, part, mh, profile, delta, mhparam, pick, -seq, sel, ...]",0


Unnamed: 0,subject,to,from,email_body,label
8595,"[cannabis, difference]",webnote.net,dialix.oz.au,"[mid-summer, customer, appreciation, sale, express, appreciation, loyal, customers, offering, following, products, limited, time, ragga, dagga, stoney, mahoney, sweet, vjestika, aqueous, kathmandu, along, free, oz, package, capillaris, herba, three, new, renegade, botanicals, exotic, botanical, resources, ethnobotanical, herbalists, brought, herba, supplementals, kathmandu, temple, kiff, personal, choice, pipe-smoking, products, substances, common, market, announcing, temple, ragga, dagga, tm, aqueous, kathmandu, tm, stoney, mahoney, tm, temple, ragga, dagga, finally, able, offer, sensitive, responsive, personal, choice, smoking, enjoyment, temple, ragga, dagga, tm, pipe-smoking, substance, supplemental, product, introduced, three, years, research, development, temple, personal, choice, legal, smoking, indulgence, redefined, thanks, recent, dramatic, technological, advances, laboratorial, processes, extraction, ...]",1
8596,"[ilug, wilson, kamela]",linux.ie,netscape.net,"[attn, sir, madan, strictly, confidential, pleased, introduce, name, mr, wilson, kamela, native, south, africa, senior, employee, mines, natural, resources, department, currently, trainning, course, holland, months, writing, letter, request, assistance, order, redeem, investment, south, african, mining, corporation, said, investment, valued, million, dollars, fifteen, million, five, hundred, thousand, dollars, purchased, lucio, harper, contracted, south, african, mining, corporation, recognised, mines, natural, resources, department, redeemable, investment, interest, matured, since, march, last, year, since, march, last, year, several, attempts, made, contact, lucio, harper, without, success, way, contact, close, relatives, whose, favour, investment, cash, value, paid, since, access, lucio, harper, information, claim, money, help, partners, south, ...]",1


In [None]:
# Handling typos and mispellings
english_vocab = set(w.lower() for w in words.words())


def lemmatize_word(word, pos=None):
    lemmatizer = WordNetLemmatizer()
    if pos is None:
        return lemmatizer.lemmatize(word)        
    
    return lemmatizer.lemmatize(word, pos=pos)


def suggest_words(word, threshold=3):
    suggested_words = [e_word for e_word in english_vocab if edit_distance(word, e_word) <= threshold]
    return suggested_words


def is_combined_word(word_list, index):
    combined_word = word_list[index] + word_list[index+1]
    lemmatized_word = lemmatize_word(combined_word, 'v')

    # Check if the combined word is in the dictionary and not a stopword
    if lemmatized_word in english_vocab and lemmatized_word not in stop_words:
        word_list[index] = lemmatize_word(combined_word)
        word_list.pop(index+1)
        return True

    return False


def clean_words(word_list):
    word_list_size = len(word_list)-1
    idx = 0
    
    while idx < word_list_size:
        # Remove trailing and leading dash(es)
        cleaned_word = re.sub(r'^-+|-+$', '', word_list[idx])

        # Check if any of two consecutive words (current and after) aren't in dictionary and 
        # see if putting them together might work
        if any(word not in english_vocab for word in word_list[idx:idx+2]):
            # Check if two words can be combined
            is_valid_word = is_combined_word(word_list, idx)
            if is_valid_word:
                word_list_size = word_list_size - 1
            else:
                # Remove one-character word from the list
                if len(cleaned_word) == 1:
                    word_list.pop(idx)    
                    word_list_size = word_list_size - 1
                else:
                    # suggested_words = suggest_words(cleaned_word)
                    # TODO: Improve this section of the code
                    # For now, we only take words that have error distance of 3
                    # if len(suggested_words) == 0:
                    #     word_list.pop(idx)    
                    #     word_list_size = word_list_size - 1
                    # else:
                    word_list[idx] = lemmatize_word(cleaned_word)
        else:
            word_list[idx] = lemmatize_word(cleaned_word)
            
        idx = idx + 1

    return word_list


data_cleaned_df[['subject', 'email_body']] = data_cleaned_df[['subject', 'email_body']].applymap(lambda x : clean_words(x))
data_cleaned_df[['subject', 'email_body']]

Unnamed: 0,subject,email_body
0,"[new, sequence, window]","[date, tue, aug, chris, garrigues, message-id, hoping, people, additional, sequence, notice, purely, cosmetic, change, well, first, exmh, latest, one, change, start, get, read, flist, totalcount, unseen, element, array, executing, totalcount, unseen-sequence, flaginner, spool, iconspool, labelup, else, flaginner, icondown, labeldown, procedure, flag, msgseen, line, invoked, within, flag, msgseen, procedure, msgseen, line, invoked, within, msgseen, procedure, msgshow, line, invoked, within, msgshow, procedure, msgchange, line, invoked, within, msgchange, show, invoked, within, time, list, msgchange, procedure, msg, change, line, invoked, within, msg, change, id, procedure, msg, show, line, invoked, within, msg, show, cur, eval, body, line, invoked, within, eval, procedure, folderchange, line, invoked, within, ...]"
1,"[new, sequence, window]","[date, wed, aug, chris, garrigues, message-id, reproduce, error, repeatable, like, every, time, without, fail, debug, log, pick, happening, pick, exec, pick, inbox, list, lbrace, lbrace, subject, ftp, rbrace, rbrace, sequence, mercury, exec, pick, inbox, list, lbrace, lbrace, subject, ftp, rbrace, rbrace, sequence, mercury, ftoc, pickmsgs, hit, marking, hit, tkerror, syntax, error, expression, int, note, run, pick, command, hand, delta, pick, inbox, list, lbrace, lbrace, subject, ftp, rbrace, rbrace, sequence, mercury, hit, hit, come, obviously, version, nmh, using, delta, pick, version, pick, nmh, compiled, fuchsia, c, mu, oz, au, sun, mar, ict, relevant, part, mh, profile, delta, mhparam, pick, seq, sel, ...]"
2,"[personal, finance, resolution, keep]","[motley, fool, personal, finance, wednesday, january, issue, ask, fool, stop, solicitation, money, resolution, keep, spotlight, saving, market, community, tip, least, one, thing, differently, sponsored, datek, datek, online, built, trade, proprietary, auto, routing, technology, commission, online, equity, trade, second, execution, commitment, apply, datek, account, ask, fool, q, tired, getting, many, credit, card, offer, mail, already, card, want, mail, offering, new, one, stop, three, main, credit, bureau, united, state, agreed, someone, contact, one, asks, removed, junk, mail, er, direct, mail, list, telemarketing, phone, list, honor, request, full, answer, money, resolution, keep, let, u, face, new, year, resolution, mostly, nightmare, hang, head, like, ...]"
3,"[new, sequence, window]","[date, wed, aug, exmh, content-type, text, plain, charset, us-ascii, tue, aug, edt, said, ever, tried, get, mh, pseq, sequence, suspect, everybod, looking, big, box, unseen, pseq, might, want, add, pseq, hide, default, list, intended, added, sequence, never, show, list, take, effect, till, stopped, restarted, exmh, added, pseq, hit, save, preference, take, effect, till, restarted, one, point, worked, fine, check, see, stopped, working, chris, chris, garrigues, cwg, vircio, congress, suite, austin, tx, world, war, iii, wrong-doers, v, evil-doers, begin, pgp, signature, version, gnupg, v, gnu, linux, comment, exmh, version, id, dbqe, gk, b, h, iuirasiyaj, zejm, tiqdd, mqu, lbapzzpasgcccjtl, bdfb, wkmtagwylf, ...]"
4,"[zzzzteana, alexander]","[martin, posted, tasso, papadopoulos, greek, sculptor, behind, plan, judged, limestone, mount, kerdylio, mile, east, salonika, far, mount, athos, monastic, community, ideal, patriotic, sculpture, well, alexander, granite, feature, ft, high, ft, wide, museum, restored, amphitheatre, car, park, admiring, crowd, planned, mountain, limestone, granite, limestone, weather, pretty, fast, yahoo, group, sponsor, dvd, free, p, join, unsubscribe, group, send, email, use, yahoo, group, subject]"
...,...,...
8592,"[busy, home, study, make, sense]","[want, bos, train, self-paced, home, study, thousand, people, people, like, able, work, home, thanks, convenience, distance, learning, course, pcdi, bonus, tuition, discount, enroll, pcdi, offer, popular, career, training, program, including, medical, transcription, medical, billing, electrician, interior, decorating, child, day, care, private, investigation, high, school, diploma, start, left, th, th, grade, associate, degree, program, take, education, earn, nationally, accredited, degree, paralegal, study, criminal, justice, early, childhood, education, accounting, health, care, management, small, business, management, human, resource, management, click, order, free, career, information, kit, enjoy, tuition, saving, enroll, one, nationally, accredited, course, free, home, study, career, information, kit, pcdi, nationally, accredited, home, ...]"
8593,"[preferred, non-smoker, rate, smokers]","[preferred, non-smoker, doctor, ordered, case, study, male, face, good, health, cigarette, day, issued, preferred, non-smoker, case, study, female, face, good, health, social, cigarette, smoker, issued, preferred, non-smoker, case, study, male, face, good, health, cigar, month, issued, preferred, best, non-smoker, case, study, male, face, private, pilot, smoke, cigar, daily, issued, preferred, non-smoker, without, aviation, flat, extra, click, provide, detail, tobacco, case, call, doctor, case, detail, cured, agent, tough, case, please, fill, form, information, name, e-mail, phone, city, state, tennessee, brokerage, agency, want, anyone, receive, mailing, wish, receive, professional, communication, sent, insurance, professional, removed, mailing, list, reply, message, instead, go, legal, notice, ...]"
8594,"[get, free, hit, per, day, website]","[dear, subscriber, could, show, way, get, visitor, day, web, site, noting, cost, money, minute, day, time, would, interested, click, link, copy, paste, browser, information, promise, disguised, solicitation, money, need, outlay, single, solitary, dollar, pound, punt, rand, mark, euro, step, lesson, plan, work, marketing, strategy, out-performs, anything, seen, register, free, learn, combine, synergistic, potential, free, program, click, link, copy, paste, browser, see, proof, main, site, receives, visitor, day, visitor, averaging, page, view, received, million, visitor, since, began, people, worldwide, joined, program, month, operating, act, today, click, link, copy, paste, browser, join, success, best, wish, mike, p, step, instruction, course, absolutely, free, ...]"
8595,"[cannabis, difference]","[mid-summer, customer, appreciation, sale, express, appreciation, loyal, customer, offering, following, product, limited, time, ragga, dagga, stoney, mahoney, sweet, vjestika, aqueous, kathmandu, along, free, oz, package, capillaris, herba, three, new, renegade, botanical, exotic, botanical, resource, ethnobotanical, herbalist, brought, herba, supplementals, kathmandu, temple, kiff, personal, choice, pipe-smoking, product, substance, common, market, announcing, temple, ragga, dagga, tm, aqueous, kathmandu, tm, stoney, mahoney, tm, temple, ragga, dagga, finally, able, offer, sensitive, responsive, personal, choice, smoking, enjoyment, temple, ragga, dagga, tm, pipe-smoking, substance, supplemental, product, introduced, three, year, research, development, temple, personal, choice, legal, smoking, indulgence, redefined, thanks, recent, dramatic, technological, advance, laboratorial, process, extraction, ...]"


## Building Model (uncleaned data)

Note: Make sure to run 3 iterations of tuning and explain your rationale for the tuning approaches used each iteration

In [None]:
# Split independent variables and dependent variable
X_raw = data_raw_df.drop(['label'], axis=1)
y_raw = data_raw_df['label']

X_raw.shape, y_raw.shape

((8597, 4), (8597,))

### Model Evaluation

Note: Evaluate both test and train data. Make sure there are enough data points in the test set (>500) for Confusion Matrix, AUC etc.

## Building Model (cleaned data)

Note: Make sure to run 3 iterations of tuning and explain your rationale for the tuning approaches used each iteration

In [None]:
# Split independent variables and dependent variable
X_cleaned = data_cleaned_df.drop(['label'], axis=1)
y_cleaned = data_cleaned_df['label']

X_cleaned.shape, y_cleaned.shape

((8597, 4), (8597,))

### Model Evaluation

Note: Evaludate both test and train data. Make sure there are enough data points in the test set (>500) for Confusion Matrix, AUC etc.

## Conclusion

Note: Include what else could be done to tune the model and how it would have helped (w/ some numbers)

## References
- https://coderzcolumn.com/tutorials/python/email-how-to-represent-an-email-message-in-python