The objective of this notebook is to perform the initial preprocessing on the dataset  in order to create a baseline model

## Libraries

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd 
import re
from string import punctuation
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /home/maldu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/maldu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
df = pd.read_csv("../../data/bronze/raw_train.csv")
df

Unnamed: 0,email,label
0,get up to escapenumber escapenumber emergency ...,spam
1,dear customer the pharmacy you shop at got too...,spam
2,please find below the gtv project status updat...,ham
3,fuller harder\nerectionsthe same as the one fr...,spam
4,thinking of breathing new life into your busin...,spam
...,...,...
75936,we are grateful to all our devoted customers a...,spam
75937,You have registered Sinco as Payee. Log in at ...,ham
75938,seize the opportunity escapenumber anatrim esc...,spam
75939,hi all attached is a patch that minimally enab...,ham


## Change data types

In [4]:
df['label'] = df['label'].map({"ham": 0, "spam": 1})

## Drop duplicated rows

In [5]:
df.drop_duplicates(inplace=True)

# Tokenizer

In [6]:
df_tokenized = df.copy()
df_tokenized['email'] = df_tokenized['email'].apply(word_tokenize)
df_tokenized['email']

0        [get, up, to, escapenumber, escapenumber, emer...
1        [dear, customer, the, pharmacy, you, shop, at,...
2        [please, find, below, the, gtv, project, statu...
3        [fuller, harder, erectionsthe, same, as, the, ...
4        [thinking, of, breathing, new, life, into, you...
                               ...                        
75936    [we, are, grateful, to, all, our, devoted, cus...
75937    [You, have, registered, Sinco, as, Payee, ., L...
75938    [seize, the, opportunity, escapenumber, anatri...
75939    [hi, all, attached, is, a, patch, that, minima...
75940    [hi, i, am, working, on, conjoint, analysis, h...
Name: email, Length: 75606, dtype: object

## Clean text function

For more info visit the special_chars_analysis.ipynb

In [None]:
def clean_text(tokens):
    special_replacements = {
        r"£": "pound",
        r"$": "dollar",
        r"€": "euro",
        r"%": "percentage",
        r"♣": "clover", 
        r"®": "registered trademark",
        r"©": "copyright",
        r"☺": "emoji",
        r"™": "trademark",
    }

    # Chat words dictionary)
    chat_words = {
        "afaik": "As Far As I Know",
        "afk": "Away From Keyboard",
        "asap": "As Soon As Possible",
        "atk": "At The Keyboard",
        "atm": "At The Moment",
        "a3": "Anytime, Anywhere, Anyplace",
        "bak": "Back At Keyboard",
        "bbl": "Be Back Later",
        "bbs": "Be Back Soon",
        "bfn": "Bye For Now",
        "b4n": "Bye For Now",
        "brb": "Be Right Back",
        "brt": "Be Right There",
        "btw": "By The Way",
        "b4": "Before",
        "b4n": "Bye For Now",
        "cu": "See You",
        "cul8r": "See You Later",
        "cya": "See You",
        "faq": "Frequently Asked Questions",
        "fc": "Fingers Crossed",
        "fwiw": "For What It's Worth",
        "fyi": "For Your Information",
        "gal": "Get A Life",
        "gg": "Good Game",
        "gn": "Good Night",
        "gmta": "Great Minds Think Alike",
        "gr8": "Great!",
        "g9": "Genius",
        "ic": "I See",
        "icq": "I Seek you (also a chat program)",
        "ilu": "ILU: I Love You",
        "imho": "In My Honest/Humble Opinion",
        "imo": "In My Opinion",
        "iow": "In Other Words",
        "irl": "In Real Life",
        "kiss": "Keep It Simple, Stupid",
        "ldr": "Long Distance Relationship",
        "lmao": "Laugh My A.. Off",
        "lol": "Laughing Out Loud",
        "ltns": "Long Time No See",
        "l8r": "Later",
        "mte": "My Thoughts Exactly",
        "m8": "Mate",
        "nrn": "No Reply Necessary",
        "oic": "Oh I See",
        "pita": "Pain In The A..",
        "prt": "Party",
        "prw": "Parents Are Watching",
        "qpsa?": "Que Pasa?",
        "rofl": "Rolling On The Floor Laughing",
        "roflol": "Rolling On The Floor Laughing Out Loud",
        "rotflmao": "Rolling On The Floor Laughing My A.. Off",
        "sk8": "Skate",
        "stats": "Your sex and age",
        "asl": "Age, Sex, Location",
        "thx": "Thank You",
        "ttfn": "Ta-Ta For Now!",
        "ttyl": "Talk To You Later",
        "u": "You",
        "u2": "You Too",
        "u4e": "Yours For Ever",
        "wb": "Welcome Back",
        "wtf": "What The F...",
        "wtg": "Way To Go!",
        "wuf": "Where Are You From?",
        "w8": "Wait...",
        "7k": "Sick:-D Laugher",
        "tfw": "That feeling when",
        "mfw": "My face when",
        "mrw": "My reaction when",
        "ifyp": "I feel your pain",
        "tntl": "Trying not to laugh",
        "jk": "Just kidding",
        "idc": "I don't care",
        "ily": "I love you",
        "imu": "I miss you",
        "adih": "Another day in hell",
        "zzz": "Sleeping, bored, tired",
        "wywh": "Wish you were here",
        "time": "Tears in my eyes",
        "bae": "Before anyone else",
        "fimh": "Forever in my heart",
        "bsaaw": "Big smile and a wink",
        "bwl": "Bursting with laughter",
        "bff": "Best friends forever",
        "csl": "Can't stop laughing"
    }

    

    emoticon_pattern = re.compile(r"""
    [:;=Xx]           
    [-~]?             
    [\)\]\(\[dDpP/]   
    """, re.VERBOSE)
    
    tokens = [re.sub(pattern, replacement, token) for token in tokens for pattern, replacement in special_replacements.items()]
    tokens = [token.replace('\n', ' ') for token in tokens]
    tokens = [re.sub(emoticon_pattern, 'emoji', token) for token in tokens]
    tokens = [token.lower() for token in tokens]
    tokens = [re.sub(r'\b' + re.escape(abbr) + r'\b', full_form, token) for token in tokens for abbr, full_form in chat_words.items()]
    tokens = [re.sub('<[^<>]+>', ' ', token) for token in tokens]
    tokens = [re.sub(r'http\S+|www.\S+', '', token) for token in tokens]
    tokens = [re.sub(r'[0-9]+', 'number', token) for token in tokens]
    tokens = [re.sub(r'[^\s]+@[^\s]+', 'emailaddr', token) for token in tokens]
    tokens = [token.translate(str.maketrans('', '', punctuation)) for token in tokens]
    tokens = [re.sub(r'[^a-zA-Z\s]', '', token) for token in tokens]
    tokens = [re.sub(r'\s+', ' ', token).strip() for token in tokens]
    return tokens



df_tokenized['email'] = df_tokenized['email'].apply(clean_text)


In [12]:
# df_tokenized.to_csv("../data/silver/df_cleantext_v0.csv", index= False)

## Stopwords

In [None]:
def remove_stopwords(tokens):
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

df_text_clean = df_tokenized.copy()
df_text_clean['message_clean'] = df_text_clean['email'].apply(remove_stopwords)
df_text_clean['message_clean']

## Lemmatizer

In [None]:
def lemmatize_text(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized_tokens) 

df_text_clean['message_lemmatized'] = df_text_clean['message_clean'].apply(lemmatize_text)
df_text_clean

In [None]:
df.to_csv("../data/silver/df_lemmatized_v0.csv", index= False)
df

In [None]:
df.isna().sum()