The objective of this notebook is to perform the initial preprocessing on the dataset  in order to create a baseline model

## Libraries

In [None]:
import pandas as pd 
import re
from string import punctuation
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')


[nltk_data] Downloading package punkt to /home/maldu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/maldu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /home/maldu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv("../data/bronze/spam.csv")


## Change data types

In [3]:
df['Category'] = df['Category'].map({"ham": 0, "spam": 1}).astype(int)
df['Message'] = df['Message'].astype(str)

## Drop duplicated rows

In [4]:
df.drop_duplicates(inplace=True)

In [None]:
split train to test before anything

## Clean text function

For more info visit the special_chars_analysis.ipynb

In [5]:
def clean_text(text):
    special_replacements = {
        r"£": "pound",
        r"\$": "dollar",
        r"\€": "euro",
        r"%": "percentage", 
        r"ì": "i",
        r"ü": "you",
        }
    
    emoticon_pattern = re.compile(r"""
    [:;=Xx]           
    [-~]?             
    [\)\]\(\[dDpP/]   
    """, re.VERBOSE)
    
    for pattern, replacement in special_replacements.items():
        text = re.sub(pattern, replacement, text)
    text = re.sub(emoticon_pattern, 'emoji', text)
    text = text.lower()
    text = re.sub('<[^<>]+>', ' ', text)
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub('[0-9]+', 'number', text)
    text = re.sub('[^\s]+@[^\s]+', 'emailaddr', text)
    text = text.translate(str.maketrans('', '', punctuation))
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_cleaned = df.copy()
df_cleaned['Message']=df_cleaned['Message'].apply(clean_text)

In [12]:
df_cleaned.to_csv("../data/silver/df_cleantext_v0.csv", index= False)

## Tokenizer

In [7]:
df_tokenized = df_cleaned.copy()
df_tokenized['Message'] = df_tokenized['Message'].apply(word_tokenize)
df_tokenized['Message']

0       [go, until, jurong, point, crazy, available, o...
1                          [ok, lar, joking, wif, u, oni]
2       [free, entry, in, number, a, wkly, comp, to, w...
3       [u, dun, say, so, early, hor, u, c, already, t...
4       [nah, i, dont, think, he, goes, to, usf, he, l...
                              ...                        
5567    [this, is, the, numbernd, time, we, have, trie...
5568       [will, you, b, going, to, esplanade, fr, home]
5569    [pity, was, in, mood, for, that, soany, other,...
5570    [the, guy, did, some, bitching, but, i, acted,...
5571                     [rofl, its, true, to, its, name]
Name: Message, Length: 5157, dtype: object

## Stopwords

In [8]:
def remove_stopwords(tokens):
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

df_text_clean = df_tokenized.copy()
df_text_clean['message_clean'] = df_text_clean['Message'].apply(remove_stopwords)
df_text_clean['message_clean']

0       go jurong point crazy available bugis n great ...
1                                 ok lar joking wif u oni
2       free entry number wkly comp win fa cup final t...
3                     u dun say early hor u c already say
4             nah dont think goes usf lives around though
                              ...                        
5567    numbernd time tried number contact u u poundnu...
5568                            b going esplanade fr home
5569                          pity mood soany suggestions
5570    guy bitching acted like id interested buying s...
5571                                       rofl true name
Name: message_clean, Length: 5157, dtype: object

## Lemmatizer

In [9]:
def lemmatize_text(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized_tokens) 

df_text_clean['message_lemmatized'] = df_text_clean['message_clean'].apply(lemmatize_text)
df_text_clean

Unnamed: 0,Category,Message,message_clean,message_lemmatized
0,0,"[go, until, jurong, point, crazy, available, o...",go jurong point crazy available bugis n great ...,go jurong point crazy available bugis n great ...
1,0,"[ok, lar, joking, wif, u, oni]",ok lar joking wif u oni,ok lar joking wif u oni
2,1,"[free, entry, in, number, a, wkly, comp, to, w...",free entry number wkly comp win fa cup final t...,free entry number wkly comp win fa cup final t...
3,0,"[u, dun, say, so, early, hor, u, c, already, t...",u dun say early hor u c already say,u dun say early hor u c already say
4,0,"[nah, i, dont, think, he, goes, to, usf, he, l...",nah dont think goes usf lives around though,nah dont think go usf life around though
...,...,...,...,...
5567,1,"[this, is, the, numbernd, time, we, have, trie...",numbernd time tried number contact u u poundnu...,numbernd time tried number contact u u poundnu...
5568,0,"[will, you, b, going, to, esplanade, fr, home]",b going esplanade fr home,b going esplanade fr home
5569,0,"[pity, was, in, mood, for, that, soany, other,...",pity mood soany suggestions,pity mood soany suggestion
5570,0,"[the, guy, did, some, bitching, but, i, acted,...",guy bitching acted like id interested buying s...,guy bitching acted like id interested buying s...


In [10]:
df.to_csv("../data/silver/df_lemmatized_v0.csv", index= False)
df

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [11]:
df.isna().sum()

Category    0
Message     0
dtype: int64