In [1]:
from sklearn.feature_extraction.text import CountVectorizer # this will apply count vectorization to the text
import re # regex
import pandas as pd
import numpy as np
from joblib import dump

Spam_df = pd.read_csv("02_Spam_EDA_Phase.csv")

In [2]:
Spam_df.head()

Unnamed: 0,Text,Result,num_chars,num_words,num_spec_chars,num_digits,num_Uppercase_Words,num_URLS,num_Emails,num_Sus_Words
0,"Go until jurong point, crazy.. Available only ...",ham,111,20,9,0,0,0,0,0
1,Ok lar... Joking wif u oni...,ham,29,6,6,0,0,0,0,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam,155,27,6,25,4,0,0,2
3,U dun say so early hor... U c already then say...,ham,49,11,6,0,2,0,0,0
4,"Nah I don't think he goes to usf, he lives aro...",ham,61,13,2,0,1,0,0,0


In [3]:
# lets get the easy part out of the way and convert result to '1' for spam and '0' for ham
def Transform_Target_Spam(text):
    if text.lower() == 'spam':
        return 1
    if text.lower() == 'ham':
        return 0

Spam_df['Result'] = Spam_df['Result'].apply(Transform_Target_Spam)

In [4]:
Spam_df.head()

Unnamed: 0,Text,Result,num_chars,num_words,num_spec_chars,num_digits,num_Uppercase_Words,num_URLS,num_Emails,num_Sus_Words
0,"Go until jurong point, crazy.. Available only ...",0,111,20,9,0,0,0,0,0
1,Ok lar... Joking wif u oni...,0,29,6,6,0,0,0,0,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1,155,27,6,25,4,0,0,2
3,U dun say so early hor... U c already then say...,0,49,11,6,0,2,0,0,0
4,"Nah I don't think he goes to usf, he lives aro...",0,61,13,2,0,1,0,0,0


<h3> Text Feature Removal</h3>

Every aspect of the text that we made a feature for during the EDA phase needs to be removed before applying CountVectorizer for vectorization. This way we won't have redundant signals that will confuse the model during the modeling phase.  

As such we will create functions to remove the following: uppercase words, digits, URLs, emails and suspicious words.

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r'\S+@\S+', '', text)  # Remove emails
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r"[^a-z\s]", '', text)  # Remove punctuation/numbers
    return text.strip()
    
Spam_df['Text'] = Spam_df['Text'].apply(clean_text)


In [6]:
Spam_df.head()

Unnamed: 0,Text,Result,num_chars,num_words,num_spec_chars,num_digits,num_Uppercase_Words,num_URLS,num_Emails,num_Sus_Words
0,go until jurong point crazy available only in ...,0,111,20,9,0,0,0,0,0
1,ok lar joking wif u oni,0,29,6,6,0,0,0,0,0
2,free entry in a wkly comp to win fa cup final...,1,155,27,6,25,4,0,0,2
3,u dun say so early hor u c already then say,0,49,11,6,0,2,0,0,0
4,nah i dont think he goes to usf he lives aroun...,0,61,13,2,0,1,0,0,0


In [7]:
# Lets check a piece of  text that contained uppercase words, special characters, digits, etc to confirm that our function worked
Spam_df['Text'][2]

'free entry in  a wkly comp to win fa cup final tkts st may  text fa to  to receive entry questionstd txt ratetcs apply overs'

In [8]:
# lets make a function to remove the suspcious words as well
def Sus_Removal(text):
    Sus_Words = [
    "free", "win", "winner", "winnings", "cash", "bonus",
    "prize", "reward", "offer", "exclusive", "guarantee",
    "100% free", "earn", "earn money", "income", "double your",
    "get paid", "giveaway", "cheap", "lowest price","urgent", "act now", "immediately", "don't miss", "limited time",
    "important", "as seen on", "last chance", "once in a lifetime",
    "apply now", "instant access", "final notice", "only today","confidential", "risk-free", "no obligation", "guaranteed",
    "safe", "access now", "click below", "click here", "this isn’t spam",
    "unsubscribe", "why pay more", "credit card", "urgent response needed",
    "act immediately", "no cost","viagra", "cialis", "pills", "pharmacy", "meds", "enhancement",
    "lose weight", "no prescription", "miracle", "adult", "xxx",
    "nude", "satisfaction guaranteed","viagra", "cialis", "pills", "pharmacy", "meds", "enhancement",
    "lose weight", "no prescription", "miracle", "adult", "xxx",
    "nude", "satisfaction guaranteed","bitch", "asshole", "shit", "fuck", "bastard", "dick", "crap", 
    "slut", "whore", "damn", "piss", "motherfucker", "cunt", "fag"
    ]
    for word in Sus_Words:
        text = re.sub(r'\b' + re.escape(word) + r'\b', '', text)

    return text
            
    

In [9]:
Spam_df['Text'] = Spam_df['Text'].apply(Sus_Removal)

In [10]:
Spam_df['Text'][2]

' entry in  a wkly comp to  fa cup final tkts st may  text fa to  to receive entry questionstd txt ratetcs apply overs'

<h3>Vectorization of text</h3>

Since all of my features are based on frequency counts, I will be using the CountVectorizer function to vectorize my text before proceeding to the modeling phase.

In [11]:
Vectorizer = CountVectorizer(stop_words='english',max_features=5000)

In [12]:
X_text = Vectorizer.fit_transform(Spam_df['Text'])

In [13]:
# lets turn the sparse matrix from CountVectorizer into a dataframe
X_text_df = pd.DataFrame(X_text.toarray(), columns=Vectorizer.get_feature_names_out())

In [14]:
X_text_df.head()

Unnamed: 0,aah,aathilove,aathiwhere,abi,ability,abiola,abj,able,abt,abta,...,zebra,zed,zeros,zf,zhong,zindgi,zoe,zogtorius,zoom,zouk
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# Lets combine back with original dataframe
Spam_df = pd.concat([Spam_df.reset_index(drop=True), X_text_df.reset_index(drop=True)], axis=1)

In [16]:
Spam_df = Spam_df.drop(columns=['Text'], errors='ignore')

# Save the training column names for future alignment
training_columns = list(Spam_df.columns)
dump(training_columns, "training_columns.joblib")

['training_columns.joblib']

In [17]:
Spam_df.head()

Unnamed: 0,Result,num_chars,num_words,num_spec_chars,num_digits,num_Uppercase_Words,num_URLS,num_Emails,num_Sus_Words,aah,...,zebra,zed,zeros,zf,zhong,zindgi,zoe,zogtorius,zoom,zouk
0,0,111,20,9,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,29,6,6,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,155,27,6,25,4,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
3,0,49,11,6,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,61,13,2,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<h2> Save the dataset for Modeling phase</h2>

In [19]:
Spam_df.to_csv('03_Spam_Preprocess_Phase.csv', index=False)  # index=False avoids saving the index as a column

In [20]:
dump(Vectorizer,'Vectorizer.joblib')

['Vectorizer.joblib']