# SMS Spam Detection

In [1]:
#Importing pandas for data analysis and data manipulation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the dataset
df = pd.read_csv('spam.csv', encoding='latin-1')
df.drop(['Unnamed: 2', 'Unnamed: 3','Unnamed: 4'], axis=1,inplace=True)
df.rename(columns = {'v1':'label','v2':'SMS'},inplace=True)

FileNotFoundError: [Errno 2] No such file or directory: 'spam.csv'

In [3]:
#Checking first 10 values
df.head(10)

Unnamed: 0,label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [4]:
#Checking last 10 values
df.tail(10)

Unnamed: 0,label,SMS
5562,ham,Ok lor... Sony ericsson salesman... I ask shuh...
5563,ham,Ard 6 like dat lor.
5564,ham,Why don't you wait 'til at least wednesday to ...
5565,ham,Huh y lei...
5566,spam,REMINDER FROM O2: To get 2.50 pounds free call...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [5]:
#Checking rows and columns(m*n)
df.shape

(5572, 2)

In [6]:
#Checking column names
df.columns

Index(['label', 'SMS'], dtype='object')

In [7]:
#Checking for nulls
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   SMS     5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


<b>As we see there are no null values

In [8]:
#Checking for nulls
df.isnull().sum()

label    0
SMS      0
dtype: int64

In [9]:
#Row-wise all values are displayed
df.values

array([['ham',
        'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'],
       ['ham', 'Ok lar... Joking wif u oni...'],
       ['spam',
        "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"],
       ...,
       ['ham',
        'Pity, * was in mood for that. So...any other suggestions?'],
       ['ham',
        "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free"],
       ['ham', 'Rofl. Its true to its name']], dtype=object)

In [10]:
#Libraries needed for NLP
import nltk
import re

#Stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('russian'))

#Stemmers
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

#Lemmatizer
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to D:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to D:\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
#Store it in a variable
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [12]:
df['label'] = df['label'].map( {'spam': 1, 'ham': 0})

In [13]:
#Cleanup

# Replace email address with 'emailaddress'
df['SMS'] = df['SMS'].str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', 'emailaddress')

# Replace urls with 'webaddress'
df['SMS'] = df['SMS'].str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'webaddress')

# Replace money symbol with 'money-symbol'
df['SMS'] = df['SMS'].str.replace(r'£|\$', 'money-symbol')

# Replace 10 digit phone number with 'phone-number'
df['SMS'] = df['SMS'].str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phone-number')

# Replace normal number with 'number'
df['SMS'] = df['SMS'].str.replace(r'\d+(\.\d+)?', 'number')

# remove punctuation
df['SMS'] = df['SMS'].str.replace(r'[^\w\d\s]', ' ')

# remove whitespace between terms with single space
df['SMS'] = df['SMS'].str.replace(r'\s+', ' ')

# remove leading and trailing whitespace
df['SMS'] = df['SMS'].str.replace(r'^\s+|\s*?$', ' ')

# change words to lower case
df['SMS'] = df['SMS'].str.lower()

In [14]:
#Applying English Stopwords
df['SMS'] = df['SMS'].apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [15]:
#Applying snowball stemmer
snowball = nltk.SnowballStemmer("russian")
df['SMS'] = df['SMS'].apply(lambda x: ' '.join(snowball.stem(term) for term in x.split()))

<b> Using snowball I got better results compared to Porter Stemmer and Wordnet Lemmatizer

In [16]:
#Applying Porter Stemmer
#df['SMS'] = df['SMS'].apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [17]:
#Applying Wordnet Lemmatizer
#df['SMS'] = df['SMS'].apply(lambda x: ' '.join(lemmatizer.lemmatize(term) for term in x.split()))

In [18]:
#Creating a bag-of-words model
df_final = df['SMS']
from nltk.tokenize import word_tokenize

bow = []
for sms in df_final:
    words = word_tokenize(sms)
    for w in words:
        bow.append(w)

In [19]:
#Implementing TF|IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_model = TfidfVectorizer()
tfidf_vec=tfidf_model.fit_transform(df_final)
tfidf_data=pd.DataFrame(tfidf_vec.toarray())

In [20]:
tfidf_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6496,6497,6498,6499,6500,6501,6502,6503,6504,6505
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Defining train and test
df_train = tfidf_data.iloc[:4457]
df_test = tfidf_data.iloc[4457:]

target = df['label']
df_train['label'] = target

Y = df_train['label']
X = df_train.drop('label',axis=1)

In [22]:
#Checking predictors
X.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6496,6497,6498,6499,6500,6501,6502,6503,6504,6505
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
#Checking target
Y.head(10)

0    0
1    0
2    1
3    0
4    0
5    1
6    0
7    0
8    1
9    1
Name: label, dtype: int64

In [24]:
# Splitting training data into train and test
from sklearn import model_selection
X_train,X_test,y_train,y_test = model_selection.train_test_split(X,Y,test_size=.25, random_state=42)

In [25]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(alpha=0.2)
classifier.fit(X_train, y_train)

MultinomialNB(alpha=0.2)

In [26]:
#Using this class to make my print statement look bold
class color:
    BOLD = '\033[1m'

In [27]:
#Predicting and calculating f1 score
from sklearn.metrics import f1_score
prediction = classifier.predict(X_test)

f1 = f1_score(prediction, y_test)
f1_round = round(f1,2)

print(color.BOLD + 'F1 score is:',f1_round)

[1mF1 score is: 0.95


In [28]:
import lightgbm as lgb

def train_and_test(model, model_name):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    f2 = f1_score(pred, y_test)
    print(f'F1 score is: ',f2)
    
for depth in (1,2,3,4,5,6,7,8,9,10,11):
    lgbmodel = lgb.LGBMClassifier(max_depth=depth, n_estimators=200,alpha=0.2,num_leaves=74)
    print(f"Max Depth {depth}")
    train_and_test(lgbmodel, "Light GBM")
    print("____________________________________")

Max Depth 1
F1 score is:  0.8951048951048951
____________________________________
Max Depth 2
F1 score is:  0.9220338983050846
____________________________________
Max Depth 3
F1 score is:  0.9261744966442953
____________________________________
Max Depth 4
F1 score is:  0.9235880398671097
____________________________________
Max Depth 5
F1 score is:  0.9235880398671097
____________________________________
Max Depth 6
F1 score is:  0.9342105263157894
____________________________________
Max Depth 7
F1 score is:  0.9205298013245035
____________________________________
Max Depth 8
F1 score is:  0.9271523178807948
____________________________________
Max Depth 9
F1 score is:  0.9266666666666667
____________________________________
Max Depth 10
F1 score is:  0.9235880398671097
____________________________________
Max Depth 11
F1 score is:  0.9302325581395349
____________________________________


### Multinomial Naive Bayes has higher F1 score of 0.95 compared to LightGBM which has 0.93

In [29]:
#Exporting model using joblib library
import joblib
joblib.dump(classifier,"SMS_Detection_Model.pkl")

['SMS_Detection_Model.pkl']

In [30]:
#Exporting TF|IDF using joblib library
joblib.dump(tfidf_model,"TF_IDF.pkl")

['TF_IDF.pkl']