In [1]:
"""
importing libraries
"""
import pandas as pd
import numpy as np
import nltk

In [2]:
"""
importing dataset
"""
data = pd.read_csv('spam.csv', encoding='latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
"""
As we can see the datasets consists of unnecessary columns
we are going to remove those columns
"""
data = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1) # axis = 1 -> signifies columns
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
"""
renaming column names : v1 -> label and v2 -> message 
"""
data.rename(columns={"v1": "label", "v2": "message"}, inplace=True)
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
"""
Handeling categorical data
"""
data = pd.get_dummies(data, columns=['label'])
data.head()

Unnamed: 0,message,label_ham,label_spam
0,"Go until jurong point, crazy.. Available only ...",1,0
1,Ok lar... Joking wif u oni...,1,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,0,1
3,U dun say so early hor... U c already then say...,1,0
4,"Nah I don't think he goes to usf, he lives aro...",1,0


In [6]:
data['label_ham'].value_counts()

1    4825
0     747
Name: label_ham, dtype: int64

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   message     5572 non-null   object
 1   label_ham   5572 non-null   uint8 
 2   label_spam  5572 non-null   uint8 
dtypes: object(1), uint8(2)
memory usage: 54.5+ KB


In [8]:
"""
Introducing message length in dataset
"""
data['count'] = 0
for i in np.arange(0, len(data.message)):
    data.loc[i, 'count'] = len(data.loc[i, 'message'])
data.head()

Unnamed: 0,message,label_ham,label_spam,count
0,"Go until jurong point, crazy.. Available only ...",1,0,111
1,Ok lar... Joking wif u oni...,1,0,29
2,Free entry in 2 a wkly comp to win FA Cup fina...,0,1,155
3,U dun say so early hor... U c already then say...,1,0,49
4,"Nah I don't think he goes to usf, he lives aro...",1,0,61


In [13]:
"""
Show data distributions
"""
data.describe()

Unnamed: 0,label_ham,label_spam,count
count,5572.0,5572.0,5572.0
mean,0.865937,0.134063,80.118808
std,0.340751,0.340751,59.690841
min,0.0,0.0,2.0
25%,1.0,0.0,36.0
50%,1.0,0.0,61.0
75%,1.0,0.0,121.0
max,1.0,1.0,910.0


In [9]:
"""
Preparing Word Vector
"""
corpus = []

In [10]:
"""
Preparing Message
"""
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import re

In [11]:
ps = PorterStemmer()

In [12]:
for i in range(0, len(data.message)):
    # regular expressions
    msg = data['message'][i]
    # find email address and replace with emailaddress
    msg = re.sub('\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailaddress', msg)
    # find urls and replace with httpaddress
    msg = re.sub('(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'httpaddress', msg)
    # find money symbols and replace with moneysymbol
    msg = re.sub('£|\$', 'moneysymbol', msg)
    # find phone numbers and replace with phonenumber
    msg = re.sub('\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', 'phonenumber', msg)
    # find numbers and replace with number
    msg = re.sub('\d+(\.\d+)?', 'number', msg)
    # find punctuations and replace with empty string
    msg = re.sub('[^\w\d\s]', ' ', msg)
    # change into lower case
    msg = msg.lower()
    msg = msg.split()
    # stemming and stopwords removal
    msg = [ps.stem(word) for word in msg if not word in set(stopwords.words('english'))]
    # join message
    msg = ' '.join(msg)
    corpus.append(msg)

In [13]:
"""
Prepare vector for each message
"""
cv = CountVectorizer()
data_input = cv.fit_transform(corpus).toarray()
"""
first data for input
"""
data_input[0]

In [16]:
"""
Applying classification
    input -> prepared sparse matrix/vector for each message
    output -> label i.e. spam or ham
"""
data_output = data['label_ham']
data_output.value_counts()

1    4825
0     747
Name: label_ham, dtype: int64

In [18]:
"""
Splitting data for training and testing
"""
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(data_input, data_output, test_size=0.2, random_state=0)

In [26]:
"""
Preparing ML models
"""
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [25]:
"""
Training using Naive Bayes
"""
model_nb = GaussianNB()
model_nb.fit(train_x, train_y)

GaussianNB()

In [27]:
"""
Training using Decision Tree
"""
model_dt = DecisionTreeClassifier()
model_dt.fit(train_x, train_y)

DecisionTreeClassifier()

In [29]:
"""
Training using Random Forest
"""
model_rf = RandomForestClassifier(n_estimators=100)
model_rf.fit(train_x, train_y)

RandomForestClassifier()

In [30]:
"""
Prediction
"""
prediction_nb = model_nb.predict(test_x)
prediction_dt = model_dt.predict(test_x)
prediction_rf = model_rf.predict(test_x)

In [31]:
"""
Result Naive Bayes Classifier
"""
print(f"Accuracy from Naive Bayes Classifier is : {accuracy_score(test_y, prediction_nb)}")
print(f"Classification report from Naive Bayes Classifier is : \n {classification_report(test_y, prediction_nb)}")

Accuracy from Naive Bayes Classifier is : 0.863677130044843
Classification report from Naive Bayes Classifier is : 
               precision    recall  f1-score   support

           0       0.53      0.85      0.65       166
           1       0.97      0.87      0.92       949

    accuracy                           0.86      1115
   macro avg       0.75      0.86      0.78      1115
weighted avg       0.90      0.86      0.88      1115



In [32]:
"""
Result Decision Tree Classifier
"""
print(f"Accuracy from Decision Tree Classifier is : {accuracy_score(test_y, prediction_dt)}")
print(f"Classification report from Decision Tree Classifier is : \n {classification_report(test_y, prediction_dt)}")

Accuracy from Decision Tree Classifier is : 0.9704035874439462
Classification report from Decision Tree Classifier is : 
               precision    recall  f1-score   support

           0       0.91      0.89      0.90       166
           1       0.98      0.99      0.98       949

    accuracy                           0.97      1115
   macro avg       0.95      0.94      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [33]:
"""
Result Random Forest Classifier
"""
print(f"Accuracy from Random Forest Classifier is : {accuracy_score(test_y, prediction_rf)}")
print(f"Classification report from Random Forest Classifier is : \n {classification_report(test_y, prediction_rf)}")

Accuracy from Random Forest Classifier is : 0.9748878923766816
Classification report from Random Forest Classifier is : 
               precision    recall  f1-score   support

           0       1.00      0.83      0.91       166
           1       0.97      1.00      0.99       949

    accuracy                           0.97      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.97      0.97      1115

