In [3]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv('spam.csv')

In [5]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [8]:
df.isna().sum()

Category    0
Message     0
dtype: int64

In [10]:
df['Result'] = df['Category'].replace({'ham' : 0, 'spam' : 1})

In [11]:
df.head()

Unnamed: 0,Category,Message,Result
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [12]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [16]:
def perform_nlp(text):
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_words = [word.lower() for word in words if word.lower() not in stop_words]
    filtered_text = ' '.join(filtered_words)
    return filtered_text

In [17]:
df['Message'] = df['Message'].apply(perform_nlp)

In [18]:
df.head()

Unnamed: 0,Category,Message,Result
0,ham,"go jurong point , crazy .. available bugis n g...",0
1,ham,ok lar ... joking wif u oni ...,0
2,spam,free entry 2 wkly comp win fa cup final tkts 2...,1
3,ham,u dun say early hor ... u c already say ...,0
4,ham,"nah n't think goes usf , lives around though",0


In [20]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

In [21]:
X = df.Message
y = df.Result

In [24]:
X_test, X_train, y_test, y_train = train_test_split(X, y, test_size = 0.2, random_state = 2022)

In [26]:
clf = Pipeline([
    ('Vectorizer' , TfidfVectorizer()),
    ('Classifier' , MultinomialNB())
])

In [27]:
clf.fit(X_test, y_test)

In [28]:
clf.score(X_train, y_train)

0.9650224215246637

In [29]:
pred = clf.predict(X_test)

In [30]:
classification = classification_report(y_test, pred)
print(classification)

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3879
           1       1.00      0.86      0.93       578

    accuracy                           0.98      4457
   macro avg       0.99      0.93      0.96      4457
weighted avg       0.98      0.98      0.98      4457



In [31]:
confusion = confusion_matrix(y_test, pred)
print(confusion)

[[3879    0]
 [  79  499]]


# The dataset is imbalance now we oversample it and then try to predict it

In [36]:
df_class_0 = df[df['Result'] == 0]

In [38]:
df_class_1 = df[df['Result'] == 1]

In [39]:
df_class_0.shape

(4825, 3)

In [40]:
df_class_1.shape

(747, 3)

In [42]:
df_class_1_over = df_class_1.sample(4825, replace = True)

In [43]:
df_class_1_over.shape

(4825, 3)

In [45]:
df2 = pd.concat([df_class_0, df_class_1_over], axis = 0)

In [46]:
df2.head()

Unnamed: 0,Category,Message,Result
0,ham,"go jurong point , crazy .. available bugis n g...",0
1,ham,ok lar ... joking wif u oni ...,0
3,ham,u dun say early hor ... u c already say ...,0
4,ham,"nah n't think goes usf , lives around though",0
6,ham,even brother like speak . treat like aids pate...,0


In [47]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9650 entries, 0 to 3758
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  9650 non-null   object
 1   Message   9650 non-null   object
 2   Result    9650 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 301.6+ KB


In [48]:
A = df2.Message
b = df2.Result

In [49]:
A_train, A_test, b_train, b_test = train_test_split(A, b, test_size = 0.2, random_state = 2022)

In [50]:
clf.fit(A_train, b_train)

In [51]:
clf.score(A_test, b_test)

0.9818652849740933

In [52]:
pred = clf.predict(A_test)

In [53]:
report = classification_report(b_test, pred)
print(report)

              precision    recall  f1-score   support

           0       0.99      0.98      0.98       954
           1       0.98      0.99      0.98       976

    accuracy                           0.98      1930
   macro avg       0.98      0.98      0.98      1930
weighted avg       0.98      0.98      0.98      1930



In [55]:
conf = confusion_matrix(b_test, pred)
print(conf)

[[932  22]
 [ 13 963]]


### Now we notice there is no bias in model bcz we balance the dataset and now the precision, recall and f1-score are  at same level