In [269]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [270]:
import nltk
stopwords = nltk.download('stopwords')
#we will skip this step due to technical issues

[nltk_data] Error loading stopwords: <urlopen error Tunnel connection
[nltk_data]     failed: 407 AuthorizedOnly>


In [271]:
spam_df = pd.read_csv('spam.csv')

In [272]:
spam_df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [273]:
spam_df.shape

(5572, 2)

In [274]:
spam_df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [275]:
spam_df.isnull().sum()

Category    0
Message     0
dtype: int64

In [276]:
spam_df.fillna('', inplace = True) #convert null values with null strings if present

In [277]:
#Remove stopwords

Label Encoding

In [278]:
spam_df.replace({'Category': {'ham':0, 'spam':1}}, inplace = True)

In [279]:
spam_df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [280]:
ps = PorterStemmer()

In [281]:
print(ps.stem('batman'))
print(ps.stem('bat'))
print(ps.stem('batting')) #bat and batting comes from same word 'bat' but batman and batwoman have diff meanings
print(ps.stem('batwoman'))

batman
bat
bat
batwoman


Porter Stemming

In [282]:
def stemming(content):
    content = re.sub('^[A-Za-z]','', content)
    content_stemmed = content.lower()
    content_stemmed = content_stemmed.split()
    ps = PorterStemmer()
    content_stemmed = [ps.stem(words) for words in content_stemmed]
    content_stemmed = ' '.join(content_stemmed)
    return content_stemmed

In [283]:
spam_df['Message'] = spam_df['Message'].apply(stemming)

In [284]:
print(spam_df['Message'])

0       o until jurong point, crazy.. avail onli in bu...
1                              k lar... joke wif u oni...
2       ree entri in 2 a wkli comp to win fa cup final...
3         dun say so earli hor... u c alreadi then say...
4       ah i don't think he goe to usf, he live around...
                              ...                        
5567    hi is the 2nd time we have tri 2 contact u. u ...
5568                      ill ü b go to esplanad fr home?
5569    ity, * wa in mood for that. so...ani other sug...
5570    he guy did some bitch but i act like i'd be in...
5571                              ofl. it true to it name
Name: Message, Length: 5572, dtype: object


In [285]:
X = spam_df['Message']
Y = spam_df['Category']

In [335]:
Y = spam_df['Category'].astype(int)

In [337]:
spam_df['Category'].value_counts()

0    4825
1     747
Name: Category, dtype: int64

In [338]:
sample_1 = spam_df[spam_df['Category'] ==0].sample(n = 747)
sample_2 = spam_df[spam_df['Category'] ==1].sample(n = 747)

In [339]:
new_spam_df = pd.concat((sample_1, sample_2))

In [340]:
new_spam_df['Category'].value_counts()

0    747
1    747
Name: Category, dtype: int64

In [341]:
new_spam_df['Message'].shape

(1494,)

In [342]:
X = new_spam_df['Message']
Y = new_spam_df['Category'].astype(int)

Tf-Idf Vectorizer

In [343]:
vectorizer = TfidfVectorizer(min_df =1, stop_words = 'english', lowercase= 'True')

In [351]:
X = vectorizer.fit_transform(X)

In [352]:
X.shape

(1494, 4298)

In [353]:
Y.shape

(1494,)

In [354]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 2, stratify = Y)

In [355]:
print(X.shape, X_train.shape, X_test.shape)

(1494, 4298) (1195, 4298) (299, 4298)


In [356]:
model = LogisticRegression()

In [357]:
model.fit(X_train, Y_train)

LogisticRegression()

In [358]:
spam_train_predict = model.predict(X_train)
spam_train_accuracy = accuracy_score(spam_train_predict, Y_train)
print('Accuracy of training data is: ',spam_train_accuracy )

Accuracy of training data is:  0.9815899581589959


In [359]:
spam_test_predict = model.predict(X_test)
spam_test_accuracy = accuracy_score(spam_test_predict, Y_test)
print('Accuracy of testing data is: ',spam_test_accuracy )

Accuracy of testing data is:  0.959866220735786
