### بسم الله الرحمن الرحيم

In [181]:
import re
import nltk
import numpy as np
import pandas as pd

import nltk
nltk.download('punct')
nltk.download('omw-1.4')

import string
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Error loading punct: Package 'punct' not found in index
[nltk_data] Downloading package omw-1.4 to C:\Users\Abdalla
[nltk_data]     Ayman\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Data Pre-Processing

In [182]:
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

In [183]:
data_frame = pd.read_csv(r'data_spam.csv', encoding='ISO-8859-1')


In [184]:
# display (rows, colmuns)
data_frame.shape

(5572, 5)

In [185]:
# show information about dataset
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [186]:
# change columns name
data_frame.rename(columns={'v1': 'label', 'v2':'text'}, inplace=True)
data_frame = data_frame[['label', 'text']]

In [187]:
data_frame['label'].describe()

count     5572
unique       2
top        ham
freq      4825
Name: label, dtype: object

In [188]:
# display description of text column
data_frame['text'].describe()

count                       5572
unique                      5169
top       Sorry, I'll call later
freq                          30
Name: text, dtype: object

In [189]:
# show top 5 rows
data_frame.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


#### Data Cleaning

In [190]:
# counting the number of missing values in the dataset
data_frame.isnull().sum()

label    0
text     0
dtype: int64

In [191]:
# replacing the null values with empty string
data_frame = data_frame.fillna('')

In [192]:
# counting the number of duplicated values in the data_frame
data_frame.duplicated().sum()

403

In [193]:
# remove duplicated values in the data_frame
data_frame = data_frame.drop_duplicates()

In [194]:
# replace "spam" with 0 & "ham" with 1
data_frame['label'].replace({'spam': 0, 'ham': 1}, inplace=True)

In [195]:
data_frame.head()

Unnamed: 0,label,text
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [196]:
def removing_punctuations(content):
    content = content.lower()
    return content.translate(str.maketrans("","",string.punctuation))

In [197]:
data_frame['unpunctuated_text'] = data_frame['text'].apply(removing_punctuations)

In [198]:
data_frame.head()

Unnamed: 0,label,text,unpunctuated_text
0,1,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,1,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,1,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,1,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


In [199]:
data_frame['tokenized_words'] = data_frame['unpunctuated_text'].apply(word_tokenize)

In [200]:
data_frame.head()

Unnamed: 0,label,text,unpunctuated_text,tokenized_words
0,1,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o..."
1,1,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,1,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t..."
4,1,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l..."


In [201]:
stopwords_list = set(stopwords.words('english'))
print(stopwords_list)

{'am', 'me', 'are', 'them', 'our', "didn't", 'after', 'how', 'aren', 'have', 'again', 'i', 'has', 'yours', 'below', 'was', 'wasn', 'herself', 'you', 'my', 'an', 'very', "she's", 'up', 'be', 'in', 'during', "you've", 'hasn', 'off', 'both', 'above', "mustn't", 'then', 'ain', 'needn', 'does', 'against', 'all', "that'll", 'some', 'or', "don't", 'as', 's', 'same', 'now', 'yourself', 'to', 'that', 'a', 'but', 'haven', 'your', 'her', 'nor', "you're", 'where', 'which', 'more', 'here', 'what', 'wouldn', "it's", 'himself', 'should', 'from', 'between', 'will', 'no', "shouldn't", 'down', 'ma', "shan't", 'their', 'had', 'him', "wouldn't", 't', 'not', "haven't", 'they', 'further', 'most', 'when', 'shouldn', 'with', 'before', 'by', 'hers', 'into', "needn't", 'other', 'at', 'mightn', 'did', 'whom', 'because', 'about', 'themselves', 'there', 'this', 'she', 'm', 'isn', 'any', 'who', 'don', "you'd", 'over', 'couldn', 'ourselves', 'can', 'the', 'he', "isn't", 'weren', 'were', 'being', 'it', "couldn't", 'o

In [202]:
port_stem = PorterStemmer()

In [203]:
def stemming(content):
    stemmed_content = content #content.lower()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords_list]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [204]:
data_frame['stemmed_text'] = data_frame['tokenized_words'].apply(stemming)

In [205]:
data_frame.head()

Unnamed: 0,label,text,unpunctuated_text,tokenized_words,stemmed_text
0,1,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o...",go jurong point crazi avail bugi n great world...
1,1,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]",ok lar joke wif u oni
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...",free entri 2 wkli comp win fa cup final tkt 21...
3,1,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t...",u dun say earli hor u c alreadi say
4,1,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...",nah dont think goe usf live around though


In [206]:
lemmatizer = WordNetLemmatizer()

In [207]:
# lemetization function
def lemetize(content):
    lemetized_content = content #.split()
    lemetized_content = [lemmatizer.lemmatize(word) for word in lemetized_content]
    lemetized_content = ' '.join(lemetized_content)
    return lemetized_content

In [208]:
# excute lemtization function on data
data_frame['lemetized_text'] = data_frame['tokenized_words'].apply(lemetize)

In [209]:
data_frame.head()

Unnamed: 0,label,text,unpunctuated_text,tokenized_words,stemmed_text,lemetized_text
0,1,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o...",go jurong point crazi avail bugi n great world...,go until jurong point crazy available only in ...
1,1,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]",ok lar joke wif u oni,ok lar joking wif u oni
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...",free entri 2 wkli comp win fa cup final tkt 21...,free entry in 2 a wkly comp to win fa cup fina...
3,1,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t...",u dun say earli hor u c alreadi say,u dun say so early hor u c already then say
4,1,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...",nah dont think goe usf live around though,nah i dont think he go to usf he life around h...


In [210]:
X = data_frame['lemetized_text'].values # input data
Y = data_frame['label'].values # result

In [211]:
bow = CountVectorizer(stop_words='english')
bow.fit(X)

In [212]:
print(bow.get_feature_names_out())

['008704050406' '0089my' '0121' ... 'ûïharry' 'ûò' 'ûówell']


In [213]:
X = bow.transform(X)

In [214]:
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### Model

In [215]:
# spliting the dataset to (80%) training data & (20%) test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

#### Logistic Regression

In [216]:
model = LogisticRegression()

In [217]:
model.fit(X_train, Y_train)

In [218]:

# accuracy score on the training data
X_train_prediction = model.predict(X_train)
precision = accuracy_score(X_train_prediction, Y_train)
recall = recall_score(X_train_prediction, Y_train)
f1score = f1_score(X_train_prediction, Y_train)


In [219]:
print('Accuracy score of the training data : ', precision)
print("Recall : ",recall)
print("f1socre : ",f1score)

Accuracy score of the training data :  0.9939540507859734
Recall :  0.9931280923584387
f1socre :  0.9965521996965936


In [220]:

# accuracy score on the test data
X_test_prediction = model.predict(X_test)
precision = accuracy_score(X_test_prediction, Y_test)
recall = recall_score(X_test_prediction, Y_test)
f1score = f1_score(X_test_prediction, Y_test)


In [221]:
print('Accuracy score of the test data : ', precision)
print("Recall : ",recall)
print("f1socre : ",f1score)

Accuracy score of the test data :  0.9758220502901354
Recall :  0.9751082251082251
f1socre :  0.9863163656267104


#### SVM

In [222]:
from sklearn import svm

In [223]:
svm = svm.SVC()

In [224]:
svm.fit(X_train, Y_train)

In [225]:
# accuracy score on the training data
X_train_prediction = svm.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [226]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9949214026602177


In [227]:
# accuracy score on the test data
X_test_prediction = svm.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)


In [228]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.9671179883945842


In [229]:
precision = precision_score(Y_test, X_test_prediction)
recall = recall_score(Y_test, X_test_prediction)
f1score = f1_score(Y_test, X_test_prediction)
print("Precision : ",precision)
print("Recall : ",recall)
print("f1socre : ",f1score)

Precision :  0.9647058823529412
Recall :  0.9988925802879292
f1socre :  0.9815016322089228


#### Decision Tree

In [230]:
from sklearn.tree import DecisionTreeClassifier

In [231]:
clf = DecisionTreeClassifier()

In [232]:
clf.fit(X_train, Y_train)

In [233]:
# accuracy score on the training data
X_train_prediction = clf.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [234]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  1.0


In [235]:
# accuracy score on the test data
X_test_prediction = clf.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)


In [236]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.9613152804642167


In [237]:
precision = precision_score(Y_test, X_test_prediction)
recall = recall_score(Y_test, X_test_prediction)
f1score = f1_score(Y_test, X_test_prediction)
print("Precision : ",precision)
print("Recall : ",recall)
print("f1socre : ",f1score)

Precision :  0.9705561613958561
Recall :  0.9856035437430787
f1socre :  0.9780219780219781
