In [75]:
import  pandas as pd
import neattext as nt
import neattext.functions as nfx

In [76]:
# Load the data

data = pd.read_csv("spam.csv", encoding='latin1')
data.rename(columns={'v1': 'Type','v2':"Emails"}, inplace=True)
data.drop(data.columns[-3:], axis=1, inplace=True)
data


Unnamed: 0,Type,Emails
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [77]:
# to check the duplication of emails

duplicate_emails = data[data["Emails"].duplicated()]["Emails"]
print(f"\nThere are {len(duplicate_emails)} duplicate emails.")
print("Duplicate Emails:")
duplicate_emails


There are 403 duplicate emails.
Duplicate Emails:


102     As per your request 'Melle Melle (Oru Minnamin...
153     As per your request 'Melle Melle (Oru Minnamin...
206     As I entered my cabin my PA said, '' Happy B'd...
222                                Sorry, I'll call later
325                      No calls..messages..missed calls
                              ...                        
5524    You are awarded a SiPix Digital Camera! call 0...
5535    I know you are thinkin malaria. But relax, chi...
5539                           Just sleeping..and surfing
5553                          Hahaha..use your brain dear
5558                               Sorry, I'll call later
Name: Emails, Length: 403, dtype: object

In [78]:
 # to drop the duplicate_emails

data = data.drop_duplicates(subset=["Emails"], keep="first")
data

Unnamed: 0,Type,Emails
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [79]:
# convert the emails into lower letters

data.loc[:, 'Emails'] = data['Emails'].str.lower()
data


Unnamed: 0,Type,Emails
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...
5568,ham,will ì_ b going to esplanade fr home?
5569,ham,"pity, * was in mood for that. so...any other s..."
5570,ham,the guy did some bitching but i acted like i'd...


In [80]:
# stop words in every row

print("\nStop Words in every Row")
# stopWords = data["Emails"].apply(lambda x:nt.TextFrame(x).count_stopwords())
stopWords = data["Emails"].apply(lambda x:nt.TextExtractor(x).extract_stopwords())
print(stopWords)


Stop Words in every Row
0                            [go, until, only, in, there]
1                                                      []
2                                [in, a, to, may, to, to]
3                                [say, so, already, then]
4            [i, don't, he, to, he, around, here, though]
                              ...                        
5567    [this, is, the, we, have, have, won, the, is, ...
5568                                           [will, to]
5569                                [was, in, for, other]
5570    [the, did, some, but, i, be, in, something, el...
5571                                 [its, to, its, name]
Name: Emails, Length: 5169, dtype: object


In [81]:
# we re going to remove te stop words

data.loc[:,'Processed Emails'] = data["Emails"].apply(nfx.remove_stopwords)
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[:,'Processed Emails'] = data["Emails"].apply(nfx.remove_stopwords)


Unnamed: 0,Type,Emails,Processed Emails
0,ham,"go until jurong point, crazy.. available only ...","jurong point, crazy.. available bugis n great ..."
1,ham,ok lar... joking wif u oni...,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say so early hor... u c already then say...,u dun early hor... u c say...
4,ham,"nah i don't think he goes to usf, he lives aro...","nah think goes usf, lives"
...,...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...,2nd time tried 2 contact u. u å£750 pound priz...
5568,ham,will ì_ b going to esplanade fr home?,ì_ b going esplanade fr home?
5569,ham,"pity, * was in mood for that. so...any other s...","pity, * mood that. so...any suggestions?"
5570,ham,the guy did some bitching but i acted like i'd...,guy bitching acted like i'd interested buying ...


In [82]:
# convert the type of email into labels as (0,1)

data.loc[:,'Label'] = data['Type'].map({'ham': 0, 'spam': 1})
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[:,'Label'] = data['Type'].map({'ham': 0, 'spam': 1})


Unnamed: 0,Type,Emails,Processed Emails,Label
0,ham,"go until jurong point, crazy.. available only ...","jurong point, crazy.. available bugis n great ...",0
1,ham,ok lar... joking wif u oni...,ok lar... joking wif u oni...,0
2,spam,free entry in 2 a wkly comp to win fa cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,1
3,ham,u dun say so early hor... u c already then say...,u dun early hor... u c say...,0
4,ham,"nah i don't think he goes to usf, he lives aro...","nah think goes usf, lives",0
...,...,...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...,2nd time tried 2 contact u. u å£750 pound priz...,1
5568,ham,will ì_ b going to esplanade fr home?,ì_ b going esplanade fr home?,0
5569,ham,"pity, * was in mood for that. so...any other s...","pity, * mood that. so...any suggestions?",0
5570,ham,the guy did some bitching but i acted like i'd...,guy bitching acted like i'd interested buying ...,0


In [83]:
# length of email in each row
length_of_emails = [len(email) for email in data["Processed Emails"]]

# Find the maximum length among all sentences
max_length = max(length_of_emails)

# Print the maximum review length
print("Maximum email length:", max_length)


# Find the index of the review with the maximum length
index_of_max_length = length_of_emails.index(max_length)

# print the index of sentence whose
print(index_of_max_length)


Maximum email length: 540
1528


In [84]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


max_length = max_length
trunc_type='post'
oov_tok = "<OOV>"

# Initialize the Tokenizer class
tokenizer = Tokenizer(oov_token="<OOV>", lower=False)

# Generate the word index dictionary
tokenizer.fit_on_texts(data["Processed Emails"])

# Generate and pad the sequences
sequences = tokenizer.texts_to_sequences(data["Processed Emails"])
padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)


index = index_of_max_length
print(f'email: {data["Processed Emails"].iloc[index]}')
print(f'padded sequence: {padded[index]}')

email: girl happy? difficult girls happy. u need be... 1. friend 2. companion 3. lover 4. chef . . . &lt;#&gt; . good listener &lt;#&gt; . organizer &lt;#&gt; . good boyfriend &lt;#&gt; . clean &lt;#&gt; . sympathetic &lt;#&gt; . athletic &lt;#&gt; . warm . . . &lt;#&gt; . courageous &lt;#&gt; . determined &lt;#&gt; . true &lt;#&gt; . dependable &lt;#&gt; . intelligent . . . &lt;#&gt; . psychologist &lt;#&gt; . pest exterminator &lt;#&gt; . psychiatrist &lt;#&gt; . healer . . &lt;#&gt; . stylist &lt;#&gt; . driver . . aaniye pudunga venaam..
padded sequence: [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    

In [85]:
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()
X = standard_scaler.fit_transform(padded)

In [86]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,data["Label"],test_size = 0.20, random_state = 42)

In [87]:
from sklearn.naive_bayes import GaussianNB

In [88]:
gnb = GaussianNB()

In [89]:
gnb.fit(X_train, Y_train)

In [90]:
GNB_clf_model_Predictions = gnb.predict(X_test)
print(GNB_clf_model_Predictions)

[1 0 1 ... 1 1 1]


In [91]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [92]:
print("\n-------------> Gaussian_Naive_Bays Classification REPORT <-------------\n")
print(classification_report(Y_test, GNB_clf_model_Predictions))


-------------> Gaussian_Naive_Bays Classification REPORT <-------------

              precision    recall  f1-score   support

           0       1.00      0.01      0.01       889
           1       0.14      1.00      0.25       145

    accuracy                           0.15      1034
   macro avg       0.57      0.50      0.13      1034
weighted avg       0.88      0.15      0.05      1034



In [93]:
GNB_clf_accuracy = accuracy_score(Y_test,GNB_clf_model_Predictions)
print(f"Random_Forest_Classifier Model Accuracy is:{int(GNB_clf_accuracy*100)}%.")

Random_Forest_Classifier Model Accuracy is:14%.


## Random Forest

In [94]:
from sklearn.ensemble import RandomForestClassifier

In [95]:
rfc = RandomForestClassifier()

In [96]:
rfc.fit(X_train, Y_train)

In [97]:
RF_clf_model_Predictions = rfc.predict(X_test)

In [98]:
print(RF_clf_model_Predictions)

[0 0 0 ... 0 0 0]


In [99]:
print("\n-------------> Random_Forest_Classifier Classification REPORT <-------------\n")
print(classification_report(Y_test, RF_clf_model_Predictions))


-------------> Random_Forest_Classifier Classification REPORT <-------------

              precision    recall  f1-score   support

           0       0.93      0.98      0.95       889
           1       0.79      0.54      0.64       145

    accuracy                           0.92      1034
   macro avg       0.86      0.76      0.80      1034
weighted avg       0.91      0.92      0.91      1034



In [100]:
RF_clf_accuracy = accuracy_score(Y_test,RF_clf_model_Predictions)
print(f"Random_Forest_Classifier Model Accuracy is:{int(RF_clf_accuracy*100)}%.")

Random_Forest_Classifier Model Accuracy is:91%.


## Logistic Regression

In [101]:
from sklearn.linear_model import LogisticRegression

In [102]:
lg = LogisticRegression()

In [103]:
lg.fit(X_train, Y_train)

In [104]:
LG_clf_model_Predictions = lg.predict(X_test)
print(LG_clf_model_Predictions)

[0 0 0 ... 0 0 0]


In [105]:
print("\n-------------> Logistic_Regression_Classifier Classification REPORT <-------------\n")
print(classification_report(Y_test, LG_clf_model_Predictions))


-------------> Logistic_Regression_Classifier Classification REPORT <-------------

              precision    recall  f1-score   support

           0       0.88      0.97      0.92       889
           1       0.47      0.16      0.24       145

    accuracy                           0.86      1034
   macro avg       0.67      0.56      0.58      1034
weighted avg       0.82      0.86      0.83      1034



In [106]:
LG_clf_accuracy = accuracy_score(Y_test,LG_clf_model_Predicitions)
print(f"Logistic_Regression_Classifier Model Accuracy is:{int(LG_clf_accuracy*100)}%.")

Logistic_Regression_Classifier Model Accuracy is:85%.
