In [983]:
import pandas as pd
import re
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import spacy


In [1016]:
df = pd.read_csv("spam_utf8.csv")
df.drop(['Unnamed: 2', 'Unnamed: 3','Unnamed: 4'], axis=1, inplace=True)

df

Unnamed: 0,label,Text
0,notspam,"Go until jurong point, crazy.. Available only ..."
1,notspam,"Go until jurong point, crazy.. Available only ..."
2,notspam,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,notspam,
...,...,...
5569,spam,This is the 2nd time we have tried 2 contact u...
5570,notspam,Will Ì_ b going to esplanade fr home?
5571,notspam,"Pity, * was in mood for that. So...any other s..."
5572,notspam,The guy did some bitching but I acted like i'd...


In [1017]:
#  Number of spam vs non spam emails
num_spam = df[df["label"]=="spam"].shape[0]
num_notspam = df[df["label"]=="notspam"].shape[0]

print(num_spam)
print(num_notspam)

747
4827


In [1018]:
# Missing values count in dataset
missing_values = df["Text"].isnull().sum()
print(missing_values)

2


In [1019]:
# number of duplicate values in text column
num_duplicates = df["Text"].duplicated().sum()

print(num_duplicates)

406


In [1020]:
# average length of email
avg_email_length = df['Text'].str.len().mean()
avg_email_length

80.1340631730079

In [1021]:
df.drop_duplicates(subset=['Text'], keep='first', inplace=True)
df['Text'] = df['Text'].astype(str)

df.head()

Unnamed: 0,label,Text
0,notspam,"Go until jurong point, crazy.. Available only ..."
2,notspam,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,notspam,
5,notspam,"Nah I don't think he goes to usf, he lives aro..."


In [1022]:
df['processed_text'] = df['Text'].str.lower()
df['processed_text'] = df['processed_text'].astype(str)

df.head()

Unnamed: 0,label,Text,processed_text
0,notspam,"Go until jurong point, crazy.. Available only ...","go until jurong point, crazy.. available only ..."
2,notspam,Ok lar... Joking wif u oni...,ok lar... joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
4,notspam,,
5,notspam,"Nah I don't think he goes to usf, he lives aro...","nah i don't think he goes to usf, he lives aro..."


In [1023]:
#preprocessing

# remove punctuation
df['processed_text'] = df['processed_text'].str.replace(r'[^\w\s]', '', regex=True)

#remove special characters
df['processed_text'] = df['processed_text'].str.replace(r'[^A-Za-z0-9 ]+', '', regex=True)

#remove numbers
df['processed_text'] = df['processed_text'].str.replace(r'\d+', '', regex=True)
df.tail()

Unnamed: 0,label,Text,processed_text
5569,spam,This is the 2nd time we have tried 2 contact u...,this is the nd time we have tried contact u u...
5570,notspam,Will Ì_ b going to esplanade fr home?,will b going to esplanade fr home
5571,notspam,"Pity, * was in mood for that. So...any other s...",pity was in mood for that soany other suggest...
5572,notspam,The guy did some bitching but I acted like i'd...,the guy did some bitching but i acted like id ...
5573,notspam,Rofl. Its true to its name,rofl its true to its name


In [1024]:
# tokenize words
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

df['processed_text'] = df['processed_text'].astype(str)
df['processed_text'] = df['processed_text'].apply(word_tokenize)

df.head(50)

[nltk_data] Downloading package punkt_tab to /Users/ahad/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,label,Text,processed_text
0,notspam,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o..."
2,notspam,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, a, wkly, comp, to, win, fa, ..."
4,notspam,,[nan]
5,notspam,"Nah I don't think he goes to usf, he lives aro...","[nah, i, dont, think, he, goes, to, usf, he, l..."
6,spam,FreeMsg Hey there darling it's been 3 week's n...,"[freemsg, hey, there, darling, its, been, week..."
7,notspam,Even my brother is not like to speak with me. ...,"[even, my, brother, is, not, like, to, speak, ..."
8,notspam,As per your request 'Melle Melle (Oru Minnamin...,"[as, per, your, request, melle, melle, oru, mi..."
9,spam,WINNER!! As a valued network customer you have...,"[winner, as, a, valued, network, customer, you..."
10,spam,Had your mobile 11 months or more? U R entitle...,"[had, your, mobile, months, or, more, u, r, en..."


In [1025]:
# stopwords removal
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

df['processed_text'] = df['processed_text'].apply(
    lambda tokens: [word for word in tokens if word not in stop_words])

df.head()

[nltk_data] Downloading package stopwords to /Users/ahad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,label,Text,processed_text
0,notspam,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
2,notspam,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, wkly, comp, win, fa, cup, final,..."
4,notspam,,[nan]
5,notspam,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, goes, usf, lives, around, t..."


In [None]:
#lemmatization

nlp = spacy.load("en_core_web_sm")

df["processed_text"] = df["processed_text"].apply(
    lambda tokens: [token.lemma_ for token in nlp(" ".join(tokens))]  
)

In [1028]:
df["processed_text"] = df["processed_text"].apply(lambda tokens: " ".join(tokens))

#Generate N-grams (Unigrams and Bigrams)
vectorizer = CountVectorizer(ngram_range=(1, 2))  # Generate unigrams and bigrams
ngrams_matrix = vectorizer.fit_transform(df["processed_text"])
ngrams_df = pd.DataFrame(ngrams_matrix.toarray(), columns=vectorizer.get_feature_names_out())

In [1029]:
ngrams_df

Unnamed: 0,aa,aa exhaust,aah,aah bless,aah cuddle,aah speak,aaniye,aaniye pudunga,aaooooright,aaooooright work,...,zogtorius,zogtorius ve,zoom,zoom cine,zouk,zouk nichols,zs,zs subscription,zyada,zyada kisi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5163,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5164,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5165,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5166,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [1030]:
X = ngrams_df.values   
y = df['label']             

In [1031]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2,   
    random_state=42 
)


In [1048]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(
    penalty='l1',                     
    C = 0.2, 
    solver='liblinear',   
    random_state=42      
)


In [1049]:
logreg.fit(X_train, y_train)


In [1050]:
y_pred = logreg.predict(X_test)


In [1051]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.9642


In [1052]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

     notspam       0.97      0.99      0.98       915
        spam       0.94      0.74      0.83       119

    accuracy                           0.96      1034
   macro avg       0.95      0.87      0.90      1034
weighted avg       0.96      0.96      0.96      1034



In [1053]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)


Confusion Matrix:
 [[909   6]
 [ 31  88]]


In [1045]:
stopwords = set(stopwords.words('english'))
nlp = spacy.load("en_core_web_sm")
def custom_preprocess(text):
    text = text.lower() 
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
    text = re.sub(r'\d+', '', text)
    
    tokens = text.split()
    
    doc = nlp(" ".join(tokens))
    
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    
    return " ".join(tokens)



In [1054]:
new_text = "Congratulations! You have won a free trip to Japan!"
new_text = custom_preprocess(new_text) 

print(new_text)

congratulation win free trip japan


In [1055]:
X_new = vectorizer.transform([new_text])

prediction = logreg.predict(X_new)

print(prediction)  


['spam']
