In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
import re
import nltk
from contractions import contractions_dict
from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS
from itertools import filterfalse
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

In [2]:
data = pd.read_csv("emails.csv")

In [3]:
def strip_titles(text):
    if "Subject: re :" in text:
        return text[13:]
    elif "Subject: news :" in text:
        return text[15:]
    else:
        return text[8:]

In [4]:
data['text'] = data['text'].apply(lambda x: strip_titles(x))

In [5]:
data['text'] = data['text'].apply(lambda x: word_tokenize(x))

In [6]:
def normalize_tokens(list_of_tokens):
    return map(lambda x: x.lower(),list_of_tokens)

In [7]:
data['text'] = data['text'].apply(lambda x: normalize_tokens(x))

In [8]:
data['text'] = data['text'].apply(lambda x: list(x))

In [9]:
def contracted_word_expansion(token):
    if token in contractions_dict.keys():
        return contractions_dict[token]
    else:
        return token

In [10]:
def contractions_expansion(list_of_tokens):
    return map(contracted_word_expansion,list_of_tokens)

In [11]:
data['text'] = data['text'].apply(lambda x: contractions_expansion(x))

In [12]:
data['text'] = data['text'].apply(lambda x: list(x))

In [13]:
regex = r'^@[a-zA-z0-9]|^#[a-zA-Z0-9]|\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*|\W+|\d+|<("[^"]*"|\'[^\']*\'|[^\'">])*>|_+|[^\u0000-\u007f]+'

In [14]:
def waste_word_or_not(token):
    return re.search(regex,token)

In [15]:
def filter_waste_words(list_of_tokens):
    return filterfalse(waste_word_or_not,list_of_tokens)

In [16]:
data['text'] = data['text'].apply(lambda x: filter_waste_words(x))

In [17]:
data['text'] = data['text'].apply(lambda x: list(x))

In [18]:
def split(list_of_tokens):
    return map(lambda x: re.split(regex,x)[0],list_of_tokens)

In [19]:
data['text'] = data['text'].apply(lambda x: split(x))

In [20]:
data['text'] = data['text'].apply(lambda x: list(x))

In [21]:
en_stop_words = list(set(stopwords.words('english')).union(set(STOP_WORDS)))

In [22]:
def is_stopword(token):
    return not(token in en_stop_words or re.search(r'\b\w\b|[^\u0000-\u007f]+|_+|\W+',token))

In [23]:
def stopwords_removal(list_of_tokens):
    return filter(is_stopword,list_of_tokens)

In [24]:
data['text'] = data['text'].apply(lambda x: stopwords_removal(x))

In [25]:
data['text'] = data['text'].apply(lambda x: list(x))

In [26]:
def get_wnet_pos_tag(treebank_tag):
    if treebank_tag[1].startswith('J'):
        return (treebank_tag[0],wordnet.ADJ)
    elif treebank_tag[1].startswith('V'):
        return (treebank_tag[0],wordnet.VERB)
    elif treebank_tag[1].startswith('N'):
        return (treebank_tag[0],wordnet.NOUN)
    elif treebank_tag[1].startswith('R'):
        return (treebank_tag[0],wordnet.ADV)
    else:
        (treebank_tag[0],wordnet.NOUN)

In [27]:
def get_pos_tag(list_of_tokens):
    return map(get_wnet_pos_tag,pos_tag(list_of_tokens))

In [28]:
data['text'] = data['text'].apply(lambda x: get_pos_tag(x))

In [29]:
data['text'] = data['text'].apply(lambda x: list(x))

In [30]:
lemmatizer = WordNetLemmatizer()

In [31]:
def token_lemmatization(token_pos_tuple):
    if token_pos_tuple == None:
        return ""
    else:
        return lemmatizer.lemmatize(word=token_pos_tuple[0],pos=token_pos_tuple[1])

In [32]:
def lemmatization(list_of_tokens):
    if len(list_of_tokens) > 0:
        return map(lambda x: token_lemmatization(x),list_of_tokens)

In [33]:
data['text'] = data['text'].apply(lambda x: lemmatization(x))

In [34]:
data['text'] = data['text'].apply(lambda x: list(x))

In [35]:
vocab = set()
for list_of_tokens in data['text']:
    vocab = vocab.union(set(list_of_tokens))

In [36]:
vocab = list(vocab)

In [37]:
vocab.pop(0)

''

In [38]:
vocab_dict = dict(zip(vocab,list(range(0,len(vocab)))))

In [39]:
def join_tokens(list_of_tokens):
    return " ".join(list_of_tokens)

In [40]:
data['text'] = data['text'].apply(lambda x: join_tokens(x))

In [41]:
corpus = list()
for email_text in data['text']:
    corpus.append(email_text)

In [42]:
vectorizer = TfidfVectorizer(vocabulary=vocab_dict)
tf_idf_matrix = vectorizer.fit_transform(corpus)

In [43]:
tf_idf_matrix = tf_idf_matrix.toarray()

In [44]:
df = pd.DataFrame(tf_idf_matrix)

In [45]:
df['spam'] = data['spam']

In [46]:
pca = PCA(n_components=100)

In [47]:
tf_idf_matrix_reduced = pca.fit_transform(tf_idf_matrix)
tf_idf_matrix_reduced.shape

(5728, 100)

In [48]:
df = pd.DataFrame(data=tf_idf_matrix_reduced)

In [49]:
df['spam'] = data['spam']
df.shape

(5728, 101)

In [50]:
gnb = GaussianNB()

In [51]:
X_train = df.iloc[:,0:100]
y_train = df['spam']

In [52]:
gnb.fit(X=X_train,y=y_train)

In [53]:
predicted_categories = gnb.predict(X_train)

In [54]:
print(classification_report(y_true=y_train,y_pred=predicted_categories))

              precision    recall  f1-score   support

           0       0.94      0.90      0.92      4360
           1       0.72      0.82      0.77      1368

    accuracy                           0.88      5728
   macro avg       0.83      0.86      0.84      5728
weighted avg       0.89      0.88      0.88      5728



In [55]:
from random import randrange

In [56]:
from SMO import smo_algo

In [57]:
y_train = np.where(y_train == 1, 1, -1)

In [58]:
np.unique(y_train)

array([-1,  1])

In [59]:
X_train.shape

(5728, 100)

In [60]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.151228,0.178181,-0.234092,0.318191,0.007274,-0.006338,0.109231,0.061892,0.033337,-0.051042,...,-0.009817,0.006532,0.000633,0.000364,0.008472,0.002114,0.011359,-0.002240,-0.003400,0.000784
1,-0.077254,0.038657,0.010552,-0.038706,-0.022500,-0.014736,-0.008404,-0.018160,0.012296,0.036472,...,-0.012398,-0.006673,-0.012331,0.006794,0.003233,-0.017605,-0.005727,0.007719,0.004561,0.002232
2,-0.106518,0.065937,0.019646,0.000584,-0.012164,0.014756,0.053930,-0.017528,0.054324,-0.013720,...,-0.016684,-0.021850,-0.021478,0.025194,0.011361,-0.014650,0.002782,0.026294,0.071733,0.050520
3,-0.112580,0.050676,0.116470,0.059507,-0.054831,0.080401,-0.026957,-0.067632,-0.033101,-0.022549,...,0.001618,0.032069,0.033665,-0.028658,-0.040202,0.010293,-0.022151,-0.023867,-0.030197,-0.059284
4,-0.108220,0.084269,0.039310,-0.085947,-0.093243,-0.038087,-0.057611,0.100305,0.001455,0.015863,...,0.009377,-0.005789,-0.030297,-0.000844,-0.012622,-0.016286,-0.001659,0.002003,-0.016872,-0.015865
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5723,0.401540,0.074827,0.020231,0.019541,-0.029358,-0.015947,0.007501,0.004625,0.003927,0.000372,...,0.027403,-0.019701,-0.011671,0.008613,-0.012867,-0.064943,0.077036,0.007240,-0.025210,-0.081564
5724,-0.047198,-0.099707,-0.082818,-0.068373,-0.093367,0.098876,0.005325,-0.043702,0.026293,-0.021770,...,-0.059248,-0.096295,-0.051853,-0.004085,0.000985,0.040050,0.079912,0.040317,0.019032,0.012607
5725,0.105271,-0.155611,-0.151001,-0.077213,-0.117468,0.189585,-0.039609,-0.066671,0.266276,-0.059878,...,-0.068468,0.010795,-0.040981,-0.009467,0.037558,-0.050063,-0.057595,0.029856,0.078313,-0.031591
5726,0.360915,0.106249,-0.033290,-0.053303,-0.053544,0.005257,0.026760,-0.050556,-0.069254,-0.030160,...,-0.041254,-0.049547,-0.060371,-0.016703,-0.054060,-0.032394,-0.021341,-0.021441,0.040905,0.092943


In [61]:
y_train.shape

(5728,)

In [62]:
from imblearn.over_sampling import SMOTE

# If X_train is a DataFrame and y_train is a Series
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)


In [63]:
# Step 1: Convert data if needed
X_np = X_resampled.to_numpy()
y_np = y_resampled

# Step 2: Train the model
model = smo_algo(train_data_features=X_np, labels=y_np, reg_strength=1.0, tolerance=0.01)
model.smo_algo_main_loop()

# Step 3: Predict on training data
train_predictions = model.predict(X_np)  # ✅ Use the predict method

# Step 4: Evaluate accuracy
accuracy = np.mean(train_predictions == y_np)
print("Training accuracy:", accuracy)



Training accuracy: 0.5


In [64]:
print("X shape:", model.X.shape)
print("theta_hat shape:", model.theta_hat.shape)

X shape: (8720, 100)
theta_hat shape: (100,)


In [65]:
print("theta_hat:", model.theta_hat.shape)  # should be (5000,)
for i in range(3):
    print(f"Sample {i}: X[i] shape = {model.X[i].shape}")

theta_hat: (100,)
Sample 0: X[i] shape = (100,)
Sample 1: X[i] shape = (100,)
Sample 2: X[i] shape = (100,)


In [66]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train)
print("Logistic Accuracy:", clf.score(X_train, y_train))

Logistic Accuracy: 0.9736382681564246


In [67]:
unique, counts = np.unique(y_train, return_counts=True)
print(dict(zip(unique, counts)))

{-1: 4360, 1: 1368}


In [68]:
unique, counts = np.unique(y_resampled, return_counts=True)
print(dict(zip(unique, counts)))

{-1: 4360, 1: 4360}


In [72]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [73]:
clf = SVC(kernel='linear', C=1.0)  # or try 'rbf' or 'poly' kernels
clf.fit(X_np, y_np)
y_pred = clf.predict(X_np)

# Classification report
print("Accuracy:", accuracy_score(y_np, y_pred))
print(classification_report(y_np, y_pred))


Accuracy: 0.9814220183486239
              precision    recall  f1-score   support

          -1       1.00      0.96      0.98      4360
           1       0.97      1.00      0.98      4360

    accuracy                           0.98      8720
   macro avg       0.98      0.98      0.98      8720
weighted avg       0.98      0.98      0.98      8720

