In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize

import gensim
from gensim.models import Word2Vec

In [2]:
df = pd.read_csv(r"./data/email_classification.csv")

In [3]:
df.describe()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179 entries, 0 to 178
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   email   179 non-null    object
 1   label   179 non-null    object
dtypes: object(2)
memory usage: 2.9+ KB


## Cleaning the data and tokenization

In [4]:
df['email'] = df['email'].str.lower()

In [5]:
df

Unnamed: 0,email,label
0,upgrade to our premium plan for exclusive acce...,ham
1,happy holidays from our team! wishing you joy ...,ham
2,we're hiring! check out our career opportuniti...,ham
3,your amazon account has been locked. click her...,spam
4,your opinion matters! take our survey and help...,ham
...,...,...
174,we're pleased to inform you that your refund h...,ham
175,get rich quick! invest in our revolutionary ne...,spam
176,your free trial period is ending soon. upgrade...,ham
177,your order is on its way! track your shipment ...,ham


In [6]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bachm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
df['email'] = df['email'].apply(word_tokenize)

In [8]:
df['email']

0      [upgrade, to, our, premium, plan, for, exclusi...
1      [happy, holidays, from, our, team, !, wishing,...
2      [we, 're, hiring, !, check, out, our, career, ...
3      [your, amazon, account, has, been, locked, ., ...
4      [your, opinion, matters, !, take, our, survey,...
                             ...                        
174    [we, 're, pleased, to, inform, you, that, your...
175    [get, rich, quick, !, invest, in, our, revolut...
176    [your, free, trial, period, is, ending, soon, ...
177    [your, order, is, on, its, way, !, track, your...
178    [limited-time, offer, !, get, 50, %, off, on, ...
Name: email, Length: 179, dtype: object

## Removing Stop words

In [9]:
import pandas as pd
import re

def remove_stop_words(sentence):
    stop_words  = ["the", "and", "a", "to", "of", "in", "is", "you", "for", "on",
    "with", "that", "as", "it", "be", "are", "this", "from", "or", "by",
    "your", "at", "not", "have", "was", "but", "which", "an", "if", "they"]
    #word_list=sentence.split()
    clean_sentence=' '.join([w for w in sentence if w.lower() not in stop_words])
    return(clean_sentence)

In [10]:
df['email'] = df['email'].apply(remove_stop_words)

In [11]:
df['email'].apply(word_tokenize)

0      [upgrade, our, premium, plan, exclusive, acces...
1      [happy, holidays, our, team, !, wishing, joy, ...
2      [we, 're, hiring, !, check, out, our, career, ...
3      [amazon, account, has, been, locked, ., click,...
4      [opinion, matters, !, take, our, survey, help,...
                             ...                        
174    [we, 're, pleased, inform, refund, has, been, ...
175    [get, rich, quick, !, invest, our, revolutiona...
176    [free, trial, period, ending, soon, ., upgrade...
177    [order, its, way, !, track, shipment, real-tim...
178    [limited-time, offer, !, get, 50, %, off, all,...
Name: email, Length: 179, dtype: object

In [14]:
model = Word2Vec(sentences=df['email'], vector_size=100, window=5, min_count=1, workers=4)

In [15]:
# save the model
model.save("emailWord2Vec.model")

## Converting to text embeddings

In [16]:
#loading the model
model = Word2Vec.load("emailWord2Vec.model")



<gensim.models.word2vec.Word2Vec at 0x1be2a944080>

In [41]:
df['label'].value_counts()

pd.get_dummies(df['label'], prefix='label')

df['label'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 179 entries, 0 to 178
Series name: label
Non-Null Count  Dtype 
--------------  ----- 
179 non-null    object
dtypes: object(1)
memory usage: 1.5+ KB


In [37]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

#enc = OneHotEncoder(handle_unknown='ignore')
enc = LabelEncoder()

x = enc.fit_transform(df['label'])

In [60]:
from sklearn.linear_model import LogisticRegression

mm = LogisticRegression()

In [45]:
X = []
for document in df['label']:
    # Filter out words not in the model's vocabulary
    vectors = [model.wv[word] for word in document if word in model.wv]
    if vectors:  # check if there are any vectors
        mean_vector = np.mean(vectors, axis=0)
        X.append(mean_vector)
    else:
        # Handle documents that may not have any words in the model's vocabulary
        # For instance, by appending a zero vector of the same length as others
        X.append(np.zeros(model.vector_size))

# Convert the list of vectors into an array for machine learning usage
X = np.array(X)

In [46]:
X

array([[-0.08787648,  0.2448949 ,  0.10689574, ..., -0.09616518,
         0.02436404,  0.16101478],
       [-0.08787648,  0.2448949 ,  0.10689574, ..., -0.09616518,
         0.02436404,  0.16101478],
       [-0.08787648,  0.2448949 ,  0.10689574, ..., -0.09616518,
         0.02436404,  0.16101478],
       ...,
       [-0.08787648,  0.2448949 ,  0.10689574, ..., -0.09616518,
         0.02436404,  0.16101478],
       [-0.08787648,  0.2448949 ,  0.10689574, ..., -0.09616518,
         0.02436404,  0.16101478],
       [-0.08796126,  0.2495893 ,  0.11155474, ..., -0.09393063,
         0.02863941,  0.16644704]], dtype=float32)

In [58]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(X, x, test_size=0.2, random_state=42)
randf = RandomForestClassifier()
svc_clas = SVC()

mm.fit(X_train, y_train)
randf.fit(X_train, y_train)
svc_clas.fit(X_train, y_train)


# Naive bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred_gaus = gnb.fit(X_train, y_train).predict(X_test)

In [59]:
y_pred = mm.predict(X_test)
y_pred_rand = randf.predict(X_test)
y_pred_svc = svc_clas.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report
print("Logistic Regregression Accuracy:", accuracy_score(y_test, y_pred))
print("Random Forrest Accuracy:", accuracy_score(y_test, y_pred_rand))
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svc))
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_gaus))
print("Classification Report:\n", classification_report(y_test, y_pred))

Logistic Regregression Accuracy: 0.3888888888888889
Random Forrest Accuracy: 1.0
SVM Accuracy: 0.3888888888888889
Naive Bayes Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       0.39      1.00      0.56        14
           1       0.00      0.00      0.00        22

    accuracy                           0.39        36
   macro avg       0.19      0.50      0.28        36
weighted avg       0.15      0.39      0.22        36



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
