In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
df=pd.read_csv("data/Spam_Email_Data.csv")

In [2]:
print("Sample of the data : \n")
df.head(1)

Sample of the data : 



Unnamed: 0,text,target
0,From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...,0


In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5796 entries, 0 to 5795
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5796 non-null   object
 1   target  5796 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 90.7+ KB
None


In [4]:
print(df.isna().sum())

text      0
target    0
dtype: int64


In [5]:
print(df['target'].value_counts())

target
0    3900
1    1896
Name: count, dtype: int64


In [6]:
df['target'].count()

5796

In [7]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [8]:
def preprocess_text(text):
    text = re.sub(r'\S+@\S+', '', text)  # Remove email addresses
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text= re.sub(r'<.*?>','',text) #Remove HTML tag
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
    text = ' '.join(text.split()) # Split text by whitespaces and join back with single space
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    
    # Lemmatization
    #lemmatizer = WordNetLemmatizer()
    #lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
    return ' '.join(stemmed_tokens)  # Join tokens back into a string

In [9]:
df['text'] = df['text'].apply(preprocess_text)

In [10]:
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], test_size=0.4, random_state=135)

In [12]:
bow_vectorizer = CountVectorizer(ngram_range=(3, 3))
tfidf_vectorizer = TfidfVectorizer()

In [13]:
X_train_counts = bow_vectorizer.fit_transform(X_train)
X_test_counts = bow_vectorizer.transform(X_test)

In [14]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [16]:
models = {
    'Logistic Regression (CountVectorizer)': LogisticRegression(C=0.5, random_state=50),
    'Random Forest (CountVectorizer)': RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=10, random_state=50),
    'Logistic Regression (TfidfVectorizer)': LogisticRegression(C=0.5, random_state=50),
    'Random Forest (TfidfVectorizer)': RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=10, random_state=50)
}


In [17]:
results=[]
for model_name, model in models.items():
    if 'CountVectorizer' in model_name:
        X_train_features = X_train_counts
        X_test_features = X_test_counts
    elif 'TfidfVectorizer' in model_name:
        X_train_features = X_train_tfidf
        X_test_features = X_test_tfidf
    
    # Train the model
    model.fit(X_train_features, y_train)
    
    # Make predictions on the training data
    y_train_pred = model.predict(X_train_features)
    
    # Make predictions on the test data
    y_pred = model.predict(X_test_features)
    
    # Calculate evaluation metrics for both train and test
    accuracy_train = accuracy_score(y_train, y_train_pred)
    accuracy_test = accuracy_score(y_test, y_pred)
    precision_train = precision_score(y_train, y_train_pred)
    precision_test = precision_score(y_test, y_pred)
    recall_train = recall_score(y_train, y_train_pred)
    recall_test = recall_score(y_test, y_pred)
    f1_train = f1_score(y_train, y_train_pred)
    f1_test = f1_score(y_test, y_pred)
    
    # Store the evaluation results
    results.append({
        'Model': model_name,
        'Test Accuracy': accuracy_test,
        'Test Precision': precision_test,
        'Test Recall': recall_test,
        'Test F1-score': f1_test
    })


In [18]:
results_df = pd.DataFrame(results)
print("\nModel Evaluation Results: \n")
print(results_df)


Model Evaluation Results: 

                                   Model  Test Accuracy  Test Precision  \
0  Logistic Regression (CountVectorizer)       0.985339        0.984658   
1        Random Forest (CountVectorizer)       0.851660        0.994859   
2  Logistic Regression (TfidfVectorizer)       0.968521        0.992492   
3        Random Forest (TfidfVectorizer)       0.948254        0.988764   

   Test Recall  Test F1-score  
0     0.968450       0.976487  
1     0.530864       0.692308  
2     0.906722       0.947670  
3     0.844993       0.911243  


In [27]:
import gensim
from gensim.models import Word2Vec,Doc2Vec
from gensim.models.doc2vec import TaggedDocument


In [20]:
sentences = [row.split() for row in X_train]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4, sg=1)  # sg=1 for skip-gram model

In [23]:
def get_word2vec_embedding(text):
    words = text.split()
    embedding = np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv], axis=0)
    return embedding

In [24]:
X_train_w2v = np.array([get_word2vec_embedding(text) for text in X_train])
X_test_w2v = np.array([get_word2vec_embedding(text) for text in X_test])

In [28]:
tagged_data = [TaggedDocument(words=_d.split(), tags=[str(i)]) for i, _d in enumerate(X_train)]
doc2vec_model = Doc2Vec(tagged_data, vector_size=100, window=5, min_count=1, workers=4, epochs=20)


In [30]:
from transformers import BertTokenizer, BertModel
import torch

  torch.utils._pytree._register_pytree_node(


In [31]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [32]:
X_train_d2v = np.array([doc2vec_model.infer_vector(text.split()) for text in X_train])
X_test_d2v = np.array([doc2vec_model.infer_vector(text.split()) for text in X_test])

In [35]:
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state[:,0,:].detach().numpy().flatten() 

In [36]:
X_train_bert = np.array([get_bert_embedding(text) for text in X_train])
X_test_bert = np.array([get_bert_embedding(text) for text in X_test])

In [37]:
models = {
    'Logistic Regression (Word2Vec)': LogisticRegression(C=0.5, random_state=50),
    'Random Forest (Word2Vec)': RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=10, random_state=50),
    'Logistic Regression (Doc2Vec)': LogisticRegression(C=0.5, random_state=50),
    'Random Forest (Doc2Vec)': RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=10, random_state=50),
    'Logistic Regression (BERT)': LogisticRegression(C=0.5, random_state=50),
    'Random Forest (BERT)': RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=10, random_state=50)
}

In [38]:
results=[]
for model_name, model in models.items():
    if 'Word2Vec' in model_name:
        X_train_features = X_train_w2v
        X_test_features = X_test_w2v
    elif 'Doc2Vec' in model_name:
        X_train_features = X_train_d2v
        X_test_features = X_test_d2v
    elif 'BERT' in model_name:
        X_train_features = X_train_bert
        X_test_features = X_test_bert
    
    # Train the model
    model.fit(X_train_features, y_train)
    
    # Make predictions on the training data
    y_train_pred = model.predict(X_train_features)
    
    # Make predictions on the test data
    y_pred = model.predict(X_test_features)
    
    # Calculate evaluation metrics for both train and test
    accuracy_train = accuracy_score(y_train, y_train_pred)
    accuracy_test = accuracy_score(y_test, y_pred)
    precision_train = precision_score(y_train, y_train_pred)
    precision_test = precision_score(y_test, y_pred)
    recall_train = recall_score(y_train, y_train_pred)
    recall_test = recall_score(y_test, y_pred)
    f1_train = f1_score(y_train, y_train_pred)
    f1_test = f1_score(y_test, y_pred)
    
    # Store the evaluation results
    results.append({
        'Model': model_name,
        'Test Accuracy': accuracy_test,
        'Test Precision': precision_test,
        'Test Recall': recall_test,
        'Test F1-score': f1_test
    })

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [39]:
results_df = pd.DataFrame(results)
print("\nModel Evaluation Results: \n")
print(results_df)


Model Evaluation Results: 

                            Model  Test Accuracy  Test Precision  Test Recall  \
0  Logistic Regression (Word2Vec)       0.977145        0.992711     0.934156   
1        Random Forest (Word2Vec)       0.981026        0.987198     0.951989   
2   Logistic Regression (Doc2Vec)       0.942216        0.977528     0.835391   
3         Random Forest (Doc2Vec)       0.912893        0.987061     0.732510   
4      Logistic Regression (BERT)       0.982751        0.980474     0.964335   
5            Random Forest (BERT)       0.960759        0.978979     0.894376   

   Test F1-score  
0       0.962544  
1       0.969274  
2       0.900888  
3       0.840945  
4       0.972337  
5       0.934767  
