<a href="https://colab.research.google.com/github/akashhhhh02/Automated-Email-Classification-Using-GenAI-Enhanced-Models/blob/main/Automated_Email_Classification_Using_GenAI_Enhanced_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Automated Email Classification Using GenAI-Enhanced Models

# Install the necessary libraries

In [None]:
!pip install -q transformers fasttext scikit-learn pandas numpy torch

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m71.7/73.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone


# Import the necessary libraries

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

from transformers import AutoTokenizer, AutoModel
import torch

# Load the dataset and basic cleaning commands

In [None]:
df = pd.read_csv('/content/combined_data.csv')
df.head()


Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


In [None]:
df = df[['label', 'text']]
df.dropna(inplace=True)

df['text'] = df['text'].str.lower()


# Convert Binary -> Multi Class labels

In [None]:
def map_category(text, label):
    if label == 1:
        return "Spam"
    elif any(k in text for k in ['offer', 'sale', 'discount', 'buy', 'price']):
        return "Promotions"
    elif any(k in text for k in ['help', 'support', 'issue', 'error', 'problem']):
        return "Support"
    else:
        return "Personal"

df['category'] = df.apply(lambda x: map_category(x['text'], x['label']), axis=1)
df['category'].value_counts()


Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
Spam,684
Support,268
Personal,219
Promotions,76


# Split the dataset into training and testing datasets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text'],
    df['category'],
    test_size=0.2,
    random_state=42,
    stratify=df['category']
)

## Baseline model: TF-IDF model

# TF-IDF vectorization

In [None]:
tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english'
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train the classifier

In [None]:
tfidf_clf = LogisticRegression(max_iter=1000)
tfidf_clf.fit(X_train_tfidf, y_train)

# Evaluate the model

In [None]:
y_pred = tfidf_clf.predict(X_test_tfidf)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

    Personal       0.70      0.52      0.60        44
  Promotions       0.00      0.00      0.00        15
        Spam       0.80      1.00      0.89       137
     Support       0.76      0.63      0.69        54

    accuracy                           0.78       250
   macro avg       0.56      0.54      0.54       250
weighted avg       0.72      0.78      0.74       250

[[ 23   0  15   6]
 [  0   0  10   5]
 [  0   0 137   0]
 [ 10   0  10  34]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## GenAI Enhancement

# Load transformer model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]



tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


# Create the embedding function and genterate embedding

In [None]:
def get_embeddings(texts):
    inputs = tokenizer(
        list(texts),
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

In [None]:
X_train_emb = get_embeddings(X_train)
X_test_emb = get_embeddings(X_test)

# Train GenAI Classifier

In [None]:
genai_clf = LogisticRegression(max_iter=1000)
genai_clf.fit(X_train_emb, y_train)

# Evaluate GenAI Model

In [None]:
y_pred_genai = genai_clf.predict(X_test_emb)

print(classification_report(y_test, y_pred_genai))
print(confusion_matrix(y_test, y_pred_genai))

              precision    recall  f1-score   support

    Personal       0.69      0.66      0.67        44
  Promotions       0.50      0.20      0.29        15
        Spam       0.94      0.96      0.95       137
     Support       0.71      0.81      0.76        54

    accuracy                           0.83       250
   macro avg       0.71      0.66      0.67       250
weighted avg       0.82      0.83      0.82       250

[[ 29   2   3  10]
 [  2   3   4   6]
 [  2   1 132   2]
 [  9   0   1  44]]


## Real Time Classification

In [None]:
def classify_email(email_text):
    emb = get_embeddings([email_text.lower()])
    return genai_clf.predict(emb)[0]

classify_email("hey bro, are we meeting tomorrow evening?")

'Support'

In [None]:
classify_email("buy viagra at 70% discount, limited offer")

'Spam'