# Importing Libraries

In [84]:
from transformers import AutoTokenizer, TFAutoModel,TFAutoModelForSequenceClassification
import tensorflow as tf

import pandas as pd
import numpy as np

In [122]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Getting the Data

In [44]:
email_data= pd.read_csv("email_classification.csv")
email_data.head()
print(email_data.shape)

(179, 2)


# Using LLM model to get vectors

In [92]:
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
import numpy as np

# Load pre-trained model and tokenizer
model_name = "distilbert-base-uncased"  # You can replace this with any other model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModel.from_pretrained(model_name)

# Example email sentences
emails = email_data["email"].tolist()

# Tokenize and encode the email sentences
max_length = 64  # You can adjust this as needed
inputs = tokenizer(emails, padding=True, truncation=True, max_length=max_length, return_tensors="tf")

# Pass the inputs through the model to get the embeddings
outputs = model(**inputs)

# Get the embeddings for all tokens
embeddings = outputs.last_hidden_state

# Compute average pooling to get sentence embeddings
mask = tf.cast(inputs['attention_mask'], tf.float32)
sum_embeddings = tf.reduce_sum(embeddings * tf.expand_dims(mask, -1), axis=1)
avg_embeddings = sum_embeddings / tf.reduce_sum(mask, axis=1, keepdims=True)

# Convert TensorFlow tensor to numpy array
embeddings = avg_embeddings.numpy()

# Now 'embeddings' contains the vector representations of the email sentences
print(embeddings.shape)  # Shape: (num_sentences, embedding_dim)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


(179, 768)


# Reforming the Labels

In [93]:
label_mapping = {"ham": 0, "spam": 1}

# Map the labels using the mapping
y = email_data["label"].map(label_mapping)

In [94]:
X=embeddings

In [95]:
print(X.shape)
y.shape

(179, 768)


(179,)

# Train Test Split

In [96]:
# Shuffle the indices
indices = np.arange(len(X))
np.random.shuffle(indices)

# Define the split ratio
train_ratio = 0.8  # 80% train, 20% test

# Calculate split indices
split_idx = int(len(X) * train_ratio)
train_indices = indices[:split_idx]
test_indices = indices[split_idx:]

# Split the data
X_train, X_test = tf.gather(X, train_indices), tf.gather(X, test_indices)

# Assuming you have labels y, you can split them as well
y_train, y_test = y[train_indices], y[test_indices]

# Now you have X_train, y_train, X_test, y_test for training and testing

### Getting the shapes

In [97]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(143, 768)
(36, 768)
(143,)
(36,)


# KNN

In [213]:


# Create a KNN classifier
k = 5  # Number of neighbors
knn_classifier = KNeighborsClassifier(n_neighbors=k)

# Train the classifier
knn_classifier.fit(X_train, y_train)

# Make predictions
predictions = knn_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.9722222222222222


In [208]:
y_train_pred = knn_classifier.predict(X_train)

# Calculate training accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Training Accuracy:", train_accuracy)

Training Accuracy: 0.9300699300699301


# SVM

In [215]:
# Initialize the SVM classifier
svm_classifier = SVC(kernel='linear')  # You can choose different kernels like 'rbf', 'poly', etc.

# Train the SVM classifier
svm_classifier.fit(X_train, y_train)

# Predict the labels for test set
y_pred = svm_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9722222222222222


In [196]:
y_train_pred = svm_classifier.predict(X_train)

# Calculate training accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Training Accuracy:", train_accuracy)

Training Accuracy: 1.0


# Naive Bayes

In [195]:

# Initialize the Gaussian Naive Bayes classifier
naive_bayes_classifier = GaussianNB()

# Train the Naive Bayes classifier
naive_bayes_classifier.fit(X_train, y_train)

# Predict the labels for test set
y_pred = naive_bayes_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [194]:
y_train_pred = naive_bayes_classifier.predict(X_train)

# Calculate training accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Training Accuracy:", train_accuracy)

Training Accuracy: 0.9440559440559441


# Decision Tree Classifier 

In [188]:
# Initialize the Decision Tree classifier
decision_tree_classifier = DecisionTreeClassifier()

# Train the Decision Tree classifier
decision_tree_classifier.fit(X_train, y_train)

# Predict the labels for test set
y_pred = decision_tree_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9722222222222222


In [193]:
# Predict the labels for training set
y_train_pred = decision_tree_classifier.predict(X_train)

# Calculate training accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Training Accuracy:", train_accuracy)

Training Accuracy: 1.0
