In [None]:
%pip install transformers torch


In [11]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
import tqdm


# Day 1 : Simple Neural Network for predicting spam and ham classification
# 
# # BERT (Bidirectional Encoder Representations from Transformers) is a groundbreaking model introduced by Google in 2018. 
# It revolutionized the field of natural language processing (NLP) by enabling models to understand the context of words 
# in a sentence more effectively than previous models. Unlike traditional models that read text sequentially, BERT 
# processes words in relation to all the other words in a sentence, allowing it to capture nuanced meanings and 
# relationships. This bidirectional approach, combined with its transformer architecture, has led to significant 
# improvements in various NLP tasks, including sentiment analysis, question answering, and text classification. 
# For more details, refer to the original paper: https://arxiv.org/pdf/1810.04805


In [9]:
data=pd.read_csv("email_classification.csv")
data['label']=data['label'].map({'spam':1,'ham':0})
data.rename(columns={'email':'text'},inplace=True)
data.head()


Unnamed: 0,text,label
0,Upgrade to our premium plan for exclusive acce...,0
1,Happy holidays from our team! Wishing you joy ...,0
2,We're hiring! Check out our career opportuniti...,0
3,Your Amazon account has been locked. Click her...,1
4,Your opinion matters! Take our survey and help...,0


In [8]:
print(data.shape)


(179, 2)


In [15]:
class Transformation:
    def __init__(self):
        self.tokenizer, self.model = self.load_tokenizer_model()

    def load_tokenizer_model(self):
        """
        Load BERT tokenizer and model.
        """
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased')
        return tokenizer, model

    def tokenize_text(self, text):
        """
        Tokenizes input text and converts it into PyTorch tensors.
        """
        tokens = self.tokenizer(
            text,
            padding=True,
            truncation=True,
            return_tensors='pt'
        )
        return tokens

    def get_embeddings(self, tokens):
        """
        Extracts embeddings from BERT model.
        """
        with torch.no_grad():  
            outputs = self.model(**tokens)  
        embeddings = outputs.last_hidden_state
        return embeddings

In [5]:
# Testing the code
text = "Congratulations! You have won a free iPhone."
transformer = Transformation(data=None)

tokens = transformer.tokenize_text(text)
embeddings = transformer.get_embeddings(tokens)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



In [6]:
tokens

{'input_ids': tensor([[  101, 23156,   999,  2017,  2031,  2180,  1037,  2489, 18059,  1012,
           102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [7]:
embeddings

tensor([[[ 3.0962e-01, -3.4133e-04,  4.6005e-03,  ..., -3.2639e-01,
           2.3386e-01,  4.2773e-01],
         [ 5.4302e-01, -2.5816e-01,  2.7266e-01,  ..., -2.3062e-01,
           1.0924e+00,  2.8424e-01],
         [ 5.7392e-01, -1.2022e-01,  7.1131e-02,  ...,  2.9910e-01,
           6.9459e-01,  7.7258e-02],
         ...,
         [ 7.4757e-01,  2.9356e-01,  5.2565e-01,  ..., -1.3592e-01,
           3.0505e-01, -7.9207e-01],
         [ 7.1357e-01,  7.4151e-03, -3.9734e-01,  ...,  3.7720e-01,
          -3.2668e-01, -5.5241e-01],
         [ 6.4319e-01,  9.1430e-02, -1.2740e-01,  ...,  3.9700e-01,
          -4.5115e-01, -3.1485e-01]]])

In [19]:
transformer = Transformation()  # Pass data parameter

embeddings_list = []
labels_list = []

# Convert each email into embeddings and flatten the embeddings
for text, label in zip(data['text'], data['label']):
    tokens = transformer.tokenize_text(text)
    embeddings = transformer.get_embeddings(tokens)
    # Flatten the embeddings tensor and convert to numpy array
    flattened_embeddings = embeddings.mean(dim=1).squeeze().numpy()
    embeddings_list.append(flattened_embeddings)
    labels_list.append(label)

# Convert list to numpy array to ensure consistent shape
embeddings_array = np.array(embeddings_list)

# Create DataFrame with flattened embeddings
embeddings_data = pd.DataFrame(embeddings_array)
embeddings_data['label'] = labels_list

# Save processed data
embeddings_data.to_csv("bert_embeddings.csv", index=False)

print("✅ BERT embeddings stored successfully!")



✅ BERT embeddings stored successfully!


In [22]:
data_transformed = pd.read_csv("bert_embeddings.csv")   
data_transformed.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,label
0,0.314854,-0.315803,0.482772,0.406596,0.651483,-0.514341,0.287409,0.361406,0.113588,-0.435126,...,-0.024764,-0.214943,-0.171342,0.197776,-0.088101,0.018636,0.041526,-0.015001,-0.348425,0
1,0.151626,-0.027887,0.407615,0.026858,0.002005,-0.376765,0.378584,0.799385,-0.485622,-0.581311,...,0.189612,-0.078242,-0.092983,0.178424,-0.132592,-0.271396,-0.030238,0.402832,-0.168941,0
2,0.530745,0.066995,0.319736,0.006169,0.396067,-0.457637,0.494592,0.412954,-0.056115,-0.468745,...,0.007627,-0.090966,-0.280942,-0.050126,-0.263792,-0.210276,-0.273926,0.368495,-0.379933,0
3,0.469785,-0.18799,-0.036101,-0.062842,0.632343,-0.298325,0.766535,0.998686,-0.244634,0.203218,...,-0.067121,-0.180987,-0.7038,0.086969,-0.030878,-0.199462,-0.478008,-0.176455,0.067534,1
4,0.206392,0.141666,0.001321,0.033523,0.189142,-0.165017,0.106561,0.311256,0.091679,-0.673464,...,0.270695,-0.048474,-0.252568,-0.305456,-0.062712,-0.016795,0.013259,0.275179,-0.087495,0


# Day 1:
# Conclusion : We have a dataset which can be used to train a neural network.

# Important paper:https://arxiv.org/html/2502.02523v2(Note:This paper has nothin to do with above code)
