In [None]:
%pip install transformers torch


In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel

# Day 1 : Simple Neural Network for predicting spam and Ham classification

In [2]:
data=pd.read_csv("email_classification.csv")
data['label']=data['label'].map({'spam':1,'ham':0})
data.head()

Unnamed: 0,email,label
0,Upgrade to our premium plan for exclusive acce...,0
1,Happy holidays from our team! Wishing you joy ...,0
2,We're hiring! Check out our career opportuniti...,0
3,Your Amazon account has been locked. Click her...,1
4,Your opinion matters! Take our survey and help...,0


In [4]:
class Transformation:
    def __init__(self, data):
        self.data = data
        self.tokenizer, self.model = self.load_tokenizer_model()

    def load_tokenizer_model(self):
        """
        Load BERT tokenizer and model.
        """
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased')
        return tokenizer, model

    def tokenize_text(self, text):
        """
        Tokenizes input text and converts it into PyTorch tensors.
        """
        tokens = self.tokenizer(
            text,
            padding=True,
            truncation=True,
            return_tensors='pt'
        )
        return tokens

    def get_embeddings(self, tokens):
        """
        Extracts embeddings from BERT model.
        """
        with torch.no_grad():  
            outputs = self.model(**tokens)  
        embeddings = outputs.last_hidden_state
        return embeddings

In [5]:
# Testing the code
text = "Congratulations! You have won a free iPhone."
transformer = Transformation(data=None)

tokens = transformer.tokenize_text(text)
embeddings = transformer.get_embeddings(tokens)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



In [6]:
tokens

{'input_ids': tensor([[  101, 23156,   999,  2017,  2031,  2180,  1037,  2489, 18059,  1012,
           102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [7]:
embeddings

tensor([[[ 3.0962e-01, -3.4133e-04,  4.6005e-03,  ..., -3.2639e-01,
           2.3386e-01,  4.2773e-01],
         [ 5.4302e-01, -2.5816e-01,  2.7266e-01,  ..., -2.3062e-01,
           1.0924e+00,  2.8424e-01],
         [ 5.7392e-01, -1.2022e-01,  7.1131e-02,  ...,  2.9910e-01,
           6.9459e-01,  7.7258e-02],
         ...,
         [ 7.4757e-01,  2.9356e-01,  5.2565e-01,  ..., -1.3592e-01,
           3.0505e-01, -7.9207e-01],
         [ 7.1357e-01,  7.4151e-03, -3.9734e-01,  ...,  3.7720e-01,
          -3.2668e-01, -5.5241e-01],
         [ 6.4319e-01,  9.1430e-02, -1.2740e-01,  ...,  3.9700e-01,
          -4.5115e-01, -3.1485e-01]]])