STAGE 1
# DATA PREPARATION

DOWNLOAD DATASET
DATA PREPROCESSING
CREATE DATALOADERS


STAGE 2
# MODEL SETUP
INITIALIZE THE MODEL
LOAD PRETRAINED MODEL WEIGHTS OF GPT 2
MODIFY THE FINAL OUTPUT LAYERS OF MODEL FOR FINETUNING
IMPLEMENT THE EVALUATION UTILITIES


STAGE 3
# MODEL FINETUNING AND USAGE
FINETUNE MODEL
EVALUATE THE MODEL >ACCURACY , LOSS .
TEST THE MODEL ON NEW DATA

# stage 1 begning data preprocessing

In [122]:
import pandas as pd

In [123]:
df = pd.read_csv("./original_dataset/SMSSpamCollection.csv",sep= ",", header= None, names=["Label", "Text"])
df.head()

Unnamed: 0,Label,Text
0,label,text
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...


In [124]:
print("The description of the dataset \n",df.describe())
print(" -------------------------------")
print("The shape of the dataset",df.shape)
print(" -------------------------------")
print("The value counts of the dataset\n",df["Label"].value_counts())


The description of the dataset 
        Label                    Text
count   5573                    5573
unique     3                    5170
top      ham  Sorry, I'll call later
freq    4825                      30
 -------------------------------
The shape of the dataset (5573, 2)
 -------------------------------
The value counts of the dataset
 Label
ham      4825
spam      747
label       1
Name: count, dtype: int64


we need to create a balanced dataset you can see ham has 4825 samples and spam has 747 samples, its obviosly imbalanced dataset , we will create a new balanced dataset form this . We will take 747 sample from each class and create a new dataset.

In [125]:
from pandas.core.common import random_state
def create_balanced_dataset(df):
    random_state= 122
    #count the frequncy of spam and extract random samples of ham from dataset , such that both ham and spam sample count or you can say frquency is same
    num_spam = df[df['Label'] == "spam"].shape[0]
    ham_instances = df[df['Label'] == "ham"].sample(num_spam, random_state=42) #it will extract 747(num_spam) random samples of ham.
    spam_instances = df[df['Label'] == "spam"]
    balanced_df = pd.concat([ham_instances,spam_instances])
    return balanced_df


In [126]:
#main
balanced_df =  create_balanced_dataset(df)
balanced_df['Label'].value_counts()

Label
ham     747
spam    747
Name: count, dtype: int64

# saving the balanced dataset into a seprate csv file 
we will do training and validation , testing and finetuing with the help of this dataset

In [127]:
# balanced_df.to_csv("./balanced_dataset_prepared/balanced_dataset.csv", index = False)

In [128]:
balanced_df['Label'].value_counts()

Label
ham     747
spam    747
Name: count, dtype: int64

# we have balanced dataframe prepared already so we will use it and do  Label encoding on it. so that our model understand the numbers better.
NOTE: we are just making changes on dataframe and not on original balanced dataset

# Label Encoding of Labels['ham','spam']
ham -> 0
spam -> 1

In [129]:
balanced_df['Label'] = balanced_df['Label'].map({'ham':0, 'spam':1})

you can see that we have encoded the labels spam as 1 and ham as 0

In [130]:
print(balanced_df['Label'].value_counts())

Label
0    747
1    747
Name: count, dtype: int64


# We will split the dataset into train , validation and test set for this there are two strategy we can direclty use the trani_test_split from sklearn and cleverly make the train , validation and test set.
# second option we can create custom random split function to create the train , validation and test set.

train 70%,

validation 10%,

test 20%

creating a random split function which will split the dataset into train , validation and test set


In [131]:
# def random_split(df, train_frac , valid_frac):
#     df = df.sample(frac =1, random_state=42, reset_index(drop=True))
#     train_size = int(len(df) * train_frac)
#     valid_size = int(len(df) * valid_frac)
#     train_df = df.iloc[:train_size]
#     valid_df = df.iloc[train_size:train_size + valid_size]
#     test_df = df.iloc[train_size + valid_size:]
#     return train_df, valid_df, test_df


# option 2 from sklearn

In [132]:
from sklearn.model_selection import train_test_split

note train test split can make two splits train and test, we will first create 70% train_df and 30% temp_test_df split, then we will further split on temp_test_df to make 
10% val_df and 20% test_df. In this way we can get splitting in the ratio 70:10:20

In [133]:
training_df, temp_test_df = train_test_split(balanced_df,test_size=0.3, random_state=42) #
val_df, test_df = train_test_split(temp_test_df, test_size=2/3, random_state = 42)


In [134]:
training_df.head()

Unnamed: 0,Label,Text
1301,0,Great to hear you are settling well. So what's...
2023,0,"I don't have anybody's number, I still haven't..."
5521,0,No. I dont want to hear anything
2695,0,All these nice new shirts and the only thing I...
3485,0,"Hello, my love! How goes that day ? I wish you..."


In [135]:
val_df.head()

Unnamed: 0,Label,Text
5028,1,Ur cash-balance is currently 500 pounds - to m...
1127,1,For taking part in our mobile survey yesterday...
5286,1,URGENT! You have won a 1 week FREE membership ...
2575,1,Congrats 2 mobile 3G Videophones R yours. call...
697,0,Good. Good job. I like entrepreneurs


In [136]:
test_df.head()

Unnamed: 0,Label,Text
2934,0,Only 2% students solved this CAT question in '...
3863,1,Free Msg: Ringtone!From: http://tms. widelive....
5428,1,Santa Calling! Would your little ones like a c...
1219,0,"Damn, can you make it tonight or do you want t..."
4827,0,I am going to sleep. I am tired of travel.


# converting these dataframe into csv files so that we can use them for training and testing and validation also

In [137]:
training_df.to_csv("./balanced_dataset_prepared/splits/train.csv", index=None)
test_df.to_csv("./balanced_dataset_prepared/splits/testing.csv", index=None)
val_df.to_csv("./balanced_dataset_prepared/splits/validation.csv", index=None)

In [138]:
print(balanced_df['Label'].value_counts())
print(training_df.shape)
print(val_df.shape)
print(test_df.shape)

Label
0    747
1    747
Name: count, dtype: int64
(1045, 2)
(149, 2)
(300, 2)


# dataset loader for training and validation and testing df
Note: okenized inputs must have the same length because deep learning models and DataLoaders operate on fixed-shape tensors, and padding enables batching, parallel computation, and efficient training without affecting model learning (via masking).


In [None]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
max_token = 0
for i in training_df['Text']:
    token_count = len(tokenizer.encode(i))
    max_token = max(token_count, max_token)
    # print(len(tokenizer.encode(i)) == 137)
    
print(max_token)

137


# as you can see in our case in the trainig dataframe we have email of max_token length 137 , so our input batch size should be 137 tokens in each tokenized input sample for training , validation and test data

# note
### Dataset:

A Dataset defines how individual data samples are loaded and preprocessed, providing one sample at a time.

### DataLoader:

A DataLoader handles batching, shuffling, and parallel loading of data from a Dataset for efficient model training.# """

## Dataset

In [177]:
import torch
from torch.utils.data import Dataset

class SpamDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None,pad_token_id=50256):
        self.data = pd.read_csv(csv_file)
        self.encoded_texts = [ tokenizer.encode(text) for text in self.data['Text']]
        # self.labels = self.data['label'].values
        if max_length is None:
            self.max_length = self.__longest__encoded_length()
        else:
            #truncate the encoded texts if their size is greate than the max_length

            self.max_length = max_length
            self.encoded_text = [encoded_text[:max_length]for encoded_text in self.encoded_texts]
            
            #pad the encoded texts if their size is less than the max_length
            self.encoded_text = [encoded_text + [pad_token_id] * (max_length - len(encoded_text)) for encoded_text in self.encoded_texts]
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        encoded = self.encoded_texts[idx]
        label = self.data.iloc[idx]['Label']
        return (
            torch.tensor(encoded,dtype = torch.long),
            torch.tensor(label,dtype = torch.long)
        )
    def __longest__encoded_length(self):
        max_length = 0
        for encoded_text in self.encoded_texts:
            encoded_text_len = len(encoded_text)
            if(encoded_text_len > max_length):
                max_length = encoded_text_len
        return max_length


In [164]:
train_dataset = SpamDataset(csv_file = "balanced_dataset_prepared/splits/train.csv",max_length=None,tokenizer=tokenizer)
print(train_dataset.max_length)

137


In [179]:
valid_dataset = SpamDataset(csv_file = "balanced_dataset_prepared/splits/validation.csv",max_length=train_dataset.max_length,tokenizer=tokenizer)
print(valid_dataset.max_length)


137


In [180]:

test_dataset = SpamDataset(csv_file = "balanced_dataset_prepared/splits/testing.csv",max_length=train_dataset.max_length,tokenizer=tokenizer)
print(test_dataset.max_length)


137


# DataLoader 