# BERT Classification Model

In [1]:
import os
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from sklearn.metrics import accuracy_score
import re

### Extracting data

In [4]:
filename = "CLS_Dataset.csv"
filepath = os.path.abspath(filename)
df = pd.read_csv(filepath)

df["text"] = df["text"].astype(str)

df = df[df["text"].notnull() & df["text"].str.strip() != ""]

df = df.drop_duplicates(subset=["text"])

def clean_text(text):
    return re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)

df["text"] = df["text"].apply(clean_text)
df = df[df["text"].str.split().apply(len) >= 5]

dataset = Dataset.from_pandas(df)

dataset

Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 39722
})

### Creating Labels for Sentiments

In [None]:
labels_dict={0:"Negative",1:"Positive"}

## Class for BERT Classification

In [None]:
class BERTCLassification:

    def __init__(self, modelname, col_name, num_labels):
        self.modelname = modelname
        self.text = col_name
        self.num_labels = num_labels
        self.tokenizer = AutoTokenizer.from_pretrained(self.modelname)
        self.trainer = None

    def encoding_data(self, dataset, batched=True, batch_size=2000):
        def tokenize(batch):
            return self.tokenizer(batch[self.text],padding="max_length", truncation=True)
        return dataset.map(tokenize,batched=batched, batch_size=batch_size )

    def train(self, train, test, learning_rate, epochs):
        
        model = AutoModelForSequenceClassification.from_pretrained(self.modelname, num_labels=self.num_labels).to(device)

        training_args = TrainingArguments(
            output_dir='./results',
            eval_strategy="epoch",
            save_strategy="epoch",
            learning_rate= learning_rate,
            per_device_train_batch_size=2,
            per_device_eval_batch_size=2,
            num_train_epochs= epochs,
            weight_decay=0.01,
        )

        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = logits.argmax(axis=-1)
            return {"accuracy": accuracy_score(labels, predictions)}

        self.trainer = Trainer(
            model= model,
            args = training_args,
            train_dataset= train,
            eval_dataset= test,
            compute_metrics= compute_metrics
        )

        self.trainer.train()

    def evaluate(self):
        return self.trainer.evaluate()
    
    def save_model(self, path):
        self.trainer.save_model(path)
        self.tokenizer.save_pretrained(path)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
bert_cls = BERTCLassification("distilbert-base-uncased","Comment",3)

tokenzied_data = bert_cls.encoding_data(dataset, batch_size=3000)

Map: 100%|██████████| 207201/207201 [00:42<00:00, 4874.41 examples/s]


In [None]:
split = tokenzied_data.train_test_split(test_size=0.2,seed=42)

trained = bert_cls.train(split["train"],split['test'],2e-5,3)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


In [None]:
result = bert_cls.evaluate()
result

In [None]:
bert_cls.save_model("./bert_cls_model")