In [None]:
!pip install -q torch transformers datasets accelerate scikit-learn seqeval pandas matplotlib seaborn peft

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.5 MB/s[0m eta [36m

In [None]:
from google.colab import files
uploaded = files.upload()

Saving CLS_Dataset.csv to CLS_Dataset.csv


# BERT Classification Model

In [None]:
import os
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer,AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
import torch
from sklearn.metrics import accuracy_score
import re

### Extracting data

In [None]:
filename = "CLS_Dataset.csv"
filepath = os.path.abspath(filename)
df = pd.read_csv(filepath)

df["text"] = df["text"].astype(str)

df = df[df["text"].notnull() & df["text"].str.strip() != ""]

df = df.drop_duplicates(subset=["text"])

def clean_text(text):
    return re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)

df["text"] = df["text"].apply(clean_text)
df = df[df["text"].str.split().apply(len) >= 5]

dataset = Dataset.from_pandas(df)

dataset

### Creating Labels for Sentiments

In [None]:
labels_dict={0:"Negative",1:"Positive"}

## Class for BERT Classification

In [None]:
def find_lora_targets(model):
    keywords = ["query", "value", "key", "q_proj", "v_proj", "k_proj", "c_attn", "proj"]
    print("\n Possible LoRA target modules:\n")
    for name, module in model.named_modules():
        if any(kw in name.lower() for kw in keywords):
            print(name)

model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name)

find_lora_targets(model)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 Possible LoRA target modules:



In [None]:
class BERTCLassification:

    def __init__(self, modelname, col_name, num_labels):
        self.modelname = modelname
        self.text = col_name
        self.num_labels = num_labels
        self.tokenizer = AutoTokenizer.from_pretrained(self.modelname)
        self.trainer = None

    def encoding_data(self, dataset, batched=True, batch_size=2000):
        def tokenize(batch):
            return self.tokenizer(batch[self.text],padding="max_length", truncation=True)
        return dataset.map(tokenize,batched=batched, batch_size=batch_size )

    def train(self, train, test, learning_rate, epochs):

        bert_model = AutoModelForSequenceClassification.from_pretrained(self.modelname, num_labels=self.num_labels).to(device)

        lora_config = LoraConfig(
            r=8,
            lora_alpha=32,
            target_modules=["q_lin", "v_lin"],
            lora_dropout=0.05,
            bias="none",
            task_type=TaskType.SEQ_CLS,
        )
        model = get_peft_model(bert_model, lora_config)

        training_args = TrainingArguments(
            output_dir='./results',
            eval_strategy="epoch",
            save_strategy="epoch",
            learning_rate= learning_rate,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs= epochs,
            weight_decay=0.01,
            fp16=True
        )

        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = logits.argmax(axis=-1)
            return {"accuracy": accuracy_score(labels, predictions)}

        self.trainer = Trainer(
            model= model,
            args = training_args,
            train_dataset= train,
            eval_dataset= test,
            compute_metrics= compute_metrics
        )

        self.trainer.train()

    def evaluate(self):
        return self.trainer.evaluate()

    def save_model(self, path):
        self.trainer.save_model(path)
        self.tokenizer.save_pretrained(path)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
bert_cls = BERTCLassification("distilbert-base-uncased","text",2)

tokenzied_data = bert_cls.encoding_data(dataset, batch_size=5000)

Map:   0%|          | 0/39722 [00:00<?, ? examples/s]

In [None]:
split = tokenzied_data.train_test_split(test_size=0.2,seed=42)
os.environ["WANDB_DISABLED"] = "true"

trained = bert_cls.train(split["train"],split['test'],2e-5,5)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2772,0.247534,0.899937
2,0.254,0.227843,0.91051
3,0.2337,0.223546,0.912398
4,0.2245,0.219418,0.914789
5,0.23,0.219065,0.916174


In [None]:
result = bert_cls.evaluate()
result

{'eval_loss': 0.21906493604183197,
 'eval_accuracy': 0.9161736941472625,
 'eval_runtime': 34.8069,
 'eval_samples_per_second': 228.259,
 'eval_steps_per_second': 14.279,
 'epoch': 5.0}

In [None]:
bert_cls.save_model("bert_cls_model_LoRA")
!zip -r bert_cls_model_LoRA.zip bert_cls_model_LoRA
files.download("bert_cls_model_LoRA.zip")

  adding: bert_cls_model_LoRA/ (stored 0%)
  adding: bert_cls_model_LoRA/training_args.bin (deflated 51%)
  adding: bert_cls_model_LoRA/adapter_config.json (deflated 55%)
  adding: bert_cls_model_LoRA/vocab.txt (deflated 53%)
  adding: bert_cls_model_LoRA/tokenizer.json (deflated 71%)
  adding: bert_cls_model_LoRA/README.md (deflated 66%)
  adding: bert_cls_model_LoRA/adapter_model.safetensors (deflated 7%)
  adding: bert_cls_model_LoRA/special_tokens_map.json (deflated 42%)
  adding: bert_cls_model_LoRA/tokenizer_config.json (deflated 75%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>