<a href="https://colab.research.google.com/github/boodscode237/scientific_seminary/blob/main/text_classification_using_lora.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## IMPORT LIBRARIES

In [None]:
%%capture
!pip install -q peft transformers datasets evaluate seqeval pymorphy2  datasets torch

In [None]:
%%capture
!pip install googletrans==4.0.0-rc1

In [None]:
!pip install --upgrade googletrans

[0m

In [None]:
%%capture
!pip install optuna scikit-learn matplotlib

In [None]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import KFold
import numpy as np
import torch
from googletrans import Translator
import random

In [None]:
%%capture
!pip install ipython-autotime

In [None]:
%load_ext autotime

time: 132 µs (started: 2024-06-03 06:00:08 +00:00)


## DATASET UPLOAD

In [None]:
from huggingface_hub import hf_hub_download

time: 7.09 ms (started: 2024-06-03 06:00:08 +00:00)


In [None]:
data = pd.read_csv('/kaggle/input/financial-time-all-data/ft-all-data.csv', encoding='latin1', names=['sentiment', 'text'])

In [None]:
data

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [None]:
sentiment_mapping = {0: 'neutral', -1: 'negative', 1: 'positive'}

data['sentiment'] = data['sentiment'].replace(sentiment_mapping)

data.head()

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


### Data augmentation Using Translator

In [None]:
texts = data['text'].tolist()
labels = data['sentiment'].map({'positive': 0, 'neutral': 1, 'negative': 2}).tolist()

In [None]:
df = pd.DataFrame({'text': texts, 'label': labels})

### create dataset

In [None]:
dataset = Dataset.from_pandas(df)

In [None]:
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 4846
})

## Split the dataset

In [None]:
train_val_dataset, test_dataset = dataset.train_test_split(test_size=0.2, seed=42).values()
train_dataset, val_dataset = train_val_dataset.train_test_split(test_size=0.25, seed=42).values()

### Tokenize the Data

## Set Up Cross-Validation and Define Objective Function

In [None]:
%%capture
!pip install peft

In [None]:
from transformers import DistilBertTokenizer

model_name = "distilbert-base-multilingual-cased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples['text'], max_length=512, truncation=True, padding="max_length")

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

Map:   0%|          | 0/2907 [00:00<?, ? examples/s]

Map:   0%|          | 0/969 [00:00<?, ? examples/s]

Map:   0%|          | 0/970 [00:00<?, ? examples/s]

In [None]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=3)
print(model)

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
%%capture
!pip install optuna

In [None]:
from transformers import DistilBertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from peft import LoraConfig, get_peft_model
import optuna

In [None]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted', zero_division=1)
    acc = accuracy_score(p.label_ids, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

def objective(trial):
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
    batch_size = trial.suggest_categorical('batch_size', [8, 16, 32])
    num_train_epochs = trial.suggest_int('num_train_epochs', 2, 4)
    lora_alpha = trial.suggest_int('lora_alpha', 8, 32)
    r = trial.suggest_int('r', 4, 16)

    model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=3)
    config = LoraConfig(
        r=r,
        lora_alpha=lora_alpha,
        target_modules=["attention.q_lin", "attention.k_lin", "attention.v_lin"],
        lora_dropout=0.1,
        bias="none",
        task_type="SEQ_CLS"
    )
    model = get_peft_model(model, config)

    training_args = TrainingArguments(
        output_dir='./results',
        eval_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=0.01,
        logging_dir='./logs',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer
    )

    trainer.train()
    eval_result = trainer.evaluate(eval_dataset=val_dataset)

    return eval_result['eval_accuracy']

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

print("Best hyperparameters: ", study.best_params)

[I 2024-06-05 14:29:28,480] A new study created in memory with name: no-name-3a591cdd-00a3-40a7-a334-47ef354e9ddd
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.864967,0.610939,0.465543,0.762621,0.610939
2,No log,0.778836,0.659443,0.588547,0.672987,0.659443
3,0.857000,0.749089,0.673891,0.618592,0.691373,0.673891
4,0.857000,0.741202,0.672859,0.620187,0.691771,0.672859


[I 2024-06-05 14:35:05,893] Trial 0 finished with value: 0.672858617131063 and parameters: {'learning_rate': 1.927633804869361e-05, 'batch_size': 16, 'num_train_epochs': 4, 'lora_alpha': 25, 'r': 7}. Best is trial 0 with value: 0.672858617131063.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.882166,0.608875,0.460855,0.761854,0.608875
2,0.931800,0.829806,0.626419,0.513111,0.6266,0.626419
3,0.843600,0.792806,0.658411,0.580505,0.671362,0.658411
4,0.843600,0.783604,0.664603,0.592165,0.678138,0.664603


[I 2024-06-05 14:40:23,901] Trial 1 finished with value: 0.6646026831785345 and parameters: {'learning_rate': 1.2530862119024624e-05, 'batch_size': 8, 'num_train_epochs': 4, 'lora_alpha': 9, 'r': 7}. Best is trial 0 with value: 0.672858617131063.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.752945,0.651187,0.598761,0.679761,0.651187
2,0.830100,0.684918,0.693498,0.651657,0.666376,0.693498
3,0.687200,0.667801,0.687307,0.652097,0.663246,0.687307


[I 2024-06-05 14:44:25,530] Trial 2 finished with value: 0.6873065015479877 and parameters: {'learning_rate': 3.085847247531291e-05, 'batch_size': 8, 'num_train_epochs': 3, 'lora_alpha': 24, 'r': 10}. Best is trial 2 with value: 0.6873065015479877.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.737677,0.656347,0.604103,0.685012,0.656347
2,0.812900,0.697179,0.680083,0.635419,0.654602,0.680083


[I 2024-06-05 14:47:08,797] Trial 3 finished with value: 0.6800825593395253 and parameters: {'learning_rate': 4.1436660106206595e-05, 'batch_size': 8, 'num_train_epochs': 2, 'lora_alpha': 17, 'r': 16}. Best is trial 2 with value: 0.6873065015479877.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.787414,0.662539,0.597485,0.680306,0.662539
2,0.858500,0.729862,0.681115,0.628952,0.656147,0.681115
3,0.735100,0.717461,0.678019,0.630247,0.66085,0.678019


[I 2024-06-05 14:51:11,086] Trial 4 finished with value: 0.6780185758513931 and parameters: {'learning_rate': 2.6858546565186742e-05, 'batch_size': 8, 'num_train_epochs': 3, 'lora_alpha': 15, 'r': 7}. Best is trial 2 with value: 0.6873065015479877.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.787677,0.661507,0.593606,0.677609,0.661507
2,0.863700,0.722132,0.678019,0.624263,0.677077,0.678019
3,0.729900,0.708126,0.676987,0.627541,0.665113,0.676987


[I 2024-06-05 14:55:12,634] Trial 5 finished with value: 0.6769865841073271 and parameters: {'learning_rate': 2.032266677854727e-05, 'batch_size': 8, 'num_train_epochs': 3, 'lora_alpha': 28, 'r': 9}. Best is trial 2 with value: 0.6873065015479877.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.871369,0.608875,0.460855,0.761854,0.608875
2,0.921600,0.789122,0.656347,0.577829,0.667556,0.656347
3,0.803700,0.757309,0.669763,0.605828,0.688356,0.669763
4,0.803700,0.74978,0.670795,0.608683,0.690793,0.670795


[I 2024-06-05 15:00:29,357] Trial 6 finished with value: 0.6707946336429309 and parameters: {'learning_rate': 1.0901155557428154e-05, 'batch_size': 8, 'num_train_epochs': 4, 'lora_alpha': 28, 'r': 4}. Best is trial 2 with value: 0.6873065015479877.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.909399,0.608875,0.460855,0.761854,0.608875
2,No log,0.89952,0.608875,0.460855,0.761854,0.608875


[I 2024-06-05 15:03:02,227] Trial 7 finished with value: 0.608875128998968 and parameters: {'learning_rate': 1.7191186281911664e-05, 'batch_size': 32, 'num_train_epochs': 2, 'lora_alpha': 11, 'r': 7}. Best is trial 2 with value: 0.6873065015479877.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.897848,0.608875,0.460855,0.761854,0.608875
2,No log,0.869103,0.608875,0.460855,0.761854,0.608875
3,No log,0.840069,0.621259,0.498619,0.612,0.621259
4,No log,0.828775,0.631579,0.524911,0.510837,0.631579


[I 2024-06-05 15:07:57,568] Trial 8 finished with value: 0.631578947368421 and parameters: {'learning_rate': 1.99972848755622e-05, 'batch_size': 32, 'num_train_epochs': 4, 'lora_alpha': 18, 'r': 12}. Best is trial 2 with value: 0.6873065015479877.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.899968,0.608875,0.460855,0.761854,0.608875
2,No log,0.867948,0.608875,0.460855,0.761854,0.608875
3,No log,0.834422,0.628483,0.51562,0.632058,0.628483
4,No log,0.822087,0.643963,0.54671,0.652117,0.643963


[I 2024-06-05 15:12:54,682] Trial 9 finished with value: 0.6439628482972136 and parameters: {'learning_rate': 1.779585889410396e-05, 'batch_size': 32, 'num_train_epochs': 4, 'lora_alpha': 29, 'r': 13}. Best is trial 2 with value: 0.6873065015479877.


Best hyperparameters:  {'learning_rate': 3.085847247531291e-05, 'batch_size': 8, 'num_train_epochs': 3, 'lora_alpha': 24, 'r': 10}


## **Train the Final Model with Best Hyperparameters and Quantize**

In [None]:
best_params = {'learning_rate': 4.993596574084884e-05, 'batch_size': 8, 'num_train_epochs': 8, 'lora_alpha': 32, 'r': 8}
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=3)
config = LoraConfig(
    r=best_params['r'],
    lora_alpha=best_params['lora_alpha'],
    target_modules=["attention.q_lin", "attention.k_lin", "attention.v_lin"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"
)
model = get_peft_model(model, config)

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=best_params['learning_rate'],
    per_device_train_batch_size=best_params['batch_size'],
    per_device_eval_batch_size=best_params['batch_size'],
    num_train_epochs=best_params['num_train_epochs'],
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

trainer.train()
trainer.evaluate(eval_dataset=test_dataset)

trainer.save_model('./optimized_lora_distilbert_model_yesterday_params')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.656603,0.723426,0.706423,0.710517,0.723426
2,0.753800,0.571934,0.747162,0.727963,0.737734,0.747162
3,0.538100,0.487077,0.80289,0.79945,0.799749,0.80289
4,0.538100,0.464848,0.819401,0.817629,0.816981,0.819401
5,0.436800,0.452068,0.825593,0.82342,0.822997,0.825593
6,0.407500,0.450334,0.827657,0.826734,0.826315,0.827657
7,0.391100,0.451007,0.827657,0.826337,0.825843,0.827657
8,0.391100,0.451998,0.826625,0.825517,0.824961,0.826625
