In [None]:
import pandas as pd
import ast
import torch
from torch.utils.data import Dataset
from torch import nn as nn
from transformers import DistilBertTokenizerFast, TrainingArguments, Trainer, EarlyStoppingCallback, BitsAndBytesConfig,AutoModel, AutoTokenizer,AdamW
from peft import get_peft_model, LoraConfig, TaskType
import numpy as np
import logging
from sklearn.metrics import accuracy_score as sk_accuracy, f1_score as sk_f1
from seqeval.metrics import accuracy_score, f1_score 
import numpy as np
import contractions
import bitsandbytes as bnb
from data_class import CustomNERTopicDataset
from model_class_lora import MultiTaskModel 


logger = logging.getLogger(__name__)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "distilbert/distilbert-base-multilingual-cased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
MAX_LEN = 512


In [None]:
def collate_fn(batch):
    return {
        "input_ids": torch.stack([x["input_ids"] for x in batch]),
        "attention_mask": torch.stack([x["attention_mask"] for x in batch]),
        "topic_labels": torch.stack([x["topic_labels"] for x in batch]),
        "ner_labels": torch.stack([x["ner_labels"] for x in batch]),
        "sentiment": torch.stack([x["sentiment"] for x in batch]),
    }
    

In [None]:
id2ner_label = {
            0:'O',
            1:'PER',
            2:'LOC',
            3:'ORG',
            4:'MISC'
            }
        
id2bio_label = {
            0:'O',
            1:'B',
            2:'I',
            3:'E',
            4:'S'
        }
id2sentiment_label = {
            0:'negative',
            1:'neutral',
            2:'positive'
        }

In [None]:

def compute_metrics(p):
    predictions = p.predictions
    labels = p.label_ids

    
    if len(predictions) not in {2, 3}:
        raise ValueError(f"Unexpected number of prediction outputs: {len(predictions)}")

    topic_preds, ner_preds = predictions[:2]
    topic_labels, ner_labels = labels[:2]

    sentiment_preds = predictions[2] if len(predictions) == 3 else None
    sentiment_labels = labels[2] if len(labels) == 3 else None

    
    if topic_preds.ndim == 3:
        topic_preds = np.argmax(topic_preds, axis=-1)
    if ner_preds.ndim == 3:
        ner_preds = np.argmax(ner_preds, axis=-1)
    if sentiment_preds is not None and sentiment_preds.ndim == 2:
        sentiment_preds = np.argmax(sentiment_preds, axis=-1)

    def convert_to_labels(preds, labels, id2label):
        return [
            [id2label[p] for p, l in zip(pred_seq, label_seq) if l != -100]
            for pred_seq, label_seq in zip(preds, labels)
        ]

    
    topic_preds_str = convert_to_labels(topic_preds, topic_labels, id2bio_label)
    topic_labels_str = convert_to_labels(topic_labels, topic_labels, id2bio_label)

    ner_preds_str = convert_to_labels(ner_preds, ner_labels, id2ner_label)
    ner_labels_str = convert_to_labels(ner_labels, ner_labels, id2ner_label)

   
    topic_f1 = f1_score(topic_labels_str, topic_preds_str)
    topic_acc = accuracy_score(topic_labels_str, topic_preds_str)

    
    ner_f1 = f1_score(ner_labels_str, ner_preds_str)
    ner_acc = accuracy_score(ner_labels_str, ner_preds_str)

    
    sentiment_acc = sentiment_f1 = None
    if sentiment_preds is not None and sentiment_labels is not None:
        sentiment_acc = sk_accuracy(sentiment_labels, sentiment_preds)
        sentiment_f1 = sk_f1(sentiment_labels, sentiment_preds, average='weighted')

    
    metrics = {
        "topic_f1": topic_f1,
        "topic_accuracy": topic_acc,
        "ner_f1": ner_f1,
        "ner_accuracy": ner_acc,
    }

    if sentiment_acc is not None:
        metrics.update({
            "sentiment_accuracy": sentiment_acc,
            "sentiment_f1": sentiment_f1
        })

    return metrics


In [None]:
data = pd.read_csv("/data/train_data.csv")
data = data[["text", "topics", "sentiment", "ner"]]
data.dropna(subset=["text"],inplace=True)


n = int(0.8*len(data))
train_dataset = CustomNERTopicDataset(data[:n])
eval_dataset = CustomNERTopicDataset(data[n:])


In [None]:
training_args = TrainingArguments(
    output_dir="./checkpoints",
    per_device_train_batch_size=200,
    per_device_eval_batch_size=200,
    gradient_accumulation_steps=4,
    eval_strategy="epoch",
    save_strategy="epoch",
    eval_steps=1000,
    save_steps=1000,
    learning_rate=2e-6,
    weight_decay=0.01,
    warmup_steps=100,
    logging_strategy="epoch",
    load_best_model_at_end=False,
    metric_for_best_model="topic_accuracy",
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=1000,
    dataloader_num_workers=4,
    fp16=False,
    bf16=False, 
    overwrite_output_dir=True,
    max_grad_norm=1.0,
)

model = MultiTaskModel().to(device=device)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    optimizer = AdamW(model.parameters(), 
                  lr=1e-4,
                  no_deprecation_warning=True),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

torch.save(model.state_dict(), "./topic_sentiment_lora-config.bin")
print("Model Saved")

In [None]:
def predictions(text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = "multilingual-model.bin"
    encodings = tokenizer(
                text,
                truncation=True,
                padding="max_length",
                max_length=128,
                return_tensors="pt"
            ).to(device)
    
    model.to(device)
    model.eval() 
    tokens = encodings["input_ids"].squeeze().tolist()
    tokens = tokenizer.convert_ids_to_tokens(tokens)
    with torch.no_grad():
        outputs = model(
            input_ids=encodings["input_ids"],
            attention_mask=encodings["attention_mask"]
        )
        

    topic_logits = outputs.get("topic_logits")
    sentiment_logits = outputs.get("sentiment_logits")
    ner_logits = outputs.get("ner_logits")
    

    if topic_logits is None or sentiment_logits is None or ner_logits is None:
        raise ValueError("Model output does not contain the expected logits for topic, sentiment, or NER.")

    topic_predictions = torch.argmax(topic_logits, dim=-1).squeeze().to(device).tolist()
    sentiment_prediction = torch.argmax(sentiment_logits).to(device).item() 
    ner_predictions = torch.argmax(ner_logits, dim=-1).squeeze().to(device).tolist()

    return tokens,topic_predictions, sentiment_prediction, ner_predictions


In [None]:
texts = ["I tried the iphone pro. Its awesome",
        "Sarah said her calls are getting dropped or going to voicemail",
        "The ratings have gone down, their service is not good anymore",
        "Hold on! Okay would you tell her I called? Yeah. This is Martina. Call me back on 99999999"
]
for text in texts:
    print("Text:", text)    
    tokens, topic_predictions, sentiment_prediction, ner_predictions = predictions(text)
    for i in range(len(tokens)):
        if tokens[i] not in ["[CLS]","[SEP]","[PAD]","[UNK]"]:
            print(f"{tokens[i]} : {id2bio_label[topic_predictions[i]]}-TOPIC : {id2ner_label[ner_predictions[i]]}")
    print(id2sentiment_label[sentiment_prediction])
    print("=====================================================================\n\n\n")