In [5]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# --- 1. Load data ---
df = pd.read_csv("D:\Data_Science\LangGraph\Senti_dat\output.csv").drop(columns=["Unnamed: 0"])
df["label_str"] = df["sentiment"].str.lower().str.strip()

label2id = {"negative": 0, "neutral": 1, "positive": 2}
id2label = {v: k for k, v in label2id.items()}
df["label"] = df["label_str"].map(label2id)

train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["label"],
)

train_ds = Dataset.from_pandas(train_df[["text", "label"]])
test_ds  = Dataset.from_pandas(test_df[["text", "label"]])

# --- 2. Tokenizer + model ---
MODEL_NAME = "l3cube-pune/hing-roberta-mixed"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3,
    id2label=id2label,
    label2id=label2id,
)

# moving model to GPU
model.to(device)

# --- 3. Tokenization fn ---
def preprocess(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )

train_ds_enc = train_ds.map(preprocess, batched=True)
test_ds_enc  = test_ds.map(preprocess, batched=True)

cols = ["input_ids", "attention_mask", "label"]
train_ds_enc.set_format(type="torch", columns=cols)
test_ds_enc.set_format(type="torch", columns=cols)

# --- 4. Metrics ---
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    return {
        "accuracy": accuracy_score(labels, preds),
        "macro_f1": f1_score(labels, preds, average="macro"),
    }

# --- 5. TrainingArguments + Trainer  ---
training_args = TrainingArguments(
    output_dir="./hing-roberta-mixed-sentiment",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds_enc,
    eval_dataset=test_ds_enc,
    compute_metrics=compute_metrics,
)

trainer.train()


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at l3cube-pune/hing-roberta-mixed and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 2212/2212 [00:00<00:00, 18701.21 examples/s]
Map: 100%|██████████| 554/554 [00:00<00:00, 18671.31 examples/s]


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,0.9127,0.855936,0.604693,0.583251
2,0.511,0.749525,0.707581,0.689934
3,0.2744,0.743269,0.750903,0.744061


TrainOutput(global_step=417, training_loss=0.5759861303462113, metrics={'train_runtime': 301.6971, 'train_samples_per_second': 21.996, 'train_steps_per_second': 1.382, 'total_flos': 436505160010752.0, 'train_loss': 0.5759861303462113, 'epoch': 3.0})

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


Getting the same split as before, coz I lost the initial dataset due to a kernel restart

In [12]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

import pandas as pd
from sklearn.model_selection import train_test_split

# 1) Load CSV and build labels again
df = pd.read_csv("D:/Data_Science/LangGraph/Senti_dat/output.csv").drop(columns=["Unnamed: 0"])

df["label_str"] = df["sentiment"].str.lower().str.strip()

label2id = {"negative": 0, "neutral": 1, "positive": 2}
id2label = {v: k for k, v in label2id.items()}
df["label"] = df["label_str"].map(label2id)

# for convenience, make a 'true_label' string column (if your eval code expects it)
df["true_label"] = df["label_str"]

# 2) Recreate EXACT same split as before
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,    # same seed as training
    stratify=df["label"]
)

print(len(train_df), len(test_df))

Using device: cuda
2212 554


In [None]:

# 1) tokenizer from the original HF model
BASE_MODEL_NAME = "l3cube-pune/hing-roberta-mixed"
roberta_tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)

# 2) model from fine-tuned checkpoint
ROBERTA_SENTIMENT_PATH = r"D:\Data_Science\LangGraph\Senti_dat\hing-roberta-mixed-sentiment\checkpoint-417"

roberta_model = AutoModelForSequenceClassification.from_pretrained(
    ROBERTA_SENTIMENT_PATH
).to(device)
roberta_model.eval()

# 3) label mapping from the fine-tuned config
IDX2LABEL = {int(k): v.lower() for k, v in roberta_model.config.id2label.items()}
print("Label mapping:", IDX2LABEL)


Label mapping: {0: 'negative', 1: 'neutral', 2: 'positive'}


In [14]:
from langchain_openai import ChatOpenAI
from langchain_core.messages import SystemMessage, HumanMessage
import json

In [15]:
NEG_STRONG = 0.90 #0.75
CONF_LOW   = 0.80 #0.55
HIGH_RISK_KEYWORDS = ["mar jaa", "marna", "suicide", "khatam", "zindagi khatam"]

def run_roberta_sentiment(text: str):
    roberta_model.eval()
    with torch.no_grad():
        inputs = roberta_tokenizer(
            text,
            truncation=True,
            padding=True,
            max_length=128,
            return_tensors="pt",
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}

        outputs = roberta_model(**inputs)
        logits = outputs.logits[0]
        probs = torch.softmax(logits, dim=-1)
        probs_list = probs.detach().cpu().tolist()

        idx = int(torch.argmax(probs).item())
        label = IDX2LABEL[idx]
        conf = float(probs_list[idx])

    return label, conf, probs_list

def need_gpt(text: str, base_label: str, base_conf: float) -> bool:
    """
    Decide whether to call GPT, based on the small model's label + confidence.
    Works with either BERT or RoBERTa.
    """
    if base_label == "negative" and base_conf >= NEG_STRONG:
        return False

    if base_conf < CONF_LOW:
        return True

    lower = text.lower()
    if any(kw in lower for kw in HIGH_RISK_KEYWORDS):
        return True

    return False

In [16]:
async def call_gpt_classifier(text: str) -> dict:
    model = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    system_prompt = (
        "You are a safety and sentiment classifier for Hinglish/Hindi/English text.\n"
        "Output STRICT JSON with keys: sentiment, risk_level, suicide_mention, "
        "emotion, short_rationale.\n"
        "- sentiment ∈ {\"negative\", \"neutral\", \"positive\"}\n"
        "- risk_level ∈ {\"green\", \"amber\", \"red\"}\n"
        "- suicide_mention is a boolean.\n"
        "No extra text, no markdown, ONLY JSON."
    )

    msgs = [
        SystemMessage(content=system_prompt),
        HumanMessage(content=text),
    ]
    out = await model.ainvoke(msgs)
    raw = out.content.strip()

    try:
        data = json.loads(raw)
    except Exception:
        data = json.loads(raw.strip("`").strip())

    return data


async def run_hybrid_sentiment(text: str) -> dict:
    """
    Hybrid sentiment:
    - run small model (fine-tuned RoBERTa)
    - maybe call GPT
    - return combined result
    """
    base_label, base_conf, _ = run_roberta_sentiment(text)
    use_gpt = need_gpt(text, base_label, base_conf)

    gpt_data = None
    final_sentiment = base_label
    gpt_sentiment = None
    risk_level = "green"
    suicide_mention = False

    if use_gpt:
        try:
            gpt_data = await call_gpt_classifier(text)
            gpt_sentiment = (gpt_data.get("sentiment") or "").lower()
            final_sentiment = gpt_sentiment or base_label
            risk_level = gpt_data.get("risk_level", "green")
            suicide_mention = bool(gpt_data.get("suicide_mention", False))
        except Exception as e:
            print("GPT failed:", e)

    return {
        "base_label": base_label,
        "base_conf": base_conf,
        "gpt_sentiment": gpt_sentiment,
        "final_sentiment": final_sentiment,
        "risk_level": risk_level,
        "suicide_mention": suicide_mention,
        "used_gpt": use_gpt,
    }

In [11]:
df.head()

Unnamed: 0,text,sentiment,label,label_str
0,"module , ek program hoti hai , jismen ya to so...",neutral,1,neutral
1,aur hamne aume samood ke pas unke bhaee saleh ...,positive,2,positive
2,"aur jab unhen yad dilaya jata hai , to ve yad ...",neutral,1,neutral
3,tumhen २०११ ka ted prize mil gaya hai\n,positive,2,positive
4,unhonne bad science karne ke lie ye delhi univ...,neutral,1,neutral


In [None]:
from tqdm.auto import tqdm
import asyncio

df_eval = test_df.copy().reset_index(drop=True) #df.sample(n=300, random_state=42)

texts = df_eval["text"].fillna("").tolist()
true_labels = df_eval["label_str"].tolist()

roberta_preds = []   
hybrid_preds  = []
gpt_preds     = []
used_gpt      = []

async def eval_loop():
    for text in tqdm(texts):
        # RoBERTa-only (small model)
        s_label, s_conf, _ = run_roberta_sentiment(text)
        roberta_preds.append(s_label)

        # Hybrid
        h = await run_hybrid_sentiment(text)
        hybrid_preds.append(h["final_sentiment"])
        used_gpt.append(h["used_gpt"])

        # GPT-only
        g = await call_gpt_classifier(text)
        gpt_preds.append((g.get("sentiment") or "").lower())

await eval_loop()

df_eval["roberta_pred"] = roberta_preds
df_eval["hybrid_pred"]  = hybrid_preds
df_eval["gpt_pred"]     = gpt_preds


100%|██████████| 554/554 [20:50<00:00,  2.26s/it]


In [18]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_true    = df_eval["label_str"]
y_small   = df_eval["roberta_pred"]
y_hybrid  = df_eval["hybrid_pred"]
y_gpt     = df_eval["gpt_pred"]

print("RoBERTa accuracy:  ", accuracy_score(y_true, y_small))
print("Hybrid accuracy:   ", accuracy_score(y_true, y_hybrid))
print("GPT-only acc:      ", accuracy_score(y_true, y_gpt))

print("\n=== RoBERTa report ===")
print(classification_report(y_true, y_small, digits=3))

print("\n=== HYBRID report ===")
print(classification_report(y_true, y_hybrid, digits=3))

print("\n=== GPT-only report ===")
print(classification_report(y_true, y_gpt, digits=3))

print("\nConfusion matrix (HYBRID: rows=true, cols=pred):")
labels = ["negative", "neutral", "positive"]
print(confusion_matrix(y_true, y_hybrid, labels=labels))

print("\nHybrid called GPT on {:.1%} of messages.".format(sum(used_gpt)/len(used_gpt)))


RoBERTa accuracy:   0.7509025270758123
Hybrid accuracy:    0.7472924187725631
GPT-only acc:       0.5577617328519856

=== RoBERTa report ===
              precision    recall  f1-score   support

    negative      0.718     0.725     0.721       109
     neutral      0.787     0.777     0.782       247
    positive      0.725     0.732     0.729       198

    accuracy                          0.751       554
   macro avg      0.743     0.745     0.744       554
weighted avg      0.751     0.751     0.751       554


=== HYBRID report ===
              precision    recall  f1-score   support

    negative      0.697     0.780     0.736       109
     neutral      0.753     0.814     0.782       247
    positive      0.776     0.646     0.705       198

    accuracy                          0.747       554
   macro avg      0.742     0.747     0.741       554
weighted avg      0.750     0.747     0.746       554


=== GPT-only report ===
              precision    recall  f1-score   sup