In [None]:
!nvidia-smi


In [None]:
import os
os.listdir()



In [None]:
import pandas as pd

train = pd.read_csv("Train.csv")
val   = pd.read_csv("Val.csv")
test  = pd.read_csv("Test.csv")

print("Train shape:", train.shape)
print("Columns:", train.columns)
train.head(3)


In [None]:
print("Train labels:", sorted(train["Label"].unique()))
print("Val labels:", sorted(val["Label"].unique()))
print("Test labels:", sorted(test["Label"].unique()))


In [None]:
print(train["Label"].value_counts())


In [None]:
from transformers import pipeline

MODEL_NAME = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
clf = pipeline("text-classification", model=MODEL_NAME)

print(clf("আমি আজকে খুব খুশি"))
print(clf("খুব খারাপ সার্ভিস"))


In [None]:
import pandas as pd

# Make a clean copy with standard column names
train_df = train.rename(columns={"Data":"text", "Label":"label"}).copy()
val_df   = val.rename(columns={"Data":"text", "Label":"label"}).copy()
test_df  = test.rename(columns={"Data":"text", "Label":"label"}).copy()

# Convert label to int
train_df["label"] = train_df["label"].astype(int)
val_df["label"]   = val_df["label"].astype(int)
test_df["label"]  = test_df["label"].astype(int)

train_df.head(3)


In [None]:
min_label = min(train_df["label"].unique())
print("Min label:", min_label)


In [None]:
!pip -q install -U transformers accelerate datasets evaluate


In [None]:
!pip -q install --upgrade --no-cache-dir \
  "pandas==2.2.2" \
  "requests==2.32.4"



In [None]:
!pip -q install --upgrade --no-cache-dir \
  "pyarrow>=21.0.0" \
  "datasets>=4.5.0" \
  "transformers" "accelerate" "evaluate" "scikit-learn"



In [None]:
import numpy as np, pandas as pd
import pyarrow as pa, datasets, transformers

print("numpy:", np.__version__)
print("pandas:", pd.__version__)
print("pyarrow:", pa.__version__)
print("datasets:", datasets.__version__)
print("transformers:", transformers.__version__)



In [None]:
import torch
print("GPU available:", torch.cuda.is_available())
!nvidia-smi


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
import evaluate

MODEL_NAME = "cardiffnlp/twitter-xlm-roberta-base-sentiment"

# number of classes from your data
NUM_LABELS = len(sorted(train_df["label"].unique()))
print("NUM_LABELS =", NUM_LABELS)

# ✅ define tokenizer first
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# ✅ then create collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def to_ds(df):
    df = df[["text","label"]].copy()
    df["text"] = df["text"].astype(str)
    df["label"] = df["label"].astype(int)
    return Dataset.from_pandas(df)

train_ds = to_ds(train_df)
val_ds   = to_ds(val_df)
test_ds  = to_ds(test_df)

def tok(batch):
    return tokenizer(batch["text"], truncation=True)

train_ds = train_ds.map(tok, batched=True)
val_ds   = val_ds.map(tok, batched=True)
test_ds  = test_ds.map(tok, batched=True)

train_ds.set_format("torch", columns=["input_ids","attention_mask","label"])
val_ds.set_format("torch", columns=["input_ids","attention_mask","label"])
test_ds.set_format("torch", columns=["input_ids","attention_mask","label"])

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=NUM_LABELS, ignore_mismatched_sizes=True
)

acc = evaluate.load("accuracy")
f1  = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": acc.compute(predictions=preds, references=labels)["accuracy"],
        "macro_f1": f1.compute(predictions=preds, references=labels, average="macro")["f1"]
    }

args = TrainingArguments(
    output_dir="xlmr_finetuned_bn",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    logging_steps=50,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()
print("FINAL TEST:", trainer.evaluate(test_ds))



In [None]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

pred = trainer.predict(test_ds)
y_pred = np.argmax(pred.predictions, axis=1)
y_true = pred.label_ids

print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=4))


In [None]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

# Get predictions on test set
pred = trainer.predict(test_ds)
y_pred = np.argmax(pred.predictions, axis=1)
y_true = pred.label_ids  # safest (from dataset)

# Label names MUST match your numeric labels (0,1,2)
label_names = ["neutral", "positive", "negative"]   # change if your mapping is different

print(classification_report(y_true, y_pred, target_names=label_names, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))


In [None]:
#no-preprocessing
def run_experiment(train_df, val_df, test_df, exp_name="baseline"):
    import numpy as np
    from datasets import Dataset
    from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
    from transformers import DataCollatorWithPadding
    import evaluate

    MODEL_NAME = "cardiffnlp/twitter-xlm-roberta-base-sentiment"

    NUM_LABELS = len(sorted(train_df["label"].unique()))
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    def to_ds(df):
        df = df[["text","label"]].copy()
        df["text"] = df["text"].astype(str)
        df["label"] = df["label"].astype(int)
        return Dataset.from_pandas(df)

    train_ds = to_ds(train_df)
    val_ds   = to_ds(val_df)
    test_ds  = to_ds(test_df)

    def tok(batch):
        return tokenizer(batch["text"], truncation=True, max_length=128)

    train_ds = train_ds.map(tok, batched=True)
    val_ds   = val_ds.map(tok, batched=True)
    test_ds  = test_ds.map(tok, batched=True)

    train_ds.set_format("torch", columns=["input_ids","attention_mask","label"])
    val_ds.set_format("torch", columns=["input_ids","attention_mask","label"])
    test_ds.set_format("torch", columns=["input_ids","attention_mask","label"])

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=NUM_LABELS, ignore_mismatched_sizes=True
    )

    acc = evaluate.load("accuracy")
    f1  = evaluate.load("f1")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=1)
        return {
            "accuracy": acc.compute(predictions=preds, references=labels)["accuracy"],
            "macro_f1": f1.compute(predictions=preds, references=labels, average="macro")["f1"]
        }

    args = TrainingArguments(
        output_dir=f"out_{exp_name}",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",
        logging_steps=50,
        report_to="none",
        seed=42
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()
    val_result  = trainer.evaluate(val_ds)
    test_result = trainer.evaluate(test_ds)

    return val_result, test_result

In [None]:
#run without preprocessing
# Create standardized dataframes
train_df = train.rename(columns={"Data":"text", "Label":"label"}).copy()
val_df   = val.rename(columns={"Data":"text", "Label":"label"}).copy()
test_df  = test.rename(columns={"Data":"text", "Label":"label"}).copy()

# Make sure labels are integers
train_df["label"] = train_df["label"].astype(int)
val_df["label"]   = val_df["label"].astype(int)
test_df["label"]  = test_df["label"].astype(int)

print(train_df.columns)
train_df.head(3)

In [None]:
val_base, test_base = run_experiment(train_df, val_df, test_df, "baseline")
print("BASELINE VAL:", val_base)
print("BASELINE TEST:", test_base)

In [None]:
#Put this “Preprocessing Functions” cell once
import re, unicodedata

# remove invisible characters common in Bangla copy-paste
ZERO_WIDTH = r"[\u200b\u200c\u200d\uFEFF]"
multi_space_re = re.compile(r"\s+")

url_re = re.compile(r"http\S+|www\.\S+")
mention_re = re.compile(r"@\w+")
rt_re = re.compile(r"^RT\s+@\w+:\s*")

repeat_punct_re = re.compile(r"([!?।])\1{2,}")   # !!!!! or ????

bn_to_en_digits = str.maketrans("০১২৩৪৫৬৭৮৯", "0123456789")

def preprocess_v1(text: str) -> str:
    """V1: Safe cleaning (Unicode normalize + zero-width remove + spacing)"""
    text = "" if text is None else str(text)
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(ZERO_WIDTH, "", text)
    text = multi_space_re.sub(" ", text).strip()
    return text

def preprocess_v2(text: str) -> str:
    """V2: Twitter normalization (V1 + mask URL + mask USER + remove RT header)"""
    text = preprocess_v1(text)
    text = rt_re.sub("", text)
    text = url_re.sub(" [URL] ", text)
    text = mention_re.sub(" [USER] ", text)
    text = multi_space_re.sub(" ", text).strip()
    return text

def preprocess_v3(text: str) -> str:
    """V3: V2 + normalize Bangla digits + compress repeated punctuation"""
    text = preprocess_v2(text)
    text = text.translate(bn_to_en_digits)
    text = repeat_punct_re.sub(r"\1\1", text)  # "!!!!!" -> "!!"
    text = multi_space_re.sub(" ", text).strip()
    return text



In [None]:
#Run your 3 preprocessing experiments (like baseline)
#Preprocess V1 experiment
train_v1 = train_df.copy()
val_v1   = val_df.copy()
test_v1  = test_df.copy()

for df in [train_v1, val_v1, test_v1]:
    df["text"] = df["text"].apply(preprocess_v1)

val_v1_res, test_v1_res = run_experiment(train_v1, val_v1, test_v1, "preprocess_v1")
print("V1 VAL:", val_v1_res)
print("V1 TEST:", test_v1_res)



In [None]:
#Preprocess V2 experiment

train_v2 = train_df.copy()
val_v2   = val_df.copy()
test_v2  = test_df.copy()

for df in [train_v2, val_v2, test_v2]:
    df["text"] = df["text"].apply(preprocess_v2)

val_v2_res, test_v2_res = run_experiment(train_v2, val_v2, test_v2, "preprocess_v2")
print("V2 VAL:", val_v2_res)
print("V2 TEST:", test_v2_res)


In [None]:
#Preprocess V3 experiment
train_v3 = train_df.copy()
val_v3   = val_df.copy()
test_v3  = test_df.copy()

for df in [train_v3, val_v3, test_v3]:
    df["text"] = df["text"].apply(preprocess_v3)

val_v3_res, test_v3_res = run_experiment(train_v3, val_v3, test_v3, "preprocess_v3")
print("V3 VAL:", val_v3_res)
print("V3 TEST:", test_v3_res)


In [None]:
#Compare results nicely
results = {
    "baseline": (val_base["eval_macro_f1"], test_base["eval_macro_f1"]),
    "v1": (val_v1_res["eval_macro_f1"], test_v1_res["eval_macro_f1"]),
    "v2": (val_v2_res["eval_macro_f1"], test_v2_res["eval_macro_f1"]),
    "v3": (val_v3_res["eval_macro_f1"], test_v3_res["eval_macro_f1"]),
}

for k, (v_f1, t_f1) in results.items():
    print(f"{k:8s} | VAL macro-F1: {v_f1:.4f} | TEST macro-F1: {t_f1:.4f}")


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
!mkdir -p /content/drive/MyDrive/colab_project


In [None]:
!mv out_baseline /content/drive/MyDrive/colab_project/
!mv out_preprocess_v1 /content/drive/MyDrive/colab_project/
!mv out_preprocess_v2 /content/drive/MyDrive/colab_project/
!mv sample_data /content/drive/MyDrive/colab_project/
!mv xlmr_finetuned_bn /content/drive/MyDrive/colab_project/

!mv Train.csv /content/drive/MyDrive/colab_project/
!mv Val.csv /content/drive/MyDrive/colab_project/
!mv Test.csv /content/drive/MyDrive/colab_project/
