In [None]:
# ✅ STEP 1: Install Dependencies
!pip install -q transformers datasets scikit-learn pandas

# ✅ STEP 2: Import Libraries
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
import torch


In [None]:

# ✅ STEP 3: Load Your Training and Test Data
train_df = pd.read_csv("sample_data/complexity_labeled_queries.csv")
test_df = pd.read_csv("sample_data/complexity_test_set.csv")

print("✅ Training Data:")
print(train_df.head())
print("\n✅ Test Data:")
print(test_df.head())


In [None]:

# ✅ STEP 4: Encode labels
label2id = {"simple": 0, "medium": 1, "complex": 2}
id2label = {v: k for k, v in label2id.items()}
train_df["label"] = train_df["complexity"].map(label2id)
test_df["label"] = test_df["complexity"].map(label2id)

In [None]:
# ✅ STEP 5: Tokenizer & Model
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
model = BertForSequenceClassification.from_pretrained("bert-large-uncased", num_labels=3)

# ✅ STEP 6: Tokenization
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=64)

train_dataset = train_df[["query", "label"]].rename(columns={"query": "text"})
test_dataset = test_df[["query", "label"]].rename(columns={"query": "text"})

from datasets import Dataset
train_dataset = Dataset.from_pandas(train_dataset)
test_dataset = Dataset.from_pandas(test_dataset)

train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# ✅ STEP 7: Training Setup
training_args = TrainingArguments(
    output_dir="./bert-complexity",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

In [None]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"accuracy": accuracy_score(p.label_ids, preds)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics
)

# ✅ STEP 8: Train!
trainer.train()

# ✅ STEP 9: Evaluate
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)
print("✅ Classification Report:")
print(classification_report(test_dataset["label"], preds, target_names=label2id.keys()))


In [None]:
from sklearn.metrics import classification_report
print(classification_report(test_dataset["label"], preds, target_names=label2id.keys()))


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

cm = confusion_matrix(test_dataset["label"], preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label2id.keys())
disp.plot(cmap="Blues")
plt.show()


In [None]:
# ✅ STEP 1: Install dependencies (if not already installed)
!pip install -q transformers torch onnx onnxruntime onnxruntime-tools

# ✅ STEP 2: Load fine-tuned model
from transformers import BertForSequenceClassification, BertTokenizer
import torch

model_path = "./bert-complexity/checkpoint-564"  #
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval()

# ✅ STEP 3: Prepare dummy input
inputs = tokenizer("Set a timer for 5 minutes", return_tensors="pt", padding="max_length", truncation=True, max_length=64)

# ✅ STEP 4: Export to ONNX (FP32)
torch.onnx.export(
    model,
    (inputs["input_ids"], inputs["attention_mask"]),
    "bert_complexity_fp32.onnx",
    input_names=["input_ids", "attention_mask"],
    output_names=["logits"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "seq_len"},
        "attention_mask": {0: "batch_size", 1: "seq_len"},
        "logits": {0: "batch_size"}
    },
    opset_version=14
)
print("✅ Exported to bert_complexity_fp32.onnx")

# ✅ STEP 5: Quantize to INT8 (optional but faster)
from onnxruntime.quantization import quantize_dynamic, QuantType
quantize_dynamic(
    model_input="bert_complexity_fp32.onnx",
    model_output="bert_complexity_int8.onnx",
    weight_type=QuantType.QInt8
)
print("✅ Quantized to bert_complexity_int8.onnx")

# ✅ STEP 6: Download ONNX files (if in Colab)
from google.colab import files
files.download("bert_complexity_fp32.onnx")
files.download("bert_complexity_int8.onnx")
