In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Check if we have GPU
import torch
print("🔥 CUDA available:", torch.cuda.is_available())
print("📱 Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

🔥 CUDA available: True
📱 Device: Tesla T4


In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score, classification_report
import os
from datetime import datetime

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Using device: {device}")

🚀 Using device: cuda


In [None]:
import os
import pandas as pd

# Path to trusted dataset in Google Drive
data_path = "/content/drive/MyDrive/NLP_Project/trusted_discipline_dataset.csv"

# Check if file exists
if os.path.exists(data_path):
    print("✅ Dataset file found!")
else:
    print("❌ Dataset file not found. Please check the path:")
    print(f"\nLooking for: {data_path}")
    print("\n📂 Available files in your drive:")
    !ls "/content/drive/MyDrive/NLP_Project"

# Load data
df = pd.read_csv(data_path)

# Explore data
print(f"\n📊 Dataset shape: {df.shape}")
print(f"\n🧱 Columns: {list(df.columns)}")
print("\n📝 First few rows:")
print(df.head())

# Check for missing values
print("\n🚨 Missing values:")
print(df.isnull().sum())

# Check class distribution
print("\n📈 Discipline distribution:")
print(df['v2.2_predicted_label'].value_counts())

✅ Dataset file found!

📊 Dataset shape: (4838, 7)

🧱 Columns: ['Title', 'Abstract', 'Discipline', 'Link', 'text_input', 'v2.2_predicted_label', 'v2.2_trust_score']

📝 First few rows:
                                               Title  \
0  VITA-Audio: Fast Interleaved Cross-Modal Token...   
1  AMO: Adaptive Motion Optimization for Hyper-De...   
2  FlexiAct: Towards Flexible Action Control in H...   
3  Actor-Critics Can Achieve Optimal Sample Effic...   
4  Demonstrating ViSafe: Vision-enabled Safety fo...   

                                            Abstract Discipline  \
0  With the growing requirement for natural human...         CS   
1  Humanoid robots derive much of their dexterity...         CS   
2  Action customization involves generating video...         CS   
3  Actor-critic algorithms have become a cornerst...         CS   
4  Assured safe-separation is essential for achie...         CS   

                                Link  \
0  http://arxiv.org/abs/2505.03739v1 

In [None]:
# Clean the data
print("🧹 Cleaning data...")
original_size = len(df)
df = df.dropna(subset=["Title", "Abstract", "Discipline"])
print(f"Removed {original_size - len(df)} rows with missing values")

# Combine title and abstract
df["text"] = df["Title"].str.strip() + ". " + df["Abstract"].str.strip()

# Create label mappings
label2id = {"CS": 0, "IS": 1, "IT": 2}
id2label = {v: k for k, v in label2id.items()}

# Map disciplines to numeric labels
df["label"] = df["Discipline"].map(label2id)

# Check if all disciplines were mapped correctly
unmapped = df["label"].isnull().sum()
if unmapped > 0:
    print(f"⚠️ Warning: {unmapped} rows have unmapped disciplines")
    print("Unique disciplines found:", df["Discipline"].unique())
    df = df.dropna(subset=["label"])

# Keep only necessary columns
df = df[["text", "label"]].copy()

print(f"✅ Final dataset shape: {df.shape}")
print(f"📊 Label distribution:")
print(df["label"].value_counts())

# Show sample processed data
print(f"\n📝 Sample processed data:")
for i in range(2):
    print(f"\nExample {i+1}:")
    print(f"Text: {df.iloc[i]['text'][:200]}...")
    print(f"Label: {df.iloc[i]['label']} ({id2label[df.iloc[i]['label']]})")

🧹 Cleaning data...
Removed 0 rows with missing values
✅ Final dataset shape: (4838, 2)
📊 Label distribution:
label
0    2402
1    1400
2    1036
Name: count, dtype: int64

📝 Sample processed data:

Example 1:
Text: VITA-Audio: Fast Interleaved Cross-Modal Token Generation for Efficient Large Speech-Language Model. With the growing requirement for natural human-computer interaction, speech-based systems receive i...
Label: 0 (CS)

Example 2:
Text: AMO: Adaptive Motion Optimization for Hyper-Dexterous Humanoid Whole-Body Control. Humanoid robots derive much of their dexterity from hyper-dexterous whole-body movements, enabling tasks that require...
Label: 0 (CS)


In [None]:
print("🤖 Loading SciBERT model and tokenizer...")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

# Load model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    "allenai/scibert_scivocab_uncased",
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

print("✅ Model and tokenizer loaded successfully!")
print(f"📏 Model parameters: {model.num_parameters():,}")

🤖 Loading SciBERT model and tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Model and tokenizer loaded successfully!
📏 Model parameters: 109,920,771


In [None]:
print("🔤 Tokenizing data...")

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df)
print(f"📦 Created dataset with {len(dataset)} samples")

# Tokenize the dataset
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    desc="Tokenizing"
)

# Remove the original text column (not needed for training)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])

print("✅ Tokenization completed!")

🔤 Tokenizing data...
📦 Created dataset with 4838 samples


Tokenizing:   0%|          | 0/4838 [00:00<?, ? examples/s]

✅ Tokenization completed!


In [None]:
print("📂 Splitting data into train/validation sets...")

# Split the data (80% train, 20% validation)
split_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)

# Rename label column to match Trainer expectations
train_dataset = split_dataset["train"].rename_column("label", "labels")
eval_dataset = split_dataset["test"].rename_column("label", "labels")

print(f"📚 Training samples: {len(train_dataset)}")
print(f"🔍 Validation samples: {len(eval_dataset)}")

# Check class distribution in splits
train_labels = train_dataset["labels"]
eval_labels = eval_dataset["labels"]

print(f"\n📊 Training set distribution:")
for label, name in id2label.items():
    count = sum(1 for x in train_labels if x == label)
    print(f"  {name}: {count} ({count/len(train_labels)*100:.1f}%)")

print(f"\n📊 Validation set distribution:")
for label, name in id2label.items():
    count = sum(1 for x in eval_labels if x == label)
    print(f"  {name}: {count} ({count/len(eval_labels)*100:.1f}%)")

📂 Splitting data into train/validation sets...
📚 Training samples: 3870
🔍 Validation samples: 968

📊 Training set distribution:
  CS: 1924 (49.7%)
  IS: 1121 (29.0%)
  IT: 825 (21.3%)

📊 Validation set distribution:
  CS: 478 (49.4%)
  IS: 279 (28.8%)
  IT: 211 (21.8%)


In [None]:
from datetime import datetime
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from transformers import TrainingArguments
import torch

print("⚙️ Setting up training configuration...")

# ⬇️ Metrics Function
def compute_metrics(eval_pred):
    """Compute accuracy and F1 scores"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1_macro": f1_score(labels, predictions, average="macro"),
        "f1_weighted": f1_score(labels, predictions, average="weighted")
    }

# 📁 Timestamped output path
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"/content/drive/MyDrive/NLP_Project/scibert_lora_v4.0_{timestamp}"

# ⚙️ TrainingArguments setup
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=5,  # Recommended: 4–6 epochs for LoRA finetuning
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir=f"{output_dir}/logs",
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    seed=42,
    fp16=torch.cuda.is_available(),  # Enable mixed precision if GPU
    report_to=None  # Disable WandB/TensorBoard
)

print("✅ Training configuration ready!")
print(f"📁 Results will be saved to: {output_dir}")

⚙️ Setting up training configuration...
✅ Training configuration ready!
📁 Results will be saved to: /content/drive/MyDrive/NLP_Project/scibert_lora_v4.0_20250606_094449


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import TrainingArguments

# Re-declare if needed (optional redundancy)
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    seed=42,
    report_to=[],  # disables wandb & tensorboard
)

print("✅ TrainingArguments created with WandB disabled.")

✅ TrainingArguments created with WandB disabled.


In [None]:
import torch.nn as nn
import torch.nn.functional as F

class LabelSmoothingLoss(nn.Module):
    def __init__(self, num_classes, smoothing=0.1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.num_classes = num_classes

    def forward(self, logits, target):
        log_probs = F.log_softmax(logits, dim=-1)
        true_dist = torch.zeros_like(log_probs)
        true_dist.fill_(self.smoothing / (self.num_classes - 1))
        true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * log_probs, dim=-1))


In [None]:
from transformers import Trainer

class CustomTrainer(Trainer):
    def __init__(self, *args, label_smoothing=0.1, num_labels=3, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = LabelSmoothingLoss(num_classes=num_labels, smoothing=label_smoothing)

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = self.loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss


In [None]:
print("🏋️ Initializing custom trainer with label smoothing...")

custom_trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    label_smoothing=0.1,  # ε = 0.1
    num_labels=3
)

print("🚀 Starting training with label smoothing (ε = 0.1)...")

training_output = custom_trainer.train()

print("✅ Training completed!")

🏋️ Initializing custom trainer with label smoothing...
🚀 Starting training with label smoothing (ε = 0.1)...


  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.7783,0.798887,0.756198,0.700871,0.737603
2,0.6966,0.782192,0.780992,0.762256,0.780443
3,0.6392,0.789135,0.786157,0.761774,0.783906
4,0.4831,0.815274,0.790289,0.766642,0.787146
5,0.4386,0.832718,0.792355,0.76856,0.789429


✅ Training completed!


In [None]:
from sklearn.metrics import classification_report

print("📊 Evaluating final model...")

# Use custom_trainer instead of trainer
eval_results = custom_trainer.evaluate()

print("\n🎯 Final Evaluation Results:")
print("=" * 50)
for key, value in eval_results.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")
    else:
        print(f"{key}: {value}")

# Get predictions for analysis
print("\n🔍 Generating detailed predictions...")
predictions = custom_trainer.predict(eval_dataset)

y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

# Classification report
print("\n📋 Detailed Classification Report:")
print("=" * 50)
target_names = [id2label[i] for i in sorted(id2label)]
print(classification_report(y_true, y_pred, target_names=target_names))

📊 Evaluating final model...



🎯 Final Evaluation Results:
eval_loss: 0.8327
eval_accuracy: 0.7924
eval_f1_macro: 0.7686
eval_f1_weighted: 0.7894
eval_runtime: 7.3141
eval_samples_per_second: 132.3470
eval_steps_per_second: 8.3400
epoch: 5.0000

🔍 Generating detailed predictions...

📋 Detailed Classification Report:
              precision    recall  f1-score   support

          CS       0.81      0.87      0.84       478
          IS       0.80      0.78      0.79       279
          IT       0.74      0.63      0.68       211

    accuracy                           0.79       968
   macro avg       0.78      0.76      0.77       968
weighted avg       0.79      0.79      0.79       968



In [21]:
import joblib
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Create timestamped final directory
hf_save_path = f"/content/drive/MyDrive/NLP_Project/discipline_classifier_scilora_v4.0_{timestamp}"
os.makedirs(hf_save_path, exist_ok=True)

# Save LoRA model (HF format)
model.save_pretrained(hf_save_path)  # saves config.json, pytorch_model.bin, etc.

# Save tokenizer (HF format)
tokenizer.save_pretrained(hf_save_path)  # saves vocab.txt, tokenizer_config.json, etc.

# Save label maps with versioned filenames
joblib.dump(label2id, f"{hf_save_path}/label2id_v4.0.pkl")
joblib.dump(id2label, f"{hf_save_path}/id2label_v4.0.pkl")

# Save model metadata
model_info = {
    "model_name": "discipline_classifier_scilora_v4.0",
    "version": "4.0",
    "timestamp": timestamp,
    "disciplines": ["CS", "IS", "IT"],
    "architecture": "SciBERT + LoRA",
    "trained_on": "trusted_discipline_dataset.csv",
    "label_mapping": {"CS": 0, "IS": 1, "IT": 2},
    "framework": "HuggingFace Transformers + PEFT",
}
joblib.dump(model_info, f"{hf_save_path}/model_info_v4.0.pkl")

print("✅ Model, tokenizer, and metadata saved using Hugging Face standard.")
print(f"📁 Location: {hf_save_path}")

✅ Model, tokenizer, and metadata saved using Hugging Face standard.
📁 Location: /content/drive/MyDrive/NLP_Project/discipline_classifier_scilora_v4.0_20250606_094449
