In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Check if we have GPU
import torch
print("🔥 CUDA available:", torch.cuda.is_available())
print("📱 Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

🔥 CUDA available: True
📱 Device: Tesla T4


In [3]:
!pip install transformers==4.21.0 datasets==2.4.0 torch scikit-learn pandas numpy==1.21.6
!pip install accelerate evaluate

Collecting transformers==4.21.0
  Downloading transformers-4.21.0-py3-none-any.whl.metadata (81 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/82.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.0/82.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==2.4.0
  Downloading datasets-2.4.0-py3-none-any.whl.metadata (20 kB)
[31mERROR: Ignored the following versions that require a different python version: 1.21.2 Requires-Python >=3.7,<3.11; 1.21.3 Requires-Python >=3.7,<3.11; 1.21.4 Requires-Python >=3.7,<3.11; 1.21.5 Requires-Python >=3.7,<3.11; 1.21.6 Requires-Python >=3.7,<3.11[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement numpy==1.21.6 (from versions: 1.3.0, 1.4.1, 1.5.0, 1.5.1, 1.6.0, 1.6.1, 1.6.2, 1.7.0, 1.7.1, 1.7.2, 1.8.0, 1.8.1, 1.8.2, 1.9.0, 1.9.1, 1.9.2, 1.9.3, 1.10.0.post2, 1.10.1, 1.10.2, 1.10.4, 1.11.0, 1.11.1, 1.11.2, 1.11.3, 1.12.0,

In [4]:
import torch
import pandas as pd
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score, classification_report
import os
from datetime import datetime

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Using device: {device}")

🚀 Using device: cuda


In [5]:
data_path = "/content/drive/MyDrive/NLP_Project/Expanded Discipline Dataset.csv"

# Check if file exists
if os.path.exists(data_path):
    print("✅ Dataset file found!")
else:
    print("❌ Dataset file not found. Please check the path:")
    print(f"Looking for: {data_path}")
    print("\n📁 Available files in your drive:")
    !ls "/content/drive/MyDrive/"

df = pd.read_csv(data_path)

# Explore the data
print(f"📊 Dataset shape: {df.shape}")
print(f"\n📋 Columns: {list(df.columns)}")
print(f"\n🔍 First few rows:")
print(df.head())

# Check for missing values
print(f"\n❓ Missing values:")
print(df.isnull().sum())

# Check discipline distribution
print(f"\n📈 Discipline distribution:")
print(df['Discipline'].value_counts())

✅ Dataset file found!
📊 Dataset shape: (5402, 4)

📋 Columns: ['Title', 'Abstract', 'Discipline', 'Link']

🔍 First few rows:
                                               Title  \
0  VITA-Audio: Fast Interleaved Cross-Modal Token...   
1  AMO: Adaptive Motion Optimization for Hyper-De...   
2  FlexiAct: Towards Flexible Action Control in H...   
3  Actor-Critics Can Achieve Optimal Sample Effic...   
4  Demonstrating ViSafe: Vision-enabled Safety fo...   

                                            Abstract Discipline  \
0  With the growing requirement for natural human...         CS   
1  Humanoid robots derive much of their dexterity...         CS   
2  Action customization involves generating video...         CS   
3  Actor-critic algorithms have become a cornerst...         CS   
4  Assured safe-separation is essential for achie...         CS   

                                Link  
0  http://arxiv.org/abs/2505.03739v1  
1  http://arxiv.org/abs/2505.03738v1  
2  http://arxiv.org

In [6]:
# Clean the data
print("🧹 Cleaning data...")
original_size = len(df)
df = df.dropna(subset=["Title", "Abstract", "Discipline"])
print(f"Removed {original_size - len(df)} rows with missing values")

# Combine title and abstract
df["text"] = df["Title"].str.strip() + ". " + df["Abstract"].str.strip()

# Create label mappings
label2id = {"CS": 0, "IS": 1, "IT": 2}
id2label = {v: k for k, v in label2id.items()}

# Map disciplines to numeric labels
df["label"] = df["Discipline"].map(label2id)

# Check if all disciplines were mapped correctly
unmapped = df["label"].isnull().sum()
if unmapped > 0:
    print(f"⚠️ Warning: {unmapped} rows have unmapped disciplines")
    print("Unique disciplines found:", df["Discipline"].unique())
    df = df.dropna(subset=["label"])

# Keep only necessary columns
df = df[["text", "label"]].copy()

print(f"✅ Final dataset shape: {df.shape}")
print(f"📊 Label distribution:")
print(df["label"].value_counts())

# Show sample processed data
print(f"\n📝 Sample processed data:")
for i in range(2):
    print(f"\nExample {i+1}:")
    print(f"Text: {df.iloc[i]['text'][:200]}...")
    print(f"Label: {df.iloc[i]['label']} ({id2label[df.iloc[i]['label']]})")

🧹 Cleaning data...
Removed 0 rows with missing values
✅ Final dataset shape: (5402, 2)
📊 Label distribution:
label
0    2570
1    1581
2    1251
Name: count, dtype: int64

📝 Sample processed data:

Example 1:
Text: VITA-Audio: Fast Interleaved Cross-Modal Token Generation for Efficient Large Speech-Language Model. With the growing requirement for natural human-computer interaction, speech-based systems receive i...
Label: 0 (CS)

Example 2:
Text: AMO: Adaptive Motion Optimization for Hyper-Dexterous Humanoid Whole-Body Control. Humanoid robots derive much of their dexterity from hyper-dexterous whole-body movements, enabling tasks that require...
Label: 0 (CS)


In [7]:
print("🤖 Loading SciBERT model and tokenizer...")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

# Load model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    "allenai/scibert_scivocab_uncased",
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

print("✅ Model and tokenizer loaded successfully!")
print(f"📏 Model parameters: {model.num_parameters():,}")

🤖 Loading SciBERT model and tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Model and tokenizer loaded successfully!
📏 Model parameters: 109,920,771


In [8]:
print("🔤 Tokenizing data...")

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df)
print(f"📦 Created dataset with {len(dataset)} samples")

# Tokenize the dataset
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    desc="Tokenizing"
)

# Remove the original text column (not needed for training)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])

print("✅ Tokenization completed!")

🔤 Tokenizing data...
📦 Created dataset with 5402 samples


Tokenizing:   0%|          | 0/5402 [00:00<?, ? examples/s]

✅ Tokenization completed!


In [9]:
print("📂 Splitting data into train/validation sets...")

# Split the data (80% train, 20% validation)
split_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)

# Rename label column to match Trainer expectations
train_dataset = split_dataset["train"].rename_column("label", "labels")
eval_dataset = split_dataset["test"].rename_column("label", "labels")

print(f"📚 Training samples: {len(train_dataset)}")
print(f"🔍 Validation samples: {len(eval_dataset)}")

# Check class distribution in splits
train_labels = train_dataset["labels"]
eval_labels = eval_dataset["labels"]

print(f"\n📊 Training set distribution:")
for label, name in id2label.items():
    count = sum(1 for x in train_labels if x == label)
    print(f"  {name}: {count} ({count/len(train_labels)*100:.1f}%)")

print(f"\n📊 Validation set distribution:")
for label, name in id2label.items():
    count = sum(1 for x in eval_labels if x == label)
    print(f"  {name}: {count} ({count/len(eval_labels)*100:.1f}%)")

📂 Splitting data into train/validation sets...
📚 Training samples: 4321
🔍 Validation samples: 1081

📊 Training set distribution:
  CS: 2053 (47.5%)
  IS: 1256 (29.1%)
  IT: 1012 (23.4%)

📊 Validation set distribution:
  CS: 517 (47.8%)
  IS: 325 (30.1%)
  IT: 239 (22.1%)


In [11]:
print("⚙️ Setting up training configuration...")

def compute_metrics(eval_pred):
    """Compute accuracy and F1 score for evaluation"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predictions)
    f1_macro = f1_score(labels, predictions, average='macro')
    f1_weighted = f1_score(labels, predictions, average='weighted')

    return {
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted
    }

# Create output directory with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"/content/drive/MyDrive/NLP_Project/scibert_results_{timestamp}"

# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=8,  # Adjust based on GPU memory
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir=f"{output_dir}/logs",
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    save_total_limit=2,  # Keep only 2 best models
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
    report_to=None,  # Disable wandb/tensorboard logging
    seed=42,
)

print("✅ Training configuration ready!")
print(f"📁 Results will be saved to: {output_dir}")

⚙️ Setting up training configuration...
✅ Training configuration ready!
📁 Results will be saved to: /content/drive/MyDrive/NLP_Project/scibert_results_20250605_093813


In [21]:
# Complete fix for wandb issues
!pip install wandb
import os
os.environ["WANDB_DISABLED"] = "true"

# Restart the trainer with explicit wandb disabling
from transformers import TrainingArguments, Trainer

# Recreate training arguments with wandb explicitly disabled
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    report_to=[],  # This completely disables wandb
    seed=42,
)

print("✅ Training arguments recreated with wandb disabled")

✅ Training arguments recreated with wandb disabled


In [22]:
print("🏋️ Initializing trainer...")

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("🚀 Starting training...")
print("This will take approximately 30-45 minutes on GPU...")

# Start training
training_output = trainer.train()

print("✅ Training completed!")

🏋️ Initializing trainer...
🚀 Starting training...
This will take approximately 30-45 minutes on GPU...


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.6596,0.587018,0.792784,0.773213,0.789031
2,0.489,0.543883,0.807586,0.787921,0.803066
3,0.2765,0.679862,0.820537,0.809153,0.819164


✅ Training completed!


In [23]:
print("📊 Evaluating final model...")

# Final evaluation
eval_results = trainer.evaluate()

print("\n🎯 Final Evaluation Results:")
print("=" * 50)
for key, value in eval_results.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")
    else:
        print(f"{key}: {value}")

# Get detailed predictions for analysis
print("\n🔍 Generating detailed predictions...")
predictions = trainer.predict(eval_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

# Classification report
print("\n📋 Detailed Classification Report:")
print("=" * 50)
target_names = [id2label[i] for i in range(len(id2label))]
print(classification_report(y_true, y_pred, target_names=target_names))


📊 Evaluating final model...



🎯 Final Evaluation Results:
eval_loss: 0.6799
eval_accuracy: 0.8205
eval_f1_macro: 0.8092
eval_f1_weighted: 0.8192
eval_runtime: 9.4392
eval_samples_per_second: 114.5220
eval_steps_per_second: 7.2040
epoch: 3.0000

🔍 Generating detailed predictions...

📋 Detailed Classification Report:
              precision    recall  f1-score   support

          CS       0.81      0.89      0.85       517
          IS       0.85      0.79      0.82       325
          IT       0.82      0.72      0.76       239

    accuracy                           0.82      1081
   macro avg       0.82      0.80      0.81      1081
weighted avg       0.82      0.82      0.82      1081



In [25]:
import joblib
import os

# Save only the essential components as .pkl files
final_model_path = f"/content/drive/MyDrive/NLP_Project/lora_discipline_classifier_v3.1_{timestamp}"
os.makedirs(final_model_path, exist_ok=True)

# Essential files only
joblib.dump(model, f"{final_model_path}/lora_model_v3.1.pkl")
joblib.dump(tokenizer, f"{final_model_path}/tokenizer_v3.1.pkl")
joblib.dump(label2id, f"{final_model_path}/label2id_v3.1.pkl")
joblib.dump(id2label, f"{final_model_path}/id2label_v3.1.pkl")

# Basic model info
model_info = {
    "model_name": "lora_discipline_classifier_v3.1",
    "version": "3.1",
    "disciplines": ["CS", "IS", "IT"]
}
joblib.dump(model_info, f"{final_model_path}/model_info_v3.1.pkl")

print("✅ Essential files saved as .pkl!")
print("🤖 Model, tokenizer, and labels ready to use!")

✅ Essential files saved as .pkl!
🤖 Model, tokenizer, and labels ready to use!
