In [1]:
import os
import random
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    DebertaV2Tokenizer,
    Trainer,
    pipeline,
)
from sklearn.metrics import f1_score, accuracy_score

# -----------------------------------------------------------------------------
# 1. Set Random Seeds for Reproducibility
# -----------------------------------------------------------------------------

seed = 239
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)


In [None]:

# -----------------------------------------------------------------------------
# 2. Define Training Parameters & Model Configuration
# -----------------------------------------------------------------------------

model_name = "microsoft/mdeberta-v3-base"

# Load model configuration and update dropout settings
config = AutoConfig.from_pretrained(model_name)
config.hidden_dropout_prob = 0.05
config.attention_probs_dropout_prob = 0.05

# Load the slow tokenizer (requires SentencePiece)
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
except ImportError as e:
    print("Error: The slow tokenizer requires SentencePiece. Install it with: pip install sentencepiece")
    raise e

In [None]:

# -----------------------------------------------------------------------------
# 3. Load and Sanitize Dataset
# -----------------------------------------------------------------------------

# Update these paths with your actual CSV file locations
TRAIN_DATA_PATH = "/nas/longleaf/home/bmaaron/processed/train.csv"
VALIDATION_DATA_PATH = "/nas/longleaf/home/bmaaron/processed/val.csv"
TEST_DATA_PATH = "/nas/longleaf/home/bmaaron/processed/test.csv"

def sanitize_text(text):
    # Example: Remove leading/trailing whitespace.
    return text.strip()

# Load each CSV into a pandas DataFrame
train_df = pd.read_csv(TRAIN_DATA_PATH)
validation_df = pd.read_csv(VALIDATION_DATA_PATH)
test_df = pd.read_csv(TEST_DATA_PATH)

# Apply sanitization to the 'text' column for each split
train_df["text"] = train_df["text"].apply(sanitize_text)
validation_df["text"] = validation_df["text"].apply(sanitize_text)
test_df["text"] = test_df["text"].apply(sanitize_text)

# Build a label mapping based on all unique labels from all splits.
# This ensures consistency between train, validation, and test.
unique_labels = sorted(
    set(train_df["label"].unique()).union(set(validation_df["label"].unique())).union(set(test_df["label"].unique()))
)
label2id = {label: idx for idx, label in enumerate(unique_labels)}
print("Label mapping:", label2id)

# Map the string labels to integers
train_df["label"] = train_df["label"].map(label2id)
validation_df["label"] = validation_df["label"].map(label2id)
test_df["label"] = test_df["label"].map(label2id)

# Optionally, confirm conversion by printing the first few labels
print("Train labels after conversion:", train_df["label"].head(10))

# Convert the pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(validation_df)
test_dataset = Dataset.from_pandas(test_df)

# Create a DatasetDict for training and evaluation
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset,
})

print("Datasets loaded, sanitized, and labels converted successfully!")

In [None]:

# -----------------------------------------------------------------------------
# 4. Tokenize the Dataset
# -----------------------------------------------------------------------------

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

# Set format for PyTorch
tokenized_datasets.set_format(
    type="torch", columns=["input_ids", "attention_mask", "label"]
)

# Save the tokenized dataset to disk
tokenized_datasets.save_to_disk("tokenized_data")


if kernel restart - load tokenized datasets below

In [None]:
from datasets import load_from_disk

tokenized_datasets = load_from_disk("tokenized_data")

In [None]:

# -----------------------------------------------------------------------------
# 5. Prepare the Model
# -----------------------------------------------------------------------------

# Determine the number of unique labels from all splits (assumes labels are numeric)
num_labels = pd.concat([train_df["label"], validation_df["label"], test_df["label"]]).nunique()
config.num_labels = num_labels


model = AutoModelForSequenceClassification.from_pretrained(
    model_name,   # model_name should be "microsoft/mdeberta-v3-base"
    config=config, 
)
print("mDeBERTa-v3-base model loaded successfully with", num_labels, "labels.")

In [None]:

# -----------------------------------------------------------------------------
# 6. Define Training Arguments
# -----------------------------------------------------------------------------

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    # Adjust the per-device batch size for FP32 on 48GB GPUs.
    per_device_train_batch_size=32,  
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    seed=seed,
    # Disable fp16 since we are training in single precision.
    fp16=False,
    dataloader_num_workers=8,
    report_to="none",
)


In [None]:

# -----------------------------------------------------------------------------
# 7. Define the Metrics for Evaluation
# -----------------------------------------------------------------------------

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1}


In [None]:

# -----------------------------------------------------------------------------
# 8. Instantiate the Trainer
# -----------------------------------------------------------------------------

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)


In [None]:

# -----------------------------------------------------------------------------
# 9. Run Training and Evaluate the Model
# -----------------------------------------------------------------------------

# Start training; this will automatically evaluate on the validation set after each epoch
train_result = trainer.train()

# Optionally, save the final model checkpoint
trainer.save_model("./final_model_checkpoint")

# Evaluate the best model on the test split
test_metrics = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
print("Test Metrics:", test_metrics)



In [None]:
tokenizer.save_pretrained("./final_model_checkpoint")

In [6]:
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

# Replace with the path or identifier of your saved model, e.g., "./final_model_checkpoint"
model_path = "/Users/bryce/Desktop/INLS697/INLS697_proj/DeBERTa/final_model_checkpoint"

# Load the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Create a text-classification pipeline
nlp = pipeline("text-classification", model=model, tokenizer=tokenizer)

print("Model is ready. Type text to analyze or 'exit' to quit.")

while True:
    text = input("Enter text: ")
    if text.lower() == "exit":
        break
    # Get the prediction from the pipeline
    results = nlp(text)
    print("Results:", results)


Device set to use cpu


Model is ready. Type text to analyze or 'exit' to quit.
Results: [{'label': 'right', 'score': 0.4479616582393646}]
Results: [{'label': 'center', 'score': 0.6844319105148315}]
Results: [{'label': 'left', 'score': 0.49298906326293945}]
Results: [{'label': 'right', 'score': 0.4805333912372589}]
Results: [{'label': 'left', 'score': 0.49298906326293945}]
Results: [{'label': 'right', 'score': 0.4805333912372589}]
Results: [{'label': 'right', 'score': 0.5474206805229187}]
Results: [{'label': 'right', 'score': 0.6045874357223511}]
Results: [{'label': 'left', 'score': 0.49298906326293945}]
Results: [{'label': 'right', 'score': 0.5087749361991882}]
Results: [{'label': 'left', 'score': 0.49298906326293945}]


{'center': 0, 'left': 1, 'right': 2}