# Fine Tune Original LLM with General Examples to Create [Base Model]

Load base BERT model

In [None]:
# %pip install transformers datasets peft accelerate

from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 1 output logit (probability of "productive")
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

Add LoRA layers using peft

In [None]:
from peft import get_peft_model, LoraConfig, TaskType

# Set LoRA config (only 8 rank adapters for Q and V projections)
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query", "value"],  # apply LoRA only to attention Q, V layers
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS,  # sequence classification
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()  # just to confirm only LoRA layers are trainable


Tokenize titles and labels

In [None]:
from datasets import load_dataset, Dataset
import torch

# Load your CSV file
dataset = load_dataset("csv", data_files="base_dataset.csv")['train']

# Keep only the necessary columns
dataset = dataset.remove_columns([col for col in dataset.column_names if col not in ["videoTitle", "label"]])
dataset = dataset.map(lambda x: {"label": float(x["label"])})


# Tokenization function
def tokenize(batch):
    tokens = tokenizer(batch["videoTitle"], padding="max_length", truncation=True, max_length=32)
    tokens["labels"] = torch.tensor(batch["label"], dtype=torch.float32)
    return tokens

# Tokenize and format for PyTorch
tokenized_dataset = dataset.map(tokenize)
tokenized_dataset.set_format("torch")

print(tokenized_dataset[0])


Train model

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./lora-roberta-productivity",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    num_train_epochs=5,
    logging_dir="./logs",
    report_to="none",
    label_names=["labels"],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

trainer.train()


In [None]:
# Save both the base model and LoRA adapters
model.save_pretrained("./lora-roberta-productivity")  
tokenizer.save_pretrained("./lora-roberta-productivity")


Predict on new titles

In [None]:
from torch.nn.functional import sigmoid

# model.eval()
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
model.to(device)

# # If training on GPU, need to move all tensors to same device as model
# for i, data in enumerate(training_args):
#     inputs, labels = data[0].to(device), data[1].to(device)

def predict_productivity(title):
    inputs = tokenizer(title, return_tensors="pt", truncation=True, padding=True, max_length=32)
    with torch.no_grad():
        logits = model(**inputs).logits
        prob = sigmoid(logits).item()  # convert logit to probability
    return prob

# Example
print(predict_productivity("Pomodoro technique for deep work"))
print(predict_productivity("Machine Learning for Everybody – Full Course"))
print(predict_productivity("Grand Theft Auto VI Trailer 2"))
print(predict_productivity("QLoRA—How to Fine-tune an LLM on a Single GPU (w/ Python Code)"))
print(predict_productivity("Methodz & Scump Watch OpTic SLAM Cloud 9 (EWC 2025)"))


In [None]:
import pandas as pd
from sklearn.metrics import log_loss

# Load test set
test_df = pd.read_csv("yt_watch_data_clean.csv")

# Get predicted probabilities
test_df["pred_prob"] = test_df["videoTitle"].apply(predict_productivity)

# true labels
y_true = test_df["label"]

# predicted probabilities
y_prob = test_df["pred_prob"]

# log loss (cross entropy)
ce_loss = log_loss(y_true, y_prob)
print("Cross-Entropy Loss:", ce_loss)
