In [1]:
"""
Instructions to install packages and libraries:
1. !nvidia-smi (Check CUDA)
https://www.carc.usc.edu/user-guides/data-science/building-conda-environment
2. module purge
3. module load conda
4. mamba init bash
5. source ~/.bashrc
6. mamba create --name <env_name>
7. mamba activate <env_name>
8. mamba install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
9. mamba install other libraries
10. Run a batch job
"""

'\nInstructions to install packages and libraries:\n1. !nvidia-smi (Check CUDA)\nhttps://www.carc.usc.edu/user-guides/data-science/building-conda-environment\n2. module purge\n3. module load conda\n4. mamba init bash\n5. source ~/.bashrc\n6. mamba create --name <env_name>\n7. mamba activate <env_name>\n8. mamba install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia\n9. mamba install other libraries\n10. Run a batch job\n'

In [2]:
# Batch Job Script. May have to change the env
# #!/bin/bash

# #SBATCH --account=yzhao010_1531
# #SBATCH --partition=gpu
# #SBATCH --nodes=1
# #SBATCH --ntasks=1
# #SBATCH --cpus-per-task=4
# #SBATCH --gpus-per-task=a100:1
# #SBATCH --mem=16G
# #SBATCH --time=2:30:00
# #SBATCH --output=/home1/kharwal/logs_stdout.txt
# #SBATCH --error=/home1/kharwal/logs_stderr.txt

# module purge
# module load conda
# eval "$(conda shell.bash hook)"

# mamba activate /home1/kharwal/.conda/envs/dl_100

# python dl_100.py



In [3]:
"""
Political Neutrality Model: Fine-tuning MPT-7B for neutral article generation
-------------------
This script fine-tunes a 7B parameter model to generate politically neutral
content given left-leaning and right-leaning articles using QLoRA techniques.
"""

import os
import json
import pandas as pd
import numpy as np
from itertools import cycle
from dataclasses import dataclass
from typing import Dict, List, Any
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType, PeftModel

# For evaluation
import evaluate
from tqdm import tqdm
import nltk
nltk.download('punkt_tab', quiet=True)  # Needed for BLEU score calculation

# Set environment variables for better performance
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Check CUDA availability
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

  from .autonotebook import tqdm as notebook_tqdm


CUDA available: True
GPU: NVIDIA A100 80GB PCIe
GPU Memory: 85.10 GB


In [4]:
# Configuration
CONFIG = {
    "model_name": "mosaicml/mpt-7b-8k-instruct",  # Model with 8K context window
    "root_dir": "/home1/kharwal/",              # Base directory for data and outputs
    "output_dir": "mpt-lora-neutral",           # Output directory name
    "max_seq_length": 6000,                     # Maximum sequence length for training (increased for triplets)
    "per_device_batch_size": 1,                 # Per device batch size (reduced for longer sequences)
    "gradient_accumulation_steps": 8,           # For effective batch size of 8 (adjusted for batch size=1)
    "learning_rate": 2e-4,                      # Learning rate for LoRA fine-tuning
    "num_train_epochs": 5,     #TODO                 # Number of training epochs
    "lora_r": 8,                                # LoRA attention dimension
    "lora_alpha": 16,                           # LoRA alpha parameter
    "lora_dropout": 0.05,                       # Dropout probability for LoRA layers
    "weight_decay": 0.01,                       # Weight decay for AdamW
    "warmup_steps": 25,                         # Linear warmup steps
    "target_modules": ["Wqkv", "out_proj"],     # MPT-specific target modules for LoRA
    "max_new_tokens": 1800,                     # Max new tokens for generation (increased as requested)
    "evaluate_every": 30,                      # Evaluate every N steps
    "save_total_limit": 2,                      # Number of checkpoints to save
    "eval_accumulation_steps": 8,               # Accumulation steps for evaluation
    "save_strategy": "steps",                   # Save strategy (steps or epoch)
    "json_files": [                             # Data files
        "flipside_4K_articles_23-25.json",
        "flipside_4k_articles_21-22.json",
        "flipside_4k_articles_19-20.json",
    ]
}

In [5]:
# ----------------- DATA LOADING -----------------
def load_all_data(file_paths: List[str], root_dir: str) -> List[Dict[str, Any]]:
    """Load and combine all JSON data files."""
    all_entries = []
    for path in file_paths:
        full_path = os.path.join(root_dir, path)
        if os.path.exists(full_path):
            print(f"Loading data from {full_path}")
            with open(full_path, "r") as f:
                all_entries.extend(json.load(f))
        else:
            print(f"[WARN] File not found: {full_path}")

    print(f"Loaded {len(all_entries)} total entries from {len(file_paths)} files")
    return all_entries

def generate_triplets(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """Generate left-right-neutral article triplets from raw data."""
    triplets = []

    for entry in data:
        left_articles = entry.get("left", [])
        right_articles = entry.get("right", [])
        neutral_articles = entry.get("neutral", [])

        # Skip if any perspective is missing
        if not (left_articles and right_articles and neutral_articles):
            continue

        # Create cycles to handle uneven article counts
        left_cycle = cycle(left_articles)
        right_cycle = cycle(right_articles)

        for neutral in neutral_articles:
            left = next(left_cycle)
            right = next(right_cycle)

            # Combine headline and text for each article
            left_combined = f"{left['headline']}. {left['text']}"
            right_combined = f"{right['headline']}. {right['text']}"
            neutral_combined = f"{neutral['headline']}. {neutral['text']}"

            triplets.append({
                "left": left_combined,
                "right": right_combined,
                "neutral": neutral_combined,
                "meta": {
                    "event_id": entry["id"],
                    "date": entry["date"],
                    "source_title": entry["source_title"]
                }
            })

    print(f"Generated {len(triplets)} triplets")
    return triplets

# ----------------- DATASET CLASS -----------------
class TripletDataset(Dataset):
    """Dataset for processing left-right-neutral article triplets."""

    def __init__(self, df, tokenizer, max_length=4096):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # Format prompt with special tokens
        prompt = f"<|left|>\n{row['left']}\n<|right|>\n{row['right']}\n<|neutral|>\n{row['neutral']}"

        # Tokenize with truncation
        encoding = self.tokenizer(
            prompt,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )

        input_ids = encoding["input_ids"].squeeze(0)
        attention_mask = encoding["attention_mask"].squeeze(0)

        # For causal LM, labels are the same as input_ids
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": input_ids.clone()
        }

@dataclass
class DataCollator:
    """Custom data collator for handling variable length sequences."""

    tokenizer: AutoTokenizer

    def __call__(self, batch):
        input_ids = [b["input_ids"] for b in batch]
        attention_mask = [b["attention_mask"] for b in batch]
        labels = [b["labels"] for b in batch]

        # Pad to the longest sequence in the batch
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )
        attention_mask = torch.nn.utils.rnn.pad_sequence(
            attention_mask, batch_first=True, padding_value=0
        )
        labels = torch.nn.utils.rnn.pad_sequence(
            labels, batch_first=True, padding_value=-100  # -100 ignores padding in loss calculation
        )

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }


In [6]:

# ----------------- EVALUATION FUNCTIONS -----------------
def generate_neutral_article(left, right, model, tokenizer, max_length=4096, max_new_tokens=2048):
    """Generate a neutral article given left and right articles."""
    # Format prompt with special tokens
    prompt = f"<|left|>\n{left}\n<|right|>\n{right}\n<|neutral|>\n"

    # Tokenize the input
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=max_length
    ).to(model.device)

    # Generate text
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,  # Use greedy decoding for evaluation
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    # Decode the output
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract only the generated neutral article (after the last <|neutral|> tag)
    if "<|neutral|>" in generated_text:
        generated_neutral = generated_text.split("<|neutral|>")[-1].strip()
    else:
        # Fallback if no neutral tag is present
        generated_neutral = generated_text.split(prompt)[-1].strip()

    return generated_neutral

def evaluate_model(model, tokenizer, test_df, metrics_dict):
    """Evaluate the model using ROUGE and BLEU metrics."""
    model.eval()

    # During inference, we only need to provide left+right articles and generate the neutral article
    # The max_new_tokens is set to 2048 to allow for longer generated articles
    generated_list = []
    reference_list = []

    # Generate predictions for each test example
    for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Evaluating"):
        # Generate neutral article
        gen = generate_neutral_article(
            row["left"],
            row["right"],
            model,
            tokenizer,
            max_length=CONFIG["max_seq_length"] - CONFIG["max_new_tokens"],
            max_new_tokens=CONFIG["max_new_tokens"]
        )
        ref = row["neutral"]

        generated_list.append(gen)
        reference_list.append(ref)

        # Print a sample every 50 examples
        if idx % 25 == 0:
            print(f"\nSample {idx}:")
            print(f"Generated: {gen}")
            print(f"Reference: {ref}")
            torch.cuda.empty_cache()

    # Compute ROUGE scores
    rouge_result = metrics_dict["rouge"].compute(
        predictions=generated_list,
        references=reference_list,
        use_stemmer=True
    )

    # Compute BLEU score
    # Tokenize for BLEU (which expects list of tokens)
    tokenized_gen = [nltk.word_tokenize(text.lower()) for text in generated_list]
    tokenized_ref = [[nltk.word_tokenize(text.lower())] for text in reference_list]

    bleu_result = metrics_dict["bleu"].compute(
        predictions=tokenized_gen,
        references=tokenized_ref
    )

    # Print evaluation results
    print("\n===== EVALUATION RESULTS =====")
    print("ROUGE scores:", {k: v.mid.fmeasure for k, v in rouge_result.items()})
    print("BLEU score:", bleu_result["bleu"])

    # Return metrics for potential logging
    results = {
        "rouge1": rouge_result["rouge1"].mid.fmeasure,
        "rouge2": rouge_result["rouge2"].mid.fmeasure,
        "rougeL": rouge_result["rougeL"].mid.fmeasure,
        "bleu": bleu_result["bleu"],
    }

    return results, generated_list, reference_list

In [7]:
def check_gpu_memory():
    """Print current GPU memory usage."""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1024**3
        reserved = torch.cuda.memory_reserved() / 1024**3
        max_mem = torch.cuda.get_device_properties(0).total_memory / 1024**3
        
        print(f"GPU Memory: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved, {max_mem:.2f}GB total")
        return allocated < (max_mem * 0.9)  # Return True if we're using less than 90% of memory
    return True
    
def clear_memory():
    """Aggressive memory clearing function."""
    # Clear CUDA cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        
    # Force garbage collection
    import gc
    gc.collect()
    
    # Check memory after clearing
    check_gpu_memory()

In [None]:
# ----------------- MAIN EXECUTION -----------------
def main():
    # Create output directory
    output_dir = os.path.join(CONFIG["root_dir"], CONFIG["output_dir"])
    os.makedirs(output_dir, exist_ok=True)

    # 1. Load and preprocess data
    print("\n===== LOADING DATA =====")
    combined_data = load_all_data(CONFIG["json_files"], CONFIG["root_dir"])
    triplet_data = generate_triplets(combined_data)
    df_triplets = pd.DataFrame(triplet_data)

    # Check data characteristics
    print(f"Dataset size: {len(df_triplets)} examples")
    text_lengths = df_triplets.apply(
        lambda x: len(x['left'].split()) + len(x['right'].split()) + len(x['neutral'].split()),
        axis=1
    )
    print(f"Average total word count per example: {text_lengths.mean():.1f}")
    print(f"Max total word count per example: {text_lengths.max()}")


    output_text_lengths = df_triplets.apply(
        lambda x: len(x['neutral'].split()),
        axis=1
    )
    estimated_output_token_lengths = output_text_lengths * 1.3
    print(f"Estimated average token count per triplet: {estimated_output_token_lengths.mean():.1f}")
    print(f"Estimated max token count per triplet: {estimated_output_token_lengths.max():.1f}")
    print(f"Number of examples that may exceed 6500 tokens: {(estimated_output_token_lengths > 2048).sum()}")


    # Estimate token counts (words * 1.3 is a rough approximation)
    estimated_token_lengths = text_lengths * 1.3
    print(f"Estimated average token count per triplet: {estimated_token_lengths.mean():.1f}")
    print(f"Estimated max token count per triplet: {estimated_token_lengths.max():.1f}")
    print(f"Number of examples that may exceed 6500 tokens: {(estimated_token_lengths > 6500).sum()}")

    # Display token length distribution
    token_bins = [0, 2000, 4000, 6000, 8000, float('inf')]
    token_counts = pd.cut(estimated_token_lengths, bins=token_bins).value_counts().sort_index()
    print("Estimated token length distribution:")
    for i, count in enumerate(token_counts):
        if i < len(token_bins) - 1:
            print(f"  {token_bins[i]}-{token_bins[i+1]}: {count} examples")

    # 2. Split data
    print("\n===== SPLITTING DATA =====")
    # Stratified split not necessary for this task; use random split
    df_train_val, df_test = train_test_split(df_triplets, test_size=0.1, random_state=42)
    df_train, df_val = train_test_split(df_train_val, test_size=0.1, random_state=42)
    print(f"Train: {len(df_train)}, Val: {len(df_val)}, Test: {len(df_test)}")

    # 3. Set up tokenizer
    print("\n===== SETTING UP TOKENIZER =====")
    # Note: We're using a max_seq_length of 6500 tokens during training to accommodate
    # all three articles (left + right + neutral) plus special tokens in the input.
    # This is important because during fine-tuning we provide the complete triplet,
    # while at inference time we only need to provide left + right articles.
    tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name"], padding_side="right")
    tokenizer.pad_token = tokenizer.eos_token

    # Add special tokens for our task
    special_tokens_dict = {
        "additional_special_tokens": ["<|neutral|>", "<|left|>", "<|right|>"],
        "bos_token": "<|neutral|>"
    }
    tokenizer.add_special_tokens(special_tokens_dict)

    # 4. Set up model with QLoRA
    print("\n===== SETTING UP MODEL WITH QLORA =====")

    # # Configure quantization
    # bnb_config = BitsAndBytesConfig(
    #     load_in_4bit=True,
    #     bnb_4bit_use_double_quant=True,
    #     bnb_4bit_quant_type="nf4",
    #     bnb_4bit_compute_dtype=torch.float16
    # )

    # Load model with quantization
    model = AutoModelForCausalLM.from_pretrained(
        CONFIG["model_name"],
        torch_dtype=torch.float16,
        device_map="auto",
        # quantization_config=bnb_config,
        # max_position_embeddings=CONFIG["max_seq_length"],  # Ensure model supports our longer sequence length
    )

    # for name, _ in model.named_modules():
    #     print(name)

    # Resize token embeddings for new special tokens
    model.resize_token_embeddings(len(tokenizer))

    # Configure LoRA
    lora_config = LoraConfig(
        r=CONFIG["lora_r"],
        lora_alpha=CONFIG["lora_alpha"],
        target_modules=CONFIG["target_modules"],
        lora_dropout=CONFIG["lora_dropout"],
        bias="none",
        task_type=TaskType.CAUSAL_LM
    )

    # Enable gradient checkpointing for memory efficiency
    model.gradient_checkpointing_enable()

    # Apply LoRA to the model
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    # 5. Create datasets
    train_dataset = TripletDataset(df_train, tokenizer, max_length=CONFIG["max_seq_length"])
    val_dataset = TripletDataset(df_val, tokenizer, max_length=CONFIG["max_seq_length"])

    # 6. Set up training arguments
    # Note: With the increased sequence length (6500), we need to be cautious of memory usage
    # This is why we reduced batch size to 1 and increased gradient accumulation steps to 8
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=CONFIG["per_device_batch_size"],
        per_device_eval_batch_size=CONFIG["per_device_batch_size"],
        gradient_accumulation_steps=CONFIG["gradient_accumulation_steps"],
        eval_strategy="steps",
        eval_steps=CONFIG["evaluate_every"],
        save_strategy=CONFIG["save_strategy"],
        save_steps=CONFIG["evaluate_every"],
        logging_steps=10,
        learning_rate=CONFIG["learning_rate"],
        num_train_epochs=CONFIG["num_train_epochs"],
        warmup_steps=CONFIG["warmup_steps"],
        fp16=True,
        bf16=False,
        weight_decay=CONFIG["weight_decay"],
        save_total_limit=CONFIG["save_total_limit"],
        logging_dir=os.path.join(output_dir, "logs"),
        report_to="none",
        gradient_checkpointing=True,  # Enable gradient checkpointing for memory efficiency
        eval_accumulation_steps=CONFIG["eval_accumulation_steps"]
    )

    # 7. Set up trainer
    print("\n===== SETTING UP TRAINER =====")
    data_collator = DataCollator(tokenizer)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
    )

    # 8. Train the model
    print("\n===== STARTING TRAINING =====")
    try:
        trainable = any(p.requires_grad for p in model.parameters())
        if not trainable:
            raise ValueError("No parameters require gradients. LoRA adapter may not be properly applied.")

        trainer.train()
        # Save the trained model and tokenizer
        model.save_pretrained(os.path.join(output_dir, "adapters"))
        tokenizer.save_pretrained(os.path.join(output_dir, "tokenizer"))
        print(f"Model saved to {os.path.join(output_dir, 'adapters')}")
    except Exception as e:
        print(f"Training error: {e}")
        import traceback
        traceback.print_exc()

    # # 9. Evaluate the model
    # print("\n===== EVALUATING MODEL =====")
    # # Load metrics
    # metrics = {
    #     "rouge": evaluate.load("rouge"),
    #     "bleu": evaluate.load("bleu")
    # }

    # # Make sure model is in evaluation mode
    # model.eval()

    # # Run evaluation
    # eval_results, generated_samples, reference_samples = evaluate_model(
    #     model, tokenizer, df_test, metrics
    # )

    # # 10. Save evaluation results
    # print("\n===== SAVING RESULTS =====")
    # # Save metrics
    # with open(os.path.join(output_dir, "eval_metrics.json"), "w") as f:
    #     json.dump(eval_results, f, indent=2)

    # # Save a few examples
    # examples = []
    # for i in range(len(generated_samples)):
    #     examples.append({
    #         "left": df_test.iloc[i]["left"],  # Truncate for readability
    #         "right": df_test.iloc[i]["right"],
    #         "generated": generated_samples[i],
    #         "reference": reference_samples[i]
    #     })

    # with open(os.path.join(output_dir, "sample_outputs.json"), "w") as f:
    #     json.dump(examples, f, indent=2)

    # print(f"Evaluation results saved to {output_dir}")
    print("\n===== TRAINING COMPLETE =====")

if __name__ == "__main__":
    # Clear CUDA cache before starting
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    main()


===== LOADING DATA =====
Loading data from /home1/kharwal/flipside_4K_articles_23-25.json
Loading data from /home1/kharwal/flipside_4k_articles_21-22.json
Loading data from /home1/kharwal/flipside_4k_articles_19-20.json
Loaded 1037 total entries from 3 files
Generated 1376 triplets
Dataset size: 1376 examples
Average total word count per example: 3105.5
Max total word count per example: 20174
Estimated average token count per triplet: 1463.3
Estimated max token count per triplet: 23159.5
Number of examples that may exceed 6500 tokens: 137
Estimated average token count per triplet: 4037.2
Estimated max token count per triplet: 26226.2
Number of examples that may exceed 6500 tokens: 67
Estimated token length distribution:
  0-2000: 28 examples
  2000-4000: 827 examples
  4000-6000: 422 examples
  6000-8000: 69 examples
  8000-inf: 30 examples

===== SPLITTING DATA =====
Train: 1114, Val: 124, Test: 138

===== SETTING UP TOKENIZER =====

===== SETTING UP MODEL WITH QLORA =====


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.07it/s]
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 6,291,456 || all params: 6,654,955,520 || trainable%: 0.0945

===== SETTING UP TRAINER =====

===== STARTING TRAINING =====


Step,Training Loss,Validation Loss
