## Preparing Training-Ready Data

In [2]:
import os
import json
import re
from tqdm import tqdm

# Define paths
data_dir = "narrative_classification/training_data/eng/raw_documents"  # Directory containing the text files
annotation_file = "narrative_classification/training_data/eng/subtask_2_annotations.txt"  # Annotation file
output_file = "prepared_dataset.json"  # Output file for training-ready data

# Step 1: Load Annotations
def load_annotations(annotation_path):
    annotations = {}
    with open(annotation_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                article_id, narrative, subnarrative = parts
                # Parse multiple narratives and sub-narratives if present
                narrative_list = [n.strip() for n in narrative.split(';')]
                subnarrative_list = [s.strip() for s in subnarrative.split(';')]
                annotations[article_id.replace(".txt", "")] = {
                    "narratives": narrative_list,
                    "subnarratives": subnarrative_list
                }
    return annotations

# Step 2: Load Text Files and Merge with Annotations
def prepare_data(text_dir, annotations):
    dataset = []
    for filename in tqdm(os.listdir(text_dir), desc="Processing text files"):
        if filename.endswith(".txt"):
            filepath = os.path.join(text_dir, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                content = file.read().strip()
            article_id = filename.replace(".txt", "")
            annotation = annotations.get(article_id, {
                "narratives": ["Other"],
                "subnarratives": ["Other"]
            })
            dataset.append({
                "article_id": article_id,
                "content": content,
                "narratives": annotation["narratives"],
                "subnarratives": annotation["subnarratives"]
            })
    return dataset

# Step 3: Save Prepared Dataset
def save_dataset(dataset, output_path):
    with open(output_path, 'w', encoding='utf-8') as file:
        json.dump(dataset, file, indent=4, ensure_ascii=False)

# Main
if __name__ == "__main__":
    # Load annotations
    print("Loading annotations...")
    annotations = load_annotations(annotation_file)

    # Prepare dataset
    print("Preparing dataset...")
    dataset = prepare_data(data_dir, annotations)

    # Save dataset
    print(f"Saving dataset to {output_file}...")
    save_dataset(dataset, output_file)

    print("Data preparation completed!")


Loading annotations...
Preparing dataset...


Processing text files: 100%|██████████| 399/399 [00:00<00:00, 68920.49it/s]

Saving dataset to prepared_dataset.json...
Data preparation completed!





## Splitting Dataset into Train, Validation, and Test Sets

In [3]:
import random

# Define paths
train_file = "train.json"  # Training set file
val_file = "val.json"  # Validation set file
test_file = "test.json"  # Test set file

# Splitting Dataset
def split_dataset(dataset, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    random.shuffle(dataset)
    total = len(dataset)
    train_end = int(total * train_ratio)
    val_end = train_end + int(total * val_ratio)

    train_set = dataset[:train_end]
    val_set = dataset[train_end:val_end]
    test_set = dataset[val_end:]

    return train_set, val_set, test_set

# Main
if __name__ == "__main__":

    # Split dataset
    print("Splitting dataset into train, validation, and test sets...")
    train_set, val_set, test_set = split_dataset(dataset, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1)

    # Save datasets
    print(f"Saving training set to {train_file}...")
    save_dataset(train_set, train_file)

    print(f"Saving validation set to {val_file}...")
    save_dataset(val_set, val_file)

    print(f"Saving test set to {test_file}...")
    save_dataset(test_set, test_file)

    print("Data preparation and splitting completed!")


Splitting dataset into train, validation, and test sets...
Saving training set to train.json...
Saving validation set to val.json...
Saving test set to test.json...
Data preparation and splitting completed!


## Pre-Processing Train, Validation and Test Sets

In [None]:
#pip install nltk

In [None]:
#nltk.download('punkt')  # For tokenization
#nltk.download('stopwords')  # For stopwords
#nltk.download('punkt_tab')

In [8]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

# Step 1: Initialize stopwords
stop_words = set(stopwords.words('english'))

# Step 2: Preprocess Text
def preprocess_text(text):
    # Lowercase text
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Step 3: Apply Preprocessing to Splits
def preprocess_split(split):
    for entry in split:
        entry["content"] = preprocess_text(entry["content"])
    return split

# Main
if __name__ == "__main__":

    # Preprocess train, validation, and test sets
    print("Preprocessing train, validation, and test sets...")
    train_set = preprocess_split(train_set)
    val_set = preprocess_split(val_set)
    test_set = preprocess_split(test_set)

    # Save datasets
    print(f"Saving training set to {train_file}...")
    save_dataset(train_set, train_file)

    print(f"Saving validation set to {val_file}...")
    save_dataset(val_set, val_file)

    print(f"Saving test set to {test_file}...")
    save_dataset(test_set, test_file)

    print("Data preparation and splitting completed!")    

Preprocessing train, validation, and test sets...
Saving training set to train.json...
Saving validation set to val.json...
Saving test set to test.json...
Data preparation and splitting completed!


## Tokenizing and Formatting Input

In [None]:
#pip install transformers

In [None]:
#pip install sentencepiece

In [1]:
# Tokenization script for fine-tuning
import os
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login

# Define paths
train_file = "train.json"
val_file = "val.json"
test_file = "test.json"
tokenized_train_file = "tokenized_train.json"
tokenized_val_file = "tokenized_val.json"
tokenized_test_file = "tokenized_test.json"

# Step 1: Initialize tokenizer 
login("add_your_tokenizer")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B", use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B", use_auth_token=True)

# Set a padding token (use eos_token if a specific padding token is not defined)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Step 2: Function to tokenize a dataset
def tokenize_dataset(input_file, output_file, tokenizer):
    with open(input_file, 'r', encoding='utf-8') as file:
        dataset = json.load(file)

    tokenized_dataset = []
    for entry in dataset:
        # Format the input as per model requirements
        formatted_input = f"Content: {entry['content']} Provide narrative and subnarrative labels."

        # Tokenize the formatted input
        tokenized_input = tokenizer(
            formatted_input,
            truncation=True,
            padding="max_length",
            max_length=512,
            return_tensors="pt"
        )

        # Format the output as per model requirements
        formatted_output = (
            f"Narrative: {'; '.join(entry['narratives'])}; "
            f"Subnarrative: {'; '.join(entry['subnarratives'])}"
        )

        # Tokenize the output if the model requires structured outputs
        tokenized_output = tokenizer(
            formatted_output,
            truncation=True,
            padding="max_length",
            max_length=512,
            return_tensors="pt"
        )

        # Store tokenized content and output
        tokenized_dataset.append({
            "article_id": entry["article_id"],
            "input_ids": tokenized_input["input_ids"].tolist(),
            "attention_mask": tokenized_input["attention_mask"].tolist(),
            "output_ids": tokenized_output["input_ids"].tolist(),
            "output_attention_mask": tokenized_output["attention_mask"].tolist()
        })

    # Save tokenized dataset
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(tokenized_dataset, file, indent=4, ensure_ascii=False)

# Step 3: Tokenize train, validation, and test sets
if __name__ == "__main__":
    print("Tokenizing training set...")
    tokenize_dataset(train_file, tokenized_train_file, tokenizer)

    print("Tokenizing validation set...")
    tokenize_dataset(val_file, tokenized_val_file, tokenizer)

    print("Tokenizing test set...")
    tokenize_dataset(test_file, tokenized_test_file, tokenizer)

    print("Tokenization completed!")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Tokenizing training set...
Tokenizing validation set...
Tokenizing test set...
Tokenization completed!


## Model Training

In [None]:
#pip install 'accelerate>=0.26.0'
#pip install datasets 
#pip install peft
#pip install bitsandbytes
#pip install WandB

In [4]:
import os
import json
import torch
import wandb  # ✅ Weights & Biases tracking
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset

# ✅ Initialize WandB (Weights & Biases for tracking)
wandb.init(project="llama-lora-finetuning")

# Free GPU memory before training
torch.cuda.empty_cache()
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# ✅ Load tokenized dataset (Ensure correct tensor dtype)
def load_tokenized_dataset(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    inputs = [torch.tensor(d["input_ids"], dtype=torch.long) for d in data]  # Force long dtype
    outputs = [torch.tensor(d["output_ids"], dtype=torch.long) for d in data]  
    return Dataset.from_dict({"input_ids": inputs, "labels": outputs})

# Paths for tokenized data
tokenized_train_file = "tokenized_train.json"
tokenized_val_file = "tokenized_val.json"
train_dataset = load_tokenized_dataset(tokenized_train_file)
val_dataset = load_tokenized_dataset(tokenized_val_file)

# ✅ Ensure tokenizer has padding
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B", use_auth_token=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# ✅ Load model with 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,  # ✅ Ensures correct data type
    bnb_4bit_quant_type="fp4",  # ✅ Fixes tensor shape mismatch
)

base_model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-3B",
    torch_dtype=torch.float16,
    device_map="auto",
    quantization_config=quantization_config,
    use_auth_token=True
)

# ✅ Disable KV Cache (Prevents unpacking errors)
base_model.config.use_cache = False

# ✅ Apply LoRA with restricted layer modifications
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,  # ✅ Lower rank for efficiency
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj", "o_proj"],  # ✅ Restrict LoRA to avoid interference
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

# ✅ Training arguments (No evaluation)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",  # ✅ No eval, only training
    save_strategy="epoch",
    per_device_train_batch_size=1,  # ✅ Prevents memory overflow
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,  # ✅ Reduces memory load
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=1,
    load_best_model_at_end=False,  # ✅ No eval, so no best model selection
    fp16=True,  # ✅ Mixed precision for efficiency
    optim="adamw_torch",
    weight_decay=0.01,
    report_to="wandb"  # ✅ Log loss in WandB
)

# ✅ Trainer (without evaluation)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# ✅ Train (WandB will track loss/error rate)
trainer.train()

# ✅ Save fine-tuned LoRA model
trainer.save_model("./lora_finetuned_model")

# ✅ Finish WandB logging
wandb.finish()




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 3,670,016 || all params: 3,216,419,840 || trainable%: 0.1141


  trainer = Trainer(


ValueError: too many values to unpack (expected 4)