In [1]:
!pip install --upgrade transformers datasets accelerate bitsandbytes

Collecting transformers
  Downloading transformers-4.56.1-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting accelerate
  Downloading accelerate-1.10.1-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.1

In [2]:
# ==============================================================================
# EmoLLM PHASE 3: DATA AUGMENTATION PIPELINE
# Author: S. M. Hozaifa Hossain
# Date: September 2, 2025
# Objective: Use a fine-tuned Llama 3.1 8B model to generate synthetic
#            training data to improve the performance of the champion classifier.
# ==============================================================================

import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
from peft import PeftModel
from tqdm import tqdm
import os
os.environ["WANDB_DISABLED"] = "true"
from sklearn.model_selection import train_test_split


2025-09-06 06:35:36.521650: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757140536.728811      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757140536.792695      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# --- 1. SETUP & AUTHENTICATION (LESSON LEARNED) ---
# This must be run in an environment with your HF_TOKEN secret.
try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    hf_token = user_secrets.get_secret("HF_TOKEN")
    from huggingface_hub import login
    login(token=hf_token)
    print("Kaggle environment detected. Logged into Hugging Face.")
except ImportError:
    print("Not in a Kaggle environment. Assuming Hugging Face token is available.")
    # Ensure you have run `huggingface-cli login` in your local terminal.

Kaggle environment detected. Logged into Hugging Face.


In [4]:
# --- 2. CONFIGURATION ---
# Path to the base model you fine-tuned
BASE_MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B-Instruct"
# Path to your saved PEFT model adapters (the final Llama checkpoint)
PEFT_MODEL_PATH = "/kaggle/input/emollm/llama_ft"
# Path to your original, manually labeled data
DATA_FILE = "/kaggle/input/emollm/labeled_2k.csv"
# Output file for the new, augmented dataset
OUTPUT_DATA_FILE = "augmented_training_data.csv"
PROGRESS_INPUT_FILE = "/kaggle/input/emollm/augmentation_progress.csv"
PROGRESS_OUTPUT_FILE = "augmentation_progress.csv"
FINAL_OUTPUT_FILE = "/kaggle/working/augmented_training_data_final.csv"



In [5]:
# Augmentation Parameters
N_VARIANTS_PER_SAMPLE = 2
BATCH_SIZE = 4
MAX_NEW_TOKENS = 100
SAVE_INTERVAL = 25  # Save progress every 25 batches (25 * 4 = 100 original messages)



In [6]:
# --- 3. LOAD THE FINE-TUNED AUGMENTOR MODEL ---
print("--- Loading base model and applying PEFT adapters for augmentation ---")

# Define the quantization configuration (QLoRA)
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load the base model with quantization
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_PATH,
    quantization_config=quantization_config,
    device_map="auto",
    # attn_implementation="flash_attention_2" # ADD THIS ARGUMENT

)

# Load the PEFT model and merge the adapters
model = PeftModel.from_pretrained(base_model, PEFT_MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
tokenizer.pad_token = tokenizer.eos_token

print("Augmentor model loaded successfully.")

--- Loading base model and applying PEFT adapters for augmentation ---


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Augmentor model loaded successfully.


In [7]:
# --- 4. PREPARE SOURCE DATA & RESUMPTION LOGIC ---
print("--- Preparing source data and checking for progress ---")
df = pd.read_csv(DATA_FILE)
train_df, _ = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['reconciled_emotion']
)

# Resumption Logic
if os.path.exists(PROGRESS_INPUT_FILE):
    print(f"Progress file found at '{PROGRESS_INPUT_FILE}'. Resuming augmentation.")
    progress_df = pd.read_csv(PROGRESS_INPUT_FILE)
    # Identify which original messages have already been fully processed
    processed_messages = set(progress_df['original_message'].unique())
    print(f"Found {len(processed_messages)} already processed messages. Skipping them.")
else:
    print("No progress file found. Starting from scratch.")
    progress_df = pd.DataFrame()
    processed_messages = set()

# Filter out the messages that are already done
train_df_remaining = train_df[~train_df['message'].isin(processed_messages)].copy()
print(f"Remaining messages to augment: {len(train_df_remaining)}")



--- Preparing source data and checking for progress ---
Progress file found at '/kaggle/input/emollm/augmentation_progress.csv'. Resuming augmentation.
Found 1491 already processed messages. Skipping them.
Remaining messages to augment: 98


In [8]:
# --- 5. EXECUTE THE RESUMABLE AUGMENTATION LOOP ---
augmented_data_buffer = []

for i in tqdm(range(0, len(train_df_remaining), BATCH_SIZE), desc="Augmenting Data"):
    batch_df = train_df_remaining.iloc[i:i+BATCH_SIZE]
    if batch_df.empty:
        continue

    prompts = [
        (f"### Human:\n"
         f"You are a software developer. Your task is to rewrite the following GitHub commit message. "
         f"The new message must preserve the original meaning and the emotional tone of '{row['reconciled_emotion']}'. "
         f"Do not include any preamble, just provide the rewritten message.\n\n"
         f"### Original Message:\n"
         f"'{row['message']}'\n\n"
         f"### Rewritten Message:\n")
        for _, row in batch_df.iterrows()
    ]
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda")

    for n in range(N_VARIANTS_PER_SAMPLE):
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS, do_sample=True, top_p=0.9, temperature=0.7)
        decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        for idx, output in enumerate(decoded_outputs):
            try:
                new_message = output.split("### Rewritten Message:\n")[-1].strip()
                original_row = batch_df.iloc[idx]
                augmented_data_buffer.append({
                    'message': new_message,
                    'reconciled_emotion': original_row['reconciled_emotion'],
                    'original_message': original_row['message'] # For tracking progress
                })
            except Exception as e:
                print(f"Warning: Failed to parse output for a sample. Error: {e}")
                continue

    # --- Save Progress Periodically ---
    current_batch_number = i // BATCH_SIZE
    if (current_batch_number + 1) % SAVE_INTERVAL == 0:
        print(f"\n--- Saving progress at batch {current_batch_number + 1}... ---")
        temp_df = pd.DataFrame(augmented_data_buffer)
        progress_df = pd.concat([progress_df, temp_df], ignore_index=True)
        progress_df.to_csv(PROGRESS_OUTPUT_FILE, index=False)
        augmented_data_buffer = [] # Clear buffer after saving
        print(f"Progress saved. {len(progress_df)} samples generated so far.")



Augmenting Data:   0%|          | 0/25 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Augmenting Data:   4%|▍         | 1/25 [01:53<45:31, 113.82s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Augmenting Data:   8%|▊         | 2/25 [03:47<43:42, 114.01s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Augmenting Data:  12%|█▏        | 3/25 [05:39<41:25, 112.99s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Augmenting Data:  16%|█▌        | 4/25 [07:32<39:32, 112.96s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_t


--- Saving progress at batch 25... ---
Progress saved. 3196 samples generated so far.





In [9]:
# --- 6. FINAL SAVE ---
print("\n--- Augmentation loop finished. Performing final save. ---")

# Add any remaining data from the buffer
if augmented_data_buffer:
    temp_df = pd.DataFrame(augmented_data_buffer)
    progress_df = pd.concat([progress_df, temp_df], ignore_index=True)

# Clean up the progress file for final merge (remove tracking column)
if not progress_df.empty:
    progress_df = progress_df.drop(columns=['original_message'])

# Combine original training data with all augmented data
final_training_df = pd.concat([train_df[['message', 'reconciled_emotion']], progress_df], ignore_index=True)
final_training_df.to_csv(FINAL_OUTPUT_FILE, index=False)

print(f"\n--- MISSION COMPLETE ---")
print(f"Successfully created augmented dataset with {len(final_training_df)} total training samples.")
print(f"Original samples: {len(train_df)}")
print(f"Newly generated samples: {len(progress_df)}")
print(f"Final dataset saved to '{FINAL_OUTPUT_FILE}'")



--- Augmentation loop finished. Performing final save. ---

--- MISSION COMPLETE ---
Successfully created augmented dataset with 4796 total training samples.
Original samples: 1600
Newly generated samples: 3196
Final dataset saved to '/kaggle/working/augmented_training_data_final.csv'
