# Slang‑Detector Fine‑Tuning Notebook
This notebook is generated from your **train_text_model.py** script so you can run each stage interactively.

# ## 1. Setup and Imports

In [None]:
# !pip install peft

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
# !pip install urllib3==1.26.17
# !pip install pandas
# !pip install scipy

Defaulting to user installation because normal site-packages is not writeable
Collecting scipy
  Using cached scipy-1.15.2-cp312-cp312-win_amd64.whl.metadata (60 kB)
Using cached scipy-1.15.2-cp312-cp312-win_amd64.whl (40.9 MB)
Installing collected packages: scipy
Successfully installed scipy-1.15.2


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
scprep 1.2.3 requires pandas<2.1,>=0.25, but you have pandas 2.2.3 which is incompatible.

[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import os
import sys
import yaml
import logging
import json
from pathlib import Path

import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

# ## 2. Helper Functions


In [80]:
def yaml_cfg(path: Path, section: str):
    """Loads a section from a YAML file with error checking."""
    if not path.is_file():
        logging.error(f"Configuration file not found: {path}")
        return None
    try:
        with open(path, "r") as f:
            data = yaml.safe_load(f) or {}
    except yaml.YAMLError as e:
        logging.error(f"Error parsing YAML file {path}: {e}")
        return None
    except Exception as e:
        logging.error(f"Error reading YAML file {path}: {e}")
        return None

    if section not in data:
        logging.error(f"Section '{section}' missing in configuration file: {path}")
        return None
    return data[section]

def create_prompt(instruction, output):
    # Format specific to Qwen2.5-Instruct model
    return f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{output}<|im_end|>"



In [None]:
# # This chunk will lead to bugs later but it uses multiple workers for multiprocessing,
# # so we are keeping it here so that we might want to debug later 
# # So in next chunk, we will use num_proc=1 to get the tokenization done first.

# def tokenize_dataset(dataset, tokenizer, max_length):
#     """Tokenizes the dataset using the specified format with batching and parallelism."""
#     import logging  # Import inside the function so it's available to workers

#     logging.info(f"Tokenizing dataset with max_length={max_length}...")

#     # Define create_prompt inside tokenize_dataset with the same name as the global function 
#     # we define this because we want multi-workers to be able to access this function so we don't have error when we
#     # call this tokenize_dataset function in multi-workers
#     def create_prompt(instruction, output):
#         # Format specific to Qwen2.5-Instruct model
#         return f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{output}<|im_end|>"

#     def preprocess_function(examples):
#         # Add debugging to see what's coming in
#         if not isinstance(examples, dict):
#             logging.error(f"Expected examples to be a dict, got {type(examples)}")
#             logging.error(f"Examples content: {examples}")
#             # Handle the error case
#             return {"input_ids": [], "attention_mask": [], "labels": []}

#         # Check if the expected keys exist
#         if 'instruction' not in examples or 'output' not in examples:
#             logging.error(f"Missing expected keys. Available keys: {examples.keys()}")
#             # Handle the error case
#             return {"input_ids": [], "attention_mask": [], "labels": []}

#         # Now process normally
#         prompts = [
#             create_prompt(instr, out) 
#             for instr, out in zip(examples['instruction'], examples['output'])
#         ]

#         tokenized_outputs = tokenizer(
#             prompts,
#             max_length=max_length,
#             truncation=True,
#             padding=False,
#         )
#         tokenized_outputs["labels"] = tokenized_outputs["input_ids"].copy()
#         return tokenized_outputs

#     try:
#         # Use batching and multiprocessing for speed
#         num_proc = max(os.cpu_count() // 2, 1)
#         logging.info(f"Using {num_proc} processes for tokenization.")
#         tokenized_ds = dataset.map(
#             preprocess_function,
#             batched=True,
#             num_proc=num_proc,
#             remove_columns=dataset.column_names,
#             desc="Running tokenizer on dataset",
#         )
#         logging.info("Tokenization complete.")
#         return tokenized_ds
#     except Exception as e:
#         logging.error(f"Error during dataset tokenization: {e}", exc_info=True)
#         return None

In [98]:
## This chunk is map without batching or multiprocessing for now

def tokenize_dataset(dataset, tokenizer, max_length):
    """Tokenizes the dataset using the specified format."""
    import logging
    logging.info(f"Tokenizing dataset with max_length={max_length}...")
    
    print(f"Starting tokenization of dataset with {len(dataset)} examples")
    print(f"Dataset columns: {dataset.column_names}")
    
    # Process one example at a time
    def process_example(example):
        # Create the prompt
        prompt = f"<|im_start|>user\n{example['instruction']}<|im_end|>\n<|im_start|>assistant\n{example['output']}<|im_end|>"
        
        # Tokenize
        result = tokenizer(
            prompt,
            max_length=max_length,
            truncation=True,
            padding=False
        )
        
        # Add labels
        result["labels"] = result["input_ids"].copy()
        return result
    
    try:
        # Map without batching or multiprocessing for now
        tokenized_ds = dataset.map(
            process_example,
            remove_columns=dataset.column_names
        )
        
        print(f"Tokenization complete. Result size: {len(tokenized_ds)}")
        if len(tokenized_ds) > 0:
            print(f"Tokenized dataset features: {tokenized_ds.features}")
            print(f"First example keys: {list(tokenized_ds[0].keys())}")
        else:
            print("WARNING: Empty tokenized dataset returned")
        
        logging.info("Tokenization complete.")
        return tokenized_ds
    except Exception as e:
        logging.error(f"Error during tokenization: {e}", exc_info=True)
        print(f"Error during tokenization: {e}")
        import traceback
        traceback.print_exc()
        return None

In [133]:
# Optimized version for batching and parallelism

def tokenize_dataset(dataset, tokenizer, max_length):
    """Tokenizes the dataset using the specified format with batching and parallelism."""
    import logging
    logging.info(f"Tokenizing dataset with max_length={max_length}...")
    
    # Define create_prompt inside the function for multiprocessing
    def create_prompt(instruction, output):
        return f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{output}<|im_end|>"
    
    # Process examples in batches
    def process_batch(examples):
        # Create prompts for each example in the batch
        prompts = [
            create_prompt(instr, out) 
            for instr, out in zip(examples['instruction'], examples['output'])
        ]
        
        # Tokenize all prompts in the batch
        tokenized_outputs = tokenizer(
            prompts,
            max_length=max_length,
            truncation=True,
            padding=False,
        )
        
        # Add labels for causal language modeling
        tokenized_outputs["labels"] = tokenized_outputs["input_ids"].copy()
        return tokenized_outputs
    
    try:
        # Use batching and multiprocessing for speed
        num_proc = max(os.cpu_count() // 2, 1)
        logging.info(f"Using {num_proc} processes for tokenization.")
        
        tokenized_ds = dataset.map(
            process_batch,
            batched=True,
            batch_size=100,  # Reasonable batch size
            num_proc=num_proc,
            remove_columns=dataset.column_names,
            desc="Map: ",
        )
        
        logging.info("Tokenization complete.")
        return tokenized_ds
    except Exception as e:
        logging.error(f"Error during tokenization: {e}", exc_info=True)
        return None

## 3. Load Configurations

In [134]:
# ## 3. Configuration Parameters
# Set these parameters to control the training process


# Get the current directory (where the notebook is)
current_dir = os.getcwd()

# Navigate up to the project root (LINGO folder)
# Assuming notebook is in data/processed/text
project_root = os.path.abspath(os.path.join(current_dir,'../'))
print(f"Project root: {project_root}")

# Important: Add project root to Python's path
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    print(f"Added {project_root} to Python path")

# Check if src directory exists
src_dir = os.path.join(project_root, 'src')
utils_dir = os.path.join(src_dir, 'utils')
config_file = os.path.join(utils_dir, 'config.py')
print(f"Checking if src exists: {os.path.exists(src_dir)}")
print(f"Checking if utils exists: {os.path.exists(utils_dir)}")
print(f"Checking if config.py exists: {os.path.exists(config_file)}")

# Import config
from src.utils.config import RAW_DATA_DIR, PROCESSED_DATA_DIR, CONFIGS_DIR


# %%
# Set parameters
section = "text_slang_detector"  # Section name in config files
configs_dir = Path(CONFIGS_DIR)      # Directory containing config YAML files
use_4bit = True                    # Whether to use 4-bit quantization
bf16 = True                        # Whether to use bfloat16 precision
max_train_samples = None           # Limit training samples (set to number for testing)
max_eval_samples = None            # Limit evaluation samples (set to number for testing)

# Print current settings
print(f"Configuration Section: {section}")
print(f"Using 4-bit quantization: {use_4bit}")
print(f"Using bfloat16 precision: {bf16}")
print(f"Max training samples: {max_train_samples or 'All'}")
print(f"Max evaluation samples: {max_eval_samples or 'All'}")

Project root: c:\Users\jiang\Desktop\Projects\Lingo
Checking if src exists: True
Checking if utils exists: True
Checking if config.py exists: True
Configuration Section: text_slang_detector
Using 4-bit quantization: True
Using bfloat16 precision: True
Max training samples: All
Max evaluation samples: All


# ## 4. Load Configurations
## Load training, model and data configurations from YAML files

In [135]:
# Load configurations
try:
    # Assume script is run from project root
    project_root = configs_dir.parent
    train_cfg = yaml_cfg(configs_dir / "training_config.yaml", section)
    model_cfg = yaml_cfg(configs_dir / "model_config.yaml", section)
    data_cfg = yaml_cfg(configs_dir / "data_config.yaml", section)
    
    print("Successfully loaded configurations:")
    print("\nTraining Configuration:")
    print(json.dumps(train_cfg, indent=2))
    print("\nModel Configuration:")
    print(json.dumps(model_cfg, indent=2))
    print("\nData Configuration:")
    print(json.dumps(data_cfg, indent=2))
except Exception as e:
    print(f"Error loading configurations: {e}")

Successfully loaded configurations:

Training Configuration:
{
  "batch_size": 4,
  "epochs": 3,
  "gradient_accumulation_steps": 4,
  "lr": 0.0002,
  "max_length": 512,
  "warmup_ratio": 0.03
}

Model Configuration:
{
  "lora_params": {
    "alpha": 32,
    "dropout": 0.1,
    "r": 16,
    "target_modules": [
      "q_proj",
      "k_proj",
      "v_proj",
      "o_proj"
    ]
  },
  "name": "Qwen/Qwen2.5-1.5B-Instruct",
  "output_dir": "c:\\Users\\jiang\\Desktop\\Projects\\Lingo\\models\\text_models"
}

Data Configuration:
{
  "test": "c:\\Users\\jiang\\Desktop\\Projects\\Lingo\\data\\processed\\text\\test.json",
  "train": "c:\\Users\\jiang\\Desktop\\Projects\\Lingo\\data\\processed\\text\\train.json",
  "validation": "c:\\Users\\jiang\\Desktop\\Projects\\Lingo\\data\\processed\\text\\val.json"
}


# 5. Process Paths
## Resolve and validate data paths

In [136]:
print(project_root)

c:\Users\jiang\Desktop\Projects\Lingo


In [137]:
try:
    # Set up output directory
    output_dir_from_config = model_cfg.get("output_dir")
    if output_dir_from_config is None:
        print(f"WARNING: No output_dir specified in config, falling back to 'models/{section}_output'")
        output_dir = project_root / f"models/{section}_output"
    else:
        output_dir = project_root / output_dir_from_config
        print(f"Using output directory from config: {output_dir}")

    # Create the output directory if it doesn't exist
    output_dir.mkdir(
        parents=True,    
        exist_ok=True    
    )
    print(f"Output directory: {output_dir}")
    
    ''' 
    parents=True,    
    # If parent directories don't exist, create them too
                        # Example: if path is '/a/b/c' and 'a' and 'b' don't exist,
                        # this will create all necessary parent directories
    exist_ok=True
    # If the directory already exists, don't raise an error
                        # Without this, we'd get a FileExistsError if the directory exists
    '''

    
    # Resolve and check data paths
    resolved_paths = {}
    for key in ['train', 'validation', 'test']:
        if key in data_cfg:
            path_str = data_cfg[key] # for example, data_cfg['train'] = 'c:\Users\jiang\Desktop\Projects\Lingo\data\processed\text\train.json'
            if not os.path.isabs(path_str): # if the path is not absolute, in other words, the path is relative like data\processed\text\train.json, we resolve it 
                path_obj = (project_root / path_str).resolve()
                resolved_paths[key] = str(path_obj)
            else:
                path_obj = Path(path_str)
                resolved_paths[key] = str(path_obj)
            
            print(f"{key} path: {resolved_paths[key]}")
            print(f"  - Exists: {path_obj.exists()}")
            
        else:
            if key in ['train', 'validation']: # test is optional here
                print(f"ERROR: Required data path for '{key}' missing!")
            else:
                print(f"NOTE: Optional data path for '{key}' not specified.")
    
    # Update data_cfg with resolved paths
    for key, path in resolved_paths.items():
        data_cfg[key] = path

except Exception as e:
    print(f"Error processing paths: {e}")

Using output directory from config: c:\Users\jiang\Desktop\Projects\Lingo\models\text_models
Output directory: c:\Users\jiang\Desktop\Projects\Lingo\models\text_models
train path: c:\Users\jiang\Desktop\Projects\Lingo\data\processed\text\train.json
  - Exists: True
validation path: c:\Users\jiang\Desktop\Projects\Lingo\data\processed\text\val.json
  - Exists: True
test path: c:\Users\jiang\Desktop\Projects\Lingo\data\processed\text\test.json
  - Exists: True


# 6. Load Tokenizer
## Load the tokenizer for the specified model

Now we are talking about Qwen 2.5-1.5B as the specific model.


**Why implement causal LM?** 

Model Architecture: Qwen 2.5 is fundamentally designed and pre-trained as a causal language model. This is how the model's architecture was built from the ground up - it was trained to predict the next token given previous tokens. We can't change this core architecture during fine-tuning.Therefore, we will use the causal LM. 


**why padding?**

Most causal LM vocabularies (GPT‑style, Qwen‑style) don’t define a pad_token by default, because at pretraining time usually stream text without padding. Therefore, when fine‑tune or evaluate with batches of mixed lengths,we must pad shorter sequences up to your chosen max_length, or your tensors won’t line up.



We set padding_side to left because this is more efficient for causal LM.

**Why Left Padding for Causal LM?**

For causal LMs, left padding is critical because:

1. Attention Masking: In causal LMs, each token can only attend to previous tokens and itself (this is the "causal" part)

2. With Right Padding: If padding is on the right, the model would try to use meaningful tokens to predict padding tokens, which is wasteful and potentially confusing:
   ```
   "Hello world [PAD] [PAD]"
            ↑        ↑
      Model tries to predict padding based on "world"
   ```

3. With Left Padding: The padding tokens come before the real content, so they don't interfere with prediction of meaningful tokens:
   ```
   "[PAD] [PAD] Hello world"
              ↑       ↑
       Model predicts "world" based on "Hello" (not padding)
   ```

4. Efficiency: Left padding allows the model to ignore the padding completely during generation, as attention to padding would only be relevant when predicting the first real token

**Note**:Even though we're using a causal LM with left padding, the model still considers the entire input sentence for slang detection:


In [138]:
try:
    # Load tokenizer
    model_name = model_cfg["name"]
    print(f"Loading tokenizer for: {model_name}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    
    # Padding need: Language models require inputs of uniform length in a batch. Since sentences have different lengths, we pad shorter ones to match the longest.
    # Set padding token if not defined
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        print(f"Set pad_token to eos_token: {tokenizer.eos_token}")
    
    # Qwen2.5 is fundamentally 
    # Set padding side for causal LM
    tokenizer.padding_side = "left"
    print(f"Set padding_side to: {tokenizer.padding_side}")
    
    # Print tokenizer info
    print(f"Tokenizer type: {type(tokenizer)}")  # Shows exact tokenizer class
    print(f"Vocabulary size: {len(tokenizer)}")
    print(f"Model max length: {tokenizer.model_max_length}")
    print("Tokenizer loaded successfully!")
    
except Exception as e:
    print(f"Error loading tokenizer: {e}")

Loading tokenizer for: Qwen/Qwen2.5-1.5B-Instruct
Set padding_side to: left
Tokenizer type: <class 'transformers.models.qwen2.tokenization_qwen2_fast.Qwen2TokenizerFast'>
Vocabulary size: 151665
Model max length: 131072
Tokenizer loaded successfully!


# ## 7. Load and Tokenize Datasets
# Load the datasets and tokenize them for training

In [139]:
try:
    # Load raw datasets
    print("Loading datasets...")
    data_files = {'train': data_cfg['train'], 'validation': data_cfg['validation']}
    raw_datasets = load_dataset('json', data_files=data_files)
    
    print(f"Raw datasets loaded: {raw_datasets}")
    print(f"Train dataset size: {len(raw_datasets['train'])}")
    print(f"Validation dataset size: {len(raw_datasets['validation'])}")
    
    # Display a few examples
    print("\nSample training examples:")
    for i in range(min(3, len(raw_datasets['train']))):
        print(f"Example {i}:")
        print(f"  Instruction: {raw_datasets['train'][i]['instruction']}")
        print(f"  Output: {raw_datasets['train'][i]['output']}")
        print()
    
    # Reformat dataset if needed
    def reformat_dataset(dataset):
        return dataset.map(
            lambda x: {
                "instruction": x["instruction"],
                "output": x["output"]
            }
        )
    


    train_dataset = reformat_dataset(raw_datasets["train"])
    val_dataset = reformat_dataset(raw_datasets["validation"])
    print(f"train_dataset: {train_dataset}")
    print(f"val_dataset: {val_dataset}" )




    # Tokenize datasets
    max_length = train_cfg.get("max_length", 512)
    print(f"Tokenizing with max_length={max_length}...")
    
    train_ds = tokenize_dataset(train_dataset, tokenizer, max_length)
    val_ds = tokenize_dataset(val_dataset, tokenizer, max_length)
    
    # Apply sample limits if specified
    if max_train_samples:
        train_ds = train_ds.select(range(min(max_train_samples, len(train_ds))))
        print(f"Limited training dataset to {len(train_ds)} samples")
    
    if max_eval_samples:
        val_ds = val_ds.select(range(min(max_eval_samples, len(val_ds))))
        print(f"Limited validation dataset to {len(val_ds)} samples")
    
    print(f"Final dataset sizes: Train={len(train_ds)}, Validation={len(val_ds)}")
    
    # Show a tokenized example
    print("\nSample tokenized example:")
    sample_idx = 0
    sample_ids = train_ds[sample_idx]['input_ids']
    decoded = tokenizer.decode(sample_ids)
    print(decoded)
    
except Exception as e:
    print(f"Error loading or tokenizing datasets: {e}")
    import traceback
    traceback.print_exc()

Loading datasets...
Raw datasets loaded: DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 9984
    })
    validation: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 1248
    })
})
Train dataset size: 9984
Validation dataset size: 1248

Sample training examples:
Example 0:
  Instruction: Identify any slang in this video subtitle: "You gonna give me a ticket?"
  Output: slang detected: gonna
slang context: You gonna give me a ticket?

Example 1:
  Instruction: Identify any slang in this video subtitle: "No, the woman I lived with."
  Output: no slang detected

Example 2:
  Instruction: Identify any slang in this video subtitle: "And they still say it's impossible to get to the bottom of this."
  Output: slang detected: bottom
slang context: And they still say it's impossible to get to the bottom of this.

train_dataset: Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 998

2025-04-21 20:34:15,036 - INFO - Tokenizing dataset with max_length=512...
2025-04-21 20:34:15,037 - INFO - Using 8 processes for tokenization.
2025-04-21 20:34:15,163 - INFO - Tokenization complete.
2025-04-21 20:34:15,163 - INFO - Tokenizing dataset with max_length=512...
2025-04-21 20:34:15,165 - INFO - Using 8 processes for tokenization.
2025-04-21 20:34:15,285 - INFO - Tokenization complete.


Final dataset sizes: Train=9984, Validation=1248

Sample tokenized example:
<|im_start|>user
Identify any slang in this video subtitle: "You gonna give me a ticket?"<|im_end|>
<|im_start|>assistant
slang detected: gonna
slang context: You gonna give me a ticket?<|im_end|>


In [140]:
#     # Load raw datasets
#     print("Loading datasets...")
#     data_files = {'train': data_cfg['train'], 'validation': data_cfg['validation']}
#     raw_datasets = load_dataset('json', data_files=data_files)
    
#     print(f"Raw datasets loaded: {raw_datasets}")
#     print(f"Train dataset size: {len(raw_datasets['train'])}")
#     print(f"Validation dataset size: {len(raw_datasets['validation'])}")
    
#     # ADDED: Raw dataset structure debugging
#     print("\nRaw dataset structure check:")
#     print(f"First example keys: {list(raw_datasets['train'][0].keys())}")
#     print(f"First example content: {raw_datasets['train'][0]}")
    
#     # Display a few examples
#     print("\nSample training examples:")
#     for i in range(min(1, len(raw_datasets['train']))):
#         print(f"Example {i}:")
#         print(f"  Instruction: {raw_datasets['train'][i]['instruction']}")
#         print(f"  Output: {raw_datasets['train'][i]['output']}")
#         print()
    
#     # Reformat dataset if needed
#     def reformat_dataset(dataset):
#         # ADDED: Debugging before reformatting
#         print(f"Reformatting dataset with {len(dataset)} examples")
#         if len(dataset) > 0:
#             print(f"First example keys before reformatting: {list(dataset[0].keys())}")
            
#         # Original code
#         reformatted = dataset.map(
#             lambda x: {
#                 "instruction": x["instruction"],
#                 "output": x["output"]
#             }
#         )
        
#         # ADDED: Debugging after reformatting
#         print(f"Reformatted dataset size: {len(reformatted)}")
#         if len(reformatted) > 0:
#             print(f"First example keys after reformatting: {list(reformatted[0].keys())}")
            
#         return reformatted
#     print("-------------------------------- ")
#     print("--------------")
#     print("--------------")
#     print("--------------")

#     train_dataset = reformat_dataset(raw_datasets["train"])
#     val_dataset = reformat_dataset(raw_datasets["validation"])
    
#     # ADDED: More debugging
    
#     print(f"train_dataset type: {type(train_dataset)}")
#     print(f"val_dataset type: {type(val_dataset)}" )

#     print("-------------------------------- ")
#     print("--------------")
#     print("--------------")
#     print("--------------")

#     print(train_dataset)


# # After this debug chunk, I know that the problem is in tokenize_dataset function defined above, so now I want to go back and rewrite tokenize_dataset() function

# Understanding the Prompt Format for Qwen2.5-Instruct

Yes, the prompt format you're seeing is specific to the Qwen2.5-Instruct model. This is a chat-based instruction-tuning format that the model was trained on.

## The Prompt Structure

The format `<|im_start|>user ... <|im_end|> <|im_start|>assistant ... <|im_end|>` is a special chat template that Qwen2.5-Instruct understands:

1. `<|im_start|>user` - Indicates the beginning of user input
2. `<|im_end|>` - Indicates the end of user input
3. `<|im_start|>assistant` - Indicates the beginning of assistant (model) output
4. `<|im_end|>` - Indicates the end of assistant output

These special tokens help the model distinguish between:

* What part of the text is the instruction/question (user part)
* What part of the text is the expected response (assistant part)

## Why This Format Is Used

This format is crucial because:

1. Model Conditioning: It conditions the model to understand the role separation between user and assistant
2. Fine-tuning Alignment: When fine-tuning, this format helps the model learn to generate responses in the assistant's voice after seeing user instructions
3. Consistent with Pre-training: The model was pre-trained to recognize these special tokens and respond accordingly

## In Your Specific Case

For your slang detection task:

* User part: Contains the instruction and the text to analyze (`"Identify any slang in this video subtitle: "You gonna give me a ticket?""`)
* Assistant part: Contains the expected output (`"slang detected: gonna\nslang context: You gonna give me a ticket?"`)

When fine-tuning, this teaches the model to:

1. Recognize the instruction pattern
2. Identify slang in the provided text
3. Format the response in the expected way

This chat-based format is common across many instruction-tuned models, though the exact tokens may differ (some use `<s>`, `</s>`, `[INST]`, etc.). Qwen2.5 specifically uses the `<|im_start|>` and `<|im_end|>` tokens to mark the boundaries of different speakers in the conversation.


### Understanding Qwen2.5 Formatting Requirements

**Why We're Manually Adding Special Tokens**

The special tokens (`<|im_start|>`, `<|im_end|>`) are added manually because this is the specific chat format that Qwen2.5-Instruct expects. This format is documented in the Qwen2.5 model card and documentation.

According to the Qwen2.5 documentation, the model expects inputs in this chat format:

```python
<|im_start|>user
{user message}<|im_end|>
<|im_start|>assistant
{assistant message}<|im_end|>
```

This is not something we're inventing - it's the official format recommended by the model creators.

## About the Reformatting Step

Looking at your dataset sample, I can see why you're confused. Your dataset is already well-structured with:

* `instruction`: The task instruction
* `input`: An empty field (not used in this case)
* `output`: The expected response

The reformatting step in your code:

```python
def reformat_dataset(dataset):
    reformatted = dataset.map(
        lambda x: {
            "instruction": x["instruction"],
            "output": x["output"]
        }
    )
    return reformatted
```

This step is actually redundant for your dataset because:
1. Your data already has the fields we need
2. The reformatting is just selecting the same fields that already exist

You could safely remove this reformatting step since your dataset is already in the right structure. It's likely a leftover from a more general pipeline that might handle datasets with different field names.

## Qwen2.5's Dataset Format Requirements

Qwen2.5-Instruct was fine-tuned on a dataset that follows this general structure:

1. Raw data format: Similar to your JSON with instruction/output pairs
2. Tokenization format: The raw data converted to the chat format with special tokens

The key transformation happens when we convert from your JSON format:

```json
{
    "instruction": "Identify any slang...",
    "output": "slang detected: gonna..."
}
```

To the chat format:

```
<|im_start|>user
Identify any slang...
<|im_end|>
<|im_start|>assistant
slang detected: gonna...
<|im_end|>
```

This transformation is what the `create_prompt` function does. It's not redoing your dataset preparation - it's just formatting it in the way Qwen2.5 expects for training.

## Clarification on Your Dataset

Your dataset preparation work was valuable and correct! You created a well-structured dataset with instruction/output pairs. The reformatting step in the code is just ensuring consistency, and the tokenization step is converting it to Qwen's expected format with the special tokens.

If you want to simplify your pipeline, you could remove the redundant reformatting step since your data is already properly structured.
```

This markdown should render correctly in Google Colab, with proper code block formatting and structure. The code examples are properly fenced and syntax-highlighted where appropriate. Let me know if you need any adjustments!

In [141]:
try:
    # Load raw datasets
    print("Loading datasets...")
    data_files = {'train': data_cfg['train'], 'validation': data_cfg['validation']}
    raw_datasets = load_dataset('json', data_files=data_files)
    
    print(f"Raw datasets loaded: {raw_datasets}")
    print(f"Train dataset size: {len(raw_datasets['train'])}")
    print(f"Validation dataset size: {len(raw_datasets['validation'])}")
    
    # ADDED: Raw dataset structure debugging
    print("\nRaw dataset structure check:")
    print(f"First example keys: {list(raw_datasets['train'][0].keys())}")
    print(f"First example content: {raw_datasets['train'][0]}")
    
    # Display a few examples
    print("\nSample training examples:")
    for i in range(min(1, len(raw_datasets['train']))):
        print(f"Example {i}:")
        print(f"  Instruction: {raw_datasets['train'][i]['instruction']}")
        print(f"  Output: {raw_datasets['train'][i]['output']}")
        print()
    
    # Reformat dataset if needed
    def reformat_dataset(dataset):
        # ADDED: Debugging before reformatting
        print(f"Reformatting dataset with {len(dataset)} examples")
        if len(dataset) > 0:
            print(f"First example keys before reformatting: {list(dataset[0].keys())}")
            
        # Original code
        reformatted = dataset.map(
            lambda x: {
                "instruction": x["instruction"],
                "output": x["output"]
            }
        )
        
        # ADDED: Debugging after reformatting
        print(f"Reformatted dataset size: {len(reformatted)}")
        if len(reformatted) > 0:
            print(f"First example keys after reformatting: {list(reformatted[0].keys())}")
            
        return reformatted

    train_dataset = reformat_dataset(raw_datasets["train"])
    val_dataset = reformat_dataset(raw_datasets["validation"])
    
    # ADDED: More debugging
    print(f"train_dataset type: {type(train_dataset)}")
    print(f"val_dataset type: {type(val_dataset)}" )

    # Tokenize datasets
    max_length = train_cfg.get("max_length", 512)
    print(f"Tokenizing with max_length={max_length}...")
    
    # ADDED: Defensive tokenization wrapper
    def tokenize_with_logging(dataset, tokenizer, max_length):
        """Wrapper around tokenize_dataset with added logging"""
        print(f"Starting tokenization of dataset with {len(dataset)} examples")
        print(f"Dataset columns: {dataset.column_names}")
        
        # Call the original tokenize_dataset function
        result = tokenize_dataset(dataset, tokenizer, max_length)
        
        # Check the result
        if result is None:
            print("ERROR: tokenize_dataset returned None")
            return None
            
        print(f"Tokenization complete. Result size: {len(result)}")
        if len(result) > 0:
            print(f"First tokenized example keys: {list(result[0].keys())}")
        else:
            print("WARNING: Empty tokenized dataset returned")
            
        return result
    
    # Replace direct calls with wrapped version
    train_ds = tokenize_with_logging(train_dataset, tokenizer, max_length)
    val_ds = tokenize_with_logging(val_dataset, tokenizer, max_length)
    
    # Apply sample limits if specified
    if max_train_samples and train_ds is not None:
        train_ds = train_ds.select(range(min(max_train_samples, len(train_ds))))
        print(f"Limited training dataset to {len(train_ds)} samples")
    
    if max_eval_samples and val_ds is not None:
        val_ds = val_ds.select(range(min(max_eval_samples, len(val_ds))))
        print(f"Limited validation dataset to {len(val_ds)} samples")
    
    # MODIFIED: Added defensive checks
    train_size = len(train_ds) if train_ds is not None else 0
    val_size = len(val_ds) if val_ds is not None else 0
    print(f"Final dataset sizes: Train={train_size}, Validation={val_size}")
    
    # MODIFIED: Show a tokenized example with defensive checks
    print("\n\n\nSample tokenized example:")

    ######
    #####
    ######
    
    
    # #simplified version without much print() statements
    # if train_ds is not None and len(train_ds) > 0:
    #     sample_idx = 0
    #     print(f"Available keys in first example: {list(train_ds[0].keys())}")
    #     if 'input_ids' in train_ds[sample_idx]:
    #         sample_ids = train_ds[sample_idx]['input_ids']
    #         decoded = tokenizer.decode(sample_ids)
    #         print(decoded)
    #     else:
    #         print("ERROR: 'input_ids' not found in the first example")
    # else:
    #     print("ERROR: No examples in the tokenized dataset")


    ######################
    # ##################
    #     
    if train_ds is not None and len(train_ds) > 0 and train_dataset is not None:
        sample_idx = 0
        print(f"Available keys in first example: {list(train_ds[0].keys())}")
        
        # Get the original text from the pre-tokenized dataset
        original_instruction = train_dataset[sample_idx]['instruction']
        original_output = train_dataset[sample_idx]['output']
        
        # 0. Show the original text before tokenization
        print("\n0. ORIGINAL TEXT (BEFORE TOKENIZATION):")
        print("-"*40)
        print(f"Instruction: {original_instruction}")
        print(f"Output: {original_output}")
        
        # Create the prompt manually to show exactly what gets tokenized
        original_prompt = f"<|im_start|>user\n{original_instruction}<|im_end|>\n<|im_start|>assistant\n{original_output}<|im_end|>"
        print("\nCombined into prompt:")
        print(original_prompt)
        print("-"*40)
        
        if 'input_ids' in train_ds[sample_idx]:
            # Get the tokenized IDs
            sample_ids = train_ds[sample_idx]['input_ids']
            
            # 1. Show decoded text from tokens
            decoded = tokenizer.decode(sample_ids)
            print("\n1. DECODED TEXT (AFTER TOKENIZATION):")
            print("-"*40)
            print(decoded)
            print("-"*40)
            
            # 2. Show token IDs
            print("\n2. TOKEN IDs (WHAT THE MODEL ACTUALLY SEES):")
            print("-"*40)
            # Show first 20 tokens
            print(f"First 20 tokens: {sample_ids[:20]}")
            # Show last 20 tokens if sequence is long enough
            if len(sample_ids) > 40:
                print(f"Last 20 tokens: {sample_ids[-20:]}")
            print(f"Total tokens: {len(sample_ids)}")
            print("-"*40)
            
            # 3. Show token-by-token breakdown
            print("\n3. TOKEN-BY-TOKEN BREAKDOWN (FIRST 20 TOKENS):")
            print("-"*40)
            print(f"{'Index':<8} {'Token ID':<10} {'Token Text':<30}")
            print("-"*60)
            for i, token_id in enumerate(sample_ids[:20]):
                token_text = tokenizer.decode([token_id])
                print(f"{i:<8} {token_id:<10} {repr(token_text):<30}")
            print("-"*40)
            
            # 4. Compare original and decoded
            print("\n4. COMPARISON (ORIGINAL vs DECODED):")
            print("-"*40)
            print("Are they identical? ", original_prompt == decoded)
            if original_prompt != decoded:
                print("\nDifferences might be due to tokenization and detokenization process.")
                print("This is normal as tokenizers may normalize text, handle whitespace differently, etc.")
            print("-"*40)
        else:
            print("ERROR: 'input_ids' not found in the first example")
    else:
        print("ERROR: No examples in the tokenized dataset")



    #######
except Exception as e:
    print(f"Error loading or tokenizing datasets: {e}")
    import traceback
    traceback.print_exc()

Loading datasets...


2025-04-21 20:34:15,540 - INFO - Tokenizing dataset with max_length=512...
2025-04-21 20:34:15,541 - INFO - Using 8 processes for tokenization.


Raw datasets loaded: DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 9984
    })
    validation: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 1248
    })
})
Train dataset size: 9984
Validation dataset size: 1248

Raw dataset structure check:
First example keys: ['instruction', 'input', 'output']
First example content: {'instruction': 'Identify any slang in this video subtitle: "You gonna give me a ticket?"', 'input': '', 'output': 'slang detected: gonna\nslang context: You gonna give me a ticket?'}

Sample training examples:
Example 0:
  Instruction: Identify any slang in this video subtitle: "You gonna give me a ticket?"
  Output: slang detected: gonna
slang context: You gonna give me a ticket?

Reformatting dataset with 9984 examples
First example keys before reformatting: ['instruction', 'input', 'output']
Reformatted dataset size: 9984
First example keys after reformatting: ['instruction

2025-04-21 20:34:15,696 - INFO - Tokenization complete.
2025-04-21 20:34:15,698 - INFO - Tokenizing dataset with max_length=512...
2025-04-21 20:34:15,699 - INFO - Using 8 processes for tokenization.


Tokenization complete. Result size: 9984
First tokenized example keys: ['input_ids', 'attention_mask', 'labels']
Starting tokenization of dataset with 1248 examples
Dataset columns: ['instruction', 'input', 'output']


2025-04-21 20:34:15,856 - INFO - Tokenization complete.


Tokenization complete. Result size: 1248
First tokenized example keys: ['input_ids', 'attention_mask', 'labels']
Final dataset sizes: Train=9984, Validation=1248



Sample tokenized example:
Available keys in first example: ['input_ids', 'attention_mask', 'labels']

0. ORIGINAL TEXT (BEFORE TOKENIZATION):
----------------------------------------
Instruction: Identify any slang in this video subtitle: "You gonna give me a ticket?"
Output: slang detected: gonna
slang context: You gonna give me a ticket?

Combined into prompt:
<|im_start|>user
Identify any slang in this video subtitle: "You gonna give me a ticket?"<|im_end|>
<|im_start|>assistant
slang detected: gonna
slang context: You gonna give me a ticket?<|im_end|>
----------------------------------------

1. DECODED TEXT (AFTER TOKENIZATION):
----------------------------------------
<|im_start|>user
Identify any slang in this video subtitle: "You gonna give me a ticket?"<|im_end|>
<|im_start|>assistant
slang detected: gonna
slang co

# ## 8. Load Base Model with Quantization
# Load the model with 4-bit quantization to fit in limited GPU memory

In [145]:
# ## 8. Load Base Model with Quantization
# Load the model with 4-bit quantization to fit in limited GPU memory

# %%
try:
    # Setup model loading arguments
    model_load_kwargs = {"trust_remote_code": True}
    
    if use_4bit:
        print("Setting up 4-bit quantization...")
        try:
            import bitsandbytes
            print("bitsandbytes library found.")
        except ImportError:
            print("ERROR: bitsandbytes library not found. Please install with: pip install bitsandbytes")
            raise
        
        # Create quantization config
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16 if bf16 else torch.float16
        )
        model_load_kwargs["quantization_config"] = quantization_config
        model_load_kwargs["device_map"] = "auto"
        print(f"4-bit quantization config: {quantization_config.to_dict()}")
    else:
        # Standard loading with selected precision
        model_load_kwargs["torch_dtype"] = torch.bfloat16 if bf16 else torch.float16
        model_load_kwargs["device_map"] = "auto"
        print(f"Loading with torch_dtype: {model_load_kwargs['torch_dtype']}")
    
    # Check GPU availability
    if torch.cuda.is_available():
        print(f"GPU available: {torch.cuda.get_device_name(0)}")
        print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    else:
        print("WARNING: No GPU detected! Training will be very slow.")
    
    # Load base model
    print(f"Loading base model: {model_cfg['name']}...")
    print("This may take several minutes...")
    
    # This cell might take a long time to execute
    
    # Comment these lines if we don't want to actually load the model
    # base_model = AutoModelForCausalLM.from_pretrained(
    #     model_cfg["name"],
    #     **model_load_kwargs
    # )
    # print("Base model loaded successfully!")
    
except Exception as e:
    print(f"Error loading base model: {e}")
    import traceback
    traceback.print_exc()

Setting up 4-bit quantization...
bitsandbytes library found.
4-bit quantization config: {'quant_method': <QuantizationMethod.BITS_AND_BYTES: 'bitsandbytes'>, '_load_in_8bit': False, '_load_in_4bit': True, 'llm_int8_threshold': 6.0, 'llm_int8_skip_modules': None, 'llm_int8_enable_fp32_cpu_offload': False, 'llm_int8_has_fp16_weight': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'bfloat16', 'bnb_4bit_quant_storage': 'uint8', 'load_in_4bit': True, 'load_in_8bit': False}
Loading base model: Qwen/Qwen2.5-1.5B-Instruct...
This may take several minutes...


## 7. Training

In [None]:
output_dir = Path(model_cfg['output_dir'])
output_dir.mkdir(parents=True, exist_ok=True)

collator = DataCollatorForLanguageModeling(tok, mlm=False)
args = TrainingArguments(
    output_dir=str(output_dir),
    per_device_train_batch_size=train_cfg['batch_size'],
    per_device_eval_batch_size=train_cfg.get('eval_batch_size', train_cfg['batch_size']*2),
    gradient_accumulation_steps=train_cfg['gradient_accumulation_steps'],
    num_train_epochs=train_cfg['epochs'],
    learning_rate=train_cfg['lr'],
    warmup_ratio=train_cfg['warmup_ratio'],
    fp16=not BF16,
    bf16=BF16,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_steps=train_cfg.get('logging_steps', 50),
    report_to='none',
)

trainer = Trainer(model=model,
                  tokenizer=tok,
                  args=args,
                  train_dataset=train_ds,
                  eval_dataset=val_ds,
                  data_collator=collator)

# Uncomment to run
# trainer.train()

## 8. Save Model & Optional Test Evaluation

In [None]:
# trainer.save_model()
# tok.save_pretrained(str(output_dir))

# if 'test' in data_cfg and Path(data_cfg['test']).exists():
#     test_raw = load_dataset('json', data_files={'test': data_cfg['test']})
#     test_ds  = tokenize_dataset(test_raw['test'], tok, train_cfg.get('max_length', 512))
#     trainer.evaluate(eval_dataset=test_ds, metric_key_prefix='test')