# Data Preparation

We have our slang_OpenSub.tsv and slang_OpenSub_negative.tsv in our data/raw folder, now we want to combine them into a single dataset so we can train the model. Our goal is to split the combined dataset into 80/10/10 for training/validation/testing. 


In [5]:
# Set Up


import os 
import sys 

# Get the current directory (where the notebook is)
current_dir = os.getcwd()

# Navigate up to the project root (LINGO folder)
# Assuming notebook is in data/processed/text
project_root = os.path.abspath(os.path.join(current_dir, '../../../'))
print(f"Project root: {project_root}")

# Important: Add project root to Python's path
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    print(f"Added {project_root} to Python path")


# Check if src directory exists
src_dir = os.path.join(project_root, 'src')
utils_dir = os.path.join(src_dir, 'utils')
config_file = os.path.join(utils_dir, 'config.py')
print(f"Checking if src exists: {os.path.exists(src_dir)}")
print(f"Checking if utils exists: {os.path.exists(utils_dir)}")
print(f"Checking if config.py exists: {os.path.exists(config_file)}")

# Import config
from src.utils.config import RAW_DATA_DIR, PROCESSED_DATA_DIR

# Load data
#slang_df = pd.read_csv(os.path.join(RAW_DATA_DIR, 'slang_OpenSub.tsv'), sep='\t')


Project root: c:\Users\jiang\Desktop\Projects\Lingo
Checking if src exists: True
Checking if utils exists: True
Checking if config.py exists: True


In [7]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split

## 1.1 Load the Raw Datasets

In [8]:
# Load the slang and non-slang datasets
slang_file = os.path.join(RAW_DATA_DIR, 'slang_OpenSub.tsv')
nonslang_file = os.path.join(RAW_DATA_DIR, 'slang_OpenSub_negatives.tsv')

print(f"Loading data from: {slang_file}")
print(f"Loading data from: {nonslang_file}")

slang_df = pd.read_csv(slang_file, sep='\t')
nonslang_df = pd.read_csv(nonslang_file, sep='\t')

# Print basic dataset information
print(f"Slang dataset shape: {slang_df.shape}")
print(f"Non-slang dataset shape: {nonslang_df.shape}")
print(f"Slang dataset columns: {slang_df.columns.tolist()}")
print(f"Non-slang dataset columns: {nonslang_df.columns.tolist()}")

Loading data from: c:\Users\jiang\Desktop\Projects\Lingo\data\raw\slang_OpenSub.tsv
Loading data from: c:\Users\jiang\Desktop\Projects\Lingo\data\raw\slang_OpenSub_negatives.tsv
Slang dataset shape: (7488, 11)
Non-slang dataset shape: (17512, 6)
Slang dataset columns: ['SENTENCE', 'FULL_CONTEXT', 'SLANG_TERM', 'ANNOTATOR_CONFIDENCE', 'MOVIE_ID', 'SENT_ID', 'REGION', 'YEAR', 'DEFINITION_SENTENCE', 'DEFINITION_SOURCE_URL', 'LITERAL_PARAPHRASE_OF_SLANG']
Non-slang dataset columns: ['SENTENCE', 'FULL_CONTEXT', 'MOVIE_ID', 'SENT_ID', 'REGION', 'YEAR']


## 1.2 Label and Combine the Datasets

In [13]:
## Label and Combine the Datasets

# Add label columns (1 for slang, 0 for non-slang)
slang_df['has_slang'] = 1
nonslang_df['has_slang'] = 0

# Combine into a single dataset
combined_df = pd.concat([slang_df, nonslang_df], ignore_index=True)

#  Check the combined dataset shape, it should have 25000 rows and 12 columns (11 columns from slang_df and the "has_slang" column we just defined)
print(f"Combined dataset shape: {combined_df.shape}")


#  Check the negative dataset entry here should have NaN in Slang term and annotator_confidence columns and etc.
# Columns that non_slang_df didn't have should be NaN
print(f"row 15000: {combined_df.iloc[15000]}")  



Combined dataset shape: (25000, 12)
row 15000: SENTENCE                               When I was nine, I really wanted a horse.
FULL_CONTEXT                   The horse story? <i> When I was nine, I really...
SLANG_TERM                                                                   NaN
ANNOTATOR_CONFIDENCE                                                         NaN
MOVIE_ID                                                                 6692456
SENT_ID                                                                     1556
REGION                                                                        US
YEAR                                                                        2016
DEFINITION_SENTENCE                                                          NaN
DEFINITION_SOURCE_URL                                                        NaN
LITERAL_PARAPHRASE_OF_SLANG                                                  NaN
has_slang                                                     

## 1.3 Split the Dataset into Training, Validation, and Test Sets



In [16]:
## Split the Dataset into Training, Validation, and Test Sets


print("Original class distribution:")
slang_count = combined_df['has_slang'].sum()  # Count all 1s (slang examples)
nonslang_count = len(combined_df) - slang_count  # Count all 0s (non-slang examples)

# Print percentages
print(f"Non-slang examples: {nonslang_count} ({nonslang_count/len(combined_df)*100:.1f}%)")
print(f"Slang examples: {slang_count} ({slang_count/len(combined_df)*100:.1f}%)")

# For high recall, create a 60/40 distribution (slang/non-slang)
target_ratio = 0.6  # Want 60% slang, 40% non-slang

# Separating the data based on the combined_df
# Create two separate dataframes
slang_subset = combined_df[combined_df['has_slang'] == 1]    # Only slang examples
nonslang_subset = combined_df[combined_df['has_slang'] == 0] # Only non-slang examples


# Keep ALL slang examples and adjust non-slang to achieve target ratio

# Calculate required non-slang examples
target_nonslang = int(len(slang_subset) * (1-target_ratio) / target_ratio)
""" 
Math here for the int(len....) calculation above
Example: Suppose we have slang_dataset = 2000 examples, and target ratio = 0.6
# = int(2000 * 0.4 / 0.6)
# = int(2000 * 0.667)
# ≈ 1333 non-slang examples

# Final distribution:
# 2000 slang + 1333 non-slang = 3333 total
# 2000/3333 = 60% slang (approximately)
# 1333/3333 = 40% non-slang (approximately)
""" 
# Sample non-slang (no replacement needed)
nonslang_adjusted = nonslang_subset.sample(n=target_nonslang, random_state=42)

balanced_df = pd.concat([slang_subset, nonslang_adjusted], ignore_index=True)
# Shuffle the balanced dataset
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Verify the new distribution
new_slang_count = balanced_df['has_slang'].sum()
new_nonslang_count = len(balanced_df) - new_slang_count
print("\nAdjusted class distribution:")
print(f"Non-slang examples: {new_nonslang_count} ({new_nonslang_count/len(balanced_df)*100:.1f}%)")
print(f"Slang examples: {new_slang_count} ({new_slang_count/len(balanced_df)*100:.1f}%)")



Original class distribution:
Non-slang examples: 17512 (70.0%)
Slang examples: 7488 (30.0%)

Adjusted class distribution:
Non-slang examples: 4992 (40.0%)
Slang examples: 7488 (60.0%)


In [21]:
print(f"Total number of data in the adjusted dataset with {new_nonslang_count/len(balanced_df)*100:.1f}% non-slang and {new_slang_count/len(balanced_df)*100:.1f}% slang: {len(balanced_df)}")
# Split data using 80/10/10 ratio with stratification to maintain class balance
train_df, temp_df = train_test_split(balanced_df, test_size=0.2, random_state=42, stratify=balanced_df['has_slang'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['has_slang'])

print(f"\nTraining examples: {len(train_df)}")
print(f"Validation examples: {len(val_df)}")
print(f"Test examples: {len(test_df)}")

# Verify class distribution in splits
print(f"\nNow we verify the distribution of slang and non-slang data in the training set:")
print(f"Class distribution in training set:")
print(f"Non-slang: {(train_df['has_slang'] == 0).sum()} ({(train_df['has_slang'] == 0).sum()/len(train_df)*100:.1f}%)")
print(f"Slang: {(train_df['has_slang'] == 1).sum()} ({(train_df['has_slang'] == 1).sum()/len(train_df)*100:.1f}%)")

Total number of data in the adjusted dataset with 40.0% non-slang and 60.0% slang: 12480

Training examples: 9984
Validation examples: 1248
Test examples: 1248

Now we verify the distribution of slang and non-slang data in the training set
Class distribution in training set:
Non-slang: 3994 (40.0%)
Slang: 5990 (60.0%)


## 2. Format the Dataset for Instruction Tuning


Design Choice:

**Option 1: Single Model to detect both slang and provide definitions.**
Train Qwen model to both detect slang AND provide definitions simultaneously.

Pros:

- Simpler architecture (one API call)
- Lower latency
- Works offline

Cons:

- Limited to definitions available in your training data
- Can't easily update definitions without retraining


**Option 2: Two-Model Approach**
Only fine-tune model to detect slang, then use GPT-4o or other API for definitions and provide more capabilities such as examples, synonyms, and more.

Pros:   

- Leverages latest models like GPT-4o's comprehensive knowledge base
- Can generate creative examples and more detailed explanations
- Definitions stay up-to-date with API updates

Cons:

- Additional API cost (GPT-4o usage)
- Higher latency (two sequential API calls)
- Requires internet connection
- More complex architecture

**We will use Option 2 for now.**


In [22]:
def format_example(row):
    """
    Format a subtitle row into an instruction-output pair for slang detection.
    
    Args:
        row: A pandas Series containing 'SENTENCE', 'has_slang', and optionally 'SLANG_TERM'
    
    Returns:
        dict: Formatted example with instruction, input, and output fields
    
    Example:
        Input row:
        {
            'SENTENCE': "That party was lit",
            'has_slang': 1,
            'SLANG_TERM': "lit"
        }
        
        Returns:
        {
            "instruction": "Identify any slang in this video subtitle: \"That party was lit\"",
            "input": "",
            "output": "slang detected: lit
                      slang context: That party was lit"
        }
        
        For non-slang case (has_slang = 0):
        {
            "instruction": "Identify any slang in this video subtitle: \"The weather is nice\"",
            "input": "",
            "output": "no slang detected"
        }
    """

    # Clear instruction for slang detection
    instruction = f"Identify any slang in this video subtitle: \"{row['SENTENCE']}\""
    
    if row['has_slang'] == 1:
        # Simple, consistent format that's easy to parse
        response = f"slang detected: {row.get('SLANG_TERM', 'unknown')}\n"
        response += f"slang context: {row['SENTENCE']}"
    else:
        response = "no slang detected"
    
    return {
        "instruction": instruction,
        "input": "",
        "output": response
    }

# Apply formatting to each dataset
print("Formatting examples for instruction tuning...")

# train_formatted = []  # Create empty list
# for index, row in train_df.iterrows():
#     # index: row number (we don't use it, that's why it's _)
#     # row: the actual data for this row
#     formatted_data = format_example(row)
#     train_formatted.append(formatted_data)

train_formatted = [format_example(row) for _, row in train_df.iterrows()]
val_formatted = [format_example(row) for _, row in val_df.iterrows()]
test_formatted = [format_example(row) for _, row in test_df.iterrows()]
print("Formatting complete!")

# Show a couple of examples to verify formatting



print("\nExample of formatted training data (slang example):")
# # Finding slang example
# for ex in train_formatted:
#     if "slang detected:" in ex["output"]:
#         slang_example = ex
#         break
slang_example = next(ex for ex in train_formatted if "slang detected:" in ex["output"]) # next() gets the first element from the generator, like give me the first example where
print(f"Instruction: {slang_example['instruction']}")
print(f"Output: {slang_example['output']}")

print("\nExample of formatted training data (non-slang example):")
nonslang_example = next(ex for ex in train_formatted if "no slang detected" in ex["output"])
print(f"Instruction: {nonslang_example['instruction']}")
print(f"Output: {nonslang_example['output']}")

Formatting examples for instruction tuning...
Formatting complete!

Example of formatted training data (slang example):
Instruction: Identify any slang in this video subtitle: "You gonna give me a ticket?"
Output: slang detected: gonna
slang context: You gonna give me a ticket?

Example of formatted training data (non-slang example):
Instruction: Identify any slang in this video subtitle: "No, the woman I lived with."
Output: no slang detected


In [25]:
# Save Processed Data


# Define output paths
train_output = os.path.join(PROCESSED_DATA_DIR, 'train.json')
val_output = os.path.join(PROCESSED_DATA_DIR, 'val.json')
test_output = os.path.join(PROCESSED_DATA_DIR, 'test.json')
sample_output = os.path.join(PROCESSED_DATA_DIR, 'sample.json')

# Save to JSON files
import json
print("Saving processed datasets...")

with open(train_output, 'w') as f:
    json.dump(train_formatted, f)

with open(val_output, 'w') as f:
    json.dump(val_formatted, f)

with open(test_output, 'w') as f:
    json.dump(test_formatted, f)

# Save a small sample with indentation for easy inspection
with open(sample_output, 'w') as f:
    # Save 5 examples with formatting for readability
    json.dump(train_formatted[:5], f, indent=2)

print(f"Saved training data to: {train_output}")
print(f"Saved validation data to: {val_output}")
print(f"Saved test data to: {test_output}")
print(f"Saved sample data to: {sample_output}")

Saving processed datasets...
Saved training data to: c:\Users\jiang\Desktop\Projects\Lingo\data\processed\text\train.json
Saved validation data to: c:\Users\jiang\Desktop\Projects\Lingo\data\processed\text\val.json
Saved test data to: c:\Users\jiang\Desktop\Projects\Lingo\data\processed\text\test.json
Saved sample data to: c:\Users\jiang\Desktop\Projects\Lingo\data\processed\text\sample.json


In [None]:
# Step 7: Create Training Configuration and Update existing YAML configuration files in .configs folder
import yaml
import os
from src.utils.config import PROJECT_ROOT

def safe_update_yaml(file_path, new_config, section_name):
    """
    Safely update a YAML file's specific section while preserving other sections.
    
    Args:
        file_path (str): Path to the YAML file
        new_config (dict): Dictionary containing the new configuration
        section_name (str): Name of the section to update
    
    Returns:
        str: Path to the updated file

    Example: 
    If we have a YAML file like this:
        text_slang_detector:
            lr: 0.0002
            epochs: 3
            # Other parameters...

        video_slang_detector:  # Future addition
            lr: 0.0001
            epochs: 5
            # Other parameters...

    When we want to update text_slang_detector, we don't want to accidentally overwrite video_slang_detector and even 
    accidentally delete other keys and values(like video_slang_detector and all it's parameters) in the YAML file. That's
    why we want to use this function.

    """
    # 1. Try to load existing config
    try:
        with open(file_path, 'r') as f:
            existing_config = yaml.safe_load(f) or {}
    except FileNotFoundError:
        # If file doesn't exist, start with empty config
        existing_config = {}
    
    # 2. Update only the specified section
    existing_config[section_name] = new_config[section_name]
    
    # 3. Write back to file
    with open(file_path, 'w') as f:
        yaml.dump(existing_config, f, default_flow_style=False)
    
    return file_path

# 1. Update training_config.yaml
training_params = {
    "text_slang_detector": { # Such that we can leave key like video_slang_detector in the YAML file without affecting the training
        "lr": 2e-4,
        "epochs": 3,
        "batch_size": 4,
        "gradient_accumulation_steps": 4,
        "warmup_ratio": 0.03,
        "max_length": 512
    }
}

training_config_path = os.path.join(PROJECT_ROOT, 'configs', 'training_config.yaml')
safe_update_yaml(training_config_path, training_params, "text_slang_detector")
print(f"Updated training configuration in: {training_config_path}")

# 2. Update model_config.yaml
model_params = {
    "text_slang_detector": {  
        "name": "Qwen/Qwen2.5-1.5B-Instruct",
        "lora_params": {
            "r": 16,
            "alpha": 32,
            "dropout": 0.1,
            "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"]
        },
        "output_dir": os.path.join(PROJECT_ROOT, "models", "text_slang_detector")
    }
}

model_config_path = os.path.join(PROJECT_ROOT, 'configs', 'model_config.yaml')
safe_update_yaml(model_config_path, model_params, "text_slang_detector")
print(f"Updated model configuration in: {model_config_path}")

# 3. Update data_config.yaml
data_params = {
    "text_slang_detector": {  # Changed from "slang_detector" to "text_slang_detector"
        "train": os.path.abspath(train_output),
        "validation": os.path.abspath(val_output),
        "test": os.path.abspath(test_output)
    }
}

data_config_path = os.path.join(PROJECT_ROOT, 'configs', 'data_config.yaml')
safe_update_yaml(data_config_path, data_params, "text_slang_detector")
print(f"Updated data paths in: {data_config_path}")

Updated training configuration in: c:\Users\jiang\Desktop\Projects\Lingo\configs\training_config.yaml
Updated model configuration in: c:\Users\jiang\Desktop\Projects\Lingo\configs\model_config.yaml
Updated data paths in: c:\Users\jiang\Desktop\Projects\Lingo\configs\data_config.yaml
