# Data Preparation

We have our slang_OpenSub.tsv and slang_OpenSub_negative.tsv in our data/raw folder, now we want to combine them into a single dataset so we can train the model. Our goal is to split the combined dataset into 80/10/10 for training/validation/testing. 


In [5]:
# Set Up


import os 
import sys 

# Get the current directory (where the notebook is)
current_dir = os.getcwd()

# Navigate up to the project root (LINGO folder)
# Assuming notebook is in data/processed/text
project_root = os.path.abspath(os.path.join(current_dir, '../../../'))
print(f"Project root: {project_root}")

# Important: Add project root to Python's path
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    print(f"Added {project_root} to Python path")


# Check if src directory exists
src_dir = os.path.join(project_root, 'src')
utils_dir = os.path.join(src_dir, 'utils')
config_file = os.path.join(utils_dir, 'config.py')
print(f"Checking if src exists: {os.path.exists(src_dir)}")
print(f"Checking if utils exists: {os.path.exists(utils_dir)}")
print(f"Checking if config.py exists: {os.path.exists(config_file)}")

# Import config
from src.utils.config import RAW_DATA_DIR, PROCESSED_DATA_DIR

# Load data
#slang_df = pd.read_csv(os.path.join(RAW_DATA_DIR, 'slang_OpenSub.tsv'), sep='\t')


Project root: c:\Users\jiang\Desktop\Projects\Lingo
Checking if src exists: True
Checking if utils exists: True
Checking if config.py exists: True


In [7]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split

## 1.1 Load the Raw Datasets

In [8]:
# Load the slang and non-slang datasets
slang_file = os.path.join(RAW_DATA_DIR, 'slang_OpenSub.tsv')
nonslang_file = os.path.join(RAW_DATA_DIR, 'slang_OpenSub_negatives.tsv')

print(f"Loading data from: {slang_file}")
print(f"Loading data from: {nonslang_file}")

slang_df = pd.read_csv(slang_file, sep='\t')
nonslang_df = pd.read_csv(nonslang_file, sep='\t')

# Print basic dataset information
print(f"Slang dataset shape: {slang_df.shape}")
print(f"Non-slang dataset shape: {nonslang_df.shape}")
print(f"Slang dataset columns: {slang_df.columns.tolist()}")
print(f"Non-slang dataset columns: {nonslang_df.columns.tolist()}")

Loading data from: c:\Users\jiang\Desktop\Projects\Lingo\data\raw\slang_OpenSub.tsv
Loading data from: c:\Users\jiang\Desktop\Projects\Lingo\data\raw\slang_OpenSub_negatives.tsv
Slang dataset shape: (7488, 11)
Non-slang dataset shape: (17512, 6)
Slang dataset columns: ['SENTENCE', 'FULL_CONTEXT', 'SLANG_TERM', 'ANNOTATOR_CONFIDENCE', 'MOVIE_ID', 'SENT_ID', 'REGION', 'YEAR', 'DEFINITION_SENTENCE', 'DEFINITION_SOURCE_URL', 'LITERAL_PARAPHRASE_OF_SLANG']
Non-slang dataset columns: ['SENTENCE', 'FULL_CONTEXT', 'MOVIE_ID', 'SENT_ID', 'REGION', 'YEAR']


## 1.2 Label and Combine the Datasets

In [13]:
## Label and Combine the Datasets

# Add label columns (1 for slang, 0 for non-slang)
slang_df['has_slang'] = 1
nonslang_df['has_slang'] = 0

# Combine into a single dataset
combined_df = pd.concat([slang_df, nonslang_df], ignore_index=True)

#  Check the combined dataset shape, it should have 25000 rows and 12 columns (11 columns from slang_df and the "has_slang" column we just defined)
print(f"Combined dataset shape: {combined_df.shape}")


#  Check the negative dataset entry here should have NaN in Slang term and annotator_confidence columns and etc.
# Columns that non_slang_df didn't have should be NaN
print(f"row 15000: {combined_df.iloc[15000]}")  



Combined dataset shape: (25000, 12)
row 15000: SENTENCE                               When I was nine, I really wanted a horse.
FULL_CONTEXT                   The horse story? <i> When I was nine, I really...
SLANG_TERM                                                                   NaN
ANNOTATOR_CONFIDENCE                                                         NaN
MOVIE_ID                                                                 6692456
SENT_ID                                                                     1556
REGION                                                                        US
YEAR                                                                        2016
DEFINITION_SENTENCE                                                          NaN
DEFINITION_SOURCE_URL                                                        NaN
LITERAL_PARAPHRASE_OF_SLANG                                                  NaN
has_slang                                                     

## 1.3 Split the Dataset into Training, Validation, and Test Sets



In [16]:
## Split the Dataset into Training, Validation, and Test Sets


print("Original class distribution:")
slang_count = combined_df['has_slang'].sum()  # Count all 1s (slang examples)
nonslang_count = len(combined_df) - slang_count  # Count all 0s (non-slang examples)

# Print percentages
print(f"Non-slang examples: {nonslang_count} ({nonslang_count/len(combined_df)*100:.1f}%)")
print(f"Slang examples: {slang_count} ({slang_count/len(combined_df)*100:.1f}%)")

# For high recall, create a 60/40 distribution (slang/non-slang)
target_ratio = 0.6  # Want 60% slang, 40% non-slang

# Separating the data based on the combined_df
# Create two separate dataframes
slang_subset = combined_df[combined_df['has_slang'] == 1]    # Only slang examples
nonslang_subset = combined_df[combined_df['has_slang'] == 0] # Only non-slang examples


# Keep ALL slang examples and adjust non-slang to achieve target ratio

# Calculate required non-slang examples
target_nonslang = int(len(slang_subset) * (1-target_ratio) / target_ratio)
""" 
Math here for the int(len....) calculation above
Example: Suppose we have slang_dataset = 2000 examples, and target ratio = 0.6
# = int(2000 * 0.4 / 0.6)
# = int(2000 * 0.667)
# ≈ 1333 non-slang examples

# Final distribution:
# 2000 slang + 1333 non-slang = 3333 total
# 2000/3333 = 60% slang (approximately)
# 1333/3333 = 40% non-slang (approximately)
""" 
# Sample non-slang (no replacement needed)
nonslang_adjusted = nonslang_subset.sample(n=target_nonslang, random_state=42)

balanced_df = pd.concat([slang_subset, nonslang_adjusted], ignore_index=True)
# Shuffle the balanced dataset
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Verify the new distribution
new_slang_count = balanced_df['has_slang'].sum()
new_nonslang_count = len(balanced_df) - new_slang_count
print("\nAdjusted class distribution:")
print(f"Non-slang examples: {new_nonslang_count} ({new_nonslang_count/len(balanced_df)*100:.1f}%)")
print(f"Slang examples: {new_slang_count} ({new_slang_count/len(balanced_df)*100:.1f}%)")



Original class distribution:
Non-slang examples: 17512 (70.0%)
Slang examples: 7488 (30.0%)

Adjusted class distribution:
Non-slang examples: 4992 (40.0%)
Slang examples: 7488 (60.0%)


In [21]:
print(f"Total number of data in the adjusted dataset with {new_nonslang_count/len(balanced_df)*100:.1f}% non-slang and {new_slang_count/len(balanced_df)*100:.1f}% slang: {len(balanced_df)}")
# Split data using 80/10/10 ratio with stratification to maintain class balance
train_df, temp_df = train_test_split(balanced_df, test_size=0.2, random_state=42, stratify=balanced_df['has_slang'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['has_slang'])

print(f"\nTraining examples: {len(train_df)}")
print(f"Validation examples: {len(val_df)}")
print(f"Test examples: {len(test_df)}")

# Verify class distribution in splits
print(f"\nNow we verify the distribution of slang and non-slang data in the training set:")
print(f"Class distribution in training set:")
print(f"Non-slang: {(train_df['has_slang'] == 0).sum()} ({(train_df['has_slang'] == 0).sum()/len(train_df)*100:.1f}%)")
print(f"Slang: {(train_df['has_slang'] == 1).sum()} ({(train_df['has_slang'] == 1).sum()/len(train_df)*100:.1f}%)")

Total number of data in the adjusted dataset with 40.0% non-slang and 60.0% slang: 12480

Training examples: 9984
Validation examples: 1248
Test examples: 1248

Now we verify the distribution of slang and non-slang data in the training set
Class distribution in training set:
Non-slang: 3994 (40.0%)
Slang: 5990 (60.0%)


## 2. Format the Dataset for Instruction Tuning
