# 02 - Data Preprocessing
This notebook handles text preprocessing and data preparation for model training.

## Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

## Load Data

In [2]:
# Load training data
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

print(f"Original train shape: {train_df.shape}")
print(f"Original test shape: {test_df.shape}")

Original train shape: (57477, 9)
Original test shape: (3, 4)


## Text Cleaning Functions

In [3]:
def clean_text(text):
    """
    Clean text by:
    - Converting to lowercase
    - Removing special characters
    - Removing extra whitespace
    """
    if pd.isna(text):
        return ""
    
    # Convert to lowercase
    text = str(text).lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

print("Text cleaning function defined.")

Text cleaning function defined.


## Apply Text Cleaning

In [None]:
# Ensure dependencies are available
import pandas as pd
import re

# Define clean_text function if not already defined
if 'clean_text' not in locals():
    def clean_text(text):
        """
        Clean text by:
        - Converting to lowercase
        - Removing special characters
        - Removing extra whitespace
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = str(text).lower()
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        
        # Remove email addresses
        text = re.sub(r'\S+@\S+', '', text)
        
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text

# Ensure data is loaded
if 'train_df' not in locals():
    train_df = pd.read_csv('../data/train.csv')
    test_df = pd.read_csv('../data/test.csv')
    print("Data loaded.")

# Apply cleaning to training data
print("Cleaning text columns...")
text_columns = ['prompt', 'response_a', 'response_b']

for col in text_columns:
    if col in train_df.columns:
        print(f"Cleaning {col}...")
        train_df[f'{col}_clean'] = train_df[col].apply(clean_text)
        test_df[f'{col}_clean'] = test_df[col].apply(clean_text)

print("\nCleaning complete!")
print(f"\nExample of cleaning (prompt):")
if 'prompt' in train_df.columns:
    print(f"Original: {train_df['prompt'].iloc[0][:100]}...")
    print(f"Cleaned: {train_df['prompt_clean'].iloc[0][:100]}...")

Data loaded.
Cleaning text columns...
Cleaning prompt...


NameError: name 'clean_text' is not defined

## Handle Missing Values

In [2]:
# Check for empty strings after cleaning
empty_masks = []
for col in ['prompt_clean', 'response_a_clean', 'response_b_clean']:
    if col in train_df.columns:
        empty_masks.append(train_df[col].str.len() == 0)

if empty_masks:
    combined_empty = empty_masks[0]
    for mask in empty_masks[1:]:
        combined_empty = combined_empty | mask
    
    print(f"Rows with empty text fields after cleaning: {combined_empty.sum()}")
    
    # Remove rows with empty text
    train_df = train_df[~combined_empty].reset_index(drop=True)

print(f"Train shape after cleaning: {train_df.shape}")

Train shape after cleaning: (57477, 9)


## Encode Labels

In [4]:
# Create target label based on winner columns
# We'll create a target that indicates which model won

if 'winner_model_a' in train_df.columns and 'winner_model_b' in train_df.columns:
    # Create a label: 0 for model_a wins, 1 for model_b wins, 2 for tie
    train_df['target'] = -1  # Initialize
    
    # Check if any column is truly binary (contains True/False or 1/0)
    a_sum = train_df['winner_model_a'].sum()
    b_sum = train_df['winner_model_b'].sum()
    
    if a_sum > 0 or b_sum > 0:
        # Columns contain numeric values
        train_df.loc[train_df['winner_model_a'] == 1, 'target'] = 0
        train_df.loc[train_df['winner_model_b'] == 1, 'target'] = 1
        if 'winner_tie' in train_df.columns:
            train_df.loc[train_df['winner_tie'] == 1, 'target'] = 2
    else:
        # Columns might contain boolean values
        train_df.loc[train_df['winner_model_a'] == True, 'target'] = 0
        train_df.loc[train_df['winner_model_b'] == True, 'target'] = 1
        if 'winner_tie' in train_df.columns:
            train_df.loc[train_df['winner_tie'] == True, 'target'] = 2
    
    print("Target label created:")
    print(f"  Model A wins (0): {(train_df['target'] == 0).sum()}")
    print(f"  Model B wins (1): {(train_df['target'] == 1).sum()}")
    print(f"  Tie (2): {(train_df['target'] == 2).sum()}")
else:
    print("Winner columns not found.")

Target label created:
  Model A wins (0): 20064
  Model B wins (1): 19652
  Tie (2): 17761


## Create Train/Validation Split

In [2]:
# Ensure data is loaded and preprocessed
import pandas as pd
from sklearn.model_selection import train_test_split

if 'train_df' not in locals():
    train_df = pd.read_csv('../data/train.csv')
    test_df = pd.read_csv('../data/test.csv')
    print("Data loaded.")

# Ensure target column exists
if 'target' not in train_df.columns:
    if 'winner_model_a' in train_df.columns and 'winner_model_b' in train_df.columns:
        # Create a label: 0 for model_a wins, 1 for model_b wins, 2 for tie
        train_df['target'] = -1  # Initialize
        
        # Check if any column is truly binary (contains True/False or 1/0)
        a_sum = train_df['winner_model_a'].sum()
        b_sum = train_df['winner_model_b'].sum()
        
        if a_sum > 0 or b_sum > 0:
            # Columns contain numeric values
            train_df.loc[train_df['winner_model_a'] == 1, 'target'] = 0
            train_df.loc[train_df['winner_model_b'] == 1, 'target'] = 1
            if 'winner_tie' in train_df.columns:
                train_df.loc[train_df['winner_tie'] == 1, 'target'] = 2
        else:
            # Columns might contain boolean values
            train_df.loc[train_df['winner_model_a'] == True, 'target'] = 0
            train_df.loc[train_df['winner_model_b'] == True, 'target'] = 1
            if 'winner_tie' in train_df.columns:
                train_df.loc[train_df['winner_tie'] == True, 'target'] = 2
        
        print("Target label created.")

# Create train/validation split
if 'target' in train_df.columns:
    train, val = train_test_split(
        train_df,
        test_size=0.2,
        random_state=42,
        stratify=train_df['target']
    )
else:
    train, val = train_test_split(
        train_df,
        test_size=0.2,
        random_state=42
    )

print(f"Training set size: {len(train)}")
print(f"Validation set size: {len(val)}")
if 'target' in train_df.columns:
    print(f"\nTarget distribution in train set:")
    print(train['target'].value_counts().sort_index())
    print(f"\nTarget distribution in validation set:")
    print(val['target'].value_counts().sort_index())

Data loaded.
Target label created.
Training set size: 45981
Validation set size: 11496

Target distribution in train set:
target
0    16051
1    15721
2    14209
Name: count, dtype: int64

Target distribution in validation set:
target
0    4013
1    3931
2    3552
Name: count, dtype: int64


## Save Processed Data

In [3]:
# Save processed data
train.to_csv('../data/train_processed.csv', index=False)
val.to_csv('../data/val_processed.csv', index=False)
test_df.to_csv('../data/test_processed.csv', index=False)

print("Processed data saved!")
print(f"  - ../data/train_processed.csv")
print(f"  - ../data/val_processed.csv")
print(f"  - ../data/test_processed.csv")

Processed data saved!
  - ../data/train_processed.csv
  - ../data/val_processed.csv
  - ../data/test_processed.csv
