In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('dataset.csv')
df = df.drop(columns=['annotaters'], errors='ignore')

In [None]:
def preprocess_data(df):
    df_clean = df.copy()
    
    df_clean['hate'] = df_clean['label'].map({'h': 1, 'nh': 0}) # binary hate, non-hate
    
    target_map = {'p': 0, 'e': 1, 'r': 2} # numeric mapping of target categories
    df_clean['target'] = df_clean['target'].str.lower().str.strip()
    
    # non-hate labels have no target
    df_clean['target'] = (
        df_clean['target']
        .map(target_map)
        .where(df_clean['target'].isin(target_map.keys()))
    )
    df_clean['target'] = df_clean['target'].fillna(-100).astype(int)
    
    invalid_hate_mask = (df_clean['hate'] == 1) & (df_clean['target'] == -100)
    df_clean.loc[invalid_hate_mask, 'hate'] = 0
    
    return df_clean

In [None]:
def validate_dataset(df):
    assert set(df['hate'].unique()).issubset({0, 1}), f"Invalid hate labels: {df['hate'].unique()}"
    
    valid_targets = {-100, 0, 1, 2}
    invalid_targets = set(df['target'].unique()) - valid_targets
    assert not invalid_targets, f"Invalid targets detected: {invalid_targets}"
    
    nh_mask = df['hate'] == 0
    assert (df.loc[nh_mask, 'target'] == -100).all(), "Non-hate samples have invalid targets"
    
    assert not df['text'].isna().any(), "NaN in sentence column"
    assert not df['hate'].isna().any(), "NaN in hate column"
    assert not df['target'].isna().any(), "NaN in target column"
    
    print("All dataset validation checks passed!")

In [34]:
df_clean = preprocess_data(df)

validate_dataset(df_clean)

All dataset validation checks passed!
