In [3]:
import pandas as pd

# Load dataset
df = pd.read_csv("train.csv")

# Create a single 'toxic' column (if any category is 1, mark as toxic)
df['label'] = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].max(axis=1)

# Keep only relevant columns
df = df[['comment_text', 'label']]

# Print the distribution
print("Label distribution:\n", df['label'].value_counts())

# Save the cleaned dataset
df.to_csv("cleaned_train.csv", index=False)


Label distribution:
 label
0    143346
1     16225
Name: count, dtype: int64


In [5]:
# Downsample non-toxic comments to balance the dataset
df_non_toxic = df[df['label'] == 0].sample(n=len(df[df['label'] == 1]), random_state=42)
df_toxic = df[df['label'] == 1]

# Combine balanced data
df_balanced = pd.concat([df_non_toxic, df_toxic]).sample(frac=1, random_state=42)  # Shuffle

# Save the balanced dataset
df_balanced.to_csv("balanced_train.csv", index=False)

print("Balanced dataset saved. New distribution:")
print(df_balanced['label'].value_counts())


Balanced dataset saved. New distribution:
label
0    16225
1    16225
Name: count, dtype: int64


In [7]:
from sklearn.model_selection import train_test_split

# Load the balanced dataset
df = pd.read_csv("balanced_train.csv")

# Split dataset into train and validation sets (80% train, 20% validation)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['comment_text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

# Save split data
pd.DataFrame({'text': train_texts, 'label': train_labels}).to_csv("train_split.csv", index=False)
pd.DataFrame({'text': val_texts, 'label': val_labels}).to_csv("val_split.csv", index=False)

print(f"Train samples: {len(train_texts)}, Validation samples: {len(val_texts)}")


Train samples: 25960, Validation samples: 6490
