In [2]:
%pip install pandas datasets raid-bench

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
from datasets import load_dataset
from raid.utils import load_data as load_raid_data
import os

In [6]:
ds = load_dataset('liamdugan/raid', split='train', streaming=True)
df_raid = pd.DataFrame(list(ds.take(10000)))

# Basic cleaning
df_raid = df_raid[['generation', 'model']].rename(columns={'generation': 'text'})
df_raid['label'] = df_raid['model'].apply(lambda x: 0 if str(x).lower() == 'human' else 1)

df_raid['text'] = df_raid['text'].str.replace(r'[\u2028\u2029]', ' ', regex=True)
df_raid.to_csv("raid_local.csv", index=False)
print(f"RAID Loaded: {len(df_raid)} rows")

RAID Loaded: 10000 rows


In [None]:
df_daigt = pd.read_csv('train_v2_drcat_02.csv').head(10000)[['text', 'label']]

df_daigt['text'] = df_daigt['text'].str.replace(r'[\u2028\u2029]', ' ', regex=True)
print(f"DAIGT Loaded: {len(df_daigt)} rows")

DAIGT Loaded: 10000 rows


In [15]:
#### joining csv files

In [11]:
df_master = pd.concat([df_raid, df_daigt], ignore_index=True)

print(f"Rows after stacking: {len(df_master)}")

duplicate_count = df_master.duplicated(subset=['text']).sum()
print(f"Found {duplicate_count} duplicate essays.")

df_master = df_master.drop_duplicates(subset=['text'])

print(f"Total Unique Rows: {len(df_master)}")
print(f"Total Rows: {len(df_master)}")
print(f"AI samples: {df_master['label'].value_counts()[1]}")
print(f"Human samples: {df_master['label'].value_counts()[0]}")

Rows after stacking: 20000
Found 0 duplicate essays.
Total Unique Rows: 20000
Total Rows: 20000
AI samples: 9017
Human samples: 10983


#### balancing the data

In [14]:
df_master = df_master.sample(frac=1, random_state=42).reset_index(drop=True)

min_size = min(df_master['label'].value_counts())
df_balanced = df_master.groupby('label').head(min_size)

df_balanced.to_csv("master_training_data.csv", index=False)

ai_count = len(df_balanced[df_balanced['label'] == 1])
human_count = len(df_balanced[df_balanced['label'] == 0])

print(f"Final Count: {len(df_balanced)} rows ({ai_count:,} AI & {human_count:,} Human)")

Final Count: 18034 rows (9,017 AI & 9,017 Human)
