In [84]:
%pip install pandas datasets raid-bench

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [85]:
import pandas as pd
from datasets import load_dataset
from raid.utils import load_data as load_raid_data
import os

In [86]:
# -----------------------------
# CONFIGURATION
# -----------------------------
MIN_WORDS = 20         # drop very short texts
MAX_WORDS = 800        # drop extremely long texts
SHORT_SAMPLE_COUNT = 3000
SHORT_MIN_WORDS = 30
SHORT_MAX_WORDS = 80
MIN_TEXT_CHARS = 20    # minimum characters for any text


In [87]:
ds = load_dataset('liamdugan/raid', split='train', streaming=True)
df_raid = pd.DataFrame(list(ds.take(10000)))

# Basic cleaning
df_raid = df_raid[['generation', 'model']].rename(columns={'generation': 'text'})
df_raid['label'] = df_raid['model'].apply(lambda x: 0 if str(x).lower() == 'human' else 1)

df_raid['text'] = df_raid['text'].str.replace(r'[\u2028\u2029]', ' ', regex=True)
df_raid.to_csv("../data/raid_local.csv", index=False)
print(f"RAID Loaded: {len(df_raid)} rows")

RAID Loaded: 10000 rows


In [88]:
df_daigt = pd.read_csv('../data/train_v2_drcat_02.csv').head(10000)[['text', 'label']]
df_daigt['model'] = 'unknown'  # Add model column for consistency

df_daigt['text'] = df_daigt['text'].str.replace(r'[\u2028\u2029]', ' ', regex=True)
print(f"DAIGT Loaded: {len(df_daigt)} rows")

DAIGT Loaded: 10000 rows


In [89]:
#### joining csv files

In [90]:
df_master = pd.concat([df_raid, df_daigt], ignore_index=True)

print(f"Rows after stacking: {len(df_master)}")

duplicate_count = df_master.duplicated(subset=['text']).sum()
print(f"Found {duplicate_count} duplicate essays.")

df_master = df_master.drop_duplicates(subset=['text'])

print(f"Total Unique Rows: {len(df_master)}")
print(f"Total Rows: {len(df_master)}")
print(f"AI samples: {df_master['label'].value_counts()[1]}")
print(f"Human samples: {df_master['label'].value_counts()[0]}")

Rows after stacking: 20000
Found 0 duplicate essays.
Total Unique Rows: 20000
Total Rows: 20000
AI samples: 9017
Human samples: 10983


#### balancing the data

In [91]:
df_master = df_master.sample(frac=1, random_state=42).reset_index(drop=True)

min_size = min(df_master['label'].value_counts())
df_balanced = df_master.groupby('label').head(min_size)

df_balanced.to_csv("../data/master_training_data.csv", index=False)

ai_count = len(df_balanced[df_balanced['label'] == 1])
human_count = len(df_balanced[df_balanced['label'] == 0])

print(f"Final Count: {len(df_balanced)} rows ({ai_count:,} AI & {human_count:,} Human)")
print("Saved to: ../data/master_training_data.csv")

Final Count: 18034 rows (9,017 AI & 9,017 Human)
Saved to: ../data/master_training_data.csv


## Adding HC3 Dataset (Human vs ChatGPT Comparison)
Source: https://huggingface.co/datasets/Hello-SimpleAI/HC3

This dataset contains ~24,000 Q&A pairs with human answers vs ChatGPT answers.

In [92]:
# Download HC3 dataset (JSONL export)
print("Downloading HC3 dataset from Hugging Face...")

from huggingface_hub import HfFileSystem

fs = HfFileSystem()

# Find JSONL files in the dataset repo
all_files = fs.find("datasets/Hello-SimpleAI/HC3")
jsonl_files = [f for f in all_files if f.endswith(".jsonl")]

if not jsonl_files:
    print("No JSONL files found for HC3.")
    print("Top-level files:")
    print(fs.ls("datasets/Hello-SimpleAI/HC3"))
    raise ValueError("HC3 JSONL files not found. Check dataset structure or network.")

jsonl_files = [f"hf://{path}" for path in jsonl_files]

# Load via JSONL (dataset scripts are deprecated)
ds_hc3 = load_dataset("json", data_files=jsonl_files, split="train")
print(f"HC3 Downloaded: {len(ds_hc3)} Q&A pairs")

Downloading HC3 dataset from Hugging Face...
HC3 Downloaded: 48644 Q&A pairs


In [93]:
# Convert HC3 to our training format
hc3_samples = []

for item in ds_hc3:
    question = item['question']
    
    # Add human answers (label = 0)
    for answer in item['human_answers']:
        if answer and len(answer.strip()) > MIN_TEXT_CHARS:
            hc3_samples.append({
                'text': f"{question} {answer}",
                'label': 0,
                'model': 'human'
            })
    
    # Add ChatGPT answers (label = 1)
    for answer in item['chatgpt_answers']:
        if answer and len(answer.strip()) > MIN_TEXT_CHARS:
            hc3_samples.append({
                'text': f"{question} {answer}",
                'label': 1,
                'model': 'chatgpt'
            })

df_hc3 = pd.DataFrame(hc3_samples)
df_hc3['text'] = df_hc3['text'].str.replace(r'[\u2028\u2029]', ' ', regex=True)

print(f"HC3 Converted: {len(df_hc3)} samples")
print(f"  Human: {len(df_hc3[df_hc3['label'] == 0])}")
print(f"  AI:    {len(df_hc3[df_hc3['label'] == 1])}")

HC3 Converted: 170840 samples
  Human: 117080
  AI:    53760


## Adding GPT-wiki-intro Dataset (Wikipedia vs GPT)
Source: https://huggingface.co/datasets/aadityaubhat/GPT-wiki-intro

We will sample a limited number of rows to avoid over-weighting this genre.

In [94]:
# Download GPT-wiki-intro dataset (sampled)
print("Downloading GPT-wiki-intro dataset from Hugging Face...")

WIKI_SAMPLE_SIZE = 20000  # adjust if you want more/less

# Streaming keeps memory low
wiki_stream = load_dataset("aadityaubhat/GPT-wiki-intro", split="train", streaming=True)
wiki_rows = list(wiki_stream.take(WIKI_SAMPLE_SIZE))

print(f"GPT-wiki-intro loaded: {len(wiki_rows)} rows")

# Build human vs AI rows
wiki_samples = []
for row in wiki_rows:
    wiki_intro = row.get("wiki_intro", "")
    gen_intro = row.get("generated_intro", "")
    
    if wiki_intro and len(wiki_intro.strip()) > MIN_TEXT_CHARS:
        wiki_samples.append({
            "text": wiki_intro,
            "label": 0,
            "model": "human"
        })
    if gen_intro and len(gen_intro.strip()) > MIN_TEXT_CHARS:
        wiki_samples.append({
            "text": gen_intro,
            "label": 1,
            "model": "gpt"
        })

df_wiki = pd.DataFrame(wiki_samples)
df_wiki["text"] = df_wiki["text"].str.replace(r"[\u2028\u2029]", " ", regex=True)

print(f"GPT-wiki-intro Converted: {len(df_wiki)} samples")
print(f"  Human: {len(df_wiki[df_wiki['label'] == 0])}")
print(f"  AI:    {len(df_wiki[df_wiki['label'] == 1])}")

Downloading GPT-wiki-intro dataset from Hugging Face...
GPT-wiki-intro loaded: 20000 rows
GPT-wiki-intro Converted: 40000 samples
  Human: 20000
  AI:    20000


## Combining All Datasets
Merge RAID + DAIGT + HC3 + GPT-wiki-intro into one master dataset

In [95]:
# Combine all datasets
df_combined = pd.concat([df_raid, df_daigt, df_hc3, df_wiki], ignore_index=True)

print(f"Rows after stacking all datasets: {len(df_combined)}")

# Basic cleaning and length filtering
df_combined["text"] = df_combined["text"].astype(str)
df_combined["text"] = df_combined["text"].str.replace(r"\s+", " ", regex=True).str.strip()

df_combined = df_combined[df_combined["text"].str.len() >= MIN_TEXT_CHARS]

df_combined["word_count"] = df_combined["text"].apply(lambda x: len(x.split()))
df_combined = df_combined[(df_combined["word_count"] >= MIN_WORDS) & (df_combined["word_count"] <= MAX_WORDS)]

# Remove duplicates
duplicate_count = df_combined.duplicated(subset=['text']).sum()
print(f"Found {duplicate_count} duplicate texts")

df_combined = df_combined.drop_duplicates(subset=['text'])
print(f"Total Unique Rows: {len(df_combined)}")

Rows after stacking all datasets: 230840
Found 87585 duplicate texts
Total Unique Rows: 140951


## Generating Short Text Samples
Create shorter samples from long texts to improve detection on short answers

In [96]:
import numpy as np

# Get long texts (200+ words)
long_texts = df_combined[df_combined['word_count'] > 200]
print(f"Found {len(long_texts)} texts with 200+ words")

# Generate short samples from long texts
np.random.seed(42)
short_samples = []

for _, row in long_texts.sample(min(SHORT_SAMPLE_COUNT, len(long_texts))).iterrows():
    text = str(row['text'])
    words = text.split()
    
    if len(words) > (SHORT_MAX_WORDS + 20):
        # Extract a chunk from the middle (SHORT_MIN_WORDSâ€“SHORT_MAX_WORDS)
        start = np.random.randint(10, len(words) - (SHORT_MAX_WORDS + 10))
        length = np.random.randint(SHORT_MIN_WORDS, SHORT_MAX_WORDS + 1)
        short_text = ' '.join(words[start:start+length])
        
        short_samples.append({
            'text': short_text,
            'label': row['label'],
            'model': row.get('model', 'unknown')
        })

df_short = pd.DataFrame(short_samples)
print(f"Generated {len(df_short)} short samples ({SHORT_MIN_WORDS}-{SHORT_MAX_WORDS} words each)")

Found 51173 texts with 200+ words
Generated 3000 short samples (30-80 words each)


## Final Balancing and Export
Balance the enhanced dataset and save

In [97]:
# Add short samples to combined dataset
df_final = pd.concat([
    df_combined.drop('word_count', axis=1), 
    df_short
], ignore_index=True)

# Shuffle
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)

# Balance the dataset
min_size = min(df_final['label'].value_counts())
df_balanced_final = df_final.groupby('label').apply(
    lambda x: x.sample(min_size, random_state=42)
).reset_index(drop=True)

# Shuffle again
df_balanced_final = df_balanced_final.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Final Enhanced Dataset: {len(df_balanced_final)} samples")
print(f"  Human: {len(df_balanced_final[df_balanced_final['label'] == 0])}")
print(f"  AI:    {len(df_balanced_final[df_balanced_final['label'] == 1])}")

Final Enhanced Dataset: 112976 samples
  Human: 56488
  AI:    56488


  df_balanced_final = df_final.groupby('label').apply(


In [98]:
# Show text length distribution
df_balanced_final['word_count'] = df_balanced_final['text'].astype(str).apply(lambda x: len(x.split()))

print("\nText Length Distribution:")
print(f"  Min:    {df_balanced_final['word_count'].min()} words")
print(f"  Max:    {df_balanced_final['word_count'].max()} words")
print(f"  Mean:   {df_balanced_final['word_count'].mean():.0f} words")
print(f"  Median: {df_balanced_final['word_count'].median():.0f} words")

print(f"\n  Short (<50 words):    {len(df_balanced_final[df_balanced_final['word_count'] < 50])}")
print(f"  Medium (50-200):      {len(df_balanced_final[(df_balanced_final['word_count'] >= 50) & (df_balanced_final['word_count'] < 200)])}")
print(f"  Long (200+):          {len(df_balanced_final[df_balanced_final['word_count'] >= 200])}")


Text Length Distribution:
  Min:    20 words
  Max:    800 words
  Mean:   186 words
  Median: 167 words

  Short (<50 words):    4828
  Medium (50-200):      66895
  Long (200+):          41253


In [99]:
# Save the final dataset (single file)
df_balanced_final.drop('word_count', axis=1).to_csv("../data/master_training_data.csv", index=False)
print("Saved: ../data/master_training_data.csv")

print("\n" + "="*50)
print("DONE! Run 'python train.py' to retrain the model")
print("="*50)

Saved: ../data/master_training_data.csv

DONE! Run 'python train.py' to retrain the model
