In [12]:
import importlib
import subprocess
import sys

# Dict of required packages
required_packages = {
    "numpy": "numpy",
    "pandas": "pandas",
    "matplotlib": "matplotlib",
    "sklearn": "scikit-learn",
    "tensorflow": "tensorflow",
    "datasets": "datasets",
    "transformers": "transformers",
    "tf-keras": "tf-keras"
}

def install_and_import(pkg_name, install_name=None):
    install_name = install_name or pkg_name
    try:
        importlib.import_module(pkg_name)
        print(f"{pkg_name} is already installed.")
    except ImportError:
        print(f"{pkg_name} not found. Installing...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", install_name])

# Loop and ensure all are installed
for pkg, pip_name in required_packages.items():
    install_and_import(pkg, pip_name)

####################################################################################################################################
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import sklearn
from sklearn.utils import check_random_state
import tensorflow as tf
from datasets import load_dataset
from transformers import TFBertForSequenceClassification


numpy is already installed.
pandas is already installed.
matplotlib is already installed.
sklearn is already installed.
tensorflow is already installed.
datasets is already installed.
transformers is already installed.
tf-keras not found. Installing...


In [5]:
# Set random states for reproducability
RandomState = 42
random.seed(RandomState)
np.random.seed(RandomState)
skl_rand = check_random_state(RandomState)
tf.random.set_seed(RandomState)

print("Random seeds defined.")

Random seeds defined.


TO DO:
- Dataset Loading/Choosing ✅
- Dataset Cleaning
- Exploratory Data Analysis
- Baseline (TBD)
- BERT fine-tuning to classify text
- Error Analysis / Robustness Testing

# Dataset Loading/Choosing

- LLM - Detect AI Generated Text Dataset (28k essays)
https://www.kaggle.com/datasets/sunilthite/llm-detect-ai-generated-text-dataset.
- Dataset Card for AI Text Dectection Pile (1.4mil essays)
https://huggingface.co/datasets/artem9k/ai-text-detection-pile
- Raid (10+mil essays from 10 genres) https://github.com/liamdugan/raid

# Dataset Cleaning

- Lowercasing (optional with BERT since it's often case-aware depending on the model)
- Removing HTML tags, extra spaces
- Filtering by length (exclude very short texts)
- Removing duplicates
- Language detection if you need only English
- Where possible tag which model AI text is from

In [None]:
# Different selection of data
# Takes a long time to load first time around...
data_all = load_dataset("liamdugan/raid", "raid")
train_data = data_all["train"]
train_data_subset = train_data.select(range(1000))

Loading dataset shards:   0%|          | 0/24 [00:00<?, ?it/s]

In [8]:
train_data_subset.features

{'id': Value(dtype='string', id=None),
 'adv_source_id': Value(dtype='string', id=None),
 'source_id': Value(dtype='string', id=None),
 'model': Value(dtype='string', id=None),
 'decoding': Value(dtype='string', id=None),
 'repetition_penalty': Value(dtype='string', id=None),
 'attack': Value(dtype='string', id=None),
 'domain': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None),
 'prompt': Value(dtype='string', id=None),
 'generation': Value(dtype='string', id=None)}

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer
from typing import Dict

# Combine title + generation into one for training
def prepare_text(dataset: Dict) -> Dict:
    if dataset["title"] is None or dataset["generation"] is None:
        return None
    dataset["text"] = dataset["title"].strip() + " " + dataset["generation"].strip()
    return dataset

# Encode binary labels
def encode_label(dataset: Dict, label_map={"human": 0, "machine": 1}) -> Dict:
    dataset["label"] = label_map.get(dataset["model"], -1)
    return dataset

# Tokenization function
def tokenize_example(dataset: Dict, tokenizer) -> Dict:
    tokens = tokenizer(
        dataset["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )
    tokens["label"] = dataset["label"]
    return tokens

# Full "main" pipeline
def prepare_dataset_for_bert(dataset, tokenizer_name="bert-base-uncased"):
    # Filter rows with missing title or generation
    dataset = dataset.filter(lambda x: x["title"] is not None and x["generation"] is not None)
    dataset = dataset.map(prepare_text)
    
    # Encode labels
    dataset = dataset.map(encode_label)
    dataset = dataset.filter(lambda x: x["label"] != -1)
    
    # Tokenize
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    dataset = dataset.map(lambda x: tokenize_example(x, tokenizer), batched=False)

    # Set format for tf.data.Dataset
    dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"])
    
    # Convert to tf.data.Dataset
    features = {
        "input_ids": tf.TensorSpec(shape=(512,), dtype=tf.int32),
        "attention_mask": tf.TensorSpec(shape=(512,), dtype=tf.int32),
    }

    tf_dataset = dataset.to_tf_dataset(
        columns=["input_ids", "attention_mask"],
        label_cols="label",
        shuffle=True,
        batch_size=16,
        collate_fn=None
    )
    
    return tf_dataset, tokenizer


In [None]:
pd.DataFrame(train_data_subset[:5])

Unnamed: 0,id,adv_source_id,source_id,model,decoding,repetition_penalty,attack,domain,title,prompt,generation
0,e5e058ce-be2b-459d-af36-32532aaba5ff,e5e058ce-be2b-459d-af36-32532aaba5ff,e5e058ce-be2b-459d-af36-32532aaba5ff,human,,,none,abstracts,FUTURE-AI: Guiding Principles and Consensus Re...,,The recent advancements in artificial intellig...
1,f95b107b-d176-4af5-90f7-4d0bb20caf93,f95b107b-d176-4af5-90f7-4d0bb20caf93,f95b107b-d176-4af5-90f7-4d0bb20caf93,human,,,none,abstracts,EdgeFlow: Achieving Practical Interactive Segm...,,High-quality training data play a key role in ...
2,856d8972-9e3d-4544-babc-0fe16f21e04d,856d8972-9e3d-4544-babc-0fe16f21e04d,856d8972-9e3d-4544-babc-0fe16f21e04d,human,,,none,abstracts,Semi-supervised Contrastive Learning for Label...,,The success of deep learning methods in medica...
3,fbc8a5ea-90fa-47b8-8fa7-73dd954f1524,fbc8a5ea-90fa-47b8-8fa7-73dd954f1524,fbc8a5ea-90fa-47b8-8fa7-73dd954f1524,human,,,none,abstracts,Combo Loss: Handling Input and Output Imbalanc...,,Simultaneous segmentation of multiple organs f...
4,72c41b8d-0069-4886-b734-a4000ffca286,72c41b8d-0069-4886-b734-a4000ffca286,72c41b8d-0069-4886-b734-a4000ffca286,human,,,none,abstracts,Attention-Based 3D Seismic Fault Segmentation ...,,Detection faults in seismic data is a crucial ...


The fetching of data will take some minutes...

In [18]:
# Super basic model to validate data cleaning worked.
train_tf_dataset, tokenizer = prepare_dataset_for_bert(train_data_subset)
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)
model.fit(train_tf_dataset, epochs=3)


Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/493 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]




All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


 4/31 [==>...........................] - ETA: 12:02 - loss: 0.3544 - accuracy: 0.9844

KeyboardInterrupt: 

Possible extra feature engineering, usefullness unsure.
BERT embeddings + handcrafted features like:
- Average sentence length
- N-gram repetition
- Ratio of stopwords Then feed that into a LightGBM/XGBoost model to compare.

# Exploratory Data Analysis (EDA)
- Text length distributions
- Vocabulary richness (e.g. unique words)
- POS tag distribution (maybe AI uses more nouns, fewer adjectives?)
- Visualizations: word clouds, frequency plots
- Clustering to check for seperability of classes
- .
- Comparing perplexity charts of AI model text and human text, can help understand the complexity of the task at hand, due to the variety of distributions.

# Baseline (TBD)

- Basic baseline logistic regression etc (Might not be relevant)
- Basic Deep learning artitecture
- Base BERT
- Maybe a basic baseline and a base BERT to see how much performance BERT adds and how much fine-tuned BERT additionally adds.

# BERT fine-tuning to classify text

- BERT vs RoBERTa vs DistilBERT
- RoBERTa often performs better in classification tasks

# Error Analysis / Robustness Testing

- What types of errors does it make confusion matrix?
- Is the model biased toward longer/shorter texts?
- Attention analysis (using tools like BertViz)
- Check if BERT overfits to text length or formatting
- Does it misclassify texts on certain topics?
- Could it unfairly flag texts written by non-native speakers?
- Does it perform better on specific outpurs from specific models?
- Small edits (punctuation, synonyms) and how does they affect the model?
- Test synonym replacements (e.g., "happy" → "joyful") with slight paraphrasing

# Explainability

- Attention Heatmap (with bertviz or transformers-interpret)
- Visualize token importance
- SHAP map

# (Option extra if time allows) Own trained text generator, compare its outputs predictions to the ones from other models