### Importing required libraries

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


### Loading a pre-trained LLM

In [2]:
model_name = "EleutherAI/gpt-neo-2.7B" # Can be replaced to test other models
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval() # Setting to evaluation mode

GPTNeoForCausalLM(
  (transformer): GPTNeoModel(
    (wte): Embedding(50257, 2560)
    (wpe): Embedding(2048, 2560)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-31): 32 x GPTNeoBlock(
        (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (attn): GPTNeoAttention(
          (attention): GPTNeoSelfAttention(
            (attn_dropout): Dropout(p=0.0, inplace=False)
            (resid_dropout): Dropout(p=0.0, inplace=False)
            (k_proj): Linear(in_features=2560, out_features=2560, bias=False)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=False)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=False)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
        )
        (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTNeoMLP(
          (c_fc): Linear(in_features=2560, out_features=10240, bias=True)
          (c_proj)

### Defining function to compute SURP Tokens

In [3]:
def compute_surprisal_score(text, model, tokenizer, entropy_threshold = 2.0, prob_percentile=20):
    # Tokenizing the input text
    tokens = tokenizer.encode_plus(text, return_tensors='pt')
    input_ids = tokens['input_ids']

    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        logits = outputs.logits # Model's token prediction probabilities

    probs = torch.nn.functional.softmax(logits, dim=-1) # Converting logits to probabilities
    entropy = -torch.sum(probs * torch.log(probs + 1e-9), dim=-1) # Computing entropy of the token predictions

    gt_probs = probs.gather(2, input_ids.unsqueeze(-1)).squeeze(-1) # Getting the probabilities of the ground truth tokens

    # Convert to numpy
    entropy_values = entropy.squeeze().numpy()
    gt_probs_values = gt_probs.squeeze().numpy()

    # Identify surprising tokens (low entropy and low probability)
    low_entropy_indices = np.where(entropy_values < entropy_threshold)[0]
    prob_threshold = np.percentile(gt_probs_values, prob_percentile)
    low_prob_indices = np.where(gt_probs_values < prob_threshold)[0]

    # Computer average log probability over the surprising tokens
    idxs = list(set(low_entropy_indices) & set(low_prob_indices))
    if idxs:
        score = np.mean(np.log(gt_probs_values[idxs] + 1e-9))
    else:
        score = np.log(np.min(gt_probs_values) + 1e-9) # Fallback to minimum log probability if no surprising tokens found
    return float(score)

### Classifying text accordingly

In [4]:
from typing import List

def get_surprisal(texts: List[str], model, tokenizer) -> np.ndarray:
    """
    Compute the surprisal scores for a list of texts.
    """
    return np.array([compute_surprisal_score(text, model, tokenizer) for text in texts])

### Importing labeled dataset

In [5]:
import kagglehub
from sklearn.model_selection import train_test_split
import pandas as pd
import os
# Downloading the dataset

path = kagglehub.dataset_download("shanegerami/ai-vs-human-text")

# Find the CSV file in the downloaded directory
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
assert len(csv_files) == 1, "Expected exactly one CSV file in the dataset directory."
csv_path = os.path.join(path, csv_files[0])
df = pd.read_csv(csv_path, encoding='utf-8')

# Splitting the dataset into training and testing sets
x_train, x_test = train_test_split(df['text'], test_size=0.3, random_state=42)
y_train, y_test = train_test_split(df['generated'], test_size=0.3, random_state=42)

### Compute surprisal scores and sweep lambda

In [6]:
%pip install numpy

from sklearn.metrics import roc_curve, auc, precision_recall_curve

# Computing scores
scores_dev = get_surprisal(x_test, model, tokenizer)

# ROC curve
fpr, tpr, thresholds = roc_curve(y_test, scores_dev)
roc_auc = auc(fpr, tpr)
print(f"ROC AUC: {roc_auc:.3f}")

# Precision-Recall curve
prec, rec, thresholds = precision_recall_curve(y_test, scores_dev)

# Plotting the Precision-Recall curve
j_scores = tpr- fpr
best_idx = np.argmax(j_scores)
best_lambda = thresholds[best_idx]
print(f"Best threshold by Youden's J: {best_lambda:.3f}")

Note: you may need to restart the kernel to use updated packages.


Token indices sequence length is longer than the specified maximum sequence length for this model (2645 > 2048). Running this sequence through the model will result in indexing errors


IndexError: index out of range in self

### Simple classifier

In [None]:
# Feature extraction
def extract_features(texts):
    s = get_surprisal(texts, model, tokenizer)
    # If you want std-dev too, modify compute_surprisal_score to return both mean & std.
    # Here we’ll just use s.
    return np.vstack([s]).T

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

pipe = make_pipeline(
    StandardScaler(),
    LogisticRegression(class_weight="balanced", max_iter=1000)
)
pipe.fit(extract_features(x_train), y_train)

# Evaluate on dev
y_pred_proba = pipe.predict_proba(extract_features(x_test))[:,1]
fpr2, tpr2, _ = sklearn.metrics.roc_curve(y_test, y_pred_proba)
from sklearn.metrics import auc
print("Pipeline ROC AUC:", auc(fpr2, tpr2))