### Importing required libraries

In [7]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np
import matplotlib.pyplot as plt

### Loading a pre-trained LLM

In [8]:
model_name = "EleutherAI/gpt-neo-2.7B" # Can be replaced to test other models
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval() # Setting to evaluation mode

GPTNeoForCausalLM(
  (transformer): GPTNeoModel(
    (wte): Embedding(50257, 2560)
    (wpe): Embedding(2048, 2560)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-31): 32 x GPTNeoBlock(
        (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (attn): GPTNeoAttention(
          (attention): GPTNeoSelfAttention(
            (attn_dropout): Dropout(p=0.0, inplace=False)
            (resid_dropout): Dropout(p=0.0, inplace=False)
            (k_proj): Linear(in_features=2560, out_features=2560, bias=False)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=False)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=False)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
        )
        (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTNeoMLP(
          (c_fc): Linear(in_features=2560, out_features=10240, bias=True)
          (c_proj)

### Defining function to compute SURP Tokens

In [9]:
def compute_surprising_tokens(text, model, tokenizer, entropy_threshold = 2.0, prob_percentile=20):
    # Tokenizing the input text
    tokens = tokenizer.encode_plus(text, return_tensors='pt')
    input_ids = tokens['input_ids']

    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        logits = outputs.logits # Model's token prediction probabilities

    probs = torch.nn.functional.softmax(logits, dim=-1) # Converting logits to probabilities
    entropy = -torch.sum(probs * torch.log(probs + 1e-9), dim=-1) # Computing entropy of the token predictions

    gt_probs = probs.gather(2, input_ids.unsqueeze(-1)).squeeze(-1) # Getting the probabilities of the ground truth tokens

    # Convert to numpy
    entropy_values = entropy.squeeze().numpy()
    gt_probs_values = gt_probs.squeeze().numpy()

    # Identify surprising tokens (low entropy and low probability)
    low_entropy_indices = np.where(entropy_values < entropy_threshold)[0]
    prob_threshold = np.percentile(gt_probs_values, prob_percentile)
    low_prob_indices = np.where(gt_probs_values < prob_threshold)[0]

    suprising_indices = set(low_entropy_indices) & set(low_prob_indices)
    surprising_tokens = [tokenizer.decode(input_ids[0, i].item()) for i in suprising_indices]

    return surprising_tokens, gt_probs_values, entropy_values


### Classifying text accordingly

In [10]:
def classify_text(text, model, tokenizer, lambda_threshold=-5.0):
    """
    Classifies text as AI-generated or Human-Written based on SURP score.
    """
    surprising_tokens, gt_probs, entropy = compute_surprising_tokens(text, model, tokenizer)

    # Compute SURP Score (average log probability of surprising tokens)
    surprising_indices = [i for i in range(len(gt_probs)) if tokenizer.decode([tokenizer(text, return_tensors="pt")["input_ids"][0][i]]) in surprising_tokens]

    if surprising_indices:
        surp_score = np.mean(np.log([gt_probs[idx] + 1e-9 for idx in surprising_indices]))
    else:
        surp_score = -10  # Assign a very low value if no surprising tokens are found

    print(f"\nText: {text}")
    print(f"SURP Score: {surp_score}")
    print(f"Surprising Tokens: {surprising_tokens}")

    return "Human-Written" if surp_score >= lambda_threshold else "AI-Generated"

### Putting program to the test

In [11]:
text_ai = "Pipeline hazards arise in pipelined CPU architectures and can cause delays in instruction execution."
text_human = "Can’t believe Shannon’s entropy is helping me read a ML scientific paper"

print("AI Text Classification: ", classify_text(text_ai, model, tokenizer))
print("Human Text Classification: ", classify_text(text_human, model, tokenizer))


Text: Pipeline hazards arise in pipelined CPU architectures and can cause delays in instruction execution.
SURP Score: -16.26364517211914
Surprising Tokens: [' pip', 'el']
AI Text Classification:  AI-Generated

Text: Can’t believe Shannon’s entropy is helping me read a ML scientific paper
SURP Score: -17.70323371887207
Surprising Tokens: ['�', '�', '�', '�']
Human Text Classification:  AI-Generated
