In [None]:
import torch
print("CUDA Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))


In [None]:
# To Set up GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

if torch.cuda.is_available():
  print(f"GPU: {torch.cuda.get_device_name(0)}")
  print(f"CUDA Version: {torch.version.cuda}")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
pip install --upgrade Pillow

In [None]:
pip install sentencepiece

# Libraries

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForCausalLM, T5Tokenizer, T5ForConditionalGeneration
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score
from tqdm import tqdm
import re
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
import time
import subprocess

# Dataset Load

In [None]:
# Load the essays dataset
essays_df = pd.read_csv("/content/drive/MyDrive/Training_Essay_Data.csv")
# Load the text dataset
text_df = pd.read_csv("/content/drive/MyDrive/human_vs_ai_sentences.csv")
# Load the fake news dataset
news_df = pd.read_csv("/content/drive/MyDrive/fake_news_dataset.csv")

# Data Pre-processing

In [None]:
text_df = text_df.dropna(subset=['text', 'generated'])


In [None]:
text_df['generated'] = text_df['generated'].astype(int)


In [None]:
def prepare_dataset(df, text_column='text', label_column='generated', sample_size=None, max_length=512):
    def preprocess_text(text):
        # Remove HTML tags
        text = BeautifulSoup(str(text), "html.parser").get_text()

        # Remove special characters and standardize punctuation
        text = re.sub(r'[^\w\s.,!?-]', '', text)

        # Standardize whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        return text

    # To Verify label consistency and remove entries with missing text or labels
    df = df.dropna(subset=[text_column, label_column])

    # To Preprocess the text
    df[text_column] = df[text_column].apply(preprocess_text)

    # To Ensure the label column contains only 0 and 1
    df[label_column] = df[label_column].astype(int)

    # Truncate or pad sequences
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token  # Set the pad token to the EOS token

    def truncate_or_pad(text):
        encoded = tokenizer.encode_plus(
            text,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return tokenizer.decode(encoded['input_ids'][0], skip_special_tokens=True)

    df[text_column] = df[text_column].apply(truncate_or_pad)

    if sample_size:
        df = df.sample(sample_size, random_state=42)

    return df

In [None]:
# For essays dataset
def prepare_essays_dataset(essays_df, sample_size=None, max_length=512):
    return prepare_dataset(essays_df, sample_size=sample_size, max_length=max_length)



In [None]:
#prepared essay dataset
processed_essays_data = prepare_essays_dataset(essays_df, sample_size=1000, max_length=512)

In [None]:
# Count class distribution
label_counts = processed_essays_data['generated'].value_counts()

print("Class distribution in processed_essays_data:")
print(label_counts)

# Optional: Check balance ratio
ratio = label_counts.min() / label_counts.max()
print(f"Balance ratio: {ratio:.2f}")


In [None]:
import pandas as pd

# Choose equal samples from each class
sample_size = 512  # or any even number (max 2 × 373 = 746 total possible)
human_samples = processed_essays_data[processed_essays_data['generated'] == 0].sample(n=sample_size // 2, random_state=42)
ai_samples = processed_essays_data[processed_essays_data['generated'] == 1].sample(n=sample_size // 2, random_state=42)

# Combine and shuffle
balanced_essays_sample = pd.concat([human_samples, ai_samples]).sample(frac=1, random_state=42)


In [None]:
# Count class distribution
label_counts = balanced_essays_sample['generated'].value_counts()

print("Class distribution in processed_essays_data:")
print(label_counts)

# Optional: Check balance ratio
ratio = label_counts.min() / label_counts.max()
print(f"Balance ratio: {ratio:.2f}")


In [None]:
def prepare_text_dataset(text_df, sample_size=None, max_length=512):
    return prepare_dataset(text_df, sample_size=sample_size, max_length=max_length)

In [None]:
#prepared text dataset
processed_text_data = prepare_text_dataset(text_df, sample_size=1000, max_length=512)

In [None]:
# Check distribution of labels (0 = human, 1 = AI)
label_counts = news_df['generated'].value_counts()

print("Class distribution:")
print(label_counts)

# Optional: Display balance ratio
balance_ratio = label_counts.min() / label_counts.max()
print(f"\nBalance ratio: {balance_ratio:.2f}")


In [None]:
# For news dataset
def prepare_news_dataset(news_df, sample_size=None, max_length=512):
    return prepare_dataset(news_df, sample_size=sample_size, max_length=max_length)



In [None]:
#prepared essay dataset
processed_news_data = prepare_news_dataset(news_df, sample_size=1000, max_length=512)

# Pertubations T5 Model

In [None]:
def generate_perturbations_batch(texts, t5_tokenizer, t5_model, device, num_perturbations=10, max_length=512, mask_ratio=0.15):
    all_perturbed_texts = []

    for text in texts:
        inputs = t5_tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
        input_ids = inputs.input_ids[0]
        n_tokens = len(input_ids)

        perturbed_texts = []

        for _ in range(num_perturbations):
            # Random mask indices (avoid first and last token)
            mask = torch.rand(n_tokens) < mask_ratio
            mask[0] = False
            mask[-1] = False

            # Replace masked tokens with <extra_id_0> token
            masked_input_ids = []
            mask_started = False
            for idx, token_id in enumerate(input_ids):
                if mask[idx]:
                    if not mask_started:
                        masked_input_ids.append(t5_tokenizer.convert_tokens_to_ids("<extra_id_0>"))
                        mask_started = True
                else:
                    masked_input_ids.append(token_id.item())
                    mask_started = False

            # Convert to text
            masked_text = t5_tokenizer.decode(torch.tensor(masked_input_ids), skip_special_tokens=True)

            # Generate perturbation
            t5_inputs = t5_tokenizer(masked_text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
            outputs = t5_model.generate(**t5_inputs, max_length=max_length, do_sample=True, top_p=0.9)
            perturbed_text = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)

            perturbed_texts.append(perturbed_text)

        all_perturbed_texts.append(perturbed_texts)

    return all_perturbed_texts


# NPR

In [None]:
def compute_log_ranks(texts, model, tokenizer, device, max_length=512):
    log_ranks = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length).to(device)

        if inputs.input_ids.size(1) == 0:  # no tokens
            print(f"Skipping empty tokenized input: {repr(text[:50])}")
            log_ranks.append(float('nan'))  # or skip
            continue

        input_ids = inputs.input_ids[0]

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits.squeeze(0)

        ranks = []
        for idx, token_id in enumerate(input_ids):
            token_logits = logits[idx]
            probs = token_logits.softmax(dim=-1)
            sorted_indices = probs.argsort(descending=True)
            rank = (sorted_indices == token_id).nonzero(as_tuple=True)[0].item() + 1
            ranks.append(rank)

        log_rank = np.log(ranks).mean()
        log_ranks.append(log_rank)

    return log_ranks


In [None]:
def compute_npr_batch(original_texts, perturbed_texts_list, model, tokenizer, device, max_length=512):
    npr_scores = []

    original_log_ranks = compute_log_ranks(original_texts, model, tokenizer, device, max_length)

    for original_log_rank, perturbed_texts in zip(original_log_ranks, perturbed_texts_list):
        perturbed_log_ranks = compute_log_ranks(perturbed_texts, model, tokenizer, device, max_length)

        # Check for NaNs
        if np.isnan(original_log_rank) or np.isnan(perturbed_log_ranks).any():
            npr_scores.append(np.nan)
        else:
            npr = np.mean(perturbed_log_ranks) / (original_log_rank + 1e-8)
            npr_scores.append(npr)

    return npr_scores


# Model Evaluation

In [None]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score, roc_curve
def print_gpu_utilization():
    nvidia_smi = "nvidia-smi"
    try:
        output = subprocess.check_output([nvidia_smi, "--query-gpu=utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"])
        output = output.decode('utf-8').strip().split('\n')
        for line in output:
            gpu_util, mem_used, mem_total = map(int, line.split(','))
            print(f"GPU Utilization: {gpu_util}%, Memory Used: {mem_used}MB / {mem_total}MB")
    except:
        print("Unable to fetch GPU stats")

def evaluate_npr_metrics(npr_scores, labels):
    auroc = roc_auc_score(labels, npr_scores)

    # ROC curve to get best threshold
    fpr, tpr, thresholds = roc_curve(labels, npr_scores)
    j_scores = tpr - fpr
    best_idx = j_scores.argmax()
    best_threshold = thresholds[best_idx]

    # Classify using the best threshold
    predictions = (npr_scores >= best_threshold).astype(int)

    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    accuracy = accuracy_score(labels, predictions)

    return auroc, precision, recall, f1, accuracy, best_threshold



def evaluate_model_npr(model_name, dataset, t5_tokenizer, t5_model, num_perturbations=10, max_length=512, batch_size=32):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, use_safetensors=True)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    t5_model.to(device)

    print(f"Model ({model_name}) is on: {next(model.parameters()).device}")
    print(f"T5 model is on: {next(t5_model.parameters()).device}")
    print_gpu_utilization()

    texts = dataset['text'].tolist()
    labels = dataset['generated'].tolist()

    data = TensorDataset(torch.arange(len(texts)))
    dataloader = DataLoader(data, batch_size=batch_size, shuffle=False)

    npr_scores = []
    idx_list = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc=f"Evaluating {model_name}"):
            batch_idx = batch[0].tolist()
            batch_texts = [texts[i] for i in batch_idx]
            batch_perturbed_texts = generate_perturbations_batch(batch_texts, t5_tokenizer, t5_model, device, num_perturbations, max_length)
            batch_npr = compute_npr_batch(batch_texts, batch_perturbed_texts, model, tokenizer, device, max_length)

            npr_scores.extend(batch_npr)
            idx_list.extend(batch_idx)

    auroc, precision, recall, f1, accuracy, best_threshold = evaluate_npr_metrics(np.array(npr_scores), np.array(labels))

    print(f"AUROC: {auroc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}, Best Threshold: {best_threshold:.4f}")

    return auroc, precision, recall, f1, accuracy, best_threshold


# EleutherAI small, large - 5 Pertubations , 128 Sample size

In [None]:
from tqdm import tqdm
import torch
import pandas as pd
import time
from transformers import T5Tokenizer, T5ForConditionalGeneration

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

t5_tokenizer = T5Tokenizer.from_pretrained('t5-base', model_max_length=512)
t5_model = T5ForConditionalGeneration.from_pretrained('t5-base')

sample_size = 128
num_perturbations = 5
max_length = 512
batch_size = 32

subset_models = [
    'EleutherAI/gpt-neo-125M',
    'EleutherAI/gpt-j-6B'
]

results = []
start_time = time.time()

for model_name in tqdm(subset_models, desc="Overall Progress"):
    print(f"\n=== Evaluating {model_name} ")

    essays_sample = processed_essays_data.sample(sample_size, random_state=42)
    essays_auc, essays_precision, essays_recall, essays_f1, essays_accuracy, essays_best_threshold = evaluate_model_npr(
        model_name, essays_sample, t5_tokenizer, t5_model, num_perturbations, max_length, batch_size)

    print(f"Essays - AUROC: {essays_auc:.4f}, Precision: {essays_precision:.4f}, Recall: {essays_recall:.4f}, "
          f"F1: {essays_f1:.4f}, Accuracy: {essays_accuracy:.4f}, Best Threshold: {essays_best_threshold:.4f}")

    text_sample = processed_text_data.sample(sample_size, random_state=42)
    text_auc, text_precision, text_recall, text_f1, text_accuracy, text_best_threshold = evaluate_model_npr(
        model_name, text_sample, t5_tokenizer, t5_model, num_perturbations, max_length, batch_size)

    print(f"Text - AUROC: {text_auc:.4f}, Precision: {text_precision:.4f}, Recall: {text_recall:.4f}, "
          f"F1: {text_f1:.4f}, Accuracy: {text_accuracy:.4f}, Best Threshold: {text_best_threshold:.4f}")

    news_sample = processed_news_data.sample(sample_size, random_state=42)
    news_auc, news_precision, news_recall, news_f1, news_accuracy, news_best_threshold = evaluate_model_npr(
        model_name, news_sample, t5_tokenizer, t5_model, num_perturbations, max_length, batch_size)

    print(f"News - AUROC: {news_auc:.4f}, Precision: {news_precision:.4f}, Recall: {news_recall:.4f}, "
          f"F1: {news_f1:.4f}, Accuracy: {news_accuracy:.4f}, Best Threshold: {news_best_threshold:.4f}")

    results.append({
        'model': model_name,
        'essays_auc': essays_auc,
        'essays_precision': essays_precision,
        'essays_recall': essays_recall,
        'essays_f1': essays_f1,
        'essays_accuracy': essays_accuracy,
        'essays_best_threshold': essays_best_threshold,
        'text_auc': text_auc,
        'text_precision': text_precision,
        'text_recall': text_recall,
        'text_f1': text_f1,
        'text_accuracy': text_accuracy,
        'tet_best_threshold': text_best_threshold,
        'news_auc': news_auc,
        'news_precision': news_precision,
        'news_recall': news_recall,
        'news_f1': news_f1,
        'news_accuracy': news_accuracy,
        'news_best_threshold': news_best_threshold,
    })

    pd.DataFrame(results).to_csv(f'npr_results_{time.strftime("%Y%m%d-%H%M%S")}.csv', index=False)
    print(f"Results saved for {model_name}")

    elapsed_time = time.time() - start_time
    print(f"Elapsed time: {elapsed_time / 3600:.2f} hours")

# Final save
results_df = pd.DataFrame(results)
results_df.to_csv('npr_results_final.csv', index=False)
print("\n=== Final Results (Essays Only) ===")
print(results_df.to_string())


# Pythia small, large - 5 Pertubations , 128 Sample size

In [None]:
from tqdm import tqdm
import torch
import pandas as pd
import time
from transformers import T5Tokenizer, T5ForConditionalGeneration

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

t5_tokenizer = T5Tokenizer.from_pretrained('t5-base', model_max_length=512)
t5_model = T5ForConditionalGeneration.from_pretrained('t5-base')

sample_size = 128
num_perturbations = 5
max_length = 512
batch_size = 32

subset_models = [
    'EleutherAI/pythia-410m',
    'EleutherAI/pythia-6.9b'
]

results = []
start_time = time.time()

for model_name in tqdm(subset_models, desc="Overall Progress"):
    print(f"\n=== Evaluating {model_name} ")

    essays_sample = processed_essays_data.sample(sample_size, random_state=42)
    essays_auc, essays_precision, essays_recall, essays_f1, essays_accuracy, essays_best_threshold = evaluate_model_npr(
        model_name, essays_sample, t5_tokenizer, t5_model, num_perturbations, max_length, batch_size)

    print(f"Essays - AUROC: {essays_auc:.4f}, Precision: {essays_precision:.4f}, Recall: {essays_recall:.4f}, "
          f"F1: {essays_f1:.4f}, Accuracy: {essays_accuracy:.4f}, Best Threshold: {essays_best_threshold:.4f}")

    text_sample = processed_text_data.sample(sample_size, random_state=42)
    text_auc, text_precision, text_recall, text_f1, text_accuracy, text_best_threshold = evaluate_model_npr(
        model_name, text_sample, t5_tokenizer, t5_model, num_perturbations, max_length, batch_size)

    print(f"Text - AUROC: {text_auc:.4f}, Precision: {text_precision:.4f}, Recall: {text_recall:.4f}, "
          f"F1: {text_f1:.4f}, Accuracy: {text_accuracy:.4f}, Best Threshold: {text_best_threshold:.4f}")

    news_sample = processed_news_data.sample(sample_size, random_state=42)
    news_auc, news_precision, news_recall, news_f1, news_accuracy, news_best_threshold = evaluate_model_npr(
        model_name, news_sample, t5_tokenizer, t5_model, num_perturbations, max_length, batch_size)

    print(f"News - AUROC: {news_auc:.4f}, Precision: {news_precision:.4f}, Recall: {news_recall:.4f}, "
          f"F1: {news_f1:.4f}, Accuracy: {news_accuracy:.4f}, Best Threshold: {news_best_threshold:.4f}")

    results.append({
        'model': model_name,
        'essays_auc': essays_auc,
        'essays_precision': essays_precision,
        'essays_recall': essays_recall,
        'essays_f1': essays_f1,
        'essays_accuracy': essays_accuracy,
        'essays_best_threshold': essays_best_threshold,
        'text_auc': text_auc,
        'text_precision': text_precision,
        'text_recall': text_recall,
        'text_f1': text_f1,
        'text_accuracy': text_accuracy,
        'tet_best_threshold': text_best_threshold,
        'news_auc': news_auc,
        'news_precision': news_precision,
        'news_recall': news_recall,
        'news_f1': news_f1,
        'news_accuracy': news_accuracy,
        'news_best_threshold': news_best_threshold,
    })

    pd.DataFrame(results).to_csv(f'npr_results_{time.strftime("%Y%m%d-%H%M%S")}.csv', index=False)
    print(f"Results saved for {model_name}")

    elapsed_time = time.time() - start_time
    print(f"Elapsed time: {elapsed_time / 3600:.2f} hours")

# Final save
results_df = pd.DataFrame(results)
results_df.to_csv('npr_results_final.csv', index=False)
print("\n=== Final Results (Essays Only) ===")
print(results_df.to_string())


In [None]:
import torch
torch.cuda.empty_cache()


# GPT2 small, Large - 5 Pertubations , 128 Sample size

In [None]:
from tqdm import tqdm
import torch
import pandas as pd
import time
from transformers import T5Tokenizer, T5ForConditionalGeneration

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

t5_tokenizer = T5Tokenizer.from_pretrained('t5-base', model_max_length=512)
t5_model = T5ForConditionalGeneration.from_pretrained('t5-base')

sample_size = 128
num_perturbations = 5
max_length = 512
batch_size = 32

subset_models = [
    'gpt2',
    'gpt-large'
]

results = []
start_time = time.time()

for model_name in tqdm(subset_models, desc="Overall Progress"):
    print(f"\n=== Evaluating {model_name} ")

    essays_sample = processed_essays_data.sample(sample_size, random_state=42)
    essays_auc, essays_precision, essays_recall, essays_f1, essays_accuracy, essays_best_threshold = evaluate_model_npr(
        model_name, essays_sample, t5_tokenizer, t5_model, num_perturbations, max_length, batch_size)

    print(f"Essays - AUROC: {essays_auc:.4f}, Precision: {essays_precision:.4f}, Recall: {essays_recall:.4f}, "
          f"F1: {essays_f1:.4f}, Accuracy: {essays_accuracy:.4f}, Best Threshold: {essays_best_threshold:.4f}")

    text_sample = processed_text_data.sample(sample_size, random_state=42)
    text_auc, text_precision, text_recall, text_f1, text_accuracy, text_best_threshold = evaluate_model_npr(
        model_name, text_sample, t5_tokenizer, t5_model, num_perturbations, max_length, batch_size)

    print(f"Text - AUROC: {text_auc:.4f}, Precision: {text_precision:.4f}, Recall: {text_recall:.4f}, "
          f"F1: {text_f1:.4f}, Accuracy: {text_accuracy:.4f}, Best Threshold: {text_best_threshold:.4f}")

    news_sample = processed_news_data.sample(sample_size, random_state=42)
    news_auc, news_precision, news_recall, news_f1, news_accuracy, news_best_threshold = evaluate_model_npr(
        model_name, news_sample, t5_tokenizer, t5_model, num_perturbations, max_length, batch_size)

    print(f"News - AUROC: {news_auc:.4f}, Precision: {news_precision:.4f}, Recall: {news_recall:.4f}, "
          f"F1: {news_f1:.4f}, Accuracy: {news_accuracy:.4f}, Best Threshold: {news_best_threshold:.4f}")

    results.append({
        'model': model_name,
        'essays_auc': essays_auc,
        'essays_precision': essays_precision,
        'essays_recall': essays_recall,
        'essays_f1': essays_f1,
        'essays_accuracy': essays_accuracy,
        'essays_best_threshold': essays_best_threshold,
        'text_auc': text_auc,
        'text_precision': text_precision,
        'text_recall': text_recall,
        'text_f1': text_f1,
        'text_accuracy': text_accuracy,
        'tet_best_threshold': text_best_threshold,
        'news_auc': news_auc,
        'news_precision': news_precision,
        'news_recall': news_recall,
        'news_f1': news_f1,
        'news_accuracy': news_accuracy,
        'news_best_threshold': news_best_threshold,
    })

    pd.DataFrame(results).to_csv(f'npr_results_{time.strftime("%Y%m%d-%H%M%S")}.csv', index=False)
    print(f"Results saved for {model_name}")

    elapsed_time = time.time() - start_time
    print(f"Elapsed time: {elapsed_time / 3600:.2f} hours")

# Final save
results_df = pd.DataFrame(results)
results_df.to_csv('npr_results_final.csv', index=False)
print("\n=== Final Results (Essays Only) ===")
print(results_df.to_string())


#  EleutherAI small, large - 10 Pertubations , 256 Sample size

In [None]:
from tqdm import tqdm
import torch
import pandas as pd
import time
from transformers import T5Tokenizer, T5ForConditionalGeneration

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

t5_tokenizer = T5Tokenizer.from_pretrained('t5-base', model_max_length=512)
t5_model = T5ForConditionalGeneration.from_pretrained('t5-base')

sample_size = 256
num_perturbations = 10
max_length = 512
batch_size = 32

subset_models = [
    'EleutherAI/gpt-neo-125M',
    'EleutherAI/gpt-j-6B'
]

results = []
start_time = time.time()

for model_name in tqdm(subset_models, desc="Overall Progress"):
    print(f"\n=== Evaluating {model_name} ")

    essays_sample = processed_essays_data.sample(sample_size, random_state=42)
    essays_auc, essays_precision, essays_recall, essays_f1, essays_accuracy, essays_best_threshold = evaluate_model_npr(
        model_name, essays_sample, t5_tokenizer, t5_model, num_perturbations, max_length, batch_size)

    print(f"Essays - AUROC: {essays_auc:.4f}, Precision: {essays_precision:.4f}, Recall: {essays_recall:.4f}, "
          f"F1: {essays_f1:.4f}, Accuracy: {essays_accuracy:.4f}, Best Threshold: {essays_best_threshold:.4f}")

    text_sample = processed_text_data.sample(sample_size, random_state=42)
    text_auc, text_precision, text_recall, text_f1, text_accuracy, text_best_threshold = evaluate_model_npr(
        model_name, text_sample, t5_tokenizer, t5_model, num_perturbations, max_length, batch_size)

    print(f"Text - AUROC: {text_auc:.4f}, Precision: {text_precision:.4f}, Recall: {text_recall:.4f}, "
          f"F1: {text_f1:.4f}, Accuracy: {text_accuracy:.4f}, Best Threshold: {text_best_threshold:.4f}")

    news_sample = processed_news_data.sample(sample_size, random_state=42)
    news_auc, news_precision, news_recall, news_f1, news_accuracy, news_best_threshold = evaluate_model_npr(
        model_name, news_sample, t5_tokenizer, t5_model, num_perturbations, max_length, batch_size)

    print(f"News - AUROC: {news_auc:.4f}, Precision: {news_precision:.4f}, Recall: {news_recall:.4f}, "
          f"F1: {news_f1:.4f}, Accuracy: {news_accuracy:.4f}, Best Threshold: {news_best_threshold:.4f}")

    results.append({
        'model': model_name,
        'essays_auc': essays_auc,
        'essays_precision': essays_precision,
        'essays_recall': essays_recall,
        'essays_f1': essays_f1,
        'essays_accuracy': essays_accuracy,
        'essays_best_threshold': essays_best_threshold,
        'text_auc': text_auc,
        'text_precision': text_precision,
        'text_recall': text_recall,
        'text_f1': text_f1,
        'text_accuracy': text_accuracy,
        'tet_best_threshold': text_best_threshold,
        'news_auc': news_auc,
        'news_precision': news_precision,
        'news_recall': news_recall,
        'news_f1': news_f1,
        'news_accuracy': news_accuracy,
        'news_best_threshold': news_best_threshold,
    })

    pd.DataFrame(results).to_csv(f'npr_results_{time.strftime("%Y%m%d-%H%M%S")}.csv', index=False)
    print(f"Results saved for {model_name}")

    elapsed_time = time.time() - start_time
    print(f"Elapsed time: {elapsed_time / 3600:.2f} hours")

# Final save
results_df = pd.DataFrame(results)
results_df.to_csv('npr_results_final.csv', index=False)
print("\n=== Final Results (Essays Only) ===")
print(results_df.to_string())


# Pythia small, large - 10 Pertubations , 256 Sample size

In [None]:
from tqdm import tqdm
import torch
import pandas as pd
import time
from transformers import T5Tokenizer, T5ForConditionalGeneration

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

t5_tokenizer = T5Tokenizer.from_pretrained('t5-base', model_max_length=512)
t5_model = T5ForConditionalGeneration.from_pretrained('t5-base')

sample_size = 256
num_perturbations = 10
max_length = 512
batch_size = 32

subset_models = [
    'EleutherAI/pythia-410m',
    'EleutherAI/pythia-6.9b'
]

results = []
start_time = time.time()

for model_name in tqdm(subset_models, desc="Overall Progress"):
    print(f"\n=== Evaluating {model_name} ")

    essays_sample = processed_essays_data.sample(sample_size, random_state=42)
    essays_auc, essays_precision, essays_recall, essays_f1, essays_accuracy, essays_best_threshold = evaluate_model_npr(
        model_name, essays_sample, t5_tokenizer, t5_model, num_perturbations, max_length, batch_size)

    print(f"Essays - AUROC: {essays_auc:.4f}, Precision: {essays_precision:.4f}, Recall: {essays_recall:.4f}, "
          f"F1: {essays_f1:.4f}, Accuracy: {essays_accuracy:.4f}, Best Threshold: {essays_best_threshold:.4f}")

    text_sample = processed_text_data.sample(sample_size, random_state=42)
    text_auc, text_precision, text_recall, text_f1, text_accuracy, text_best_threshold = evaluate_model_npr(
        model_name, text_sample, t5_tokenizer, t5_model, num_perturbations, max_length, batch_size)

    print(f"Text - AUROC: {text_auc:.4f}, Precision: {text_precision:.4f}, Recall: {text_recall:.4f}, "
          f"F1: {text_f1:.4f}, Accuracy: {text_accuracy:.4f}, Best Threshold: {text_best_threshold:.4f}")

    news_sample = processed_news_data.sample(sample_size, random_state=42)
    news_auc, news_precision, news_recall, news_f1, news_accuracy, news_best_threshold = evaluate_model_npr(
        model_name, news_sample, t5_tokenizer, t5_model, num_perturbations, max_length, batch_size)

    print(f"News - AUROC: {news_auc:.4f}, Precision: {news_precision:.4f}, Recall: {news_recall:.4f}, "
          f"F1: {news_f1:.4f}, Accuracy: {news_accuracy:.4f}, Best Threshold: {news_best_threshold:.4f}")

    results.append({
        'model': model_name,
        'essays_auc': essays_auc,
        'essays_precision': essays_precision,
        'essays_recall': essays_recall,
        'essays_f1': essays_f1,
        'essays_accuracy': essays_accuracy,
        'essays_best_threshold': essays_best_threshold,
        'text_auc': text_auc,
        'text_precision': text_precision,
        'text_recall': text_recall,
        'text_f1': text_f1,
        'text_accuracy': text_accuracy,
        'tet_best_threshold': text_best_threshold,
        'news_auc': news_auc,
        'news_precision': news_precision,
        'news_recall': news_recall,
        'news_f1': news_f1,
        'news_accuracy': news_accuracy,
        'news_best_threshold': news_best_threshold,
    })

    pd.DataFrame(results).to_csv(f'npr_results_{time.strftime("%Y%m%d-%H%M%S")}.csv', index=False)
    print(f"Results saved for {model_name}")

    elapsed_time = time.time() - start_time
    print(f"Elapsed time: {elapsed_time / 3600:.2f} hours")

# Final save
results_df = pd.DataFrame(results)
results_df.to_csv('npr_results_final.csv', index=False)
print("\n=== Final Results (Essays Only) ===")
print(results_df.to_string())


# GPT2  small, large- 10 Pertubations , 256 Sample size

In [None]:
from tqdm import tqdm
import torch
import pandas as pd
import time
from transformers import T5Tokenizer, T5ForConditionalGeneration

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

t5_tokenizer = T5Tokenizer.from_pretrained('t5-base', model_max_length=512)
t5_model = T5ForConditionalGeneration.from_pretrained('t5-base')

sample_size = 256
num_perturbations = 10
max_length = 512
batch_size = 32

subset_models = [
    'gpt2',
    'gpt-large'
]

results = []
start_time = time.time()

for model_name in tqdm(subset_models, desc="Overall Progress"):
    print(f"\n=== Evaluating {model_name} ")

    essays_sample = processed_essays_data.sample(sample_size, random_state=42)
    essays_auc, essays_precision, essays_recall, essays_f1, essays_accuracy, essays_best_threshold = evaluate_model_npr(
        model_name, essays_sample, t5_tokenizer, t5_model, num_perturbations, max_length, batch_size)

    print(f"Essays - AUROC: {essays_auc:.4f}, Precision: {essays_precision:.4f}, Recall: {essays_recall:.4f}, "
          f"F1: {essays_f1:.4f}, Accuracy: {essays_accuracy:.4f}, Best Threshold: {essays_best_threshold:.4f}")

    text_sample = processed_text_data.sample(sample_size, random_state=42)
    text_auc, text_precision, text_recall, text_f1, text_accuracy, text_best_threshold = evaluate_model_npr(
        model_name, text_sample, t5_tokenizer, t5_model, num_perturbations, max_length, batch_size)

    print(f"Text - AUROC: {text_auc:.4f}, Precision: {text_precision:.4f}, Recall: {text_recall:.4f}, "
          f"F1: {text_f1:.4f}, Accuracy: {text_accuracy:.4f}, Best Threshold: {text_best_threshold:.4f}")

    news_sample = processed_news_data.sample(sample_size, random_state=42)
    news_auc, news_precision, news_recall, news_f1, news_accuracy, news_best_threshold = evaluate_model_npr(
        model_name, news_sample, t5_tokenizer, t5_model, num_perturbations, max_length, batch_size)

    print(f"News - AUROC: {news_auc:.4f}, Precision: {news_precision:.4f}, Recall: {news_recall:.4f}, "
          f"F1: {news_f1:.4f}, Accuracy: {news_accuracy:.4f}, Best Threshold: {news_best_threshold:.4f}")

    results.append({
        'model': model_name,
        'essays_auc': essays_auc,
        'essays_precision': essays_precision,
        'essays_recall': essays_recall,
        'essays_f1': essays_f1,
        'essays_accuracy': essays_accuracy,
        'essays_best_threshold': essays_best_threshold,
        'text_auc': text_auc,
        'text_precision': text_precision,
        'text_recall': text_recall,
        'text_f1': text_f1,
        'text_accuracy': text_accuracy,
        'tet_best_threshold': text_best_threshold,
        'news_auc': news_auc,
        'news_precision': news_precision,
        'news_recall': news_recall,
        'news_f1': news_f1,
        'news_accuracy': news_accuracy,
        'news_best_threshold': news_best_threshold,
    })

    pd.DataFrame(results).to_csv(f'npr_results_{time.strftime("%Y%m%d-%H%M%S")}.csv', index=False)
    print(f"Results saved for {model_name}")

    elapsed_time = time.time() - start_time
    print(f"Elapsed time: {elapsed_time / 3600:.2f} hours")

# Final save
results_df = pd.DataFrame(results)
results_df.to_csv('npr_results_final.csv', index=False)
print("\n=== Final Results (Essays Only) ===")
print(results_df.to_string())
