In [1]:
# !pip install -r ../requirements.txt

# Generate Topics from Human Text for Various LLMs

In [4]:
import json
import os
import time
from datetime import datetime
from tqdm import tqdm
from getpass import getpass

In [5]:
# --- Model and API Key Selection ---

# Added Google's Gemini models, specifying the "google" provider
MODELS = {
    "1": {"name": "GPT-4.1", "id": "gpt-4.1-2025-04-14", "provider": "openai"},
    "2": {"name": "GPT o3", "id": "o3-2025-04-16", "provider": "openai"},
    "3": {"name": "Claude Opus 4", "id": "claude-opus-4-20250514", "provider": "anthropic"},
    "4": {"name": "Claude Sonnet 4", "id": "claude-sonnet-4-20250514", "provider": "anthropic"},
    "5": {"name": "Gemini 2.0 Flash", "id": "gemini-2.0-flash", "provider": "google"},
}

# Prompt user to select a model
print("Please choose a model to use:")
for key, model_info in sorted(MODELS.items()):
    print(f"{key}: {model_info['name']} ({model_info['provider']})")

choice = input("Enter the number of your choice: ")
selected_model = MODELS.get(choice)

if not selected_model:
    raise ValueError("Invalid choice. Please run the script again.")

print(f"You have selected: {selected_model['name']}")

Please choose a model to use:
1: GPT-4.1 (openai)
2: GPT o3 (openai)
3: Claude Opus 4 (anthropic)
4: Claude Sonnet 4 (anthropic)
5: Gemini 2.5 Flash (google)


Enter the number of your choice:  1


You have selected: GPT-4.1


In [22]:
# --- Client and Authentication Setup ---
PROJECT_ID = "" ## GCP PROJECT
LOCATION = "" ## GCP REGION/LOCATION  

# This block now handles all three providers
client = None
provider = selected_model["provider"]

if provider == "openai":
    from openai import OpenAI
    api_key = getpass("Please enter your OpenAI API key: ")
    client = OpenAI(api_key=api_key)

elif provider == "anthropic":
    from anthropic import Anthropic
    api_key = getpass("Please enter your Anthropic API key: ")
    client = Anthropic(api_key=api_key)

elif provider == "google":
    # Key-less authentication for Vertex AI
    print("Authenticating to Google Cloud via Vertex AI environment...")
    try:
        from vertexai.preview.generative_models import GenerativeModel, Part, HarmCategory, HarmBlockThreshold
        import vertexai
        
        # You must specify your Google Cloud project ID and location
        PROJECT_ID = PROJECT_ID
        LOCATION = LOCATION

        vertexai.init(project=PROJECT_ID, location=LOCATION)
        # Instead of a 'client', we instantiate the specific model
        client = GenerativeModel(selected_model['id'])
        print("Successfully initialized Gemini model.")
    except ImportError:
        print("ERROR: 'google-cloud-aiplatform' library not found.")
        print("Please run: pip install google-cloud-aiplatform")
        exit()
    except Exception as e:
        print(f"ERROR: Could not initialize Vertex AI. Your project ID may be incorrect or you may not be in a Vertex AI notebook.")
        print(f"Details: {e}")
        exit()

Please enter your OpenAI API key:  ········


In [7]:
# --- Functions ---

def get_prompt(text):
    return f"""You are a topic summarizer.
Summarize the following paragraph into a very short topic (max 40 words).
Avoid using full sentences. Be specific.
Text:
{text}
Return only the topic without any explanation."""

def generate_topic(text, model_id, provider):
    """
    Generates a topic using the dynamically selected model and provider.
    """
    try:
        if provider == "openai":
            response = client.chat.completions.create(
                model=model_id,
                messages=[{"role": "user", "content": get_prompt(text)}],
                temperature=0.3, max_tokens=40
            )
            return response.choices[0].message.content.strip().strip('"')
        
        elif provider == "anthropic":
            response = client.messages.create(
                model=model_id,
                system="You are a topic summarizer. Summarize the following paragraph into a very short topic (max 40 words). Avoid using full sentences. Be specific. Return only the topic without any explanation.",
                messages=[{"role": "user", "content": f"Text:\n{text}"}],
                temperature=0.3, max_tokens=40
            )
            return response.content[0].text.strip().strip('"')

        elif provider == "google":
            # The 'client' here is actually the instantiated Gemini model
            prompt = get_prompt(text)
            response = client.generate_content(
                [Part.from_text(prompt)],
                generation_config={"temperature": 0.3, "max_output_tokens": 40},
                # Safety settings to reduce chances of blocking for benign content
                safety_settings={
                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH,
                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
                }
            )
            return response.text.strip().strip('"')
            
    except Exception as e:
        print(f"❌ Error generating topic with {model_id}: {e}")
        return ""

In [10]:
# --- Main Execution Logic ---

## PROVIDE RAW DATA

# input_path = "../original_corpus/original_corpus.json"
input_path = ""
timestamp = datetime.now().strftime("%Y%m%d")
output_path = f"topics_{selected_model['id'].replace('/', '_')}_{timestamp}.json"


with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)

for item in tqdm(data, desc=f"Generating topics with {selected_model['name']}"):
    if not item.get("topic"):
        item["topic"] = generate_topic(item["text"], selected_model["id"], provider)
        time.sleep(1) 

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print(f"All topics have been generated and saved to {output_path}")

Generating topics with GPT-4.1: 100%|██████████| 2/2 [00:04<00:00,  2.42s/it]

All topics have been generated and saved to topics_gpt-4.1-2025-04-14_20250703.json





# Generate AI Texts using Various LLMs 

In [13]:
import os
import json
import time
from tqdm import tqdm
from getpass import getpass

# --- 1. Model and Provider Configuration ---
# A central dictionary to manage all models and their providers.
MODELS = {
    "1": {"name": "GPT-4.1", "id": "gpt-4.1-2025-04-14", "provider": "openai"},
    "2": {"name": "GPT o3", "id": "o3-2025-04-16", "provider": "openai"},
    "3": {"name": "Claude Opus 4", "id": "claude-opus-4-20250514", "provider": "anthropic"},
    "4": {"name": "Claude Sonnet 4", "id": "claude-sonnet-4-20250514", "provider": "anthropic"},
    "5": {"name": "Gemini 2.0 Flash", "id": "gemini-2.0-flash", "provider": "google"}
}

# --- 2. Dynamic Model and Authentication Setup ---
print("Please choose a model to use for generation:")
for key, model_info in sorted(MODELS.items()):
    print(f"{key}: {model_info['name']} ({model_info['provider']})")

choice = input("Enter the number of your choice: ")
selected_model = MODELS.get(choice)

if not selected_model:
    raise ValueError("Invalid choice. Please run the script again.")

print(f"You have selected: {selected_model['name']}")

# Dynamically set up the client based on the chosen provider
client = None
provider = selected_model["provider"]

if provider == "openai":
    from openai import OpenAI
    api_key = getpass("Please enter your OpenAI API key: ")
    client = OpenAI(api_key=api_key)
elif provider == "anthropic":
    from anthropic import Anthropic
    api_key = getpass("Please enter your Anthropic API key: ")
    client = Anthropic(api_key=api_key)
elif provider == "google":
    print("Authenticating to Google Cloud via Vertex AI environment...")
    try:
        import vertexai
        from vertexai.generative_models import GenerativeModel, Part, HarmCategory, HarmBlockThreshold
        
        PROJECT_ID = "interviewai-456517"  
        LOCATION = "us-central1"          
        
        vertexai.init(project=PROJECT_ID, location=LOCATION)
        client = GenerativeModel(selected_model['id']) # Instantiate the specific model
        print("Successfully initialized Gemini model.")
    except Exception as e:
        print(f"❌ ERROR: Could not initialize Vertex AI. Ensure you are in a Vertex AI notebook and your Project ID is correct.")
        print(f"Details: {e}")
        exit()

Please choose a model to use for generation:
1: GPT-4.1 (openai)
2: GPT o3 (openai)
3: Claude Opus 4 (anthropic)
4: Claude Sonnet 4 (anthropic)
5: Gemini 2.5 Flash (google)


Enter the number of your choice:  1


You have selected: GPT-4.1


Please enter your OpenAI API key:  ········


In [14]:
# --- 3. Generalized Text Generation Function ---
def generate_text(topic, word_count, model_id, provider):
    """
    Generates a passage of text using the dynamically selected model.
    """
    prompt = f"""You are a writing assistant.

Write an original passage on the topic: '{topic}'. It should be approximately {word_count} words long. Be clear and human-like. Avoid copying or referencing specific texts.

⚠️ Do not include or repeat the topic or instructions in your output. Return only the generated passage text."""
    
    try:
        if provider == "openai":
            response = client.chat.completions.create(
                model=model_id,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.7,
                max_tokens=min(word_count * 2, 4096)
            )
            return response.choices[0].message.content.strip()
            
        elif provider == "anthropic":
            # Anthropic uses a 'system' prompt and has a different response structure
            response = client.messages.create(
                model=model_id,
                system="You are a writing assistant. Your goal is to write clear, human-like, original passages on a given topic. ⚠️ Do not include or repeat the topic or instructions in your output. Return only the generated passage text.",
                messages=[{"role": "user", "content": f"Write an original passage on the topic: '{topic}'. It should be approximately {word_count} words long."}],
                temperature=0.7,
                max_tokens=min(word_count * 2, 4096)
            )
            return response.content[0].text.strip()
            
        elif provider == "google":
            # The 'client' is the Gemini model instance itself
            response = client.generate_content(
                [prompt],
                generation_config={"temperature": 0.7, "max_output_tokens": min(word_count * 2, 4096)},
                safety_settings={
                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH,
                }
            )
            return response.text.strip()

    except Exception as e:
        print(f"❌ Generation error for model {model_id} on topic '{topic[:30]}...': {e}")
        return ""

In [16]:
# --- 4. Main Processing Logic (with updates for dynamic models) ---

## PROVIDE OUTPUT FROM TOPIC GENERATION
file_path = "" 

# Output path is now generated dynamically to separate results from different models
output_path = f"generated_output_{selected_model['id'].replace('/', '_')}.json"
print(f"Results will be saved to: {output_path}")

processed_count = 0

# Load existing data if the output file for this model already exists
if os.path.exists(output_path):
    print(f"Resuming from existing file: {output_path}")
    with open(output_path, "r", encoding="utf-8") as f:
        processed_data = json.load(f)
else:
    processed_data = []

# Create a lookup for already processed items to avoid duplicates
processed_lookup = {item['text']: item for item in processed_data}

with open(file_path, "r", encoding="utf-8") as f:
    source_data = json.load(f)

def save_current_progress():
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(list(processed_lookup.values()), f, ensure_ascii=False, indent=2)
    print(f"\nProgress saved. {len(processed_lookup)} samples in {output_path}")

try:
    for item in tqdm(source_data, desc=f"🧠 Generating with {selected_model['name']}"):
        topic = item.get("topic", "").strip()
        text = item.get("text", "").strip()

        if not topic or not text:
            continue

        # Check if this text has been processed before
        if text in processed_lookup:
            # It exists, now check if it was processed by the *current* model
            existing_item = processed_lookup[text]
            if "ai_generated" not in existing_item:
                existing_item["ai_generated"] = {}
            
            if not existing_item["ai_generated"].get(selected_model['id']):
                word_count = len(text.split())
                ai_text = generate_text(topic, word_count, selected_model['id'], provider)
                if ai_text:
                    existing_item["ai_generated"][selected_model['id']] = ai_text
                    processed_count += 1
                    time.sleep(1) # Be a good citizen
        else:
            # It's a new item, generate text for it
            word_count = len(text.split())
            ai_text = generate_text(topic, word_count, selected_model['id'], provider)
            if ai_text:
                new_item = item.copy() # Avoid modifying the source data list
                new_item["ai_generated"] = {selected_model['id']: ai_text}
                processed_lookup[text] = new_item
                processed_count += 1
                time.sleep(1)

        # Save progress periodically
        if processed_count > 0 and processed_count % 10 == 0:
            save_current_progress()

except KeyboardInterrupt:
    print("\nInterrupted by user. Saving final progress...")
except Exception as e:
    print(f"\nAn error occurred: {e}")
    save_current_progress()
    raise

# Final save at the end of the script
save_current_progress()
print(f"\nAll done! Newly generated {processed_count} passages.")

Results will be saved to: generated_output_gpt-4.1-2025-04-14.json
Resuming from existing file: generated_output_gpt-4.1-2025-04-14.json


🧠 Generating with GPT-4.1: 100%|██████████| 2100/2100 [00:00<00:00, 1068547.66it/s]



Progress saved. 1992 samples in generated_output_gpt-4.1-2025-04-14.json

All done! Newly generated 0 passages.


# Use Five AI Detectors (4 API and 1 fine-tuned model) to Detect AI Texts and Provide Score

In [None]:

# merge all ai-detector, and remain default prob
# Multi-detector AI text analysis with default prob
import json
import requests
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm

from getpass import getpass

# === Prompt for API keys securely ===
api_keys = {
    "pengram": getpass("Enter Pengram API Key: "),
    "originality": getpass("Enter Originality.AI API Key: "),
    "gptzero": getpass("Enter GPTZero API Key: ")
}

print("✅ API keys loaded for this session.")

# === Pangram 检测函数 ===
def detect_ai_pangram(text):
    if not text.strip():
        return None
    headers = {
        "Content-Type": "application/json",
        "x-api-key": api_keys["pengram"]
    }
    payload = {"text": text}
    try:
        response = requests.post("https://text.api.pangramlabs.com", headers=headers, json=payload)
        response.raise_for_status()
        result = response.json()

        likelihood = result.get("ai_likelihood")
        prediction = result.get("prediction")

        return {
            "ai_likelihood": likelihood,
            "prediction": prediction
        }
    except Exception as e:
        return {"error": str(e)}

# === Originality.ai wrapper ===
def detect_ai_originality(text):
    if not text.strip():
        return None
    try:
        headers = {
            "X-OAI-API-KEY": api_keys["originality"],
            "Content-Type": "application/json"
        }
        payload = {
            "check_ai": True,
            "check_plagiarism": False,
            "check_facts": False,
            "check_readability": False,
            "check_grammar": False,
            "check_contentOptimizer": False,
            "storeScan": False,
            "aiModelVersion": "lite",
            "content": text
        }
        response = requests.post("https://api.originality.ai/api/v3/scan", headers=headers, json=payload)
        response.raise_for_status()
        result = response.json()
        classification = result.get("results", {}).get("ai", {}).get("classification", {})
        confidence = result.get("results", {}).get("ai", {}).get("confidence", {})
        return {
            "classification": {
                "AI": classification.get("AI"),
                "Original": classification.get("Original")
            },
            "confidence": {
                "AI": confidence.get("AI"),
                "Original": confidence.get("Original")
            }
        }
    except Exception as e:
        return {"error": str(e)}

# === GPTZero detector wrapper ===
def detect_ai_gptzero(text):
    if not text.strip():
        return None
    try:
        headers = {
            "Accept": "application/json",
            "Content-Type": "application/json",
            "x-api-key": api_keys["gptzero"]
        }
        payload = {
            "document": text,
            "multilingual": False
        }
        response = requests.post(
            "https://api.gptzero.me/v2/predict/text",
            headers=headers,
            json=payload
        )
        response.raise_for_status()
        doc = response.json().get("documents", [{}])[0]
        return {
            "average_generated_prob": doc.get("average_generated_prob")
        }
    except Exception as e:
        return {"error": str(e)}

# === Load RoBERTa-based OpenAI Detector ===
MODEL_NAME = "roberta-base-openai-detector"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
detector = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).eval()

# === RoBERTa detector function ===
def detect_ai_roberta(text):
    if not text.strip():
        return None
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        logits = detector(**inputs).logits
    prob_ai = torch.softmax(logits, dim=-1)[0, 1].item()
    return prob_ai

# === Load input data ===
input_path = ""
with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# 统计信息
total_items = len(data)
skipped_items = 0
processed_items = 0

# === Run all detectors ===
for item in tqdm(data, desc="Running all AI detectors"):
    # 检查是否已有检测结果，如果有则跳过
    if "human_verdict" in item and "ai_verdicts" in item and all(model in item["ai_verdicts"] for model in item.get("ai_generated", {})):
        skipped_items += 1
        continue

    human_text = item.get("text", "")

    # Initialize verdict structure if needed
    item.setdefault("human_verdict", {})

    # Add results from each detector
    item["human_verdict"]["pengram"] = detect_ai_pangram(human_text)
    item["human_verdict"]["originality"] = detect_ai_originality(human_text)
    item["human_verdict"]["gptzero"] = detect_ai_gptzero(human_text)
    item["human_verdict"]["roberta-base-detector"] = detect_ai_roberta(human_text)

    # Process AI-generated texts
    ai_texts = item.get("ai_generated", {})
    item.setdefault("ai_verdicts", {})

    for model_name, text in ai_texts.items():
        item["ai_verdicts"].setdefault(model_name, {})
        item["ai_verdicts"][model_name]["pengram"] = detect_ai_pangram(text)
        item["ai_verdicts"][model_name]["originality"] = detect_ai_originality(text)
        item["ai_verdicts"][model_name]["gptzero"] = detect_ai_gptzero(text)
        item["ai_verdicts"][model_name]["roberta-base-detector"] = detect_ai_roberta(text)

    processed_items += 1

# === Save output ===
output_path = input_path.replace(".json", "_all_detectors.json")
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print(f"✅ Multi-detector analysis completed.")
print(f"Total items: {total_items}")
print(f"Skipped items (already processed): {skipped_items}")
print(f"Processed items: {processed_items}")
print(f"Results saved to: {output_path}")

Enter Pengram API Key:  ········
Enter Originality.AI API Key:  ········
Enter GPTZero API Key:  ········


✅ API keys loaded for this session.


Some weights of the model checkpoint at roberta-base-openai-detector were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Running all AI detectors: 100%|██████████| 1992/1992 [4:28:38<00:00,  8.09s/it]   


✅ Multi-detector analysis completed.
Total items: 1992
Skipped items (already processed): 0
Processed items: 1992
Results saved to: generated_output_claude-opus-4-20250514_all_detectors.json
