In [None]:
# getting short text corpus
import json
import os

input_folder = '/content/drive/MyDrive/detection/pre2020_human_text_by_genre/all_json_files'
output_folder = '/content/drive/MyDrive/detection/pre2020_human_text_by_genre/short_text_json_files'
os.makedirs(output_folder, exist_ok=True)

json_files = [f for f in os.listdir(input_folder) if f.endswith('.json')]

for filename in json_files:
    input_path = os.path.join(input_folder, filename)
    output_path = os.path.join(output_folder, filename.replace(".json", "_shortText.json"))

    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    short_entries = [item for item in data if len(item.get("text", "").split()) < 50]

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(short_entries, f, ensure_ascii=False, indent=2)

    print(f"✅ {filename} → {len(short_entries)} short entries saved to {output_path}")


✅ merged_corpus_withThreshold_FP0.01.json → 193 short entries saved to /content/drive/MyDrive/detection/pre2020_human_text_by_genre/short_text_json_files/merged_corpus_withThreshold_FP0.01_shortText.json
✅ merged_corpus_withThreshold_FP0.005.json → 193 short entries saved to /content/drive/MyDrive/detection/pre2020_human_text_by_genre/short_text_json_files/merged_corpus_withThreshold_FP0.005_shortText.json
✅ merged_corpus_withThreshold_FP0.0001.json → 193 short entries saved to /content/drive/MyDrive/detection/pre2020_human_text_by_genre/short_text_json_files/merged_corpus_withThreshold_FP0.0001_shortText.json
✅ merged_corpus_withThreshold_FP0.05.json → 193 short entries saved to /content/drive/MyDrive/detection/pre2020_human_text_by_genre/short_text_json_files/merged_corpus_withThreshold_FP0.05_shortText.json
✅ merged_corpus_withThreshold_FP0.1.json → 193 short entries saved to /content/drive/MyDrive/detection/pre2020_human_text_by_genre/short_text_json_files/merged_corpus_withThresho

In [None]:
# Table 1A – AUROC & Δ-Mean
import json
import pandas as pd
from sklearn.metrics import roc_auc_score

# === Load JSON file ===
with open("/content/drive/MyDrive/detection/pre2020_human_text_by_genre/short_text_json_files/merged_corpus_withDetectors_defaultProb_shortText.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# === Supported detectors and their probability fields ===
detectors = {
    "originality": lambda d: d.get("confidence", {}).get("AI", None),
    "pengram": lambda d: d.get("ai_likelihood", None),
    "gptzero": lambda d: d.get("average_generated_prob", None),
    "roberta-base-detector": lambda d: d if isinstance(d, (float, int)) else None
}

# === Collect scores ===
records = []

for entry in data:
    # Human text
    hv = entry.get("human_verdict", {})
    for det, extract in detectors.items():
        if det in hv:
            prob = extract(hv[det])
            if prob is not None:
                records.append({
                    "detector": det,
                    "source": "human",
                    "score": prob,
                    "label": 0
                })

    # AI-generated text
    for model, verdicts in entry.get("ai_verdicts", {}).items():
        for det, extract in detectors.items():
            if det in verdicts:
                prob = extract(verdicts[det])
                if prob is not None:
                    records.append({
                        "detector": det,
                        "source": f"ai_{model}",
                        "score": prob,
                        "label": 1
                    })

# === Convert to DataFrame ===
df = pd.DataFrame(records)

# === Compute AUROC & Δ-Mean for each detector ===
summary = []

for det in df["detector"].unique():
    sub = df[df["detector"] == det]
    ai_scores = sub[sub["label"] == 1]["score"]
    human_scores = sub[sub["label"] == 0]["score"]
    all_labels = sub["label"]
    all_scores = sub["score"]

    try:
        auroc = roc_auc_score(all_labels, all_scores)
    except:
        auroc = None

    delta_mean = ai_scores.mean() - human_scores.mean()

    summary.append({
        "detector": det,
        "AUROC": round(auroc, 4) if auroc is not None else "N/A",
        "Δ-Mean": round(delta_mean, 4)
    })

# === Final Table 1A Output ===
summary_df = pd.DataFrame(summary).sort_values(by="AUROC", ascending=False)
# print(summary_df.to_string(index=False))
summary_df.to_csv("/content/drive/MyDrive/detection/pre2020_human_text_by_genre/result_forShorterText/Table1_overall.csv")
summary_df



Unnamed: 0,detector,AUROC,Δ-Mean
0,pengram,0.9991,0.9045
1,gptzero,0.9688,0.9375
2,roberta-base-detector,0.5928,0.1148
3,originality,,


In [None]:
# Table 1 By genre / domain

import json
import pandas as pd
from sklearn.metrics import roc_auc_score

# === Load JSON ===
with open("/content/drive/MyDrive/detection/pre2020_human_text_by_genre/short_text_json_files/merged_corpus_withDetectors_defaultProb_shortText.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# === Detector: field extractors ===
detectors = {
    "originality": lambda d: d.get("confidence", {}).get("AI", None),
    "pengram": lambda d: d.get("ai_likelihood", None),
    "gptzero": lambda d: d.get("average_generated_prob", None),
    "roberta-base-detector": lambda d: d if isinstance(d, (float, int)) else None
}

# === Collect records with genre ===
records = []

for entry in data:
    genre = entry.get("genre", "unknown")
    hv = entry.get("human_verdict", {})
    for det, extract in detectors.items():
        if det in hv:
            prob = extract(hv[det])
            if prob is not None:
                records.append({
                    "genre": genre,
                    "detector": det,
                    "score": prob,
                    "label": 0
                })

    for model, verdicts in entry.get("ai_verdicts", {}).items():
        for det, extract in detectors.items():
            if det in verdicts:
                prob = extract(verdicts[det])
                if prob is not None:
                    records.append({
                        "genre": genre,
                        "detector": det,
                        "score": prob,
                        "label": 1
                    })

# === Convert to DataFrame
df = pd.DataFrame(records)

# === Compute AUROC & Δ-Mean by genre x detector
summary = []

for (genre, det), sub in df.groupby(["genre", "detector"]):
    ai_scores = sub[sub["label"] == 1]["score"]
    human_scores = sub[sub["label"] == 0]["score"]
    all_labels = sub["label"]
    all_scores = sub["score"]

    try:
        auroc = roc_auc_score(all_labels, all_scores)
    except:
        auroc = None

    delta_mean = ai_scores.mean() - human_scores.mean()

    summary.append({
        "genre": genre,
        "detector": det,
        "AUROC": round(auroc, 4) if auroc is not None else "N/A",
        "Δ-Mean": round(delta_mean, 4),
        "n_ai": len(ai_scores),
        "n_human": len(human_scores)
    })

# === Format with rank per genre
summary_df = pd.DataFrame(summary)

# === Global sort by AUROC across all (genre, detector) combinations
# summary_df["Rank"] = summary_df["AUROC"].rank(ascending=False, method="min").astype(int)

# Sort by AUROC descending
summary_df = summary_df.sort_values("AUROC", ascending=False).reset_index(drop=True)
summary_df.to_csv("/content/drive/MyDrive/detection/pre2020_human_text_by_genre/result_forShorterText/Table1_ByGenre.csv")



In [None]:
# Type1 Type2 errors detect

import json
import pandas as pd
import numpy as np
import scipy.stats as st

# Load the dataset
input_path = "/content/drive/MyDrive/detection/pre2020_human_text_by_genre/short_text_json_files/merged_corpus_withThreshold_FP0.1_shortText.json"

with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Prepare the record list
records = []
for item in data:
    # Get genre info, use "unknown" if missing
    genre = item.get("genre", "unknown")

    for detector_name in ["originality", "pengram", "gptzero", "roberta-base-detector"]:
        # Process human text verdicts
        if "human_verdict" in item and detector_name in item["human_verdict"]:
            verdict = item["human_verdict"].get(detector_name)
            if verdict is not None:
                records.append({
                    "genre": genre,
                    "detector": detector_name,
                    "is_human": True,
                    "verdict": verdict
                })

        # Process AI text verdicts
        if "ai_verdicts" in item and detector_name in item.get("ai_verdicts", {}):
            verdict = item["ai_verdicts"].get(detector_name)
            if verdict is not None:
                records.append({
                    "genre": genre,
                    "detector": detector_name,
                    "is_human": False,
                    "verdict": verdict
                })

df = pd.DataFrame(records)

# Compute Wilson confidence interval
def wilson_ci(k, n, alpha=0.05):
    if n == 0:
        return (0, 0)
    z = st.norm.ppf(1 - alpha/2)
    phat = k / n
    denom = 1 + z**2/n
    center = phat + z**2 / (2 * n)
    pm = z * ((phat * (1 - phat) / n + z**2 / (4 * n**2)) ** 0.5)
    return ((center - pm) / denom, (center + pm) / denom)

# Evaluate error rates, precision, and recall by genre and detector
results = []
metrics_results = []

for (genre, detector), group in df.groupby(["genre", "detector"]):
    human = group[group["is_human"]]
    ai = group[~group["is_human"]]

    # Compute confusion matrix components
    total_human = human["verdict"].notna().sum()
    total_ai = ai["verdict"].notna().sum()

    fp = (human["verdict"] == 1).sum()  # False Positives: human texts incorrectly flagged as AI
    tn = (human["verdict"] == 0).sum()  # True Negatives: human texts correctly flagged as human
    tp = (ai["verdict"] == 1).sum()     # True Positives: AI texts correctly flagged as AI
    fn = (ai["verdict"] == 0).sum()     # False Negatives: AI texts incorrectly flagged as human

    # Type I error (False Positive Rate)
    type1_rate = fp / total_human if total_human else np.nan
    type1_ci = wilson_ci(fp, total_human)
    type1_p = st.binomtest(fp, total_human, p=0.5, alternative='greater').pvalue if total_human else np.nan

    # Type II error (False Negative Rate)
    type2_rate = fn / total_ai if total_ai else np.nan
    type2_ci = wilson_ci(fn, total_ai)
    type2_p = st.binomtest(fn, total_ai, p=0.5, alternative='greater').pvalue if total_ai else np.nan

    # Calculate Precision and Recall
    precision = tp / (tp + fp) if (tp + fp) > 0 else np.nan
    precision_ci = wilson_ci(tp, tp + fp) if (tp + fp) > 0 else (np.nan, np.nan)

    recall = tp / (tp + fn) if (tp + fn) > 0 else np.nan
    recall_ci = wilson_ci(tp, tp + fn) if (tp + fn) > 0 else (np.nan, np.nan)

    # Append error rate results
    results.append({
        "Genre": genre,
        "Detector": detector,
        "Error Type": "Type I (False Positive)",
        "Count": fp,
        "Total": total_human,
        "Error Rate": type1_rate,
        "CI Lower": type1_ci[0],
        "CI Upper": type1_ci[1],
        "p-value": type1_p
    })
    results.append({
        "Genre": genre,
        "Detector": detector,
        "Error Type": "Type II (False Negative)",
        "Count": fn,
        "Total": total_ai,
        "Error Rate": type2_rate,
        "CI Lower": type2_ci[0],
        "CI Upper": type2_ci[1],
        "p-value": type2_p
    })

    # Append precision and recall results
    metrics_results.append({
        "Genre": genre,
        "Detector": detector,
        "Metric": "Precision",
        "Value": precision,
        "CI Lower": precision_ci[0],
        "CI Upper": precision_ci[1],
        "True Positives": tp,
        "False Positives": fp,
        "Total Detected as AI": tp + fp
    })
    metrics_results.append({
        "Genre": genre,
        "Detector": detector,
        "Metric": "Recall",
        "Value": recall,
        "CI Lower": recall_ci[0],
        "CI Upper": recall_ci[1],
        "True Positives": tp,
        "False Negatives": fn,
        "Total AI": tp + fn
    })

results_df = pd.DataFrame(results)
metrics_df = pd.DataFrame(metrics_results)

metrics_df.to_csv("/content/drive/MyDrive/detection/pre2020_human_text_by_genre/result_forShorterText/Table2_threshold0.1.csv")
