In [1]:
# !pip install textstat
import pandas as pd
import textstat
import numpy as np
import json

# Functions for Descriptive Stats

In [2]:
def calculate_ttr(text: str) -> float:
    """Calculates the Type-Token Ratio (TTR) for a given text."""
    if not isinstance(text, str) or not text.strip():
        return np.nan
    tokens = text.lower().split()
    if not tokens:
        return 0.0
    num_types = len(set(tokens))
    num_tokens = len(tokens)
    return num_types / num_tokens

def calculate_human_metrics_by_genre(df: pd.DataFrame) -> pd.DataFrame:
    """Calculates N, Mean Words, TTR, and FK Grade for human text, grouped by genre."""
    human_df = df[['genre', 'text']].copy().dropna(subset=['text'])
    
    human_df['word_count'] = human_df['text'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)
    human_df['ttr'] = human_df['text'].apply(calculate_ttr) # ADDED
    human_df['fk_grade'] = human_df['text'].apply(lambda x: textstat.flesch_kincaid_grade(x) if isinstance(x, str) else np.nan)
    
    metrics = human_df.groupby('genre').agg(
        N_H=('text', 'size'),
        Mean_words_H=('word_count', 'mean'),
        TTR_H=('ttr', 'mean'), # ADDED
        FK_grade_H=('fk_grade', 'mean')
    )
    return metrics

def calculate_ai_metrics_by_genre(df: pd.DataFrame) -> pd.DataFrame:
    """Consolidates all AI texts and calculates metrics, grouped by genre."""
    ai_texts_list = []
    for _, row in df.iterrows():
        genre = row['genre']
        ai_data_dict = row['ai_generated']
        if isinstance(ai_data_dict, dict):
            for model, text in ai_data_dict.items():
                if isinstance(text, str) and text.strip():
                    ai_texts_list.append({'genre': genre, 'text': text})

    if not ai_texts_list:
        return pd.DataFrame()

    ai_df = pd.DataFrame(ai_texts_list)
    
    ai_df['word_count'] = ai_df['text'].apply(lambda x: len(x.split()))
    ai_df['ttr'] = ai_df['text'].apply(calculate_ttr) # ADDED
    ai_df['fk_grade'] = ai_df['text'].apply(textstat.flesch_kincaid_grade)
    
    metrics = ai_df.groupby('genre').agg(
        N_AI=('text', 'size'),
        Mean_words_AI=('word_count', 'mean'),
        TTR_AI=('ttr', 'mean'), # ADDED
        FK_grade_AI=('fk_grade', 'mean')
    )
    return metrics

def create_benchmark_table(df: pd.DataFrame) -> pd.DataFrame:
    """Creates the final summary table by calculating and merging human and AI metrics."""
    human_metrics = calculate_human_metrics_by_genre(df)
    ai_metrics = calculate_ai_metrics_by_genre(df)
    
    final_table = pd.merge(
        human_metrics,
        ai_metrics,
        left_index=True,
        right_index=True,
        how='outer'
    )
    
    # ADDED TTR columns to the rename mapping
    final_table = final_table.rename(columns={
        'N_H': 'NH',
        'N_AI': 'NAI',
        'Mean_words_H': 'Mean words H',
        'Mean_words_AI': 'Mean words AI',
        'TTR_H': 'TTR-H',
        'TTR_AI': 'TTR-AI',
        'FK_grade_H': 'FK grade-H',
        'FK_grade_AI': 'FK grade-AI'
    }).round(4) # Increased rounding for TTR precision
    
    return final_table

In [44]:
import json
import pandas as pd

# 1. Define the list of JSON file paths
file_paths = [
    # "generated_output_gpt-4.1-2025-04-14.json",
    # "generated_output_claude-opus-4-20250514.json",
    # "generated_output_claude-sonnet-4-20250514.json",
    # "generated_output_gemini-2.0-flash.json"
]

# 2. Initialize an empty list to hold all records
all_records = []

# 3. Loop through each file, tag the data, and combine
print("Reading, tagging, and combining files...")
for file_path in file_paths:
    try:
        # --- NEW: Extract the model name from the file path ---
        # This removes the prefix and suffix to get the clean name
        model_name = file_path.replace("generated_output_", "").replace(".json", "")

        with open(file_path, 'r', encoding='utf-8') as f:
            records = json.load(f)
            
            # --- NEW: Add the model name to each record (dictionary) ---
            if isinstance(records, list):
                for record in records:
                    if isinstance(record, dict):
                        record['model'] = model_name
                
                # Add the updated records to the master list
                all_records.extend(records)
            else:
                print(f"Warning: Data in {file_path} is not a list. Skipping.")

    except FileNotFoundError:
        print(f"Warning: File not found at '{file_path}'. Skipping.")
    except json.JSONDecodeError:
        print(f"Warning: Error decoding JSON from '{file_path}'. Skipping.")

# 4. Convert the combined list into a pandas DataFrame
combined_df = pd.DataFrame(all_records)

# 5. Display the results
print("\n✅ Successfully combined all JSON files.")
print(f"Total number of records in the DataFrame: {len(combined_df)}")

# Display the unique model names found in the new column
if 'model' in combined_df.columns:
    print("\nModels found in the DataFrame:")
    print(list(combined_df['model'].unique()))

Reading, tagging, and combining files...

✅ Successfully combined all JSON files.
Total number of records in the DataFrame: 7968

Models found in the DataFrame:
['gpt-4.1-2025-04-14', 'claude-opus-4-20250514', 'claude-sonnet-4-20250514', 'gemini-2.0-flash']


In [53]:
df_gpt = combined_df[combined_df['model'] == 'gpt-4.1-2025-04-14']
df_opus = combined_df[combined_df['model'] == 'claude-opus-4-20250514']
df_sonnet = combined_df[combined_df['model'] == 'claude-sonnet-4-20250514']
df_gemini = combined_df[combined_df['model'] == 'gemini-2.0-flash']

In [55]:
# 2. Call the main function to get the final table
benchmark_df_gpt = create_benchmark_table(df_gpt.copy())
benchmark_df_opus = create_benchmark_table(df_opus.copy())
benchmark_df_sonnet = create_benchmark_table(df_sonnet.copy())
benchmark_df_gemini = create_benchmark_table(df_gemini.copy())

In [65]:
# 3. Print the result
print("📊 Final Benchmark Table:")
benchmark_df_gpt['model'] = 'GPT 4.1'
benchmark_df_gpt = benchmark_df_gpt.reset_index()
order_columns = ['model', 'genre', 'NH', 'NAI', 'Mean words H', 'Mean words AI', 'TTR-H', 'TTR-AI',
                  'FK grade-H', 'FK grade-AI']
benchmark_df_gpt = benchmark_df_gpt[order_columns]
benchmark_df_gpt

📊 Final Benchmark Table:


Unnamed: 0,model,genre,NH,NAI,Mean words H,Mean words AI,TTR-H,TTR-AI,FK grade-H,FK grade-AI
0,GPT 4.1,amazon review,200,200,79.57,76.68,0.7863,0.8581,7.3972,13.3593
1,GPT 4.1,blog,200,200,198.98,197.11,0.7144,0.7896,6.238,9.9451
2,GPT 4.1,news,300,300,459.8767,458.3967,0.6194,0.6629,10.9521,14.8075
3,GPT 4.1,novel,1000,1000,966.773,1037.753,0.5127,0.5191,44.7898,9.5643
4,GPT 4.1,restaurant review,100,100,133.08,129.39,0.7683,0.8262,6.0666,11.0551
5,GPT 4.1,resume,192,192,789.5104,833.651,0.5329,0.5781,19.638,16.8755


In [56]:
# 3. Print the result
print("📊 Final Benchmark Table:")
benchmark_df_opus['model'] = 'Claude Opus 4'
benchmark_df_opus = benchmark_df_opus.reset_index()
order_columns = ['model', 'genre', 'NH', 'NAI', 'Mean words H', 'Mean words AI', 'TTR-H', 'TTR-AI',
                  'FK grade-H', 'FK grade-AI']
benchmark_df_opus = benchmark_df_opus[order_columns]
benchmark_df_opus

📊 Final Benchmark Table:


Unnamed: 0,model,genre,NH,NAI,Mean words H,Mean words AI,TTR-H,TTR-AI,FK grade-H,FK grade-AI
0,Claude Opus 4,amazon review,200,200,79.57,79.01,0.7863,0.8929,7.3972,12.2501
1,Claude Opus 4,blog,200,200,198.98,202.54,0.7144,0.8153,6.238,8.214
2,Claude Opus 4,news,300,300,459.8767,450.5867,0.6194,0.7014,10.9521,14.7061
3,Claude Opus 4,novel,1000,1000,966.773,982.352,0.5127,0.5632,44.7898,8.0421
4,Claude Opus 4,restaurant review,100,100,133.08,133.51,0.7683,0.8574,6.0666,9.9321
5,Claude Opus 4,resume,192,192,789.5104,784.5104,0.5329,0.6094,19.638,15.8431


In [57]:
# 3. Print the result
print("📊 Final Benchmark Table:")
benchmark_df_sonnet['model'] = 'Claude Sonnet 4'
benchmark_df_sonnet = benchmark_df_sonnet.reset_index()
order_columns = ['model', 'genre', 'NH', 'NAI', 'Mean words H', 'Mean words AI', 'TTR-H', 'TTR-AI',
                  'FK grade-H', 'FK grade-AI']
benchmark_df_sonnet = benchmark_df_sonnet[order_columns]
benchmark_df_sonnet

📊 Final Benchmark Table:


Unnamed: 0,model,genre,NH,NAI,Mean words H,Mean words AI,TTR-H,TTR-AI,FK grade-H,FK grade-AI
0,Claude Sonnet 4,amazon review,200,200,79.57,82.23,0.7863,0.8931,7.3972,14.1086
1,Claude Sonnet 4,blog,200,200,198.98,200.255,0.7144,0.8145,6.238,9.6243
2,Claude Sonnet 4,news,300,300,459.8767,435.94,0.6194,0.6929,10.9521,16.5436
3,Claude Sonnet 4,novel,1000,1000,966.773,952.398,0.5127,0.5587,44.7898,10.3207
4,Claude Sonnet 4,restaurant review,100,100,133.08,135.25,0.7683,0.854,6.0666,11.3803
5,Claude Sonnet 4,resume,192,192,789.5104,744.3177,0.5329,0.5908,19.638,18.8773


In [58]:
# 3. Print the result
print("📊 Final Benchmark Table:")
benchmark_df_gemini['model'] = 'Gemini 2.0 Flash'
benchmark_df_gemini = benchmark_df_gemini.reset_index()
order_columns = ['model', 'genre', 'NH', 'NAI', 'Mean words H', 'Mean words AI', 'TTR-H', 'TTR-AI',
                  'FK grade-H', 'FK grade-AI']
benchmark_df_gemini = benchmark_df_gemini[order_columns]
benchmark_df_gemini

📊 Final Benchmark Table:


Unnamed: 0,model,genre,NH,NAI,Mean words H,Mean words AI,TTR-H,TTR-AI,FK grade-H,FK grade-AI
0,Gemini 2.0 Flash,amazon review,200,200,79.57,77.525,0.7863,0.8501,7.3972,9.6162
1,Gemini 2.0 Flash,blog,200,200,198.98,197.485,0.7144,0.7822,6.238,7.2119
2,Gemini 2.0 Flash,news,300,300,459.8767,430.5367,0.6194,0.6478,10.9521,12.909
3,Gemini 2.0 Flash,novel,1000,1000,966.773,970.347,0.5127,0.5246,44.7898,8.3934
4,Gemini 2.0 Flash,restaurant review,100,100,133.08,134.02,0.7683,0.8181,6.0666,7.9224
5,Gemini 2.0 Flash,resume,192,192,789.5104,787.8385,0.5329,0.5424,19.638,13.7291


In [66]:
df_descriptive_stats = pd.concat([benchmark_df_gpt, 
           benchmark_df_opus,
           benchmark_df_sonnet,
           benchmark_df_gemini],
           axis = 0)

In [67]:
df_descriptive_stats

Unnamed: 0,model,genre,NH,NAI,Mean words H,Mean words AI,TTR-H,TTR-AI,FK grade-H,FK grade-AI
0,GPT 4.1,amazon review,200,200,79.57,76.68,0.7863,0.8581,7.3972,13.3593
1,GPT 4.1,blog,200,200,198.98,197.11,0.7144,0.7896,6.238,9.9451
2,GPT 4.1,news,300,300,459.8767,458.3967,0.6194,0.6629,10.9521,14.8075
3,GPT 4.1,novel,1000,1000,966.773,1037.753,0.5127,0.5191,44.7898,9.5643
4,GPT 4.1,restaurant review,100,100,133.08,129.39,0.7683,0.8262,6.0666,11.0551
5,GPT 4.1,resume,192,192,789.5104,833.651,0.5329,0.5781,19.638,16.8755
0,Claude Opus 4,amazon review,200,200,79.57,79.01,0.7863,0.8929,7.3972,12.2501
1,Claude Opus 4,blog,200,200,198.98,202.54,0.7144,0.8153,6.238,8.214
2,Claude Opus 4,news,300,300,459.8767,450.5867,0.6194,0.7014,10.9521,14.7061
3,Claude Opus 4,novel,1000,1000,966.773,982.352,0.5127,0.5632,44.7898,8.0421
