In [15]:
import pandas as pd

In [16]:
df = pd.read_csv(r'C:\Projects\36118 ANLP\Assignment 2\LLM\LegalEase_NSW\github_repo\eval\outputs\llm_generation_and_evaluation_results.csv')

In [17]:
# Remove API failures
df_success = df[df['generated_summary'] != 'API_CALL_FAILED']

# Model-level stats (averaged across ALL prompts)
model_stats = df_success.groupby('model_name').agg({
    'format_pass': 'mean',
    'bertscore_f1': 'mean',
    'flesch_kincaid_grade': 'mean',
    'latency_seconds': 'mean'
})

print(model_stats)

# Prompt-level stats (averaged across ALL models)
prompt_stats = df_success.groupby('prompt_name').agg({
    'format_pass': 'mean',
    'bertscore_f1': 'mean',
    'flesch_kincaid_grade': 'mean'
})

print(prompt_stats)

                            format_pass  bertscore_f1  flesch_kincaid_grade  \
model_name                                                                    
claude-3-5-sonnet-20240620     0.297980      0.859115              6.549417   
claude-3-opus-20240229         0.350962      0.859282              8.631613   
gemini-2.5-flash               0.490385      0.869389              5.414282   
gemini-2.5-pro                 0.418269      0.867278              5.904126   
gpt-3.5-turbo                  0.500000      0.872966              8.573203   
gpt-4-turbo                    0.500000      0.875456              8.852759   
gpt-4o                         0.500000      0.870473              7.313176   

                            latency_seconds  
model_name                                   
claude-3-5-sonnet-20240620         5.317576  
claude-3-opus-20240229             7.548510  
gemini-2.5-flash                  20.223798  
gemini-2.5-pro                    19.306779  
gpt-3.5-turb

In [18]:
# Check sample sizes per group
print("SAMPLE SIZES BY PROMPT")
print(df_success.groupby('prompt_name')['format_pass'].agg(['mean', 'count', 'sum']))

print("SAMPLE SIZES BY MODEL")
print(df_success.groupby('model_name')['format_pass'].agg(['mean', 'count', 'sum']))

print("FULL MODEL×PROMPT BREAKDOWN")
combo_detail = df_success.groupby(['model_name', 'prompt_name'])['format_pass'].agg(['mean', 'count', 'sum'])
print(combo_detail.sort_values('mean', ascending=False))

SAMPLE SIZES BY PROMPT
                        mean  count  sum
prompt_name                             
Default             0.900000    360  324
Engaging            0.000000    362    0
Explain like I'm 5  0.000000    362    0
With example        0.853591    362  309
SAMPLE SIZES BY MODEL
                                mean  count  sum
model_name                                      
claude-3-5-sonnet-20240620  0.297980    198   59
claude-3-opus-20240229      0.350962    208   73
gemini-2.5-flash            0.490385    208  102
gemini-2.5-pro              0.418269    208   87
gpt-3.5-turbo               0.500000    208  104
gpt-4-turbo                 0.500000    208  104
gpt-4o                      0.500000    208  104
FULL MODEL×PROMPT BREAKDOWN
                                                   mean  count  sum
model_name                 prompt_name                             
gemini-2.5-flash           With example        1.000000     52   52
gpt-3.5-turbo              With exam

In [19]:
# Best model×prompt combination
best_combo = df_success.groupby(['model_name', 'prompt_name']).agg({
    'format_pass': 'mean'
}).sort_values('format_pass', ascending=False).head(1)

print(best_combo)

                               format_pass
model_name       prompt_name              
gemini-2.5-flash With example          1.0


In [27]:
import pandas as pd

# Load data
df = pd.read_csv(r'C:\Projects\36118 ANLP\Assignment 2\LLM\LegalEase_NSW\github_repo\eval\outputs\llm_generation_and_evaluation_results.csv')

# Remove API failures
df_success = df[df['generated_summary'] != 'API_CALL_FAILED'].copy()

# Helper function to group models by provider
def group_models_by_provider(df):
    """Add provider column and sort by provider then model"""
    df = df.copy()
    df['provider'] = df.index.to_series().apply(lambda x: 
        'Anthropic' if 'claude' in x.lower() else
        'OpenAI' if 'gpt' in x.lower() else
        'Google' if 'gemini' in x.lower() else 'Other'
    )
    return df.sort_values(['provider', df.index.name or 'model_name'])

print("="*80)
print("Overall model averages (across all 4 prompts):")
print("="*80)
model_overall = df_success.groupby('model_name').agg({
    'format_pass': 'mean',
    'bertscore_f1': 'mean',
    'flesch_kincaid_grade': 'mean',
    'latency_seconds': 'mean'
}).round(3)
model_overall.columns = ['Format compliance', 'BERTScore F1', 'Readability (grade)', 'Latency (seconds)']
model_overall = group_models_by_provider(model_overall)
display(model_overall)

print("\n" + "="*80)
print("MODEL PERFORMANCE ON STRUCTURED PROMPTS ONLY (Default + With example):")
print("="*80)
df_structured = df_success[df_success['prompt_name'].isin(['Default', 'With example'])]

print("\nStructured prompt performance by model:")
structured_by_model = df_structured.groupby('model_name').agg({
    'format_pass': ['mean', 'sum', 'count'],
    'bertscore_f1': 'mean',
    'flesch_kincaid_grade': 'mean',
    'latency_seconds': 'mean'
}).round(3)
structured_by_model.columns = ['Format compliance', 'Passed count', 'Total cases', 
                                'BERTScore F1', 'Readability (grade)', 'Latency (seconds)']
structured_by_model = group_models_by_provider(structured_by_model)
display(structured_by_model)

print("\n" + "="*80)
print("DETAILED MODEL×PROMPT BREAKDOWN (structured prompts only):")
print("="*80)

# Create a pivot-style display grouped by model
structured_detail = df_structured.groupby(['model_name', 'prompt_name']).agg({
    'format_pass': ['mean', 'sum', 'count'],
    'bertscore_f1': 'mean',
    'flesch_kincaid_grade': 'mean',
    'latency_seconds': 'mean'
}).round(3)
structured_detail.columns = ['Format compliance', 'Passed', 'Total', 
                              'BERTScore F1', 'Readability (grade)', 'Latency (seconds)']

# Sort by model name first (grouped by provider), then by format compliance within each model
structured_detail = structured_detail.reset_index()
structured_detail['provider'] = structured_detail['model_name'].apply(lambda x: 
    'Anthropic' if 'claude' in x.lower() else
    'OpenAI' if 'gpt' in x.lower() else
    'Google' if 'gemini' in x.lower() else 'Other'
)

# Sort by provider, then model name, then format compliance descending
structured_detail_sorted = structured_detail.sort_values(
    ['provider', 'model_name', 'Format compliance'], 
    ascending=[True, True, False]
).set_index(['model_name', 'prompt_name'])

# Drop the provider column from display
structured_detail_sorted = structured_detail_sorted.drop('provider', axis=1)

display(structured_detail_sorted)

print("\n" + "="*80)
print("UNSTRUCTURED PROMPT PERFORMANCE BY MODEL:")
print("="*80)

# Filter for unstructured prompts only
df_unstructured = df_success[df_success['prompt_name'].isin(['Engaging', 'Explain like I\'m 5'])]

# Group by model and prompt, excluding format_pass metrics
unstructured_detail = df_unstructured.groupby(['model_name', 'prompt_name']).agg({
    'bertscore_f1': 'mean',
    'flesch_kincaid_grade': 'mean',
    'latency_seconds': 'mean'
}).round(3)

unstructured_detail.columns = ['BERTScore F1', 'Readability (grade)', 'Latency (seconds)']

# Sort by provider, then model name, then prompt name
unstructured_detail = unstructured_detail.reset_index()
unstructured_detail['provider'] = unstructured_detail['model_name'].apply(lambda x: 
    'Anthropic' if 'claude' in x.lower() else
    'OpenAI' if 'gpt' in x.lower() else
    'Google' if 'gemini' in x.lower() else 'Other'
)

unstructured_detail_sorted = unstructured_detail.sort_values(
    ['provider', 'model_name', 'prompt_name'], 
    ascending=[True, True, True]
).set_index(['model_name', 'prompt_name'])

# Drop the provider column from display
unstructured_detail_sorted = unstructured_detail_sorted.drop('provider', axis=1)

display(unstructured_detail_sorted)

print("\n" + "="*80)
print("PROMPT ENGINEERING IMPACT")
print("="*80)
print("\nPrompt averages (across all 7 models):")
prompt_overall = df_success.groupby('prompt_name').agg({
    'format_pass': ['mean', 'sum', 'count'],
    'bertscore_f1': 'mean',
    'flesch_kincaid_grade': 'mean'
}).round(3)
prompt_overall.columns = ['Format compliance', 'Passed count', 'Total cases', 
                          'BERTScore F1', 'Readability (grade)']
display(prompt_overall)

print("\n" + "="*80)
print("STRUCTURED vs UNSTRUCTURED PROMPT COMPARISON:")
print("="*80)
df_success['prompt_type'] = df_success['prompt_name'].apply(
    lambda x: 'Structured' if x in ['Default', 'With example'] else 'Unstructured'
)
prompt_type_comparison = df_success.groupby('prompt_type').agg({
    'format_pass': ['mean', 'sum', 'count'],
    'bertscore_f1': 'mean',
    'flesch_kincaid_grade': 'mean'
}).round(3)
prompt_type_comparison.columns = ['Format compliance', 'Passed count', 'Total cases',
                                   'BERTScore F1', 'Readability (grade)']
display(prompt_type_comparison)

print("\n" + "="*80)
print("TOP 10 MODEL×PROMPT COMBINATIONS (by format compliance):")
print("="*80)
top_combos = df_success.groupby(['model_name', 'prompt_name']).agg({
    'format_pass': ['mean', 'sum', 'count'],
    'bertscore_f1': 'mean',
    'flesch_kincaid_grade': 'mean',
    'latency_seconds': 'mean'
}).round(3)
top_combos.columns = ['Format compliance', 'Passed', 'Total', 
                      'BERTScore F1', 'Readability (grade)', 'Latency (seconds)']
display(top_combos.sort_values('Format compliance', ascending=False).head(10))

print("\n" + "="*80)
print("BOTTOM 5 MODEL×PROMPT COMBINATIONS (structured prompts only):")
print("="*80)
bottom_structured = structured_detail.sort_values('Format compliance', ascending=True).head(5)
display(bottom_structured)

print("\n" + "="*80)
print("PROVIDER COMPARISON (structured prompts only):")
print("="*80)
df_structured['provider'] = df_structured['model_name'].apply(lambda x: 
    'Anthropic' if 'claude' in x.lower() else
    'OpenAI' if 'gpt' in x.lower() else
    'Google' if 'gemini' in x.lower() else 'Other'
)
provider_comparison = df_structured.groupby('provider').agg({
    'format_pass': ['mean', 'sum', 'count'],
    'bertscore_f1': 'mean',
    'flesch_kincaid_grade': 'mean',
    'latency_seconds': 'mean'
}).round(3)
provider_comparison.columns = ['Format compliance', 'Passed count', 'Total cases',
                               'BERTScore F1', 'Readability (grade)', 'Latency (seconds)']
display(provider_comparison)

print("\n" + "="*80)
print("CLAUDE 3.5 SONNET: DEFAULT vs WITH EXAMPLE COMPARISON:")
print("="*80)
claude_comparison = df_structured[df_structured['model_name'] == 'claude-3-5-sonnet-20240620'].groupby('prompt_name').agg({
    'format_pass': ['mean', 'sum', 'count'],
    'bertscore_f1': 'mean',
    'flesch_kincaid_grade': 'mean',
    'latency_seconds': 'mean'
}).round(3)
claude_comparison.columns = ['Format compliance', 'Passed', 'Total',
                             'BERTScore F1', 'Readability (grade)', 'Latency (seconds)']
display(claude_comparison)

Overall model averages (across all 4 prompts):


Unnamed: 0_level_0,Format compliance,BERTScore F1,Readability (grade),Latency (seconds),provider
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
claude-3-5-sonnet-20240620,0.298,0.859,6.549,5.318,Anthropic
claude-3-opus-20240229,0.351,0.859,8.632,7.549,Anthropic
gemini-2.5-flash,0.49,0.869,5.414,20.224,Google
gemini-2.5-pro,0.418,0.867,5.904,19.307,Google
gpt-3.5-turbo,0.5,0.873,8.573,2.279,OpenAI
gpt-4-turbo,0.5,0.875,8.853,6.067,OpenAI
gpt-4o,0.5,0.87,7.313,4.192,OpenAI



MODEL PERFORMANCE ON STRUCTURED PROMPTS ONLY (Default + With example):

Structured prompt performance by model:


Unnamed: 0_level_0,Format compliance,Passed count,Total cases,BERTScore F1,Readability (grade),Latency (seconds),provider
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
claude-3-5-sonnet-20240620,0.602,59,98,0.847,8.281,5.207,Anthropic
claude-3-opus-20240229,0.702,73,104,0.851,9.495,8.838,Anthropic
gemini-2.5-flash,0.981,102,104,0.855,6.3,22.705,Google
gemini-2.5-pro,0.837,87,104,0.861,7.044,19.522,Google
gpt-3.5-turbo,1.0,104,104,0.85,8.478,2.718,OpenAI
gpt-4-turbo,1.0,104,104,0.855,9.672,7.456,OpenAI
gpt-4o,1.0,104,104,0.852,8.384,4.793,OpenAI



DETAILED MODEL×PROMPT BREAKDOWN (structured prompts only):


Unnamed: 0_level_0,Unnamed: 1_level_0,Format compliance,Passed,Total,BERTScore F1,Readability (grade),Latency (seconds)
model_name,prompt_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
claude-3-5-sonnet-20240620,Default,0.938,45,48,0.854,8.23,5.018
claude-3-5-sonnet-20240620,With example,0.28,14,50,0.84,8.33,5.389
claude-3-opus-20240229,Default,0.731,38,52,0.858,11.208,9.582
claude-3-opus-20240229,With example,0.673,35,52,0.844,7.782,8.094
gemini-2.5-flash,With example,1.0,52,52,0.846,5.221,22.364
gemini-2.5-flash,Default,0.962,50,52,0.865,7.379,23.045
gemini-2.5-pro,With example,1.0,52,52,0.853,6.146,20.442
gemini-2.5-pro,Default,0.673,35,52,0.868,7.941,18.603
gpt-3.5-turbo,Default,1.0,52,52,0.853,9.089,2.917
gpt-3.5-turbo,With example,1.0,52,52,0.847,7.867,2.518



UNSTRUCTURED PROMPT PERFORMANCE BY MODEL:


Unnamed: 0_level_0,Unnamed: 1_level_0,BERTScore F1,Readability (grade),Latency (seconds)
model_name,prompt_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
claude-3-5-sonnet-20240620,Engaging,0.886,8.834,4.338
claude-3-5-sonnet-20240620,Explain like I'm 5,0.856,0.872,6.514
claude-3-opus-20240229,Engaging,0.889,12.962,6.849
claude-3-opus-20240229,Explain like I'm 5,0.846,2.575,5.67
gemini-2.5-flash,Engaging,0.9,8.787,16.717
gemini-2.5-flash,Explain like I'm 5,0.866,0.271,18.77
gemini-2.5-pro,Engaging,0.896,9.688,16.79
gemini-2.5-pro,Explain like I'm 5,0.852,-0.158,21.392
gpt-3.5-turbo,Engaging,0.902,12.102,1.928
gpt-3.5-turbo,Explain like I'm 5,0.89,5.234,1.755



PROMPT ENGINEERING IMPACT

Prompt averages (across all 7 models):


Unnamed: 0_level_0,Format compliance,Passed count,Total cases,BERTScore F1,Readability (grade)
prompt_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Default,0.9,324,360,0.859,9.2
Engaging,0.0,0,362,0.895,10.566
Explain like I'm 5,0.0,0,362,0.87,2.268
With example,0.854,309,362,0.847,7.277



STRUCTURED vs UNSTRUCTURED PROMPT COMPARISON:


Unnamed: 0_level_0,Format compliance,Passed count,Total cases,BERTScore F1,Readability (grade)
prompt_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Structured,0.877,633,722,0.853,8.236
Unstructured,0.0,0,724,0.882,6.417



TOP 10 MODEL×PROMPT COMBINATIONS (by format compliance):


Unnamed: 0_level_0,Unnamed: 1_level_0,Format compliance,Passed,Total,BERTScore F1,Readability (grade),Latency (seconds)
model_name,prompt_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
gemini-2.5-flash,With example,1.0,52,52,0.846,5.221,22.364
gpt-3.5-turbo,With example,1.0,52,52,0.847,7.867,2.518
gpt-3.5-turbo,Default,1.0,52,52,0.853,9.089,2.917
gemini-2.5-pro,With example,1.0,52,52,0.853,6.146,20.442
gpt-4-turbo,Default,1.0,52,52,0.859,10.993,8.416
gpt-4-turbo,With example,1.0,52,52,0.85,8.351,6.496
gpt-4o,With example,1.0,52,52,0.849,7.285,4.693
gpt-4o,Default,1.0,52,52,0.856,9.483,4.892
gemini-2.5-flash,Default,0.962,50,52,0.865,7.379,23.045
claude-3-5-sonnet-20240620,Default,0.938,45,48,0.854,8.23,5.018



BOTTOM 5 MODEL×PROMPT COMBINATIONS (structured prompts only):


Unnamed: 0,model_name,prompt_name,Format compliance,Passed,Total,BERTScore F1,Readability (grade),Latency (seconds),provider
1,claude-3-5-sonnet-20240620,With example,0.28,14,50,0.84,8.33,5.389,Anthropic
3,claude-3-opus-20240229,With example,0.673,35,52,0.844,7.782,8.094,Anthropic
6,gemini-2.5-pro,Default,0.673,35,52,0.868,7.941,18.603,Google
2,claude-3-opus-20240229,Default,0.731,38,52,0.858,11.208,9.582,Anthropic
0,claude-3-5-sonnet-20240620,Default,0.938,45,48,0.854,8.23,5.018,Anthropic



PROVIDER COMPARISON (structured prompts only):


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_structured['provider'] = df_structured['model_name'].apply(lambda x:


Unnamed: 0_level_0,Format compliance,Passed count,Total cases,BERTScore F1,Readability (grade),Latency (seconds)
provider,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Anthropic,0.653,132,202,0.849,8.906,7.076
Google,0.909,189,208,0.858,6.672,21.113
OpenAI,1.0,312,312,0.852,8.845,4.989



CLAUDE 3.5 SONNET: DEFAULT vs WITH EXAMPLE COMPARISON:


Unnamed: 0_level_0,Format compliance,Passed,Total,BERTScore F1,Readability (grade),Latency (seconds)
prompt_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Default,0.938,45,48,0.854,8.23,5.018
With example,0.28,14,50,0.84,8.33,5.389


In [29]:
df_success[(df_success['model_name'] == 'gemini-2.5-pro') & (df_success['prompt_name'] == 'Explain like I\'m 5')]['generated_summary']

19                                                                                                                                                                                                                                                                                                                                                                                             Uh oh. The car got a note.\nThe car stayed in a spot for too long.\nThis broke a rule.\nSo you must pay some money.\nYou must pay $135.\nPay by October 13.
47                                                                                                                                                                                                        This paper is for a home.\nIt was made on October 1 2025.\nPriya owns the home.\nWei lives in the home.\nThe home is at Unit 3/45 Windsor Road Kellyville.\nWei can live there for one year.\nThis starts on October 1 2025.\nEach week Wei must pay $580.\nThis mu