In [293]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import os 

def calculate_metrics(df):
    """
    Sometimes Llama3 does not say yes or no, thats why i classified them as -1
    This rarely happens, for example in dirty_Walmart-amazon it happened one single 
    time out of 8000 prompts, thats why i omit them"""
    df = df[df['pred'] != -1]
    return {
        'Accuracy': accuracy_score(df['label'], df['pred']),
        'Precision': precision_score(df['label'], df['pred']),
        'Recall': recall_score(df['label'], df['pred']),
        'F1': f1_score(df['label'], df['pred'])
    }

dir = 'gemini_predictions'

datasets = {}
for file in os.listdir(dir):
    if file.endswith('.csv'):
        df = pd.read_csv(f'{dir}/{file}')
        datasets[file.split(".")[0]] = df

results = {name: calculate_metrics(df) for name, df in datasets.items()}

df_all = pd.DataFrame(results).T
df_all.style.background_gradient(cmap='RdYlGn', axis=1)

Unnamed: 0,Accuracy,Precision,Recall,F1
structured_Beer,0.928375,0.916667,0.589286,0.717391
dirty_iTunes-Amazon,0.71789,0.462687,0.861111,0.601942
structured_Fodors-Zagats,0.948413,0.728972,0.886364,0.8
structured_iTunes-Amazon,0.642202,0.398305,0.87037,0.546512


In [294]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def calculate_metrics(df):
    df = df[df['pred'] != -1]
    metrics = {}
    for name, group in df.groupby(['general_or_domain', 'simple_or_complex']):
        label = group['label']
        pred = group['pred']
        metrics[name] = {
            'Accuracy': accuracy_score(label, pred),
            'Precision': precision_score(label, pred, zero_division=0),
            'Recall': recall_score(label, pred, zero_division=0),
            'F1': f1_score(label, pred, zero_division=0)
        }
    return metrics

results = {}
for name, df in datasets.items():
    results.update({(name,) + key: value for key, value in calculate_metrics(df).items()})

df_all = pd.DataFrame(results).T
df_all.index.names = ['Dataset', 'GeneralOrDomain', 'SimpleOrComplex']
df_all.style.background_gradient(subset=['Accuracy', 'Precision', 'Recall', 'F1'], cmap='RdYlGn')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Accuracy,Precision,Recall,F1
Dataset,GeneralOrDomain,SimpleOrComplex,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
structured_Beer,domain,complex,0.945055,0.909091,0.714286,0.8
structured_Beer,domain,simple,0.912088,1.0,0.428571,0.6
structured_Beer,general,complex,0.945055,0.909091,0.714286,0.8
structured_Beer,general,simple,0.911111,0.875,0.5,0.636364
dirty_iTunes-Amazon,domain,complex,0.633028,0.402985,1.0,0.574468
dirty_iTunes-Amazon,domain,simple,0.926606,0.952381,0.740741,0.833333
dirty_iTunes-Amazon,general,complex,0.449541,0.305882,0.962963,0.464286
dirty_iTunes-Amazon,general,simple,0.862385,0.714286,0.740741,0.727273
structured_Fodors-Zagats,domain,complex,0.968254,0.807692,0.954545,0.875
structured_Fodors-Zagats,domain,simple,0.962963,1.0,0.681818,0.810811


In [295]:
def calculate_metrics(df):
    df = df[df['pred'] != -1]
    metrics = {}
    for category in ['general_or_domain', 'simple_or_complex']:
        for name, group in df.groupby(category):
            metrics[name] = {
                'F1': f1_score(group['label'], group['pred'])
            }
    return metrics

results = []
for name, df in datasets.items():
    metrics = calculate_metrics(df)
    results.append((name, 
                    metrics.get('domain', {}).get('F1', None), 
                    metrics.get('general', {}).get('F1', None),
                    metrics.get('simple', {}).get('F1', None),
                    metrics.get('complex', {}).get('F1', None)))

df_all = pd.DataFrame(results, columns=['Dataset', 'Domain F1', 'General F1', 'Simple F1', 'Complex F1'])
df_all.set_index('Dataset', inplace=True)
df_all.style.background_gradient(subset=['Domain F1', 'General F1', 'Simple F1', 'Complex F1'], cmap='RdYlGn', axis=1)



Unnamed: 0_level_0,Domain F1,General F1,Simple F1,Complex F1
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
structured_Beer,0.711111,0.723404,0.619048,0.8
dirty_iTunes-Amazon,0.661972,0.550898,0.776699,0.514563
structured_Fodors-Zagats,0.847059,0.763636,0.864198,0.754386
structured_iTunes-Amazon,0.619718,0.49505,0.636364,0.490566


In [296]:
def calculate_metrics(df):
    df = df[df['pred'] != -1]
    metrics = {}
    for (general_or_domain, simple_or_complex), group in df.groupby(['general_or_domain', 'simple_or_complex']):
        name = f'{general_or_domain}_{simple_or_complex}'
        metrics[name] = {
            'F1': f1_score(group['label'], group['pred'])
        }
    return metrics

results = []
for name, df in datasets.items():
    metrics = calculate_metrics(df)
    results.append((name, 
                    metrics.get('domain_simple', {}).get('F1', None), 
                    metrics.get('domain_complex', {}).get('F1', None),
                    metrics.get('general_simple', {}).get('F1', None),
                    metrics.get('general_complex', {}).get('F1', None)))

df_all = pd.DataFrame(results, columns=['Dataset', 'Domain Simple F1', 'Domain Complex F1', 'General Simple F1', 'General Complex F1'])
df_all['Mean F1'] = df_all[['Domain Simple F1', 'Domain Complex F1', 'General Simple F1', 'General Complex F1']].mean(axis=1)
df_all.set_index('Dataset', inplace=True)
df_all.style.background_gradient(subset=['Domain Simple F1', 'Domain Complex F1', 'General Simple F1', 'General Complex F1', 'Mean F1'], cmap='RdYlGn', axis=1)

Unnamed: 0_level_0,Domain Simple F1,Domain Complex F1,General Simple F1,General Complex F1,Mean F1
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
structured_Beer,0.6,0.8,0.636364,0.8,0.709091
dirty_iTunes-Amazon,0.833333,0.574468,0.727273,0.464286,0.64984
structured_Fodors-Zagats,0.810811,0.875,0.909091,0.666667,0.815392
structured_iTunes-Amazon,0.734694,0.55914,0.578313,0.436975,0.57728
