In [1]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import os 

def calculate_metrics(df):
    """
    Sometimes Llama3 does not say yes or no, thats why i classified them as -1
    This rarely happens, for example in dirty_Walmart-amazon it happened one single 
    time out of 8000 prompts, thats why i omit them
    """
    df = df[df['pred'] != -1]
    return {
        'Accuracy': accuracy_score(df['label'], df['pred']),
        'Precision': precision_score(df['label'], df['pred']),
        'Recall': recall_score(df['label'], df['pred']),
        'F1': f1_score(df['label'], df['pred'])
    }

# List of dataset names
dataset_names = [
    'dirty_DBLP-ACM',
    'dirty_iTunes-Amazon',
    'dirty_Walmart-Amazon',
    'structured_DBLP-ACM',
    'structured_iTunes-Amazon',
    'structured_Walmart-Amazon',
    'structured_Beer',
    'structured_Fodors-Zagats',
    'structured_Amazon-Google',
    'textual_Abt-Buy'
]

# Read datasets into a dictionary
datasets = {}
for name in dataset_names:
    try:
        datasets[name] = pd.read_csv(os.path.join('llama3_predictions', f'{name}.csv'))
    except FileNotFoundError:
        print(f"File {name}.csv not found in the directory 'llama3_predictions'.")

# Calculate metrics for each dataset
results = {name: calculate_metrics(df) for name, df in datasets.items()}

# Convert results to a DataFrame and display with a gradient background
df_all = pd.DataFrame(results).T
df_all.style.background_gradient(cmap='RdYlGn', axis=1)

  from pandas.core import (


Unnamed: 0,Accuracy,Precision,Recall,F1
dirty_DBLP-ACM,0.48308,0.236651,0.847285,0.369968
dirty_iTunes-Amazon,0.373853,0.270195,0.898148,0.415418
dirty_Walmart-Amazon,0.635387,0.185585,0.84715,0.304469
structured_DBLP-ACM,0.438621,0.232828,0.931445,0.372536
structured_iTunes-Amazon,0.40367,0.27381,0.851852,0.414414
structured_Walmart-Amazon,0.565955,0.16774,0.910622,0.283296
structured_Beer,0.708791,0.34375,0.982143,0.509259
structured_Fodors-Zagats,0.748677,0.294355,0.829545,0.434524
structured_Amazon-Google,0.679463,0.203083,0.731838,0.317939
textual_Abt-Buy,0.716077,0.260376,0.891859,0.403075


In [46]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def calculate_metrics(df):
    df = df[df['pred'] != -1]
    metrics = {}
    for name, group in df.groupby(['general_or_domain', 'simple_or_complex']):
        label = group['label']
        pred = group['pred']
        metrics[name] = {
            'Accuracy': accuracy_score(label, pred),
            'Precision': precision_score(label, pred, zero_division=0),
            'Recall': recall_score(label, pred, zero_division=0),
            'F1': f1_score(label, pred, zero_division=0)
        }
    return metrics

results = {}
for name, df in datasets.items():
    results.update({(name,) + key: value for key, value in calculate_metrics(df).items()})

df_all = pd.DataFrame(results).T
df_all.index.names = ['Dataset', 'GeneralOrDomain', 'SimpleOrComplex']
df_all.style.background_gradient(subset=['Accuracy', 'Precision', 'Recall', 'F1'], cmap='RdYlGn')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Accuracy,Precision,Recall,F1
Dataset,GeneralOrDomain,SimpleOrComplex,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
dirty_DBLP_ACM,domain,complex,0.720469,0.373073,0.817568,0.51235
dirty_DBLP_ACM,domain,simple,0.396426,0.208031,0.849658,0.334229
dirty_DBLP_ACM,general,complex,0.452891,0.238643,0.934685,0.380211
dirty_DBLP_ACM,general,simple,0.361754,0.190137,0.786848,0.306267
dirty_iTunes_Amazon,domain,complex,0.449541,0.301205,0.925926,0.454545
dirty_iTunes_Amazon,domain,simple,0.46789,0.298701,0.851852,0.442308
dirty_iTunes_Amazon,general,complex,0.256881,0.25,1.0,0.4
dirty_iTunes_Amazon,general,simple,0.321101,0.241758,0.814815,0.372881
dirty_Walmart_Amazon,domain,complex,0.712055,0.236388,0.92228,0.376321
dirty_Walmart_Amazon,domain,simple,0.776964,0.265957,0.777202,0.396301


In [59]:
def calculate_metrics(df):
    df = df[df['pred'] != -1]
    metrics = {}
    for category in ['general_or_domain', 'simple_or_complex']:
        for name, group in df.groupby(category):
            metrics[name] = {
                'F1': f1_score(group['label'], group['pred'])
            }
    return metrics

results = []
for name, df in datasets.items():
    metrics = calculate_metrics(df)
    results.append((name, 
                    metrics.get('domain', {}).get('F1', None), 
                    metrics.get('general', {}).get('F1', None),
                    metrics.get('simple', {}).get('F1', None),
                    metrics.get('complex', {}).get('F1', None)))

df_all = pd.DataFrame(results, columns=['Dataset', 'Domain F1', 'General F1', 'Simple F1', 'Complex F1'])
df_all.set_index('Dataset', inplace=True)
df_all.style.background_gradient(subset=['Domain F1', 'General F1', 'Simple F1', 'Complex F1'], cmap='RdYlGn', axis=1)

Unnamed: 0_level_0,Domain F1,General F1,Simple F1,Complex F1
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dirty_DBLP_ACM,0.403398,0.342549,0.320142,0.432222
dirty_iTunes_Amazon,0.448598,0.387352,0.405405,0.42449
dirty_Walmart_Amazon,0.385203,0.251446,0.272205,0.340384
structured_DBLP_ACM,0.419464,0.336344,0.326837,0.430256
structured_iTunes_Amazon,0.481675,0.363636,0.38009,0.44843
structured_Walmart_Amazon,0.393201,0.222639,0.269698,0.297678
structured_Beer,0.5,0.518519,0.355263,0.875
structured_Fodors_Zagats,0.526316,0.358696,0.284553,0.844444
structured_Amazon_Google,0.324247,0.311993,0.312444,0.323855
textual_Abt_Buy,0.475946,0.348536,0.402286,0.403805


In [55]:
def calculate_metrics(df):
    df = df[df['pred'] != -1]
    metrics = {}
    for (general_or_domain, simple_or_complex), group in df.groupby(['general_or_domain', 'simple_or_complex']):
        name = f'{general_or_domain}_{simple_or_complex}'
        metrics[name] = {
            'F1': f1_score(group['label'], group['pred'])
        }
    return metrics

results = []
for name, df in datasets.items():
    metrics = calculate_metrics(df)
    results.append((name, 
                    metrics.get('domain_simple', {}).get('F1', None), 
                    metrics.get('domain_complex', {}).get('F1', None),
                    metrics.get('general_simple', {}).get('F1', None),
                    metrics.get('general_complex', {}).get('F1', None)))

df_all = pd.DataFrame(results, columns=['Dataset', 'Domain Simple F1', 'Domain Complex F1', 'General Simple F1', 'General Complex F1'])
df_all['Mean F1'] = df_all[['Domain Simple F1', 'Domain Complex F1', 'General Simple F1', 'General Complex F1']].mean(axis=1)
df_all.set_index('Dataset', inplace=True)
df_all.style.background_gradient(subset=['Domain Simple F1', 'Domain Complex F1', 'General Simple F1', 'General Complex F1', 'Mean F1'], cmap='RdYlGn', axis=1)

Unnamed: 0_level_0,Domain Simple F1,Domain Complex F1,General Simple F1,General Complex F1,Mean F1
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
dirty_DBLP_ACM,0.334229,0.51235,0.306267,0.380211,0.383264
dirty_iTunes_Amazon,0.442308,0.454545,0.372881,0.4,0.417434
dirty_Walmart_Amazon,0.396301,0.376321,0.209827,0.309108,0.322889
structured_DBLP_ACM,0.330146,0.568846,0.323681,0.34931,0.392995
structured_iTunes_Amazon,0.44,0.527473,0.330579,0.393939,0.422998
structured_Walmart_Amazon,0.43832,0.358923,0.197876,0.254081,0.3123
structured_Beer,0.337662,0.903226,0.373333,0.848485,0.615677
structured_Fodors_Zagats,0.38835,0.816327,0.20979,0.878049,0.573129
structured_Amazon_Google,0.383167,0.279461,0.264463,0.383315,0.327601
textual_Abt_Buy,0.570978,0.410811,0.306452,0.397104,0.421336
