In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', 100)

In [2]:
# Training Set
# Load true labels
train_labels = pd.read_csv("../radnlp_2024_train_val_20240731/en/main_task/train/label.csv")
train_path = "./prediction/ensemble/train/"

# Load predictions
# deepseek-reasoner (r1)
r1_train_1 = pd.read_csv(f"{train_path}deepseek-reasoner_train_01.csv")
r1_train_2 = pd.read_csv(f"{train_path}deepseek-reasoner_train_02.csv")
r1_train_3 = pd.read_csv(f"{train_path}deepseek-reasoner_train_03.csv")

# o1
o1_train_1 = pd.read_csv(f"{train_path}o1-mini_train_01.csv")
o1_train_2 = pd.read_csv(f"{train_path}o1-mini_train_02.csv")
o1_train_3 = pd.read_csv(f"{train_path}o1-mini_train_03.csv")



In [3]:
# Concatenate labels
labels = train_labels

# Concatenate predictions for r1 (deepseek-reasoner)
r1_pred_1 = r1_train_1
r1_pred_2 = r1_train_2
r1_pred_3 = r1_train_3

# Concatenate predictions for o1 (o1-mini)
o1_pred_1 = o1_train_1
o1_pred_2 = o1_train_2
o1_pred_3 = o1_train_3

In [4]:
# Combine true labels and predictions into a single DataFrame for analysis
analysis_df = labels.copy()

# Add prediction columns with prefixes
for i, pred_df in enumerate([r1_pred_1, r1_pred_2, r1_pred_3, o1_pred_1, o1_pred_2, o1_pred_3]):
    analysis_df = analysis_df.join(
        pred_df.set_index('id')[['t', 'n', 'm']].add_prefix(f'pred_{i+1}_'),
        on='id'
    )

# Check for samples where all 6 predictions are wrong for each column
mask_t = (
    (analysis_df['pred_1_t'] != analysis_df['t']) &
    (analysis_df['pred_2_t'] != analysis_df['t']) &
    (analysis_df['pred_3_t'] != analysis_df['t']) &
    (analysis_df['pred_4_t'] != analysis_df['t']) &
    (analysis_df['pred_5_t'] != analysis_df['t']) &
    (analysis_df['pred_6_t'] != analysis_df['t'])
)

mask_n = (
    (analysis_df['pred_1_n'] != analysis_df['n']) &
    (analysis_df['pred_2_n'] != analysis_df['n']) &
    (analysis_df['pred_3_n'] != analysis_df['n']) &
    (analysis_df['pred_4_n'] != analysis_df['n']) &
    (analysis_df['pred_5_n'] != analysis_df['n']) &
    (analysis_df['pred_6_n'] != analysis_df['n'])
)

mask_m = (
    (analysis_df['pred_1_m'] != analysis_df['m']) &
    (analysis_df['pred_2_m'] != analysis_df['m']) &
    (analysis_df['pred_3_m'] != analysis_df['m']) &
    (analysis_df['pred_4_m'] != analysis_df['m']) &
    (analysis_df['pred_5_m'] != analysis_df['m']) &
    (analysis_df['pred_6_m'] != analysis_df['m'])
)

# Get problematic samples
errors_t = analysis_df[mask_t][['id', 't'] + [f'pred_{i}_t' for i in range(1, 7)]]
errors_n = analysis_df[mask_n][['id', 'n'] + [f'pred_{i}_n' for i in range(1, 7)]]
errors_m = analysis_df[mask_m][['id', 'm'] + [f'pred_{i}_m' for i in range(1, 7)]]

print(f"Samples with all t predictions wrong: {len(errors_t)}")
print(f"Samples with all n predictions wrong: {len(errors_n)}")
print(f"Samples with all m predictions wrong: {len(errors_m)}")

# Show examples if any exist
if len(errors_t) > 0:
    print("\nExample t errors:")
    print(errors_t.head(11))
    
if len(errors_n) > 0:
    print("\nExample n errors:")
    print(errors_n.head(6))
    
if len(errors_m) > 0:
    print("\nExample m errors:")
    print(errors_m.head(4))

Samples with all t predictions wrong: 5
Samples with all n predictions wrong: 4
Samples with all m predictions wrong: 4

Example t errors:
          id    t pred_1_t pred_2_t pred_3_t pred_4_t pred_5_t pred_6_t
22   2343928   T4      T2b      T2b      T2b      T2b      T2b      T2b
44   6363776   T3       T4       T4       T4       T4       T4       T4
77  10320785   T3       T4       T4       T4       T4       T4       T0
97  15045923  T2b       T3       T3       T3       T3       T3       T3
99  15095613   T4      T2b      T2b      T2b      T2b      T2b      T2b

Example n errors:
           id   n pred_1_n pred_2_n pred_3_n pred_4_n pred_5_n pred_6_n
1      133166  N0       N1       N1       N1       N1       N1       N1
4      463397  N0       N1       N1       N1       N1       N1       N1
27    3072861  N0       N1       N1       N1       N1       N1       N1
100  15410359  N0       N2       N2       N2       N2       N2       N1

Example m errors:
          id   m pred_1_m pred_

In [5]:
# Combine predictions with labels
analysis_df = labels.copy()

# Add prediction columns with lowercase 'id'
for i, pred_df in enumerate([r1_pred_1, r1_pred_2, r1_pred_3, o1_pred_1, o1_pred_2, o1_pred_3]):
    analysis_df = analysis_df.join(
        pred_df.set_index('id')[['t', 'n', 'm']].add_prefix(f'pred_{i+1}_'),
        on='id'
    )

# Function to display component errors
def show_component_errors(component):
    # Calculate error counts
    pred_cols = [f'pred_{i}_{component}' for i in range(1, 7)]
    error_count = (analysis_df[pred_cols] != analysis_df[component].values[:, None]).sum(axis=1)
    
    # Filter for partial errors (1-5 wrong predictions)
    mask = error_count.between(1, 5)
    errors = analysis_df[mask][['id', component] + pred_cols]
    
    # Rename columns for clarity
    errors = errors.rename(columns={
        component: f'{component}_true',
        **{f'pred_{i}_{component}': f'model_{i}' for i in range(1, 7)}
    })
    
    # Add error count column
    errors['num_wrong'] = error_count[mask].values
    
    return errors.sort_values('num_wrong', ascending=False)

# Generate error reports for each component
print("="*50)
print("T Component Errors (1-5 models wrong)")
print("="*50)
t_errors = show_component_errors('t')
if not t_errors.empty:
    display(t_errors.head(10))
    print("Total(T): ", t_errors.shape[0])
else:
    print("No partial errors found for t")

print("\n" + "="*50)
print("N Component Errors (1-5 models wrong)")
print("="*50)
n_errors = show_component_errors('n')
if not n_errors.empty:
    display(n_errors.head(13))
    print("Total(N): ", n_errors.shape[0])
else:
    print("No partial errors found for n")

print("\n" + "="*50)
print("M Component Errors (1-5 models wrong)")
print("="*50)
m_errors = show_component_errors('m')
if not m_errors.empty:
    display(m_errors.head(10))
    print("Total(M): ", m_errors.shape[0])
else:
    print("No partial errors found for m")

T Component Errors (1-5 models wrong)


Unnamed: 0,id,t_true,model_1,model_2,model_3,model_4,model_5,model_6,num_wrong
9,1185427,T1c,T2a,T2a,T2a,T2a,T1c,T2a,5
11,1679413,T2b,T4,T4,T4,T4,T4,T2b,5
12,1736655,T3,T0,T4,T4,T3,T0,T2a,5
18,2195733,T3,T2a,T3,T2a,T2a,T2a,T3,4
86,11650032,T3,T3,T3,T3,T2a,T2a,T2a,3
76,10223615,T1c,T1c,T1c,T1c,T2a,T2a,T2a,3
19,2240304,T3,T3,T3,T3,T2b,T2b,T2b,3
54,7516987,T0,T0,T0,T0,T1a,T1a,T1a,3
36,4760374,T2a,T4,T4,T4,T2a,T2a,T2a,3
47,6778649,T4,T4,T4,T4,T2a,T2a,T2a,3


Total(T):  29

N Component Errors (1-5 models wrong)


Unnamed: 0,id,n_true,model_1,model_2,model_3,model_4,model_5,model_6,num_wrong
42,6231900,N2,N2,N2,N2,N2,N3,N3,2
50,6861476,N2,N2,N2,N2,N3,N3,N2,2
77,10320785,N2,N2,N2,N2,N0,N2,N0,2
19,2240304,N2,N2,N2,N2,N2,N2,N3,1
20,2254357,N3,N3,N3,N3,N2,N3,N3,1
33,4624494,N0,N0,N0,N0,N0,N2,N0,1
43,6232953,N2,N2,N2,N2,N1,N2,N2,1
81,11079957,N0,N0,N0,N0,N0,N0,N3,1


Total(N):  8

M Component Errors (1-5 models wrong)


Unnamed: 0,id,m_true,model_1,model_2,model_3,model_4,model_5,model_6,num_wrong
100,15410359,M0,M1b,M1b,M1b,M0,M1b,M1b,5
13,1856176,M0,M0,M1a,M1a,M1a,M1a,M1a,5
83,11328933,M0,M1a,M1a,M1a,M1a,M1a,M0,5
37,4844803,M0,M1a,M0,M1a,M1a,M1a,M1a,5
38,4850448,M0,M0,M1a,M1a,M1a,M1a,M1a,5
12,1736655,M0,M1a,M0,M0,M1c,M1c,M1c,4
78,10405379,M0,M1a,M0,M0,M1a,M1a,M1a,4
11,1679413,M0,M1a,M1a,M1a,M0,M1a,M0,4
47,6778649,M0,M0,M0,M0,M1a,M1a,M1a,3
67,9175672,M0,M1b,M1b,M1b,M0,M0,M0,3


Total(M):  17


In [6]:
import pandas as pd
from sklearn.metrics import accuracy_score

# Assuming the following DataFrames are already defined:
# labels, o1_pred_1, o1_pred_2, o1_pred_3, r1_pred_1, r1_pred_2, r1_pred_3

# List of prediction DataFrames
pred_dfs = {
    'o1_pred_1': o1_pred_1,
    'o1_pred_2': o1_pred_2,
    'o1_pred_3': o1_pred_3,
    'r1_pred_1': r1_pred_1,
    'r1_pred_2': r1_pred_2,
    'r1_pred_3': r1_pred_3
}

# List of classes
classes = ['t', 'n', 'm']

# Initialize a dictionary to store accuracy results
results = []

# Calculate accuracy for each class and each DataFrame
for cls in classes:
    for df_name, df in pred_dfs.items():
        y_true = labels[cls].tolist()
        y_pred = df[cls].tolist()
        accuracy = accuracy_score(y_true, y_pred)
        results.append({
            'Class': cls,
            'DataFrame': df_name,
            'Accuracy': accuracy
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Pivot the DataFrame to have DataFrame names as columns and classes as rows
pivot_df = results_df.pivot(index='Class', columns='DataFrame', values='Accuracy')

# Display the pivoted DataFrame
pivot_df

DataFrame,o1_pred_1,o1_pred_2,o1_pred_3,r1_pred_1,r1_pred_2,r1_pred_3
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
m,0.898148,0.833333,0.898148,0.888889,0.898148,0.87963
n,0.925926,0.935185,0.925926,0.962963,0.962963,0.962963
t,0.787037,0.851852,0.833333,0.888889,0.898148,0.861111


In [7]:
display(t_errors)

Unnamed: 0,id,t_true,model_1,model_2,model_3,model_4,model_5,model_6,num_wrong
9,1185427,T1c,T2a,T2a,T2a,T2a,T1c,T2a,5
11,1679413,T2b,T4,T4,T4,T4,T4,T2b,5
12,1736655,T3,T0,T4,T4,T3,T0,T2a,5
18,2195733,T3,T2a,T3,T2a,T2a,T2a,T3,4
86,11650032,T3,T3,T3,T3,T2a,T2a,T2a,3
76,10223615,T1c,T1c,T1c,T1c,T2a,T2a,T2a,3
19,2240304,T3,T3,T3,T3,T2b,T2b,T2b,3
54,7516987,T0,T0,T0,T0,T1a,T1a,T1a,3
36,4760374,T2a,T4,T4,T4,T2a,T2a,T2a,3
47,6778649,T4,T4,T4,T4,T2a,T2a,T2a,3
