# KEGG Evaluation Metrics Analysis

This notebook reads evaluation results from `eval_results` directory and calculates per-class metrics including F1 score, precision, recall, frequency, and accuracy.


In [1]:
import pandas as pd
import json
import os
import glob
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


In [3]:
def load_eval_results(results_dir: str = "eval_results"):
    """
    Load evaluation results from JSON files in the results directory.
    
    Args:
        results_dir: Directory containing evaluation results
        
    Returns:
        List of evaluation results
    """
    # Find all JSON result files
    json_files = glob.glob(os.path.join(results_dir, "*_kegg_eval_results_*.json"))
    
    if not json_files:
        raise FileNotFoundError(f"No evaluation result files found in {results_dir}")
    
    # Use the most recent file
    latest_file = max(json_files, key=os.path.getctime)
    print(f"Loading results from: {latest_file}")
    
    with open(latest_file, 'r') as f:
        results = json.load(f)
    
    print(f"Loaded {len(results)} evaluation results")
    return results


In [74]:
def calculate_per_class_metrics(results: list) -> pd.DataFrame:
    """
    Calculate per-class metrics from evaluation results.
    
    Args:
        results: List of evaluation results
        
    Returns:
        DataFrame with per-class metrics
    """
    # Convert results to DataFrame
    df_data = []
    for i, result in enumerate(results):
        df_data.append({
            "example_id": i,
            "question": result.get("question", ""),
            "predicted_answer": result.get("predicted_answer", ""),
            "ground_truth": result.get("ground_truth", ""),
            "is_correct": result.get("is_correct", False),
            "generated_text": result.get("generated_text", "")
        })
    
    df = pd.DataFrame(df_data)

    def parse_llm_output(row):
        if row['ground_truth'].lower() in row['predicted_answer'].lower():
            return row['ground_truth']
        else:
            for disease in all_disease:
                if disease.lower() in row['predicted_answer'].lower():
                    return disease
            return "no"


    all_disease = df['ground_truth'].unique().tolist()
    df['predicted_disease'] = df.apply(lambda x: parse_llm_output(x), axis=1)

    # Get disease frequency
    disease_counts = df['ground_truth'].value_counts()
    
    # Calculate per-disease metrics
    results_list = []
    
    for disease in disease_counts.index:
        # Create binary labels for this disease
        disease_df = df.copy()
        disease_df['true_label'] = (disease_df['ground_truth'] == disease).astype(int)
        disease_df['pred_label'] = (disease_df['predicted_disease'] == disease).astype(int)
        
        # Calculate metrics
        accuracy = accuracy_score(disease_df['true_label'], disease_df['pred_label'])
        precision = precision_score(disease_df['true_label'], disease_df['pred_label'], zero_division=0)
        recall = recall_score(disease_df['true_label'], disease_df['pred_label'], zero_division=0)
        f1 = f1_score(disease_df['true_label'], disease_df['pred_label'], zero_division=0)
        
        results_list.append({
            'Disease': disease,
            'Frequency': disease_counts[disease],
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1
        })
    
    # Convert to DataFrame and sort by frequency
    results_df = pd.DataFrame(results_list)
    results_df = results_df.sort_values('Frequency', ascending=False)
    
    # Format the table nicely
    results_df['Disease'] = results_df['Disease'].str.lower()
    results_df['Accuracy'] = results_df['Accuracy'].round(3)
    results_df['Precision'] = results_df['Precision'].round(3)
    results_df['Recall'] = results_df['Recall'].round(3)
    results_df['F1-Score'] = results_df['F1-Score'].round(3)
    
    return df, results_df


In [75]:
# Load evaluation results
results = load_eval_results("eval_results")

# Calculate metrics
df, metrics_df = calculate_per_class_metrics(results)

# Print overall accuracy
overall_accuracy = df['is_correct'].mean()
print(f"Overall Accuracy: {overall_accuracy:.4f}")
print(f"Total Examples: {len(df)}")
print(f"Correct Predictions: {df['is_correct'].sum()}")
print(f"Number of Disease Classes: {len(metrics_df)}")
print("\n" + "="*80)


Loading results from: eval_results/dna_vllm_kegg_eval_results_20251016_213918.json
Loaded 290 evaluation results
Overall Accuracy: 0.9414
Total Examples: 290
Correct Predictions: 273
Number of Disease Classes: 33



In [76]:
# Display per-class metrics
print("Per-Class Metrics (sorted by frequency):")
metrics_df


Per-Class Metrics (sorted by frequency):


Unnamed: 0,Disease,Frequency,Accuracy,Precision,Recall,F1-Score
0,parkinsons disease,47,0.997,0.979,1.0,0.989
1,alzheimers disease,40,1.0,1.0,1.0,1.0
2,spinocerebellar ataxia,36,1.0,1.0,1.0,1.0
3,amyotrophic lateral sclerosis,35,0.993,1.0,0.943,0.971
4,melanoma,17,0.99,0.889,0.941,0.914
5,prion disease,15,0.997,0.938,1.0,0.968
6,colorectal cancer,12,0.983,0.769,0.833,0.8
7,huntingtons disease,10,1.0,1.0,1.0,1.0
8,gaucher disease,7,1.0,1.0,1.0,1.0
9,acute myeloid leukemia,7,1.0,1.0,1.0,1.0


In [77]:
# Show incorrect predictions
print("Incorrect Predictions:")
incorrect_df = df[df['is_correct'] == False]
print(f"Number of incorrect predictions: {len(incorrect_df)}")

if len(incorrect_df) > 0:
    # Show first 10 incorrect predictions
    display_cols = ['example_id', 'ground_truth', 'predicted_answer', 'is_correct']
    print("\nFirst 10 incorrect predictions:")
    print(incorrect_df[display_cols].head(10))
    
    # Show prediction distribution for incorrect cases
    print("\nPrediction distribution for incorrect cases:")
    print(incorrect_df['predicted_answer'].value_counts())


Incorrect Predictions:
Number of incorrect predictions: 17

First 10 incorrect predictions:
     example_id                      ground_truth  \
36           36         creutzfeldt-jakob disease   
60           60     amyotrophic lateral sclerosis   
70           70                 colorectal cancer   
80           80  pancreatic ductal adenocarcinoma   
86           86                  robinow syndrome   
90           90                 colorectal cancer   
108         108                          melanoma   
109         109     amyotrophic lateral sclerosis   
124         124                    thyroid cancer   
129         129                    thyroid cancer   

                                      predicted_answer  is_correct  
36                                       prion disease       False  
60         charcot-marie-tooth disease type 2x (cmt2x)       False  
70                                       breast cancer       False  
80                                            me

In [79]:
# Show diseases with lowest performance
print("Diseases with lowest F1-Score:")
low_f1 = metrics_df[metrics_df['F1-Score'] < 1.0].sort_values('F1-Score')
if len(low_f1) > 0:
    print(low_f1[['Disease', 'Frequency', 'F1-Score', 'Precision', 'Recall']])
else:
    print("All diseases have perfect F1-Score!")


Diseases with lowest F1-Score:
                                      Disease  Frequency  F1-Score  Precision  \
30                           robinow syndrome          1     0.000      0.000   
31           pancreatic ductal adenocarcinoma          1     0.000      0.000   
32  methylmalonic aciduria and homocystinuria          1     0.000      0.000   
17                             thyroid cancer          4     0.000      0.000   
22                  creutzfeldt-jakob disease          2     0.667      1.000   
19                       basal cell carcinoma          3     0.667      0.667   
6                           colorectal cancer         12     0.800      0.769   
23                   hepatocellular carcinoma          2     0.800      0.667   
14                       renal cell carcinoma          4     0.857      1.000   
13                           cushing syndrome          4     0.857      1.000   
15                       urothelial carcinoma          4     0.857      1.000 

In [80]:
# Show diseases with lowest frequency (potential data imbalance issues)
print("Diseases with lowest frequency (potential data imbalance):")
low_freq = metrics_df[metrics_df['Frequency'] <= 3].sort_values('Frequency')
if len(low_freq) > 0:
    print(low_freq[['Disease', 'Frequency', 'F1-Score', 'Precision', 'Recall']])
else:
    print("All diseases have sufficient frequency!")


Diseases with lowest frequency (potential data imbalance):
                                         Disease  Frequency  F1-Score  \
30                              robinow syndrome          1     0.000   
31              pancreatic ductal adenocarcinoma          1     0.000   
28                               prostate cancer          1     1.000   
32     methylmalonic aciduria and homocystinuria          1     0.000   
29           multiple endocrine neoplasia type 1          1     1.000   
21                     thyroid dyshormonogenesis          2     1.000   
23                      hepatocellular carcinoma          2     0.800   
24  adenine phosphoribosyltransferase deficiency          2     1.000   
22                     creutzfeldt-jakob disease          2     0.667   
27                papillary renal cell carcinoma          2     1.000   
26         n-acetylglutamate synthase deficiency          2     1.000   
25                          lesch-nyhan syndrome          2     1

In [82]:
# Summary statistics
print("Summary Statistics:")
print(f"Mean F1-Score: {metrics_df['F1-Score'].mean():.3f}")
print(f"Median F1-Score: {metrics_df['F1-Score'].median():.3f}")
print(f"Min F1-Score: {metrics_df['F1-Score'].min():.3f}")
print(f"Max F1-Score: {metrics_df['F1-Score'].max():.3f}")
print(f"Std F1-Score: {metrics_df['F1-Score'].std():.3f}")

print(f"\nMean Precision: {metrics_df['Precision'].mean():.3f}")
print(f"Mean Recall: {metrics_df['Recall'].mean():.3f}")

# Count perfect scores
perfect_f1 = (metrics_df['F1-Score'] == 1.0).sum()
print(f"\nDiseases with perfect F1-Score: {perfect_f1}/{len(metrics_df)} ({perfect_f1/len(metrics_df)*100:.1f}%)")


Summary Statistics:
Mean F1-Score: 0.829
Median F1-Score: 1.000
Min F1-Score: 0.000
Max F1-Score: 1.000
Std F1-Score: 0.326

Mean Precision: 0.846
Mean Recall: 0.822

Diseases with perfect F1-Score: 18/33 (54.5%)


In [83]:
# Optional: Save the analysis results
output_file = "kegg_eval_analysis_results.csv"
metrics_df.to_csv(output_file, index=False)
print(f"Analysis results saved to: {output_file}")

# Also save the detailed dataframe
detailed_file = "kegg_eval_detailed_results.csv"
df.to_csv(detailed_file, index=False)
print(f"Detailed results saved to: {detailed_file}")


Analysis results saved to: kegg_eval_analysis_results.csv
Detailed results saved to: kegg_eval_detailed_results.csv
