# TrustyAI LM-Eval Results

This notebook loads and visualizes evaluation results from TrustyAI LM-Eval.

## Models
- Full Precision (4 GPUs)
- Quantized W4A16 (1 GPU)

## Benchmarks
- ARC-Easy, HellaSwag, GSM8K, TruthfulQA MC2

In [None]:
import json, subprocess, pandas as pd, matplotlib.pyplot as plt
import seaborn as sns, numpy as np
from IPython.display import display
%matplotlib inline

## Load Results

In [None]:
def get_eval_results(job_name):
    cmd = ['oc', 'get', 'lmevaljob', job_name, '-n', 'private-ai-demo', '-o', 'jsonpath={.status}']
    try:
        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
        status = json.loads(result.stdout)
        return status
    except:
        return None

print('Fetching results from latest evaluation jobs...\n')
# Updated to use the latest eval jobs with all 4 benchmarks
full_status = get_eval_results('eval-mistral-full')
quant_status = get_eval_results('eval-mistral-quantized')

full_results = json.loads(full_status['results']) if full_status and 'results' in full_status else None
quant_results = json.loads(quant_status['results']) if quant_status and 'results' in quant_status else None

print('Full Precision (eval-mistral-full):', 'Available' if full_results else 'Not available')
print('Quantized (eval-mistral-quantized):', 'Available' if quant_results else 'Not available')
if full_results and 'results' in full_results:
    print(f'Benchmarks: {", ".join(full_results["results"].keys())}')

## Compare Scores

In [None]:
def extract_scores(r):
    if not r: return {}
    res = r.get('results', {})
    scores = {}
    if 'arc_easy' in res: scores['ARC-Easy'] = res['arc_easy'].get('acc_norm,none', 0) * 100
    if 'hellaswag' in res: scores['HellaSwag'] = res['hellaswag'].get('acc_norm,none', 0) * 100
    if 'gsm8k' in res: scores['GSM8K'] = res['gsm8k'].get('exact_match,flexible-extract', 0) * 100
    if 'truthfulqa_mc2' in res: scores['TruthfulQA'] = res['truthfulqa_mc2'].get('acc,none', 0) * 100
    return scores

full_scores = extract_scores(full_results)
quant_scores = extract_scores(quant_results)

if full_scores or quant_scores:
    df = pd.DataFrame({'Full (4 GPUs)': full_scores, 'Quantized (1 GPU)': quant_scores})
    if full_scores and quant_scores:
        df['Delta (%)'] = df['Full (4 GPUs)'] - df['Quantized (1 GPU)']
    print('\nComparison:\n')
    display(df.round(2))
else:
    print('No results available')

## Visualization

In [None]:
if 'df' in locals() and not df.empty:
    fig, ax = plt.subplots(figsize=(10, 5))
    df[['Full (4 GPUs)', 'Quantized (1 GPU)']].plot(kind='bar', ax=ax, color=['#1f77b4', '#ff7f0e'])
    ax.set_title('Model Quality Comparison', fontsize=14, fontweight='bold')
    ax.set_ylabel('Accuracy (%)', fontsize=12)
    ax.set_ylim(0, 100)
    ax.grid(axis='y', alpha=0.3)
    ax.set_xticklabels(df.index, rotation=45, ha='right')
    for container in ax.containers:
        ax.bar_label(container, fmt='%.1f%%')
    plt.tight_layout()
    plt.show()

## Summary

In [None]:
if 'df' in locals() and 'Delta (%)' in df.columns:
    print('\nQuantization Impact:')
    print(f'  Average Delta: {df["Delta (%)"].mean():.2f}%')
    print(f'  Max Delta: {df["Delta (%)"].max():.2f}% ({df["Delta (%)"].idxmax()})')
    print(f'  Min Delta: {df["Delta (%)"].min():.2f}% ({df["Delta (%)"].idxmin()})')
    print('\nRecommendation:')
    avg_delta = df['Delta (%)'].mean()
    if avg_delta < 2:
        print('  Use Quantized: Minimal quality loss, 75% GPU cost savings')
    elif avg_delta < 5:
        print('  Evaluate per use case: Moderate quality-cost trade-off')
    else:
        print('  Use Full Precision: Significant quality loss from quantization')