In [1]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go

In [2]:
data = {
    'Benchmark': ["AIME 2024 (Pass@1)","Codeforces (Percentile)","GPOA Diamond (Pass@1)","MATH-500 (Pass@1)","MMLU (Pass@1)","SWE-bench Verified (Resolved)"],
    'DeepSeek-R1': [79.8,96.3,71.5,97.3,90.8,49.2],
    'OpenAI-o1-1217': [79.2,96.6,75.7,96.4,91.8,48.9],
    'DeepSeek-R1-32B': [72.6,90.6,62.1,94.3,87.4,41.6],
    'OpenAI-o1-mini': [63.6,93.4,60.0,90.0,85.2,36.8],
    'DeepSeek-V3': [39.2,58.7,59.1,90.2,88.5,42.0],
}

df = pd.DataFrame(data)

df

Unnamed: 0,Benchmark,DeepSeek-R1,OpenAI-o1-1217,DeepSeek-R1-32B,OpenAI-o1-mini,DeepSeek-V3
0,AIME 2024 (Pass@1),79.8,79.2,72.6,63.6,39.2
1,Codeforces (Percentile),96.3,96.6,90.6,93.4,58.7
2,GPOA Diamond (Pass@1),71.5,75.7,62.1,60.0,59.1
3,MATH-500 (Pass@1),97.3,96.4,94.3,90.0,90.2
4,MMLU (Pass@1),90.8,91.8,87.4,85.2,88.5
5,SWE-bench Verified (Resolved),49.2,48.9,41.6,36.8,42.0


In [3]:
df = df.melt(id_vars='Benchmark', var_name='Model', value_name='Score')
df

Unnamed: 0,Benchmark,Model,Score
0,AIME 2024 (Pass@1),DeepSeek-R1,79.8
1,Codeforces (Percentile),DeepSeek-R1,96.3
2,GPOA Diamond (Pass@1),DeepSeek-R1,71.5
3,MATH-500 (Pass@1),DeepSeek-R1,97.3
4,MMLU (Pass@1),DeepSeek-R1,90.8
5,SWE-bench Verified (Resolved),DeepSeek-R1,49.2
6,AIME 2024 (Pass@1),OpenAI-o1-1217,79.2
7,Codeforces (Percentile),OpenAI-o1-1217,96.6
8,GPOA Diamond (Pass@1),OpenAI-o1-1217,75.7
9,MATH-500 (Pass@1),OpenAI-o1-1217,96.4


In [4]:
fig = px.bar(df, color='Model', y='Score', x='Benchmark', barmode='group')
fig.update_yaxes(title="")
fig.update_xaxes(title='')
fig.update_layout(width=800, height=600)
fig.show()

In [5]:
fig = px.scatter(
    df, y='Benchmark', x='Model', color='Score', size='Score',
    color_continuous_scale = px.colors.sequential.RdBu,
)
fig.update_layout(
        paper_bgcolor="white",
        plot_bgcolor="white",
    )
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')
fig.update_coloraxes(colorbar=dict(title='Metric Tonnes'))
fig.update_yaxes(title="")
fig.update_xaxes(title='')
fig.update_layout(width=1000, height=600)
fig.show()

In [None]:
data = {
    'Benchmark': ["AIME 2024 (Pass@1)","Codeforces (Percentile)","GPOA Diamond (Pass@1)",
                 "MATH-500 (Pass@1)","MMLU (Pass@1)","SWE-bench Verified (Resolved)"],
    'DeepSeek-R1': [79.8,96.3,71.5,97.3,90.8,49.2],
    'OpenAI-o1-1217': [79.2,96.6,75.7,96.4,91.8,48.9],
    'DeepSeek-R1-32B': [72.6,90.6,62.1,94.3,87.4,41.6],
    'OpenAI-o1-mini': [63.6,93.4,60.0,90.0,85.2,36.8],
    'DeepSeek-V3': [39.2,58.7,59.1,90.2,88.5,42.0],
}

df = pd.DataFrame(data)
df = df.set_index('Benchmark')

# Create heatmap
fig = go.Figure(data=go.Heatmap(
    z=df.values,
    x=df.columns,
    y=df.index,
    colorscale='RdYlBu_r',
    zmin=35,
    zmax=100, 
    text=[[f'{val:.1f}%' for val in row] for row in df.values],  # Show percentages
    texttemplate='%{text}',
    textfont={"size": 12},
    hoverongaps=False,
    hovertemplate='Model: %{x}<br>Benchmark: %{y}<br>Score: %{text}<extra></extra>'
))

# Update layout
fig.update_layout(
    title={
        'text': 'AI Model Performance Heatmap',
        'x': 0.5,
        'xanchor': 'center',
        'font': {'size': 20}
    },
    width=1000,
    height=600,
    xaxis={
        'title': 'Model',
        'side': 'bottom',
        'tickangle': 45
    },
    yaxis={
        'title': 'Benchmark',
        'tickmode': 'array',
        'tickvals': df.index
    },
    coloraxis_colorbar={
        'title': 'Performance Score (%)',
        'ticksuffix': '%'
    }
)

# Show the plot
fig.show()

In [8]:
# Create data dictionary
data = {
    'Model': ['DeepSeek R1', 'GPT-4o', 'Claude-3.5-Sonnet'] * 3,
    'LLM': ['GAIA', 'GAIA', 'GAIA', 'MATH', 'MATH', 'MATH', 'SimpleQA', 'SimpleQA', 'SimpleQA'],
    'Score': [40, 25, 28,  # GAIA values
                   90, 68, 68,   # MATH values
                   75, 83, 40]   # SimpleQA values
}

# Create DataFrame
df = pd.DataFrame(data)

df

Unnamed: 0,Model,LLM,Score
0,DeepSeek R1,GAIA,40
1,GPT-4o,GAIA,25
2,Claude-3.5-Sonnet,GAIA,28
3,DeepSeek R1,MATH,90
4,GPT-4o,MATH,68
5,Claude-3.5-Sonnet,MATH,68
6,DeepSeek R1,SimpleQA,75
7,GPT-4o,SimpleQA,83
8,Claude-3.5-Sonnet,SimpleQA,40


In [21]:
fig = px.scatter(
    df, y='LLM', x='Model', 
    color='Score', size='Score',
    text=df['Score'].round(2),
    color_continuous_scale = px.colors.sequential.RdBu_r,
)
fig.update_traces(
    textposition='top right',  # Position text above markers
    textfont=dict(size=14, weight='bold') 
)
fig.update_layout(
    paper_bgcolor="white",
    plot_bgcolor="white",
    title={
        'text': 'Performance of LLMs as CodeAgent',
        'x': 0.5,
        'xanchor': 'center',
        'font': {'size': 20}
    },
)
fig.update_yaxes(
    showgrid=True, 
    gridwidth=1, 
    gridcolor='LightGray',
    tickfont=dict(size=16, weight='bold')
)
fig.update_xaxes(
    showgrid=True, 
    gridwidth=1, 
    gridcolor='LightGray',
    tickfont=dict(size=16, weight='bold')
)
fig.update_coloraxes(
    colorbar=dict(
        title='Score',
        tickfont=dict(size=14),
        title_font=dict(size=14),
        x=1.15 # Move legend to the right
    )
)
fig.update_yaxes(title="")
fig.update_xaxes(title='')
fig.update_layout(width=600, height=500)
fig.show()