## Data Visualisation for the Baseline-LLM experiment
We make both a Seaborn Heatmap (akin to BABILong's own), but also a linechart. Both of accuracy, grouped by QA. 

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import pandas as pd
import sys

In [3]:
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

In [4]:
# Load data from file
output_filename = "baseline_accuracy"
df = pd.read_csv(f"{output_filename}.csv")

# Compute accuracy for each type and size
accuracy_df = df.groupby(['qa', 'token_size'])['correct_guess'].mean().reset_index()
accuracy_df['qa'] = accuracy_df['qa'].apply(lambda x: x.upper())
print(accuracy_df)

     qa  token_size  correct_guess
0   QA1           0           0.98
1   QA1           1           0.54
2   QA1           2           0.64
3   QA1           4           0.54
4   QA1           8           0.44
5   QA1          16           0.30
6   QA1          32           0.36
7   QA1          64           0.36
8   QA1         128           0.32
9   QA2           0           0.48
10  QA2           1           0.22
11  QA2           2           0.22
12  QA2           4           0.12
13  QA2           8           0.08
14  QA2          16           0.16
15  QA2          32           0.08
16  QA2          64           0.08
17  QA2         128           0.12
18  QA3           0           0.28
19  QA3           1           0.26
20  QA3           2           0.42
21  QA3           4           0.24
22  QA3           8           0.14
23  QA3          16           0.24
24  QA3          32           0.04
25  QA3          64           0.16
26  QA3         128           0.10
27  QA4           0 

In [5]:
# Calculating cost of the experiment
(df["input_tokens"].sum()  * 0.1 / 1e6) + (df["output_tokens"].sum() * 0.4 /1e6)

np.float64(6.047769400000001)

#### Heatmap

In [6]:
# Pivot the DataFrame to have types as rows and sizes as columns
heatmap_data = accuracy_df.pivot(index='qa', columns='token_size', values='correct_guess')
heatmap_data

token_size,0,1,2,4,8,16,32,64,128
qa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
QA1,0.98,0.54,0.64,0.54,0.44,0.3,0.36,0.36,0.32
QA2,0.48,0.22,0.22,0.12,0.08,0.16,0.08,0.08,0.12
QA3,0.28,0.26,0.42,0.24,0.14,0.24,0.04,0.16,0.1
QA4,0.78,0.58,0.72,0.6,0.68,0.6,0.46,0.38,0.14
QA5,0.92,0.72,0.68,0.86,0.76,0.64,0.76,0.82,0.56


In [7]:
fig, ax = plt.subplots(figsize=(10, 6))

# Yellow → Green → Blue
# colors = ['#FFFFCC', '#a1dab4', '#41b6c4', '#2c7fb8', '#253494']  
# colormap = LinearSegmentedColormap.from_list('YlGnBl', colors, N=256)

# Yellow -> Green
colors = ['#ffffcc','#c2e699','#78c679','#31a354','#006837']
colormap = LinearSegmentedColormap.from_list('YlGn', colors, N=256)

sns.heatmap(
    heatmap_data,
    vmin=0,
    vmax=1,
    cmap=colormap,
    annot=True,
    annot_kws={"fontsize": 14},
    fmt=".2f",
    linewidths=0.5,
    linecolor='white'
)

# Generate xtick labels
xtick_labels = ax.get_xticklabels(minor=False)
xtick_labels = [f"{int(label.get_text())}k" for label in xtick_labels]
ax.set_xticklabels(xtick_labels)

plt.title('Baseline LLM Accuracy', fontsize = 18, pad = 12)
plt.xlabel('Token Size', fontsize = 16)
plt.ylabel('Accuracy', fontsize = 16)
plt.tick_params(axis='both', labelsize = 13)
plt.tight_layout()
plt.savefig('baseline-llm-heatmap.pdf')
plt.show()

<Figure size 1000x600 with 2 Axes>

#### Line Chart

In [8]:
accuracy_df

Unnamed: 0,qa,token_size,correct_guess
0,QA1,0,0.98
1,QA1,1,0.54
2,QA1,2,0.64
3,QA1,4,0.54
4,QA1,8,0.44
5,QA1,16,0.3
6,QA1,32,0.36
7,QA1,64,0.36
8,QA1,128,0.32
9,QA2,0,0.48


In [9]:
sys.path.append("..")
import visualization

visualization.create_accuracy_figures(df, "Reproduced Baseline LLM Accuracy")
# visualization.experiment_util.create_accuracy_figures(df, "Baseline LLM Accuracy")

<Figure size 1000x600 with 2 Axes>

<Figure size 1000x600 with 1 Axes>

# Mean accuracy

In [12]:
baseline_mean = accuracy_df['correct_guess'].mean()
baseline_mean

np.float64(0.43511111111111117)

We now use the numbers from RAG-S and ReES to get the difference in mean accuracy. Look at their notebooks to find calculations.

In [11]:
rag_s_mean = 0.3973333333333333
rees_mean = 0.3595555555555556

In [13]:
rag_s_difference = baseline_mean - rag_s_mean
rag_s_difference

np.float64(0.037777777777777855)

In [14]:
rees_mean = baseline_mean - rees_mean
rees_mean

np.float64(0.0755555555555556)