# Analyse WandB Pretraining Data
resources used for this notebook:
* https://docs.wandb.ai/guides/track/public-api-guide/
* https://docs.wandb.ai/ref/python/public-api/runs/

In [1]:
import wandb
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np

In [2]:
api = wandb.Api()

# Finetuning Results
adapted from: https://docs.wandb.ai/guides/track/public-api-guide/ (22.07.2025)

## Metrics

In [3]:
entity, project = "josefine-busch-htw-berlin", "ma_finetuning"
runs = api.runs(f"{entity}/{project}")

In [4]:
names, tags, test_accuracies, test_losses, test_f1, test_precision, test_recall, test_mcc = [], [], [], [], [], [], [], []
for run in runs:
	names.append(run.name)
	tags.append(run.tags)
	test_accuracies.append(run.summary.get("test/accuracy", np.nan))
	test_losses.append(run.summary.get("test/loss", np.nan))
	test_f1.append(run.summary.get("test/f1", np.nan))
	test_precision.append(run.summary.get("test/precision", np.nan))
	test_recall.append(run.summary.get("test/recall", np.nan))
	test_mcc.append(run.summary.get("test/matthews_correlation", np.nan))

runs_df = pd.DataFrame(
	{
	"run_name": names,
	"test_accuracy": test_accuracies, "test_loss": test_losses, "test_f1": test_f1,
	"test_precision": test_precision, "test_recall": test_recall, "test_mcc": test_mcc}
)
runs_df

Unnamed: 0,run_name,test_accuracy,test_loss,test_f1,test_precision,test_recall,test_mcc
0,HF_HF_5M_NRC_150K_42_v9_3e-5_prom_prom_core_al...,0.781757,0.472779,0.781689,0.782212,0.781810,0.564022
1,HF_HS_5M_NRC_150K_42_v6_3e-5_prom_prom_core_al...,0.761486,0.491612,0.761060,0.763607,0.761606,0.525209
2,HF_HF_5M_NRC_150K_42_v9_3e-5_prom_prom_core_no...,0.790089,0.469136,0.789909,0.790954,0.790031,0.580984
3,HF_HS_5M_NRC_150K_42_v6_3e-5_prom_prom_core_no...,0.788958,0.482180,0.788673,0.790696,0.789038,0.579731
4,HF_HF_5M_NRC_150K_42_v9_3e-5_prom_prom_core_ta...,0.676998,0.587859,0.672606,0.688836,0.677988,0.366663
...,...,...,...,...,...,...,...
310,HF_HF_5M_RC_150K_3_v11_3e-5_prom_prom_300_all_...,0.883277,0.373002,0.882915,0.887143,0.882942,0.770074
311,HF_HF_5M_RC_150K_42_v11_3e-5_prom_prom_300_tat...,0.621533,0.678970,0.619100,0.628663,0.624230,0.252854
312,HF_HF_5M_RC_150K_199_v11_3e-5_prom_prom_300_ta...,0.639478,0.674505,0.638230,0.644538,0.641600,0.286123
313,HF_HF_5M_RC_150K_3_v11_3e-5_prom_prom_300_nota...,0.918787,0.270507,0.918784,0.919091,0.918949,0.838040


In [5]:
tags_df = pd.DataFrame(tags, columns=['job_id', 'tokenizer', 'task', 'subtask', 'scenario'])
tagged_runs = pd.concat([runs_df, tags_df], axis=1)
tagged_runs.head()

Unnamed: 0,run_name,test_accuracy,test_loss,test_f1,test_precision,test_recall,test_mcc,job_id,tokenizer,task,subtask,scenario
0,HF_HF_5M_NRC_150K_42_v9_3e-5_prom_prom_core_al...,0.781757,0.472779,0.781689,0.782212,0.78181,0.564022,14290,hf_5M_tokenizer,prom_core,prom_core_all,scenario1
1,HF_HS_5M_NRC_150K_42_v6_3e-5_prom_prom_core_al...,0.761486,0.491612,0.76106,0.763607,0.761606,0.525209,14293,hs_5M_tokenizer,prom_core,prom_core_all,scenario1
2,HF_HF_5M_NRC_150K_42_v9_3e-5_prom_prom_core_no...,0.790089,0.469136,0.789909,0.790954,0.790031,0.580984,14290,hf_5M_tokenizer,prom_core,prom_core_notata,scenario1
3,HF_HS_5M_NRC_150K_42_v6_3e-5_prom_prom_core_no...,0.788958,0.48218,0.788673,0.790696,0.789038,0.579731,14293,hs_5M_tokenizer,prom_core,prom_core_notata,scenario1
4,HF_HF_5M_NRC_150K_42_v9_3e-5_prom_prom_core_ta...,0.676998,0.587859,0.672606,0.688836,0.677988,0.366663,14290,hf_5M_tokenizer,prom_core,prom_core_tata,scenario1


In [7]:
mean_subtask_metrics = tagged_runs.groupby(['task', 'subtask', 'tokenizer']).agg({
	"test_accuracy": "mean", 
	"test_loss": "mean", 
	"test_f1": "mean", 
	"test_precision": "mean", 
	"test_recall": "mean", 
	"test_mcc": "mean"
})
mean_subtask_metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,test_accuracy,test_loss,test_f1,test_precision,test_recall,test_mcc
task,subtask,tokenizer,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
p_covid,p_covid_sub,hf_2_5M_tokenizer,0.725907,0.742033,0.728841,0.737226,0.724557,0.690772
p_covid,p_covid_sub,hf_5M_rc_tokenizer,0.72667,0.745418,0.730243,0.73979,0.725364,0.691687
p_covid,p_covid_sub,hf_5M_tokenizer,0.726403,0.742635,0.729169,0.737896,0.725061,0.691395
p_covid,p_covid_sub,hs_2_5M_tokenizer,0.716138,0.758272,0.719503,0.730047,0.714839,0.679951
p_covid,p_covid_sub,hs_5M_tokenizer,0.714854,0.75853,0.718047,0.728335,0.713597,0.678488
prom_300,prom_300_all,hf_2_5M_tokenizer,0.871378,0.352145,0.871338,0.871839,0.871411,0.743249
prom_300,prom_300_all,hf_5M_rc_tokenizer,0.87851,0.338742,0.878417,0.87933,0.878394,0.757722
prom_300,prom_300_all,hf_5M_tokenizer,0.873874,0.34904,0.873853,0.874168,0.873923,0.748091
prom_300,prom_300_all,hs_2_5M_tokenizer,0.869163,0.338676,0.86914,0.869476,0.869212,0.738688
prom_300,prom_300_all,hs_5M_tokenizer,0.868919,0.339153,0.868884,0.869356,0.868965,0.738321


In [23]:
tasks = ['prom_300', 'prom_core']
subtasks = ['_all', '_notata', '_tata']
columns = ['test_accuracy', 'test_loss', 'test_f1', 'test_precision', 'test_recall', 'test_mcc']

In [33]:
for task in tasks:
	print(f"Task: {task}")
	for subtask in subtasks:
		print(f" Subtask: {subtask}")
		for col in columns:
			delta = mean_subtask_metrics.loc[task].loc[f'{task}{subtask}'][col].max() - mean_subtask_metrics.loc[task].loc[f'{task}{subtask}'][col].min()
			print(f"Delta {col}: {delta:.4f}")

print('Covid')
for col in columns:
	delta = mean_subtask_metrics.loc['p_covid'].loc['p_covid_sub'][col].max()-mean_subtask_metrics.loc['p_covid'].loc['p_covid_sub'][col].min()
	print(f"Delta {col}: {delta:.4f}")

Task: prom_300
 Subtask: _all
Delta test_accuracy: 0.0096
Delta test_loss: 0.0135
Delta test_f1: 0.0095
Delta test_precision: 0.0100
Delta test_recall: 0.0094
Delta test_mcc: 0.0194
 Subtask: _notata
Delta test_accuracy: 0.0064
Delta test_loss: 0.0072
Delta test_f1: 0.0065
Delta test_precision: 0.0063
Delta test_recall: 0.0064
Delta test_mcc: 0.0127
 Subtask: _tata
Delta test_accuracy: 0.0154
Delta test_loss: 0.0101
Delta test_f1: 0.0310
Delta test_precision: 0.0080
Delta test_recall: 0.0132
Delta test_mcc: 0.0169
Task: prom_core
 Subtask: _all
Delta test_accuracy: 0.0130
Delta test_loss: 0.0164
Delta test_f1: 0.0136
Delta test_precision: 0.0116
Delta test_recall: 0.0129
Delta test_mcc: 0.0245
 Subtask: _notata
Delta test_accuracy: 0.0055
Delta test_loss: 0.0101
Delta test_f1: 0.0054
Delta test_precision: 0.0057
Delta test_recall: 0.0055
Delta test_mcc: 0.0112
 Subtask: _tata
Delta test_accuracy: 0.0089
Delta test_loss: 0.0396
Delta test_f1: 0.0071
Delta test_precision: 0.0148
Delta te

In [31]:
df = px.data.iris()
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_id
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,1
2,4.7,3.2,1.3,0.2,setosa,1
3,4.6,3.1,1.5,0.2,setosa,1
4,5.0,3.6,1.4,0.2,setosa,1
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,3
146,6.3,2.5,5.0,1.9,virginica,3
147,6.5,3.0,5.2,2.0,virginica,3
148,6.2,3.4,5.4,2.3,virginica,3


In [29]:
fig = px.box(df, y="test_accuracy")

fig.show()

In [28]:
df = mean_subtask_metrics.loc['p_covid'].loc['p_covid_sub']
fig = px.violin(df, y="test_accuracy")
fig.show()

In [15]:
mean_subtask_metrics.loc['prom_300'].loc['prom_300_all']['test_accuracy'].min()

np.float64(0.8689189189189189)

In [None]:
mean_subtask_metrics.to_csv("mean_finetuning_subtask_metrics.csv")

In [14]:
mean_task_metrics = tagged_runs.groupby(['scenario', 'task', 'tokenizer']).agg({
	"test_accuracy": "mean", 
	"test_loss": "mean", 
	"test_f1": "mean", 
	"test_precision": "mean", 
	"test_recall": "mean", 
	"test_mcc": "mean"
})
mean_task_metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,test_accuracy,test_loss,test_f1,test_precision,test_recall,test_mcc
scenario,task,tokenizer,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
scenario1,p_covid,hf_5M_tokenizer,0.726403,0.742635,0.729169,0.737896,0.725061,0.691395
scenario1,p_covid,hs_5M_tokenizer,0.714854,0.75853,0.718047,0.728335,0.713597,0.678488
scenario1,prom_300,hf_5M_tokenizer,0.804985,0.424265,0.802873,0.810448,0.806285,0.616613
scenario1,prom_300,hs_5M_tokenizer,0.798901,0.42,0.793259,0.807682,0.800278,0.607208
scenario1,prom_core,hf_5M_tokenizer,0.754954,0.508039,0.754328,0.757239,0.755116,0.512338
scenario1,prom_core,hs_5M_tokenizer,0.750997,0.528146,0.750395,0.752878,0.751019,0.503885
scenario2,p_covid,hf_2_5M_tokenizer,0.725907,0.742033,0.728841,0.737226,0.724557,0.690772
scenario2,p_covid,hf_5M_rc_tokenizer,0.72667,0.745418,0.730243,0.73979,0.725364,0.691687
scenario2,p_covid,hs_2_5M_tokenizer,0.716138,0.758272,0.719503,0.730047,0.714839,0.679951
scenario2,prom_300,hf_2_5M_tokenizer,0.804981,0.424662,0.803344,0.809089,0.806014,0.615019


In [15]:
mean_task_metrics.to_csv("mean_finetuning_task_metrics.csv")

In [9]:
mean_metrics = pd.read_csv("mean_metrics.csv", index_col=[0, 1, 2, 3])

## Runtime

In [3]:
entity, project = "josefine-busch-htw-berlin", "ma_finetuning"
runs = api.runs(f"{entity}/{project}")

In [4]:
names, tags, eval_runtime, train_runtime = [], [], [], []
for run in runs:
	names.append(run.name)
	tags.append(run.tags)
	eval_runtime.append(run.summary.get("eval/runtime", np.nan))
	train_runtime.append(run.summary.get("train_runtime", np.nan))

runs_df = pd.DataFrame(
	{
	"run_name": names,
	"eval_runtime": eval_runtime,
	"train_runtime": train_runtime
}
)

In [5]:
tags_df = pd.DataFrame(tags, columns=['job_id', 'tokenizer', 'task', 'subtask', 'scenario'])
tagged_runs = pd.concat([runs_df, tags_df], axis=1)
tagged_runs.head()

Unnamed: 0,run_name,eval_runtime,train_runtime,job_id,tokenizer,task,subtask,scenario
0,HF_HF_5M_NRC_150K_42_v9_3e-5_prom_prom_core_al...,4.2485,829.0819,14290,hf_5M_tokenizer,prom_core,prom_core_all,scenario1
1,HF_HS_5M_NRC_150K_42_v6_3e-5_prom_prom_core_al...,4.1702,828.885,14293,hs_5M_tokenizer,prom_core,prom_core_all,scenario1
2,HF_HF_5M_NRC_150K_42_v9_3e-5_prom_prom_core_no...,3.9872,755.6423,14290,hf_5M_tokenizer,prom_core,prom_core_notata,scenario1
3,HF_HS_5M_NRC_150K_42_v6_3e-5_prom_prom_core_no...,3.764,746.8029,14293,hs_5M_tokenizer,prom_core,prom_core_notata,scenario1
4,HF_HF_5M_NRC_150K_42_v9_3e-5_prom_prom_core_ta...,0.462,254.2423,14290,hf_5M_tokenizer,prom_core,prom_core_tata,scenario1


In [6]:
mean_runtimes = tagged_runs.groupby(['scenario', 'task', 'subtask', 'tokenizer']).agg({
	"eval_runtime": "mean",
	"train_runtime": "mean"
})

mean_runtimes

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,eval_runtime,train_runtime
scenario,task,subtask,tokenizer,Unnamed: 4_level_1,Unnamed: 5_level_1
scenario1,prom_300,prom_300_all,hf_5M_tokenizer,4.371867,852.207767
scenario1,prom_300,prom_300_all,hs_5M_tokenizer,4.216033,840.3945
scenario1,prom_300,prom_300_notata,hf_5M_tokenizer,3.8418,751.5518
scenario1,prom_300,prom_300_notata,hs_5M_tokenizer,3.808033,758.543167
scenario1,prom_300,prom_300_tata,hf_5M_tokenizer,0.4599,252.795867
scenario1,prom_300,prom_300_tata,hs_5M_tokenizer,0.457133,251.419033
scenario1,prom_core,prom_core_all,hf_5M_tokenizer,4.1919,830.518833
scenario1,prom_core,prom_core_all,hs_5M_tokenizer,4.178767,825.373133
scenario1,prom_core,prom_core_notata,hf_5M_tokenizer,3.852,751.5977
scenario1,prom_core,prom_core_notata,hs_5M_tokenizer,3.8052,745.836033
