In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score


#calculate weighed f1 score, macro f1 score, and accuracy with standard deviation using 'true_label_str' and 'predicted_label'
def calculate_metrics_with_bootstrap(df, n_bootstrap=1000, random_state=42):
    np.random.seed(random_state)
    metrics = {'weighted_f1': [], 'macro_f1': [], 'accuracy': []}
    for _ in range(n_bootstrap):
        sample = df.sample(frac=1, replace=True)
        metrics['weighted_f1'].append(f1_score(sample['true_label_str'], sample['predicted_label'], average='weighted'))
        metrics['macro_f1'].append(f1_score(sample['true_label_str'], sample['predicted_label'], average='macro'))
        metrics['accuracy'].append(accuracy_score(sample['true_label_str'], sample['predicted_label']))
    results = {
        'weighted_f1': (np.mean(metrics['weighted_f1']), np.std(metrics['weighted_f1'])),
        'macro_f1': (np.mean(metrics['macro_f1']), np.std(metrics['macro_f1'])),
        'accuracy': (np.mean(metrics['accuracy']), np.std(metrics['accuracy']))
    }
    return results



***BASE PROMPT***

In [4]:

df = pd.read_csv(r'/home/noxiusk/Desktop/data_science/dissertation/main/NO-fold/Google/non-thinking/validation/gemini-2.5-flash-preview-05-20_base_results-results.csv')
# Calculate metrics
metrics = calculate_metrics_with_bootstrap(df)
# Print results
for metric, (mean, std) in metrics.items():
    print(f"{metric}: {mean*100:.1f} \pm {std*100:.1f}")


  print(f"{metric}: {mean*100:.1f} \pm {std*100:.1f}")


weighted_f1: 59.3 \pm 2.3
macro_f1: 62.4 \pm 2.2
accuracy: 60.6 \pm 2.1


In [None]:

df = pd.read_csv(r'/home/noxiusk/Desktop/data_science/dissertation/main/NO-fold/Google/thinking/validation/gemini-2.5-flash-preview-05-20_base_results-temp=0.5.csv')
# Calculate metrics
metrics = calculate_metrics_with_bootstrap(df)
# Print results
for metric, (mean, std) in metrics.items():
    print(f"{metric}: {mean*100:.1f} \pm {std*100:.1f}")

  print(f"{metric}: {mean*100:.1f} \pm {std*100:.1f}")


weighted_f1: 68.1 \pm 2.1
macro_f1: 69.2 \pm 2.1
accuracy: 69.2 \pm 2.0


***NON THINKING***

In [14]:
df = pd.read_csv(r'/home/noxiusk/Desktop/data_science/dissertation/main/NO-fold/Google/non-thinking/validation/gemini-2.0-flash-lite_calculators_results.csv')
# Calculate metrics
metrics = calculate_metrics_with_bootstrap(df)
# Print results
for metric, (mean, std) in metrics.items():
    print(f"{metric}: {mean:.4f} ± {std:.4f}")

weighted_f1: 0.6763 ± 0.0210
macro_f1: 0.6625 ± 0.0245
accuracy: 0.6827 ± 0.0201


In [21]:
df = pd.read_csv(r'/home/noxiusk/Desktop/data_science/dissertation/main/NO-fold/Google/non-thinking/validation/gemini-2.0-flash_calculators_results.csv')
# Calculate metrics
metrics = calculate_metrics_with_bootstrap(df)
# Print results
for metric, (mean, std) in metrics.items():
    print(f"{metric}: {mean:.4f} ± {std:.4f}")

weighted_f1: 0.6427 ± 0.0227
macro_f1: 0.6279 ± 0.0255
accuracy: 0.6610 ± 0.0207


In [20]:
df = pd.read_csv(r'/home/noxiusk/Desktop/data_science/dissertation/main/NO-fold/Google/non-thinking/validation/gemini-2.5-flash-preview-05-20_calculators_results.csv')
# Calculate metrics
metrics = calculate_metrics_with_bootstrap(df)
# Print results
for metric, (mean, std) in metrics.items():
    print(f"{metric}: {mean:.4f} ± {std:.4f}")

weighted_f1: 0.7435 ± 0.0202
macro_f1: 0.7429 ± 0.0211
accuracy: 0.7542 ± 0.0188


In [15]:
df = pd.read_csv(r'/home/noxiusk/Desktop/data_science/dissertation/main/NO-fold/Google/non-thinking/validation/gemini-2.5-pro_calculators_results.csv')
# Calculate metrics
metrics = calculate_metrics_with_bootstrap(df)
# Print results
for metric, (mean, std) in metrics.items():
    print(f"{metric}: {mean:.4f} ± {std:.4f}")

weighted_f1: 0.7306 ± 0.0209
macro_f1: 0.7295 ± 0.0217
accuracy: 0.7445 ± 0.0190


In [16]:
df = pd.read_csv(r'/home/noxiusk/Desktop/data_science/dissertation/main/NO-fold/DeepSeek/non-thinking/validation/deepseek-chat_calculators_results.csv')
# Calculate metrics
metrics = calculate_metrics_with_bootstrap(df)
# Print results
for metric, (mean, std) in metrics.items():
    print(f"{metric}: {mean:.4f} ± {std:.4f}")

weighted_f1: 0.6457 ± 0.0234
macro_f1: 0.6234 ± 0.0268
accuracy: 0.6693 ± 0.0208


In [17]:
df = pd.read_csv(r'/home/noxiusk/Desktop/data_science/dissertation/main/NO-fold/DeepSeek/non-thinking/validation/deepseek-reasoner_calculators_results.csv')
# Calculate metrics
metrics = calculate_metrics_with_bootstrap(df)
# Print results
for metric, (mean, std) in metrics.items():
    print(f"{metric}: {mean:.4f} ± {std:.4f}")

weighted_f1: 0.6607 ± 0.0210
macro_f1: 0.6555 ± 0.0247
accuracy: 0.6608 ± 0.0210


In [18]:
df = pd.read_csv(r'/home/noxiusk/Desktop/data_science/dissertation/main/NO-fold/Grok/non-thinking/validation/grok-3-mini-latest_calculators_results.csv')
# Calculate metrics
metrics = calculate_metrics_with_bootstrap(df)
# Print results
for metric, (mean, std) in metrics.items():
    print(f"{metric}: {mean:.4f} ± {std:.4f}")

weighted_f1: 0.7503 ± 0.0199
macro_f1: 0.7482 ± 0.0211
accuracy: 0.7564 ± 0.0190


In [19]:
df = pd.read_csv(r'/home/noxiusk/Desktop/data_science/dissertation/main/NO-fold/Grok/non-thinking/validation/grok-3-latest_calculators_results.csv')
# Calculate metrics
metrics = calculate_metrics_with_bootstrap(df)
# Print results
for metric, (mean, std) in metrics.items():
    print(f"{metric}: {mean:.4f} ± {std:.4f}")

weighted_f1: 0.7143 ± 0.0208
macro_f1: 0.6966 ± 0.0243
accuracy: 0.7246 ± 0.0195


***THINKING***

In [3]:
df = pd.read_csv(r'/home/noxiusk/Desktop/data_science/dissertation/main/NO-fold/Google/thinking/validation/gemini-2.5-flash-preview-05-20_calculators_results-temp=0.5.csv')
# Calculate metrics
metrics = calculate_metrics_with_bootstrap(df)
# Print results
for metric, (mean, std) in metrics.items():
    print(f"{metric}: {mean:.4f} ± {std:.4f}")

weighted_f1: 0.7272 ± 0.0206
macro_f1: 0.7273 ± 0.0218
accuracy: 0.7339 ± 0.0197


In [4]:
df = pd.read_csv(r'/home/noxiusk/Desktop/data_science/dissertation/main/NO-fold/Google/thinking/validation/gemini-2.5-pro-preview-05-06_calculators_results-temp=0.5.csv')
# Calculate metrics
metrics = calculate_metrics_with_bootstrap(df)
# Print results
for metric, (mean, std) in metrics.items():
    print(f"{metric}: {mean:.4f} ± {std:.4f}")

weighted_f1: 0.7135 ± 0.0217
macro_f1: 0.6264 ± 0.0719
accuracy: 0.7297 ± 0.0197


In [5]:

df = pd.read_csv(r'/home/noxiusk/Desktop/data_science/dissertation/main/NO-fold/DeepSeek/thinking/validation/deepseek-reasoner_calculators_results-temp=0.5.csv')
# Calculate metrics
metrics = calculate_metrics_with_bootstrap(df)
# Print results
for metric, (mean, std) in metrics.items():
    print(f"{metric}: {mean:.4f} ± {std:.4f}")

weighted_f1: 0.7346 ± 0.0199
macro_f1: 0.7373 ± 0.0219
accuracy: 0.7379 ± 0.0194


In [9]:
df = pd.read_csv(r'/home/noxiusk/Desktop/data_science/dissertation/main/NO-fold/Grok/thinking/validation/grok-3-mini-latest_calculators_results-temp=0.4.csv')
# Calculate metrics
metrics = calculate_metrics_with_bootstrap(df)
# Print results
for metric, (mean, std) in metrics.items():
    print(f"{metric}: {mean:.4f} ± {std:.4f}")

weighted_f1: 0.7449 ± 0.0199
macro_f1: 0.7394 ± 0.0214
accuracy: 0.7477 ± 0.0194


In [None]:
df = pd.read_csv(r'/home/noxiusk/Desktop/data_science/dissertation/main/NO-fold/Grok/thinking/validation/grok-3-latest_calculators_results-temp=0.5.csv')
# Calculate metrics
metrics = calculate_metrics_with_bootstrap(df)
# Print results
for metric, (mean, std) in metrics.items():
    print(f"{metric}: {mean:.4f} ± {std:.4f}")

weighted_f1: 0.7113 ± 0.0210
macro_f1: 0.7091 ± 0.0226
accuracy: 0.7159 ± 0.0203


*** ENHANCED ***

In [7]:
df = pd.read_csv(r'/home/noxiusk/Desktop/data_science/dissertation/main/NO-fold/Google/thinking/validation/gemini-2.5-flash-preview-05-20_calculators_ev_gemini_1.1_results-temp=0.5.csv')
# Calculate metrics
metrics = calculate_metrics_with_bootstrap(df)
# Print results
for metric, (mean, std) in metrics.items():
    print(f"{metric}: {mean*100:.1f} ± {std*100:.1f}")

weighted_f1: 72.1 ± 2.0
macro_f1: 71.0 ± 2.2
accuracy: 72.4 ± 2.0


In [6]:
df = pd.read_csv(r'/home/noxiusk/Desktop/data_science/dissertation/main/NO-fold/Google/thinking/validation/gemini-2.5-flash-preview-05-20_calculators_ev_gemini_2.0_results-temp=0.5.csv')
# Calculate metrics
metrics = calculate_metrics_with_bootstrap(df)
# Print results
for metric, (mean, std) in metrics.items():
    print(f"{metric}: {mean*100:.1f} ± {std*100:.1f}")

weighted_f1: 71.7 ± 2.0
macro_f1: 71.8 ± 2.1
accuracy: 72.2 ± 1.9


In [8]:
df = pd.read_csv(r'/home/noxiusk/Desktop/data_science/dissertation/main/NO-fold/DeepSeek/thinking/validation/deepseek-reasoner_calculators_ev_gemini_1.1_results-temp=0.5.csv')
# Calculate metrics
metrics = calculate_metrics_with_bootstrap(df)
# Print results
for metric, (mean, std) in metrics.items():
    print(f"{metric}: {mean*100:.1f} ± {std*100:.1f}")

weighted_f1: 77.4 ± 2.0
macro_f1: 76.8 ± 2.2
accuracy: 77.4 ± 2.0


In [10]:
df = pd.read_csv(r'/home/noxiusk/Desktop/data_science/dissertation/main/NO-fold/DeepSeek/thinking/validation/deepseek-reasoner_calculators_ev_gemini_2.0_results-temp=0.5.csv')
# Calculate metrics
metrics = calculate_metrics_with_bootstrap(df)
# Print results
for metric, (mean, std) in metrics.items():
    print(f"{metric}: {mean*100:.1f} \pm {std*100:.1f}")

  print(f"{metric}: {mean*100:.1f} \pm {std*100:.1f}")


weighted_f1: 76.5 \pm 1.9
macro_f1: 75.5 \pm 2.2
accuracy: 76.4 \pm 1.9


In [11]:


df = pd.read_csv(r'/home/noxiusk/Desktop/data_science/dissertation/main/NO-fold/Grok/thinking/validation/grok-3-mini-latest_grok3_from_grok3_1-0_results-temp=0.4.csv')
# Calculate metrics
metrics = calculate_metrics_with_bootstrap(df)
# Print results
for metric, (mean, std) in metrics.items():
    print(f"{metric}: {mean*100:.1f} \pm {std*100:.1f}")

  print(f"{metric}: {mean*100:.1f} \pm {std*100:.1f}")


weighted_f1: 74.3 \pm 2.0
macro_f1: 74.1 \pm 2.1
accuracy: 74.6 \pm 2.0


In [26]:
print((0.2/73.9)*100)

0.2706359945872801
