### Importing dataset

#### Question prompt set

In [2]:
with open("../dataset/temperature_questions_pisa_prompt.txt", "r") as f:
    questions_prompt = f.readlines()


## printing out all question prompts
print("Total number of questions: ", len(questions_prompt))
print("=" * 50)
print("Question prompt set :")
print("=" * 50)
for q in questions_prompt:
    print(q.strip())

Total number of questions:  313
Question prompt set :
From January 01, 2010, Friday to January 15, 2010, Friday, the average temperature was 56, 51, 53, 51, 53, 54, 53, 50, 52, 47, 48, 50, 51, 50, 52 degree on each day. What is the temperature going to be on January 16, 2010, Saturday?
From January 17, 2010, Sunday to January 31, 2010, Sunday, the average temperature was 51, 53, 53, 51, 52, 54, 58, 61, 64, 56, 60, 67, 66, 61, 62 degree on each day. What is the temperature going to be on February 01, 2010, Monday?
From February 02, 2010, Tuesday to February 16, 2010, Tuesday, the average temperature was 61, 61, 60, 62, 67, 73, 67, 63, 61, 61, 62, 64, 62, 61, 65 degree on each day. What is the temperature going to be on February 17, 2010, Wednesday?
From February 18, 2010, Thursday to March 04, 2010, Thursday, the average temperature was 62, 63, 64, 67, 72, 71, 70, 69, 72, 74, 77, 79, 76, 76, 76 degree on each day. What is the temperature going to be on March 05, 2010, Friday?
From March

#### Answers set

In [3]:
with open("../dataset/temperature_answers_pisa_prompt.txt", "r") as f:
    validation_answers = f.readlines()

print("Total number of answers: ", len(validation_answers))
print("=" * 50)
print("Answer prompt set :")
print("=" * 50)
for a in validation_answers:
    print(a.strip())

Total number of answers:  313
Answer prompt set :
54
61
59
72
86
85
94
89
100
89
89
91
81
84
86
80
81
82
75
72
61
56
50
52
66
62
69
79
83
92
98
88
89
86
85
87
81
90
83
82
78
75
70
68
58
57
51
56
67
67
82
76
82
87
98
99
102
91
82
87
84
81
83
78
68
63
62
52
45
53
59
64
77
72
82
94
94
89
80
87
87
88
81
86
80
84
77
66
66
60
54
52
58
55
61
73
77
82
88
90
97
100
86
89
86
91
87
85
81
78
68
67
61
50
52
53
66
62
70
70
92
95
98
98
88
84
88
87
89
88
84
85
72
73
70
57
58
54
58
64
71
77
86
94
93
88
94
94
88
82
82
88
86
87
80
76
66
62
58
59
54
66
76
71
82
81
90
96
94
97
82
89
87
89
81
84
85
82
76
66
65
62
52
62
59
76
76
84
82
96
91
101
96
89
90
88
84
82
87
82
75
66
67
61
52
56
54
60
59
66
85
76
90
82
95
89
86
85
81
84
85
81
80
76
64
72
62
45
60
59
63
69
70
76
83
84
93
84
90
85
86
85
84
87
84
81
72
67
61
54
50
50
63
65
72
80
88
82
90
87
98
90
91
82
89
87
81
85
80
74
66
61
51
58
54
55
64
74
83
94
92
95
90
101
96
92
85
83
89
84
84
81
77
66
60
53
47
63
63
70
79
76
88
82
93
78
89
88
84
87
86
89
88


#### LLM setup

##### Initilization

In [4]:
from dotenv import load_dotenv

load_dotenv()

import os


# Safely retrieve environment variables with a default value or raise an error
GOOGLE_API_KEY = os.environ.get("GOOGLE_GEMINI_API_KEY")
if not GOOGLE_API_KEY:
    raise KeyError("Environment variable 'GOOGLE_GEMINI_API_KEY' is not set.")

GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
if not GROQ_API_KEY:
    raise KeyError("Environment variable 'GROQ_API_KEY' is not set.")

In [5]:
from groq import Groq
from google import genai

# googlegenai.configure(api_key=GOOGLE_API_KEY)
groq_client = Groq(api_key=GROQ_API_KEY)

##### Utils

In [6]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings

warnings.filterwarnings("ignore")


def calculate_mape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


def extract_number(text):
    """Extract all numbers from text and return as a list"""
    import re

    if isinstance(text, list):
        # If input is a list, join all elements with newlines
        text = "\n".join(str(item) for item in text)
    numbers = re.findall(r"[-+]?\d*\.?\d+", str(text))
    return [float(num) for num in numbers] if numbers else None


# Test the function
test_numbers = """54
61
59
72
86
85
94
89
100"""

result = extract_number(test_numbers)
print("Extracted numbers:", result)

Extracted numbers: [54.0, 61.0, 59.0, 72.0, 86.0, 85.0, 94.0, 89.0, 100.0]


In [7]:
extract_number(validation_answers)

[54.0,
 61.0,
 59.0,
 72.0,
 86.0,
 85.0,
 94.0,
 89.0,
 100.0,
 89.0,
 89.0,
 91.0,
 81.0,
 84.0,
 86.0,
 80.0,
 81.0,
 82.0,
 75.0,
 72.0,
 61.0,
 56.0,
 50.0,
 52.0,
 66.0,
 62.0,
 69.0,
 79.0,
 83.0,
 92.0,
 98.0,
 88.0,
 89.0,
 86.0,
 85.0,
 87.0,
 81.0,
 90.0,
 83.0,
 82.0,
 78.0,
 75.0,
 70.0,
 68.0,
 58.0,
 57.0,
 51.0,
 56.0,
 67.0,
 67.0,
 82.0,
 76.0,
 82.0,
 87.0,
 98.0,
 99.0,
 102.0,
 91.0,
 82.0,
 87.0,
 84.0,
 81.0,
 83.0,
 78.0,
 68.0,
 63.0,
 62.0,
 52.0,
 45.0,
 53.0,
 59.0,
 64.0,
 77.0,
 72.0,
 82.0,
 94.0,
 94.0,
 89.0,
 80.0,
 87.0,
 87.0,
 88.0,
 81.0,
 86.0,
 80.0,
 84.0,
 77.0,
 66.0,
 66.0,
 60.0,
 54.0,
 52.0,
 58.0,
 55.0,
 61.0,
 73.0,
 77.0,
 82.0,
 88.0,
 90.0,
 97.0,
 100.0,
 86.0,
 89.0,
 86.0,
 91.0,
 87.0,
 85.0,
 81.0,
 78.0,
 68.0,
 67.0,
 61.0,
 50.0,
 52.0,
 53.0,
 66.0,
 62.0,
 70.0,
 70.0,
 92.0,
 95.0,
 98.0,
 98.0,
 88.0,
 84.0,
 88.0,
 87.0,
 89.0,
 88.0,
 84.0,
 85.0,
 72.0,
 73.0,
 70.0,
 57.0,
 58.0,
 54.0,
 58.0,
 64.0,
 71.0,
 77.0,
 86

#### Models initilize 

In [10]:
# Initialize Google model
from google.genai import types

client = genai.Client(api_key=GOOGLE_API_KEY)

In [11]:
from textwrap import dedent

system_prompt = dedent(
    """
    You are a data analyst specialized in sequence prediction. 
    Based on the provided daily temperature data, predict the temperature for each asked date. 
    Output only the predicted temperature values, one per line, in the order the questions are asked. 
    Do not include any other text, explanation, or formatting—only the numeric value of each predicted temperature on a new line.
    """
)

### Google preds

##### Google gemini flash 1.5

In [None]:
import time
from math import ceil

# ...existing code...

# Generate predictions
predictions = []
true_values = []
BATCH_SIZE = 10  # Process 10 questions at a time
DELAY_SECONDS = 90  # Wait 1.5 minutes between batches

true_value = extract_number(validation_answers)
if true_value is not None:
    true_values.extend(true_value)

# Calculate number of batches
num_batches = ceil(len(questions_prompt) / BATCH_SIZE)

for batch in range(num_batches):
    start_idx = batch * BATCH_SIZE
    end_idx = min((batch + 1) * BATCH_SIZE, len(questions_prompt))

    print(f"\nProcessing batch {batch + 1}/{num_batches}")

    # Process current batch
    for i in range(start_idx, end_idx):
        try:
            response = client.models.generate_content(
                config=types.GenerateContentConfig(
                    system_instruction=system_prompt,
                ),
                model="gemini-1.5-flash",
                contents=questions_prompt[i],
            )
            pred_value = extract_number(response.text)
            print(f"Question {i + 1}: Predicted value: {pred_value}")

            if pred_value is not None:
                predictions.extend(pred_value)

        except Exception as e:
            print(f"Error processing question {i + 1}: {e}")

    # Wait between batches, except for the last batch
    if batch < num_batches - 1:
        print(f"\nWaiting {DELAY_SECONDS} seconds before next batch...")
        time.sleep(DELAY_SECONDS)

print("\nPreds value len: ", len(predictions))
print("True value len: ", len(true_values))

# Convert to numpy arrays
y_true = np.array(true_values)
y_pred = np.array(predictions)

# Calculate metrics
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_true, y_pred)
mape = calculate_mape(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

print("\nEvaluation Metrics:")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MAPE: {mape:.4f}%")
print(f"R2 Score: {r2:.4f}")


Processing batch 1/32
Question 1: Predicted value: [51.0]
Question 2: Predicted value: [63.0]
Question 3: Predicted value: [63.0]
Question 4: Predicted value: [75.0]
Question 5: Predicted value: [83.0]
Question 6: Predicted value: [89.0]
Question 7: Predicted value: [95.0]
Question 8: Predicted value: [89.0]
Question 9: Predicted value: [97.0]
Question 10: Predicted value: [80.0]

Waiting 90 seconds before next batch...

Processing batch 2/32
Question 11: Predicted value: [93.0]
Question 12: Predicted value: [92.0]
Question 13: Predicted value: [87.0]
Question 14: Predicted value: [87.0]
Question 15: Predicted value: [83.0]
Question 16: Predicted value: [81.0]
Question 17: Predicted value: [78.0]
Question 18: Predicted value: [81.0]
Question 19: Predicted value: [73.0]
Question 20: Predicted value: [73.0]

Waiting 90 seconds before next batch...

Processing batch 3/32
Question 21: Predicted value: [61.0]
Question 22: Predicted value: [60.0]
Question 23: Predicted value: [52.0]
Questio

##### Google gemini-flash-2.0

In [13]:
import time
from math import ceil

# ...existing code...

# Generate predictions
predictions = []
true_values = []
BATCH_SIZE = 10  # Process 10 questions at a time
DELAY_SECONDS = 90  # Wait 1.5 minutes between batches

true_value = extract_number(validation_answers)
if true_value is not None:
    true_values.extend(true_value)

# Calculate number of batches
num_batches = ceil(len(questions_prompt) / BATCH_SIZE)

for batch in range(num_batches):
    start_idx = batch * BATCH_SIZE
    end_idx = min((batch + 1) * BATCH_SIZE, len(questions_prompt))

    print(f"\nProcessing batch {batch + 1}/{num_batches}")

    # Process current batch
    for i in range(start_idx, end_idx):
        try:
            response = client.models.generate_content(
                config=types.GenerateContentConfig(
                    system_instruction=system_prompt,
                ),
                model="gemini-2.0-flash",
                contents=questions_prompt[i],
            )
            pred_value = extract_number(response.text)
            print(f"Question {i + 1}: Predicted value: {pred_value}")

            if pred_value is not None:
                predictions.extend(pred_value)

        except Exception as e:
            print(f"Error processing question {i + 1}: {e}")

    # Wait between batches, except for the last batch
    if batch < num_batches - 1:
        print(f"\nWaiting {DELAY_SECONDS} seconds before next batch...")
        time.sleep(DELAY_SECONDS)

print("\nPreds value len: ", len(predictions))
print("True value len: ", len(true_values))

# Convert to numpy arrays
y_true = np.array(true_values)
y_pred = np.array(predictions)

# Calculate metrics
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_true, y_pred)
mape = calculate_mape(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

print("\nEvaluation Metrics:")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MAPE: {mape:.4f}%")
print(f"R2 Score: {r2:.4f}")


Processing batch 1/32
Question 1: Predicted value: [51.0]
Question 2: Predicted value: [64.0]
Question 3: Predicted value: [63.0]
Question 4: Predicted value: [77.0]
Question 5: Predicted value: [82.0]
Question 6: Predicted value: [86.0]
Question 7: Predicted value: [97.0]
Question 8: Predicted value: [93.0]
Question 9: Predicted value: [99.0]
Question 10: Predicted value: [79.0]

Waiting 90 seconds before next batch...

Processing batch 2/32
Question 11: Predicted value: [94.0]
Question 12: Predicted value: [88.0]
Question 13: Predicted value: [87.0]
Question 14: Predicted value: [87.0]
Question 15: Predicted value: [82.0]
Question 16: Predicted value: [83.0]
Question 17: Predicted value: [79.0]
Question 18: Predicted value: [82.0]
Question 19: Predicted value: [74.0]
Question 20: Predicted value: [72.0]

Waiting 90 seconds before next batch...

Processing batch 3/32
Question 21: Predicted value: [62.0]
Question 22: Predicted value: [58.0]
Question 23: Predicted value: [53.0]
Questio

##### Google gemini-2.5-flash-preview-04-17

In [12]:
import time
from math import ceil

# ...existing code...

# Generate predictions
predictions = []
true_values = []
BATCH_SIZE = 10  # Process 10 questions at a time
DELAY_SECONDS = 90  # Wait 1.5 minutes between batches

true_value = extract_number(validation_answers)
if true_value is not None:
    true_values.extend(true_value)

# Calculate number of batches
num_batches = ceil(len(questions_prompt) / BATCH_SIZE)

for batch in range(num_batches):
    start_idx = batch * BATCH_SIZE
    end_idx = min((batch + 1) * BATCH_SIZE, len(questions_prompt))

    print(f"\nProcessing batch {batch + 1}/{num_batches}")

    # Process current batch
    for i in range(start_idx, end_idx):
        try:
            response = client.models.generate_content(
                config=types.GenerateContentConfig(
                    system_instruction=system_prompt,
                ),
                model="gemini-2.5-flash-preview-04-17",
                contents=questions_prompt[i],
            )
            pred_value = extract_number(response.text)
            print(f"Question {i + 1}: Predicted value: {pred_value}")

            if pred_value is not None:
                predictions.extend(pred_value)

        except Exception as e:
            print(f"Error processing question {i + 1}: {e}")

    # Wait between batches, except for the last batch
    if batch < num_batches - 1:
        print(f"\nWaiting {DELAY_SECONDS} seconds before next batch...")
        time.sleep(DELAY_SECONDS)

print("\nPreds value len: ", len(predictions))
print("True value len: ", len(true_values))

# Convert to numpy arrays
y_true = np.array(true_values)
y_pred = np.array(predictions)

# Calculate metrics
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_true, y_pred)
mape = calculate_mape(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

print("\nEvaluation Metrics:")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MAPE: {mape:.4f}%")
print(f"R2 Score: {r2:.4f}")


Processing batch 1/32
Question 1: Predicted value: [52.0]
Question 2: Predicted value: [63.0]
Question 3: Predicted value: [63.0]
Question 4: Predicted value: [76.0]
Question 5: Predicted value: [81.0]
Question 6: Predicted value: [87.0]
Question 7: Predicted value: [97.0]
Question 8: Predicted value: [84.0]
Question 9: Predicted value: [100.0]
Question 10: Predicted value: [70.0]

Waiting 90 seconds before next batch...

Processing batch 2/32
Question 11: Predicted value: [98.0]
Question 12: Predicted value: [91.0]
Question 13: Predicted value: [89.0]
Question 14: Predicted value: [83.0]
Question 15: Predicted value: [82.0]
Question 16: Predicted value: [81.0]
Question 17: Predicted value: [80.0]
Question 18: Predicted value: [82.0]
Question 19: Predicted value: [74.0]
Question 20: Predicted value: [74.0]

Waiting 90 seconds before next batch...

Processing batch 3/32
Question 21: Predicted value: [62.0]
Question 22: Predicted value: [60.0]
Question 23: Predicted value: [53.0]
Questi

ValueError: Found input variables with inconsistent numbers of samples: [313, 314]

### Groq preds

##### llama-3.3-70b-versatile

In [12]:
import time
from math import ceil

# ...existing code...

# Generate predictions
predictions = []
true_values = []
BATCH_SIZE = 15  # Process 10 questions at a time
DELAY_SECONDS = 10  # Wait 1.5 minutes between batches

true_value = extract_number(validation_answers)
if true_value is not None:
    true_values.extend(true_value)

# Calculate number of batches
num_batches = ceil(len(questions_prompt) / BATCH_SIZE)

for batch in range(num_batches):
    start_idx = batch * BATCH_SIZE
    end_idx = min((batch + 1) * BATCH_SIZE, len(questions_prompt))

    print(f"\nProcessing batch {batch + 1}/{num_batches}")

    # Process current batch
    for i in range(start_idx, end_idx):
        try:
            response = groq_client.chat.completions.create(
                messages=[
                    {"role": "system", "content": system_prompt},
                    {
                        "role": "user",
                        "content": questions_prompt[i],
                    },
                ],
                model="llama-3.3-70b-versatile",
            )
            pred_value = extract_number(response.choices[0].message.content)
            print(f"Question {i + 1}: Predicted value: {pred_value}")

            if pred_value is not None:
                predictions.extend(pred_value)

        except Exception as e:
            print(f"Error processing question {i + 1}: {e}")

    # Wait between batches, except for the last batch
    if batch < num_batches - 1:
        print(f"\nWaiting {DELAY_SECONDS} seconds before next batch...")
        time.sleep(DELAY_SECONDS)

print("\nPreds value len: ", len(predictions))
print("True value len: ", len(true_values))

# Convert to numpy arrays
y_true = np.array(true_values)
y_pred = np.array(predictions)

# Calculate metrics
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_true, y_pred)
mape = calculate_mape(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

print("\nEvaluation Metrics:")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MAPE: {mape:.4f}%")
print(f"R2 Score: {r2:.4f}")


Processing batch 1/21
Question 1: Predicted value: [51.0]
Question 2: Predicted value: [60.0]
Question 3: Predicted value: [64.0]
Question 4: Predicted value: [76.0]
Question 5: Predicted value: [82.0]
Question 6: Predicted value: [89.0]
Question 7: Predicted value: [95.0]
Question 8: Predicted value: [89.0]
Question 9: Predicted value: [96.0]
Question 10: Predicted value: [79.0]
Question 11: Predicted value: [95.0]
Question 12: Predicted value: [93.0]
Question 13: Predicted value: [89.0]
Question 14: Predicted value: [86.0]
Question 15: Predicted value: [82.0]

Waiting 10 seconds before next batch...

Processing batch 2/21
Question 16: Predicted value: [80.0]
Question 17: Predicted value: [78.0]
Question 18: Predicted value: [82.0]
Question 19: Predicted value: [74.0]
Question 20: Predicted value: [74.0]
Question 21: Predicted value: [60.0]
Question 22: Predicted value: [61.0]
Question 23: Predicted value: [51.0]
Question 24: Predicted value: [56.0]
Question 25: Predicted value: [59.

##### llama3-70b-8192

In [12]:
import time
from math import ceil

# ...existing code...

# Generate predictions
predictions = []
true_values = []
BATCH_SIZE = 30  # Process 10 questions at a time
DELAY_SECONDS = 90  # Wait 1.5 minutes between batches

true_value = extract_number(validation_answers)
if true_value is not None:
    true_values.extend(true_value)

# Calculate number of batches
num_batches = ceil(len(questions_prompt) / BATCH_SIZE)

for batch in range(num_batches):
    start_idx = batch * BATCH_SIZE
    end_idx = min((batch + 1) * BATCH_SIZE, len(questions_prompt))

    print(f"\nProcessing batch {batch + 1}/{num_batches}")

    # Process current batch
    for i in range(start_idx, end_idx):
        try:
            response = groq_client.chat.completions.create(
                messages=[
                    {"role": "system", "content": system_prompt},
                    {
                        "role": "user",
                        "content": questions_prompt[i],
                    },
                ],
                model="llama3-70b-8192",
            )
            pred_value = extract_number(response.choices[0].message.content)
            print(f"Question {i + 1}: Predicted value: {pred_value}")

            if pred_value is not None:
                predictions.extend(pred_value)

        except Exception as e:
            print(f"Error processing question {i + 1}: {e}")

    # Wait between batches, except for the last batch
    if batch < num_batches - 1:
        print(f"\nWaiting {DELAY_SECONDS} seconds before next batch...")
        time.sleep(DELAY_SECONDS)

print("\nPreds value len: ", len(predictions))
print("True value len: ", len(true_values))

# Convert to numpy arrays
y_true = np.array(true_values)
y_pred = np.array(predictions)

# Calculate metrics
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_true, y_pred)
mape = calculate_mape(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

print("\nEvaluation Metrics:")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MAPE: {mape:.4f}%")
print(f"R2 Score: {r2:.4f}")


Processing batch 1/11
Question 1: Predicted value: [49.0]
Question 2: Predicted value: [64.0]
Question 3: Predicted value: [63.0]
Question 4: Predicted value: [77.0]
Question 5: Predicted value: [82.0]
Question 6: Predicted value: [89.0]
Question 7: Predicted value: [97.0]
Question 8: Predicted value: [87.0]
Question 9: Predicted value: [96.0]
Question 10: Predicted value: [85.0]
Question 11: Predicted value: [94.0]
Question 12: Predicted value: [92.0]
Question 13: Predicted value: [89.0]
Question 14: Predicted value: [84.0]
Question 15: Predicted value: [80.0]
Question 16: Predicted value: [81.0]
Question 17: Predicted value: [80.0]
Question 18: Predicted value: [82.0]
Question 19: Predicted value: [74.0]
Question 20: Predicted value: [73.0]
Question 21: Predicted value: [60.0]
Question 22: Predicted value: [59.0]
Question 23: Predicted value: [53.0]
Question 24: Predicted value: [60.0]
Question 25: Predicted value: [61.0]
Question 26: Predicted value: [64.0]
Question 27: Predicted v