# Setup

In [1]:
!pip install ratelimit

Collecting ratelimit
  Downloading ratelimit-2.2.1.tar.gz (5.3 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: ratelimit
  Building wheel for ratelimit (setup.py) ... [?25ldone
[?25h  Created wheel for ratelimit: filename=ratelimit-2.2.1-py3-none-any.whl size=5895 sha256=23af6817aef02a9ce0c31bb2d6da13dad9f9ed661396952335db8360a1657ec2
  Stored in directory: /Users/idhibhatpankam/Library/Caches/pip/wheels/69/bd/e0/4a5dee2a1bfbc8e258f543f92940e2b494d63b5be8144ec8c4
Successfully built ratelimit
Installing collected packages: ratelimit
Successfully installed ratelimit-2.2.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# Translating
the data will be translated from TH -> EN to improve the accuracy when used on LLMs.

In [None]:
# import os
# from google.cloud import translate_v2 as translate

# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./gcp-key.json"
# client = translate.Client()

In [None]:
# train_df = pd.read_csv('train.csv')
# test_df = pd.read_csv('test.csv')

# answers_train = train_df[['ID', 'answer']]
# answers_test = test_df[['ID', 'answer']]
# answers_train.to_csv('answers_train_TH.csv', index=False)
# answers_test.to_csv('answers_test_TH.csv', index=False)

In [None]:
# answers_train['answer'] = answers_train['answer'].apply(lambda x: client.translate(x, source_language='th', target_language='en')['translatedText'])
# answers_test['answer'] = answers_test['answer'].apply(lambda x: client.translate(x, source_language='th', target_language='en')['translatedText'])

In [None]:
# answers_train.to_csv('answers_train_EN.csv', index=False)
# answers_test.to_csv('answers_test_EN.csv', index=False)

# Prompting

In [2]:
# from google.colab import userdata
# api_key = userdata.get('gemini_api_key')

import os
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")

In [58]:
import google.generativeai as genai
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import pandas as pd
import re

In [None]:
genai.configure(api_key=api_key)
generation_config = genai.GenerationConfig(
        temperature=0,
)
model = genai.GenerativeModel("gemini-2.0-flash-001", generation_config=generation_config)

In [40]:
@sleep_and_retry
@limits(calls=15, period=60)
def generate_content_with_rate_limit(prompt):
  return model.generate_content(prompt).text

In [None]:
train_df = pd.read_csv('train_EN.csv')
test_df = pd.read_csv('test_EN.csv')

In [7]:
train_df.head()

Unnamed: 0,ID,set,question,answer,score
0,0,Q2,Hamtube is an online video viewing platform th...,The granularity should be low because we want ...,0.0
1,1,Q3,Hamtube is an online video viewing platform th...,Agree because it is collecting data from multi...,5.0
2,2,Q2,Hamtube is an online video viewing platform th...,Granularity should be #checkout events/ #cooki...,5.0
3,3,Q3,Hamtube is an online video viewing platform th...,"Agree. Let X~Binomial(N,p), where p is the rat...",2.0
4,4,Q1,Hamtube is an online video viewing platform th...,"Agreed, because A/B Testing is an effective wa...",4.5


In [8]:
test_df.head()

Unnamed: 0,ID,set,question,answer
0,362,Q3,Hamtube is an online video viewing platform th...,"Yes, because in this problem, the variable is ..."
1,363,Q4,Hamtube is an online video viewing platform th...,50/50 has the advantage of being able to colle...
2,364,Q2,Hamtube is an online video viewing platform th...,1. Number of views 2. Number of ad clicks 3. T...
3,365,Q3,Hamtube is an online video viewing platform th...,"Agree, because a click can have two possible o..."
4,366,Q4,Hamtube is an online video viewing platform th...,50/50 - It will take less time to collect data...


In [9]:
train_df['score'].value_counts()

score
5.00    150
1.00     58
0.00     50
3.00     26
2.00     20
4.00     15
0.50     10
4.50      9
3.50      8
1.50      8
2.50      5
4.25      1
0.75      1
4.75      1
Name: count, dtype: int64

In [12]:
Q1_train = train_df[train_df['set'] == 'Q1']
Q2_train = train_df[train_df['set'] == 'Q2']
Q3_train = train_df[train_df['set'] == 'Q3']
Q4_train = train_df[train_df['set'] == 'Q4']

Q1_test = test_df[test_df['set'] == 'Q1']
Q2_test = test_df[test_df['set'] == 'Q2']
Q3_test = test_df[test_df['set'] == 'Q3']
Q4_test = test_df[test_df['set'] == 'Q4']

In [35]:
Q1_train['question'].iloc[0]

'Hamtube is an online video viewing platform that allows users to upload, share, and watch videos. Hamtaro is the head of Hamtube’s marketing team, and he wants to know if moving the placement of ads will increase sales (more users clicking on ads). So he decides to conduct A/B testing. Explain why Hamtaro should use A/B testing in his experiments, or offer a counter-argument if you disagree, and explain why.'

In [59]:
def extract_valid_score(response: str) -> float:
    allowed_scores = {i * 0.5 for i in range(11)}  # {0, 0.5, 1, ..., 5}
    # extract the first valid number from the response
    match = re.search(r"\d+(\.\d+)?", response.strip())
    
    if match:
        num = float(match.group())
        
        # round to nearest 0.5
        rounded_num = round(num * 2) / 2  # ensure 0.5 intervals
        if rounded_num in allowed_scores:
            return rounded_num
    
    return 0.0

In [60]:
extract_valid_score("The answer is 3.5")

3.5

In [61]:
def generate_predictions(train_df: pd.DataFrame, test_df: pd.DataFrame) -> pd.DataFrame:
    prompt = f"""
You are a text regression model. Given an answer to a question, predict a numerical score between 0 and 5, with intervals of 0.5.
The question is: "{train_df['question'].iloc[0]}".
Here are some examples to help you get started:\n\n"""
    for i, row in train_df.iterrows():
        prompt += f"Answer: {row['answer']}\nScore: {row['score']}\n\n"
    
    # print(prompt)
    predictions = pd.DataFrame(columns=['ID', 'score'])
    # limit = 30
    # count = 0
    for i, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
        prompt_with_test = f"{prompt}What score would you give to the following answer?\n\n{row['answer']}\n\n"
        response: str = generate_content_with_rate_limit(prompt_with_test)
        # print(response)
        score = extract_valid_score(response)
        predictions.loc[len(predictions)] = [row['ID'], score]
        # count += 1
        # if count == limit:
        #     break

    return predictions


In [53]:
pred_Q1 = generate_predictions(Q1_train, Q1_test)

100%|██████████| 22/22 [01:05<00:00,  2.99s/it]


In [55]:
pred_Q2 = generate_predictions(Q2_train, Q2_test)

100%|██████████| 23/23 [00:39<00:00,  1.73s/it]


In [56]:
pred_Q3 = generate_predictions(Q3_train, Q3_test)

100%|██████████| 23/23 [01:52<00:00,  4.87s/it]


In [62]:
pred_Q4 = generate_predictions(Q4_train, Q4_test)

100%|██████████| 22/22 [01:06<00:00,  3.01s/it]


In [None]:
pred_df = pd.concat([pred_Q1, pred_Q2, pred_Q3, pred_Q4])
pred_df = pred_df.sort_values(by='ID')
# remove .0 from ID
pred_df['ID'] = pred_df['ID'].astype(int)
pred_df.to_csv('predictions.csv', index=False)

In [66]:
pred_df

Unnamed: 0,ID,score
0,362,5.0
0,363,5.0
0,364,3.0
1,365,5.0
1,366,5.0
...,...,...
22,447,2.0
19,448,1.0
20,449,1.0
21,450,1.0
