In [15]:
%pip install pandas google-generativeai

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import os
import time
import re
import pandas as pd
import json
from datetime import datetime
import google.generativeai as genai

# === CONFIG ===
input_csv = r"gemini.csv"
output_csv = r"gemini_results.csv"

clapnq_file = r"clapnq_train_answerable.jsonl"
with open(clapnq_file, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

genai.configure(api_key="GEMINI_KEY")
model = genai.GenerativeModel("gemini-2.5-flash")

# === CARICAMENTO DATI ===
df = pd.read_csv(input_csv, encoding="utf-8")

def parse_gemini_output(raw_output):
    
    pattern = r"ID\s+(\d+)\s*:\s*(.*?)(?=\nID\s+\d+\s*:|$)"
    matches = re.findall(pattern, raw_output, flags=re.DOTALL)
    answers = {}
    for tid_str, ans in matches:
        tid = int(tid_str)
        clean_ans = " ".join(
            [line.strip() for line in ans.strip().splitlines() if line.strip()]
        )
        answers[tid] = clean_ans
    return answers


# === FUNZIONE ===
def query_batch(df_batch):
    pairs = []
    for _, row in df_batch.iterrows():
        tid = int(row["test_id"])  
        context = row["modified_text"]
        question = data[tid]["input"]  
        pairs.append(f"ID {tid}\nContext: {context}\nQuestion: {question}")

    prompt = (
        "You must answer the question only and exclusively based on the provided contexts, "
        "ignoring any prior knowledge or assumptions. Your answers must be detailed, exhaustive and complete. "
        "Do not explicitly cite the text.\n\n"
        "You will be given multiple pairs of context and question. "
        "For each pair, answer ONLY based on its context. "
        "Return the answers in the format:\n\n"
        "ID <id>: <answer>\n\n"
        "Do not skip any ID, and do not merge answers.\n\n"
        + "\n\n".join(pairs)
    )

    response = model.generate_content(prompt)

    raw_output = ""
    for cand in response.candidates:
        for part in cand.content.parts:
            if hasattr(part, "text") and part.text:
                raw_output += part.text
    raw_output = raw_output.strip()
    print("=== RAW OUTPUT GEMINI ===")
    print(raw_output[:1000])  
    print("=========================")

    answers = parse_gemini_output(raw_output)

    if answers:
        max_id = max(answers.keys())
        expected_ids = set(df_batch["test_id"])
        missing_ids = expected_ids - set(answers.keys())
        print(f"Output generated: {max_id}")
        if missing_ids:
            print(f"{len(missing_ids)} ID are missing in this batch: {sorted(list(missing_ids))[:10]}{'...' if len(missing_ids)>10 else ''}")
    else:
        print("No IDs")

    return answers

all_results = []
batch_size = 250

for start in range(15000, len(df), batch_size):
    end = min(start + batch_size, len(df))
    df_batch = df.iloc[start:end].copy()

    print(f"‚û°Ô∏è Processing rows {start}‚Äì{end-1} ({len(df_batch)} righe)")

    answers = query_batch(df_batch)

    df_batch["gemini_answer"] = df_batch["test_id"].map(answers)
    df_batch["timestamp_gemini"] = datetime.now().isoformat()

    all_results.append(df_batch)

    print(f"‚úÖ Finished rows {start}‚Äì{end-1}, sleeping 60s...")
    time.sleep(60)

final_df = pd.concat(all_results, ignore_index=True)

final_df.to_csv(output_csv, index=False, encoding="utf-8")
print(f"Saved: {output_csv} with {len(final_df)} rows")

‚û°Ô∏è Processing rows 15000‚Äì15249 (250 righe)
=== RAW OUTPUT GEMINI ===
ID 1395: The Jacobins were the most influential political club during the French Revolution, initially founded in 1789 by anti-Royalist deputies from Brittany. After 1792, they were renamed the Society of the Jacobins, Friends of Freedom and Equality. The Club grew into a nationwide movement that favored a Republican form of government. The Jacobin Club was heterogeneous, including prominent parliamentary factions of the early 1790s such as the Mountain and the Girondins.

ID 1396: The average rate of return (ARR) should be high. When evaluating a project, if the ARR is equal to or greater than the required rate of return, the project is acceptable. If it is less than the desired rate, it should be rejected. When comparing different investments, a higher ARR indicates a more attractive investment.

ID 1397: Tara dies after being attacked by Gemma. Gemma initially hit Tara with an iron. Tara then struggled agains

In [None]:
import pandas as pd

# === CONFIG ===
file_csv = r"gemini_results.csv"

df = pd.read_csv(file_csv, encoding="utf-8")

colon_mask = df["gemini_answer"].astype(str).str.strip().str.endswith(":")
colon_rows = df[colon_mask]

print(f" Num with ':' at the end of the response: {len(colon_rows)}")
print(colon_rows[["test_id","gemini_answer"]].head(20))


‚ö† Numero di risposte che finiscono con ':': 0
Empty DataFrame
Columns: [test_id, gemini_answer]
Index: []


In [None]:
import pandas as pd

# === CONFIG ===
file_csv = r"gemini_results.csv"
output_csv = r"gemini_results.csv"

df = pd.read_csv(file_csv, encoding="utf-8")

mask_colon = df["gemini_answer"].astype(str).str.strip().str.endswith(":")

num_eliminate = mask_colon.sum()
print(f"Deleted words: {num_eliminate}")

df.loc[mask_colon, "gemini_answer"] = ""

df.to_csv(output_csv, index=False, encoding="utf-8")
print(f"üòç File salvato in: {output_csv}")

‚ö† Risposte svuotate perch√© terminavano con ':': 0
üòç File salvato in: C:\Users\Elisa\Desktop\TESI\righe_mancanti_FIXED.csv


In [None]:
import pandas as pd
import json
from datetime import datetime
import google.generativeai as genai
import re

input_csv = r"gemini_results.csv"
output_csv = r"gemini_results.csv"

clapnq_file = r"clapnq_train_answerable.jsonl"
with open(clapnq_file, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

genai.configure(api_key="GEMINI_KEY")
model = genai.GenerativeModel("gemini-2.5-flash")

df = pd.read_csv(input_csv, encoding="utf-8")

missing_mask = df["gemini_answer"].isna() | (df["gemini_answer"].astype(str).str.strip() == "")
missing_df = df[missing_mask].copy()
print(f"Rows with no response: {len(missing_df)}")


def parse_gemini_output(raw_output):
    
    pattern = r"ID\s+(\d+)\s*:\s*(.*?)(?=\nID\s+\d+\s*:|$)"
    matches = re.findall(pattern, raw_output, flags=re.DOTALL)
    answers = {}
    for tid_str, ans in matches:
        tid = int(tid_str)
        clean_ans = " ".join(
            [line.strip() for line in ans.strip().splitlines() if line.strip()]
        )
        answers[tid] = clean_ans
    return answers


def query_batch(df_batch, max_retries=3):
    all_answers = {}
    attempt = 0
    remaining = df_batch.copy()

    while attempt < max_retries and not remaining.empty:
        attempt += 1
        print(f"üîÑ Attempt {attempt}, rows: {len(remaining)}")

        pairs = []
        for _, row in remaining.iterrows():
            tid = int(row["test_id"])
            context = row["modified_text"]
            question = data[tid]["input"]
            pairs.append(f"ID {tid}\nContext: {context}\nQuestion: {question}")

        prompt = (
            "You must answer the question only and exclusively based on the provided contexts, "
            "ignoring any prior knowledge or assumptions. Your answers must be detailed, exhaustive and complete. "
            "Do not explicitly cite the text.\n\n"
            "You will be given multiple pairs of context and question. "
            "For each pair, answer ONLY based on its context. "
            "Return the answers in the format:\n\n"
            "ID <id>: <answer>\n\n"
            "Do not skip any ID, and do not merge answers.\n\n"
            + "\n\n".join(pairs)
        )

        response = model.generate_content(prompt)

        # recupero tutto il testo
        raw_output = ""
        for cand in response.candidates:
            for part in cand.content.parts:
                if hasattr(part, "text") and part.text:
                    raw_output += part.text
        raw_output = raw_output.strip()

        
        answers = parse_gemini_output(raw_output)
        all_answers.update(answers)

        expected_ids = set(remaining["test_id"])
        missing_ids = expected_ids - set(answers.keys())
        if missing_ids:
            print(f"‚ö† Mancano ancora {len(missing_ids)} ID dopo tentativo {attempt}")
            remaining = remaining[remaining["test_id"].isin(missing_ids)]
        else:
            remaining = pd.DataFrame()

    return all_answers

batch_size = 100
updated_ids = set()

for start in range(4200, len(missing_df), batch_size):
    end = min(start + batch_size, len(missing_df))
    df_batch = missing_df.iloc[start:end].drop_duplicates(subset="test_id").copy()
    print(f"‚û° Batch {start}-{end-1}, {len(df_batch)} unique rows")

    answers = query_batch(df_batch)

    for tid, ans in answers.items():
        mask = (df["test_id"] == tid) & (
            df["gemini_answer"].isna() | (df["gemini_answer"].astype(str).str.strip() == "")
        )
        if mask.any():
            df.loc[mask, "gemini_answer"] = ans
            df.loc[mask, "timestamp_gemini"] = datetime.now().isoformat()
            updated_ids.add(tid)

    print(f"Updated {len(answers)} IDs in this batch")

df.to_csv(output_csv, index=False, encoding="utf-8")


‚ö† Righe senza risposta: 6129
‚û° Recupero batch 4200-4299, 100 righe uniche
üîÑ Attempt 1, righe da chiedere: 100

ID 1496: The provided context does not state when Baron Pierre de Coubertin brought back the Olympics. It states that the congress was held on 23 June 1894, during which the commission's proposals for the Olympic Games were accepted unanimously, and the modern Olympic movement was officially born. Coubertin, along with Demetrius Vikelas, C. Herbert, and W.M. Sloane, helped lead the efforts of the commission on reviving the Olympics.

ID 1497: The provided context states that the contracted forms "Dr" or "Dr." are used as a designation for a person who has obtained a Doctorate (e.g., PhD). It does not provide any information about a difference between "Dr." and "Dr".

ID 1498: The provided context states that standing House committees such as the Ways and Means Committee are important with respect to impact on policy due to their wide jurisdiction. They are also seen as 