In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import re
import time
from sklearn.metrics import classification_report, confusion_matrix
from google.colab import ai

In [None]:
DATA_PATH = "drive/MyDrive/eq_5d/eq-5d-200-records.csv"
SAVE_DIR = "drive/MyDrive/eq_5d"
USE_FEW_SHOT = True

models = [
 'google/gemini-2.0-flash',
 'google/gemini-2.0-flash-lite',
 'google/gemini-2.5-flash',
 'google/gemini-2.5-flash-lite',
 'google/gemini-2.5-pro',
 'google/gemma-3-12b',
 'google/gemma-3-1b',
 'google/gemma-3-27b',
 'google/gemma-3-4b'
]

In [None]:
df = pd.read_csv(DATA_PATH)


In [None]:
def make_zero_shot_prompt(abstract):
    return f"""
You are a biomedical text classification expert.

Task: Determine whether the following clinical study abstract provides **explicit evidence** that the EQ-5D instrument (or a variant like EQ-5D-3L, EQ-5D-5L, EuroQol-5D) was actually **used** in the study's methods or results.

Answer format (exactly one line):
Prediction: [Yes/No]
Confidence: [number between 0 and 100]

Abstract:
\"\"\"{abstract.strip()}\"\"\"
"""

In [None]:
def make_few_shot_prompt(abstract, examples):
    prompt = """You are a biomedical text classification expert.

Task: Determine whether the following clinical study abstract provides **explicit evidence** that the EQ-5D instrument (or a variant like EQ-5D-3L, EQ-5D-5L, EuroQol-5D) was actually **used** in the study's methods or results.

Answer format (exactly one line):
Prediction: [Yes/No]
Confidence: [number between 0 and 100]

Here are examples:

"""
    for _, ex in examples.iterrows():
        label = "Yes" if ex["Label"] == 1 else "No"
        conf = "90" if ex["Label"] == 1 else "85"
        prompt += f"""Abstract:
\"\"\"{ex['Abstract'].strip()}\"\"\"
Prediction: {label}
Confidence: {conf}

"""
    prompt += f"""\nNow classify this new abstract:

Abstract:
\"\"\"{abstract.strip()}\"\"\""""
    return prompt

def parse_prediction(response):
    pred, conf = "No", 50
    try:
        match_pred = re.search(r"Prediction:\s*(Yes|No)", response, re.I)
        match_conf = re.search(r"Confidence:\s*(\d+)", response)
        if match_pred:
            pred = match_pred.group(1).capitalize()
        if match_conf:
            conf = int(match_conf.group(1))
    except:
        pass
    return pred, conf

In [None]:
if USE_FEW_SHOT:
    pos_examples = df[df["Label"] == 1].sample(20, random_state=42)
    neg_examples = df[df["Label"] == 0].sample(20, random_state=42)
    few_shot_examples = pd.concat([pos_examples, neg_examples])


In [None]:
all_results = []

for model_name in models:
    model_results = []
    print(f"\nRunning predictions for {model_name}")

    for idx, row in df.iterrows():
        if USE_FEW_SHOT:
            prompt = make_few_shot_prompt(row["Abstract"], few_shot_examples)
        else:
            prompt = make_zero_shot_prompt(row["Abstract"])

        try:
            response = ai.generate_text(
                prompt=prompt,
                model_name=model_name
            )
            pred, conf = parse_prediction(response)
        except Exception as e:
            print(f"Error with {model_name}, doc {row['No']}: {e}")
            pred, conf = "No", 50

        model_results.append({
            "No": row["No"],
            "True_Label": row["Label"],
            "Prediction": 1 if pred == "Yes" else 0,
            "Confidence": conf
        })

        time.sleep(1.0)

    # Save per-model predictions
    model_df = pd.DataFrame(model_results)
    model_df.to_csv(f"{SAVE_DIR}/preds_{model_name.replace('/','_')}.csv", index=False)
    all_results.append((model_name, model_df))

In [None]:
model_weights = {}

for model_name, model_df in all_results:
    print(f"\n Results for {model_name}")
    print(classification_report(model_df["True_Label"], model_df["Prediction"]))
    f1 = classification_report(
        model_df["True_Label"], model_df["Prediction"], output_dict=True
    )["weighted avg"]["f1-score"]
    model_weights[model_name] = f1

In [None]:
def weighted_vote(row, model_dfs, model_weights):
    yes_score, no_score = 0, 0
    for model_name, model_df in model_dfs:
        pred = model_df.loc[model_df["No"] == row["No"], "Prediction"].values[0]
        conf = model_df.loc[model_df["No"] == row["No"], "Confidence"].values[0]
        weight = model_weights.get(model_name, 1.0)
        if pred == 1:
            yes_score += weight * (conf / 100)
        else:
            no_score += weight * (conf / 100)
    return 1 if yes_score >= no_score else 0

ensemble_preds = []
for idx, row in df.iterrows():
    final_pred = weighted_vote(row, all_results, model_weights)
    ensemble_preds.append({
        "No": row["No"],
        "True_Label": row["Label"],
        "Ensemble_Pred": final_pred
    })

ensemble_df = pd.DataFrame(ensemble_preds)
ensemble_df.to_csv(f"{SAVE_DIR}/ensemble_predictions.csv", index=False)

In [None]:
print("\n Ensemble Results")
print(classification_report(ensemble_df["True_Label"], ensemble_df["Ensemble_Pred"]))
print("Confusion Matrix:")
print(confusion_matrix(ensemble_df["True_Label"], ensemble_df["Ensemble_Pred"]))