In [None]:
import pandas as pd
import numpy as np
import time
import asyncio
import nest_asyncio
# Cell 1 (Modified): Install and Configure API
!pip install -q -U google-genai pandas  # <-- Added -U to upgrade

from google import genai
import pandas as pd
import random
import time
from google.colab import userdata
from tqdm.auto import tqdm # For a nice progress bar

In [None]:
# --- Configure the API ---
try:
    # Get the key from Colab secrets
    GOOGLE_API_KEY = userdata.get('GEMINI_API_KEY')
    client = genai.Client(api_key=GOOGLE_API_KEY)
    print("Gemini API configured successfully.")
except Exception as e:
    print("Error: Could not configure Gemini API. Make sure 'GEMINI_API_KEY' is set in Colab secrets.")
    print(e)

In [None]:
# all_papers = pd.read_json("/content/drive/MyDrive/google_colab/arxiv/sample-data-5000.json", lines=True)
all_papers = pd.read_json("/content/drive/MyDrive/google_colab/arxiv/arxiv-metadata-oai-snapshot.json", lines=True)
all_papers['new_id'] = all_papers['id'].astype(str).str.replace('.','').astype(int)

all_papers = all_papers.rename(columns={'id': 'arxiv_id'})
all_papers = all_papers.rename(columns={'new_id': 'id'})

all_papers.head(5)


In [None]:
# Cell 4: Define the Async "Teacher" Function

async def get_similarity_score_async(paper1, paper2):
    """
    Asks Gemini for a similarity score between two papers, asynchronously.
    Returns a float score or an Exception on failure.
    """
    prompt = f"""
    You are a research assistant. On a scale of 0.0 to 1.0, how relevant is Paper B as a recommendation for someone who just finished reading Paper A?
    0.8 <=recommendation<= 1.0 is best recommendation
    0.5 <=recommendation < 0.8 is medium recommendation
    0.0 <= recommendation < 0.5 is negative recommendation
    Do not explain your reasoning. Respond ONLY with the numerical score (e.g., 0.85).

    Paper A:
    Title: {paper1['title']}
    Submitter: {paper1['submitter']}
    Authors: {paper1['authors']}
    Categories: {paper1['categories']}
    Abstract: {paper1['abstract']}

    Paper B:
    Title: {paper2['title']}
    Submitter: {paper2['submitter']}
    Authors: {paper2['authors']}
    Categories: {paper2['categories']}
    Abstract: {paper2['abstract']}

    Score:
    """

    try:
        print(paper1['title'])
        print(paper2['title'])
        # Call the API using the correct async method
        response = await client.aio.models.generate_content(
            model='gemini-2.5-flash',
            contents=prompt
        )

        # Safely parse the score
        score_text = response.text.strip().replace(",", ".")
        score = float(score_text)

        # Clamp score to be safe
        return max(0.0, min(1.0, score))

    except Exception as e:
        # Instead of just printing, return the exception
        # This lets our main loop handle it gracefully
        return e
    finally:
        # A small non-blocking sleep to respect rate limits
        await asyncio.sleep(1)

In [None]:

# The set to track pairs we've already processed
# We store a sorted tuple (id1, id2) so (A,B) is the same as (B,A)
seen_pairs = set()
NUM_OF_PAIRS = 50000

while len(seen_pairs) < NUM_OF_PAIRS:
  p1i = random.choice(all_papers['id'])
  p2i = random.choice(all_papers['id'])
  if p1i == p2i:
    continue

  pair_key = tuple(sorted([p1i, p2i]))
  if pair_key not in seen_pairs:
    seen_pairs.add(pair_key)
    print(len(seen_pairs))
  print(len(seen_pairs))


In [None]:
all_papers[all_papers['id'] == 800]

In [None]:
# Cell 6: Run in Parallel (Batches of 10)

BATCH_SIZE = 10
labeled_pairs = []
pairs_to_process = list(seen_pairs) # Convert set to a list to be batched

# Counters for your buckets
positive = 0
medium = 0
negative = 0

print(f"Starting parallel processing for {len(pairs_to_process)} pairs in batches of {BATCH_SIZE}...")

async def main():
    # Grant access to the global counters
    global positive, medium, negative

    # Use tqdm for a progress bar
    for i in tqdm(range(0, len(pairs_to_process), BATCH_SIZE)):

        batch_keys = pairs_to_process[i:i + BATCH_SIZE]
        tasks = []

        for p1i, p2i in batch_keys:
            # Use the FAST dictionary lookup
            p1 = all_papers[all_papers['id'] == p1i]
            p2 = all_papers[all_papers['id'] == p2i]
            tasks.append(get_similarity_score_async(p1, p2))

        # Run the batch of tasks in parallel
        results = await asyncio.gather(*tasks)

        # Process results from the batch
        for (p1i, p2i), score in zip(batch_keys, results):

            # Check if the result was a valid score (not an exception)
            if isinstance(score, float):
                print(f"score is {score}")
                p1 = all_papers[all_papers['id'] == p1i]
                p2 = all_papers[all_papers['id'] == p2i]

                # Classify the score into buckets
                pair_type = ""
                if score >= 0.8:
                    pair_type = "positive"
                    positive += 1
                elif score >= 0.5:
                    pair_type = "medium"
                    medium += 1
                elif score >= 0:
                    pair_type = "negative"
                    negative += 1

                # --- FIX THE TYPEERROR ---
                # Keys must be strings (e.g., "text_1"), not dictionary objects
                labeled_pairs.append({
                    "p1_id": int(p1["id"]),
                    "p2_id": int(p2["id"]),
                    "score": score,
                    "pair_type": pair_type
                })
            elif isinstance(score, Exception):
                print(f"  [Error processing pair ({p1i}, {p2i})]: {score}")

# --- Run the main async function ---
await main()

print(f"\nProcessing complete.")
print(f"Successfully generated {len(labeled_pairs)} labels.")
print(f"Buckets: {positive} Positive, {medium} Medium, {negative} Negative")

In [None]:
# Cell 5: Review and Save
if labeled_pairs:
    df = pd.DataFrame(labeled_pairs)

    print("\n--- Pairs per Type ---")
    print(df['pair_type'].value_counts())

    # Save to a file for your fine-tuning notebook
    df.to_csv("my_labeled_training_data.csv", index=False)

    print("\nSuccessfully saved to 'my_labeled_training_data.csv'")

else:
    print("No labeled pairs were generated. Try increasing MAX_ATTEMPTS or check your sampling logic.")