In [None]:
# Cell 1: Setup Environment
!pip install -q -U sentence-transformers pandas scikit-learn

import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

In [None]:
# Cell 2: Load Your Labeled Dataset
# ðŸš¨ Make sure to upload your CSV file to Colab or mount your Google Drive

try:
    df = pd.read_csv("/content/drive/MyDrive/google_colab/arxiv/my_labeled_training_data.csv")
    # Drop any rows where the API might have failed
    print(f"Loaded {len(df)} labeled pairs.")
    print(df.head())
except FileNotFoundError:
    print("Please upload your CSV file or update the path.")

Loaded 1500 labeled pairs.
     p1_id    p2_id  score pair_type
0  7040019  7040037   0.10  negative
1  7040039  7040052   0.15  negative
2  7040042  7040057   0.05  negative
3  7040001  7040025   0.05  negative
4   704002  7040042   0.05  negative


In [None]:
# Cell 3: Split Data into Train & Validation

# Use 80% for training and 20% for validation
df_train, df_val = train_test_split(df, test_size=0.2, random_state=42)

print(f"Training samples: {len(df_train)}")
print(f"Validation samples: {len(df_val)}")

Training samples: 1200
Validation samples: 300


In [None]:
all_papers = pd.read_json("/content/drive/MyDrive/google_colab/arxiv/sample-data-5000.json", lines=True)
all_papers['new_id'] = all_papers['id'].astype(str).str.replace('.','').astype(int)

all_papers = all_papers.rename(columns={'id': 'arxiv_id'})
all_papers = all_papers.rename(columns={'new_id': 'id'})

In [None]:
# Cell 4: Prepare Data for sentence-transformers
print("Converting data to InputExample format...")

train_examples = []
for index, row in df_train.iterrows():
    paper1 = all_papers[all_papers["id"] == row["p1_id"]].iloc[0]
    text1 = paper1["title"] + paper1["abstract"]
    paper2 = all_papers[all_papers["id"] == row["p2_id"]].iloc[0]
    text2 = paper2["title"] + paper2["abstract"]
    train_examples.append(InputExample(
        texts=[text1, text2],
        label=float(row['score'])
    ))

val_examples = []
for index, row in df_val.iterrows():
    paper1 = all_papers[all_papers["id"] == row["p1_id"]].iloc[0]
    text1 = paper1["title"] + paper1["abstract"]
    paper2 = all_papers[all_papers["id"] == row["p2_id"]].iloc[0]
    text2 = paper2["title"] + paper2["abstract"]
    val_examples.append(InputExample(
        texts=[text1, text2],
        label=float(row['score'])
    ))

print(f"Created {len(train_examples)} training examples.")
print(f"Created {len(val_examples)} validation examples.")
print("\nSample Training Example:")
print(f"  Texts: {train_examples[0].texts[0][:50]}...")
print(f"  Label: {train_examples[0].label}")

Converting data to InputExample format...
Created 1200 training examples.
Created 300 validation examples.

Sample Training Example:
  Texts: Measurement of the Hadronic Form Factor in D0 --> ...
  Label: 0.05


In [None]:
# Cell 5: Define Model, Loss Function, and DataLoader

# 1. Define the "Student" Model
# 'all-MiniLM-L6-v2' is a great, fast, all-around model
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)

# 2. Define the DataLoader
# This batches the training data
train_batch_size = 16
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)

# 3. Define the Loss Function
# CosineSimilarityLoss is perfect for (text, text, score) data
train_loss = losses.CosineSimilarityLoss(model=model)

print(f"Loaded base model: {model_name}")
print("Using: CosineSimilarityLoss")

Loaded base model: all-MiniLM-L6-v2
Using: CosineSimilarityLoss


In [None]:
# Cell 6: Define the Evaluator

# The evaluator runs on the validation set
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    val_examples,
    name='sts-validation'
)

print("Evaluator configured.")

Evaluator configured.


In [None]:
# Cell 7: Start Fine-Tuning!

num_epochs = 4 # How many times to loop over the training data
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) # 10% of steps
output_path = "finetuned-arxiv-recommender" # Where to save the best model

print("Starting model fine-tuning...")

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=100, # Run evaluator every 100 training steps
    warmup_steps=warmup_steps,
    output_path=output_path,
    save_best_model=True, # Only save the model that performs best on the validation set
    show_progress_bar=True
)

print(f"Fine-tuning complete. Best model saved to: {output_path}")

Starting model fine-tuning...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Sts-validation Pearson Cosine,Sts-validation Spearman Cosine
75,No log,No log,0.617656,0.493314
100,No log,No log,0.652311,0.514999
150,No log,No log,0.681381,0.53075
200,No log,No log,0.695617,0.532417
225,No log,No log,0.708145,0.52624
300,No log,No log,0.71863,0.536912


Fine-tuning complete. Best model saved to: finetuned-arxiv-recommender


In [None]:
# Cell 8: Test the "Before" vs. "After" Model
from sentence_transformers.util import cos_sim

# --- 1. Load your newly fine-tuned model ---
finetuned_model = SentenceTransformer(output_path)

# --- 2. Load the ORIGINAL base model ---
original_model = SentenceTransformer(model_name)

# --- 3. Define some test pairs (replace with real titles/abstracts from your data) ---
# A positive pair (e.g., two papers on CNNs)
pair_positive = [
    "Advances in Convolutional Neural Networks for Image Recognition",
    "Understanding Deep Learning and CNNs"
]

# A negative pair (e.g., CNN vs. Physics)
pair_negative = [
    "Advances in Convolutional Neural Networks for Image Recognition",
    "A Study on Quantum Entanglement and Spacetime"
]


# --- 4. Get Scores from ORIGINAL Model ---
print(f"\n--- Testing ORIGINAL Model ({model_name}) ---")
emb_orig_pos = original_model.encode(pair_positive)
emb_orig_neg = original_model.encode(pair_negative)

print(f"Positive Pair Score: {cos_sim(emb_orig_pos[0], emb_orig_pos[1])[0][0]:.4f}")
print(f"Negative Pair Score: {cos_sim(emb_orig_neg[0], emb_orig_neg[1])[0][0]:.4f}")


# --- 5. Get Scores from FINE-TUNED Model ---
print(f"\n--- Testing FINE-TUNED Model ({output_path}) ---")
emb_tuned_pos = finetuned_model.encode(pair_positive)
emb_tuned_neg = finetuned_model.encode(pair_negative)

print(f"Positive Pair Score: {cos_sim(emb_tuned_pos[0], emb_tuned_pos[1])[0][0]:.4f}")
print(f"Negative Pair Score: {cos_sim(emb_tuned_neg[0], emb_tuned_neg[1])[0][0]:.4f}")


--- Testing ORIGINAL Model (all-MiniLM-L6-v2) ---
Positive Pair Score: 0.4633
Negative Pair Score: 0.1215

--- Testing FINE-TUNED Model (finetuned-arxiv-recommender) ---
Positive Pair Score: 0.4726
Negative Pair Score: 0.1297


In [None]:
# Cell 12: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

print("Google Drive mounted!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted!


In [None]:
# Cell 13: Copy Model Folder to Drive

import shutil

# This is the folder that model.fit() created
source_folder = "finetuned-arxiv-recommender"

# This is the NEW permanent location in your Google Drive
# You can change "my_models" to any folder name you want
destination_folder = "/content/drive/MyDrive/my_models/finetuned-arxiv-recommender"

try:
    shutil.copytree(source_folder, destination_folder)
    print(f"Successfully copied model from '{source_folder}' to '{destination_folder}'")
except FileExistsError:
    print(f"Model folder already exists at '{destination_folder}'. No need to copy again.")
except Exception as e:
    print(f"An error occurred: {e}")

Successfully copied model from 'finetuned-arxiv-recommender' to '/content/drive/MyDrive/my_models/finetuned-arxiv-recommender'
