In [5]:
from google.colab import drive
import os

# Mount your Google Drive
drive.mount('/content/drive')

# Install required libraries
!pip install sentence-transformers pandas tqdm pyarrow -q

print("✅ Drive mounted and libraries installed.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Drive mounted and libraries installed.


In [1]:
!ls -lh /content/drive/MyDrive/psychology_tutor_engine/data/processed/questions_with_embeddings.parquet


-rw------- 1 root root 1.7G Jun 27 02:57 /content/drive/MyDrive/psychology_tutor_engine/data/processed/questions_with_embeddings.parquet


In [2]:
import os

# --- ⚠️ IMPORTANT: EDIT THIS LINE -------------------------------------------------
# Set this to the path of your project folder in Google Drive
GDRIVE_PROJECT_PATH = '/content/drive/MyDrive/psychology_tutor_engine'
# -----------------------------------------------------------------------------------

# --- Define file paths based on your Drive structure ---
DATA_FILE = os.path.join(GDRIVE_PROJECT_PATH, 'data/processed/questions_with_embeddings.parquet')
OUTPUT_DIR = os.path.join(GDRIVE_PROJECT_PATH, 'data/training_sets')
OUTPUT_FILE = os.path.join(OUTPUT_DIR, "distractor_generation_training_data_DOMAIN_AWARE.parquet")

# --- Verify that the input file exists before proceeding ---
if not os.path.exists(DATA_FILE):
    raise FileNotFoundError(f"FATAL: Input file not found at '{DATA_FILE}'. Please check your GDRIVE_PROJECT_PATH and ensure the file is uploaded correctly.")
else:
    print(f"✅ Input file found at: {DATA_FILE}")
    # Create the output directory if it doesn't exist
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    print(f"✅ Output will be saved to: {OUTPUT_FILE}")

✅ Input file found at: /content/drive/MyDrive/psychology_tutor_engine/data/processed/questions_with_embeddings.parquet
✅ Output will be saved to: /content/drive/MyDrive/psychology_tutor_engine/data/training_sets/distractor_generation_training_data_DOMAIN_AWARE.parquet


In [3]:
import pandas as pd
import numpy as np
import torch
from tqdm.notebook import tqdm
import pyarrow.parquet as pq
import gc

# --- Configuration ---
PSYCHOLOGY_SOURCES = ['boltmonkey', 'gragroo', 'mentat']
NUM_SAMPLES_TO_GENERATE = 50000
SIMILARITY_MIN = 0.4
SIMILARITY_MAX = 0.8
BATCH_SIZE = 512 # Smaller batch size is safer
# --- Your excellent suggestions implemented ---
CHUNK_SIZE = 100000 # Process the 900k reference embeddings in chunks of 100k
TOP_K = 200         # For each question, we only keep track of the top 200 most similar candidates

# --- Main Logic ---
print("--- Starting GPU Generation (v4 - Your Chunking Method) ---")

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device.upper()}")

# --- STEP 1: LOAD EMBEDDINGS EFFICIENTLY ---
parquet_file = pq.ParquetFile(DATA_FILE)
num_rows = parquet_file.metadata.num_rows
embedding_cols = [col.name for col in parquet_file.metadata.schema if col.name.startswith('embed_')]
num_embeddings = len(embedding_cols)

print(f"Loading {num_rows} embeddings to GPU...")
all_embeddings_gpu = torch.empty((num_rows, num_embeddings), dtype=torch.float16, device=device) # Using float16 (Your Solution B)

with torch.no_grad():
    offset = 0
    for i in tqdm(range(parquet_file.num_row_groups), desc="Loading & Normalizing Embeddings"):
        row_group_table = parquet_file.read_row_group(i, columns=embedding_cols)
        chunk_gpu = torch.from_numpy(row_group_table.to_pandas().values).to(device, dtype=torch.float16)
        torch.nn.functional.normalize(chunk_gpu, p=2, dim=1, out=chunk_gpu)
        all_embeddings_gpu[offset : offset + len(chunk_gpu)] = chunk_gpu
        offset += len(chunk_gpu)

print("✅ All embeddings loaded and normalized on GPU.")

# --- STEP 2: LOAD LOGIC DATAFRAME ---
df_logic = pd.read_parquet(DATA_FILE, columns=['question', 'answer', 'source']).reset_index(drop=True)
df_sample = df_logic.sample(n=NUM_SAMPLES_TO_GENERATE, random_state=42)
psychology_mask = df_logic['source'].isin(PSYCHOLOGY_SOURCES)
training_records = []
num_batches = int(np.ceil(len(df_sample) / BATCH_SIZE))

print(f"Processing {len(df_sample)} samples in {num_batches} batches...")

# --- STEP 3: PROCESS BATCHES USING THE CHUNKING STRATEGY ---
with torch.no_grad():
    for i in tqdm(range(num_batches), desc="Generating Distractors"):
        batch_df = df_sample.iloc[i*BATCH_SIZE:(i+1)*BATCH_SIZE]
        batch_indices = torch.as_tensor(batch_df.index.values, device=device)
        query_embeddings = all_embeddings_gpu[batch_indices]

        # Placeholders for the top k candidates for each question in the batch
        best_scores_cpu = torch.full((len(query_embeddings), TOP_K), -1., device='cpu')
        best_ids_cpu = torch.full((len(query_embeddings), TOP_K), -1, dtype=torch.long, device='cpu')

        # Iterate through the entire 900k+ embedding set in manageable chunks
        for start in range(0, num_rows, CHUNK_SIZE):
            end = min(start + CHUNK_SIZE, num_rows)
            ref_chunk_gpu = all_embeddings_gpu[start:end]

            # Calculate similarity ONLY for the current chunk (e.g., 512 x 100k)
            sim_chunk_gpu = query_embeddings @ ref_chunk_gpu.T

            # Merge with previously found best candidates and find the new top k
            merged_scores = torch.cat([best_scores_cpu.to(device), sim_chunk_gpu], dim=1)
            ref_indices = torch.arange(start, end, device=device).expand(len(query_embeddings), -1)
            merged_ids = torch.cat([best_ids_cpu.to(device), ref_indices], dim=1)

            best_scores_gpu, top_k_indices = torch.topk(merged_scores, k=TOP_K, dim=1)
            best_ids_gpu = torch.gather(merged_ids, 1, top_k_indices)

            # Move results back to CPU to free up GPU VRAM for the next chunk
            best_scores_cpu = best_scores_gpu.to('cpu')
            best_ids_cpu = best_ids_gpu.to('cpu')

        # --- Now, perform the filtering logic on the small (BATCH_SIZE x TOP_K) set on the CPU ---
        for row_idx, original_df_idx in enumerate(batch_indices.cpu().numpy()):
            original_row = df_logic.loc[original_df_idx]
            original_source = original_row.source

            candidate_ids = best_ids_cpu[row_idx].numpy()
            candidate_scores = best_scores_cpu[row_idx].numpy()

            # Build the filter mask
            mask = (candidate_scores > SIMILARITY_MIN) & (candidate_scores < SIMILARITY_MAX)
            mask &= (candidate_ids != original_df_idx) # Exclude self

            # Apply domain-specific logic
            cand_sources = df_logic.loc[candidate_ids, 'source']
            if original_source in PSYCHOLOGY_SOURCES:
                mask &= cand_sources.isin(PSYCHOLOGY_SOURCES).values
            else:
                mask &= (cand_sources == original_source).values

            valid_indices = candidate_ids[mask]

            if len(valid_indices) > 0:
                distractor_idx = np.random.choice(valid_indices)
            else: # Fallback: if no ideal candidate, pick the single best non-self candidate
                fallback_mask = (candidate_ids != original_df_idx)
                if fallback_mask.any():
                    distractor_idx = candidate_ids[fallback_mask][np.argmax(candidate_scores[fallback_mask])]
                else:
                    distractor_idx = -1 # Should not happen

            if distractor_idx != -1:
                training_records.append({
                    'question': original_row.question,
                    'correct_answer': original_row.answer,
                    'distractor': df_logic.at[distractor_idx, 'answer'],
                    'question_source': original_source,
                    'distractor_source': df_logic.at[distractor_idx, 'source']
                })

        # Explicitly clean up to be safe
        del best_scores_cpu, best_ids_cpu, best_scores_gpu, best_ids_gpu, query_embeddings
        gc.collect()
        torch.cuda.empty_cache()

print("\nConstructing final training set...")
training_data = pd.DataFrame(training_records)
training_data.to_parquet(OUTPUT_FILE, index=False)

print("\n--- ✅ FINAL SUCCESS ---")
print(f"Memory-safe, GPU-accelerated generation complete. Training set saved to: '{OUTPUT_FILE}'")

--- Starting GPU Generation (v4 - Your Chunking Method) ---
Using device: CUDA
Loading 900488 embeddings to GPU...


Loading & Normalizing Embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

✅ All embeddings loaded and normalized on GPU.
Processing 50000 samples in 98 batches...


Generating Distractors:   0%|          | 0/98 [00:00<?, ?it/s]


Constructing final training set...

--- ✅ FINAL SUCCESS ---
Memory-safe, GPU-accelerated generation complete. Training set saved to: '/content/drive/MyDrive/psychology_tutor_engine/data/training_sets/distractor_generation_training_data_DOMAIN_AWARE.parquet'


In [4]:
import pandas as pd
import numpy as np
import torch
from tqdm.notebook import tqdm
import pyarrow.parquet as pq
import gc

# --- Configuration (No changes here) ---
PSYCHOLOGY_SOURCES = ['boltmonkey', 'gragroo', 'mentat']
NUM_SAMPLES_TO_GENERATE = 50000
SIMILARITY_MIN = 0.4
SIMILARITY_MAX = 0.8
BATCH_SIZE = 512
CHUNK_SIZE = 100000
TOP_K = 200

# --- Main Logic ---
print("--- Starting GPU Generation (v5 - Bug Fixes) ---")

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device.upper()}")

# --- STEP 1: LOAD EMBEDDINGS EFFICIENTLY ---
parquet_file = pq.ParquetFile(DATA_FILE)
num_rows = parquet_file.metadata.num_rows
embedding_cols = [col.name for col in parquet_file.metadata.schema if col.name.startswith('embed_')]
num_embeddings = len(embedding_cols)
print(f"Loading {num_rows} embeddings to GPU...")
all_embeddings_gpu = torch.empty((num_rows, num_embeddings), dtype=torch.float16, device=device)
with torch.no_grad():
    offset = 0
    for i in tqdm(range(parquet_file.num_row_groups), desc="Loading & Normalizing Embeddings"):
        row_group_table = parquet_file.read_row_group(i, columns=embedding_cols)
        chunk_gpu = torch.from_numpy(row_group_table.to_pandas().values).to(device, dtype=torch.float16)
        torch.nn.functional.normalize(chunk_gpu, p=2, dim=1, out=chunk_gpu)
        all_embeddings_gpu[offset : offset + len(chunk_gpu)] = chunk_gpu
        offset += len(chunk_gpu)
print("✅ All embeddings loaded and normalized on GPU.")

# --- STEP 2: LOAD LOGIC DATAFRAME ---
df_logic = pd.read_parquet(DATA_FILE, columns=['question', 'answer', 'source']).reset_index(drop=True)
df_sample = df_logic.sample(n=NUM_SAMPLES_TO_GENERATE, random_state=42)
training_records = []
num_batches = int(np.ceil(len(df_sample) / BATCH_SIZE))
print(f"Processing {len(df_sample)} samples in {num_batches} batches...")

# --- STEP 3: PROCESS BATCHES WITH BUG FIXES ---
with torch.no_grad():
    for i in tqdm(range(num_batches), desc="Generating Distractors"):
        batch_df = df_sample.iloc[i*BATCH_SIZE:(i+1)*BATCH_SIZE]
        batch_indices = torch.as_tensor(batch_df.index.values, device=device)
        query_embeddings = all_embeddings_gpu[batch_indices]
        best_scores_cpu = torch.full((len(query_embeddings), TOP_K), -1., device='cpu')
        best_ids_cpu = torch.full((len(query_embeddings), TOP_K), -1, dtype=torch.long, device='cpu')

        for start in range(0, num_rows, CHUNK_SIZE):
            end = min(start + CHUNK_SIZE, num_rows)
            ref_chunk_gpu = all_embeddings_gpu[start:end]
            sim_chunk_gpu = query_embeddings @ ref_chunk_gpu.T
            merged_scores = torch.cat([best_scores_cpu.to(device), sim_chunk_gpu], dim=1)
            ref_indices = torch.arange(start, end, device=device).expand(len(query_embeddings), -1)
            merged_ids = torch.cat([best_ids_cpu.to(device), ref_indices], dim=1)
            best_scores_gpu, top_k_indices = torch.topk(merged_scores, k=TOP_K, dim=1)
            best_ids_gpu = torch.gather(merged_ids, 1, top_k_indices)
            best_scores_cpu = best_scores_gpu.to('cpu')
            best_ids_cpu = best_ids_gpu.to('cpu')

        for row_idx, original_df_idx in enumerate(batch_indices.cpu().numpy()):
            original_row = df_logic.loc[original_df_idx]
            original_source = original_row.source
            original_answer = original_row.answer

            candidate_ids = best_ids_cpu[row_idx].numpy()
            candidate_scores = best_scores_cpu[row_idx].numpy()

            # --- FIX 1: Exclude distractors with identical answer text ---
            # Get the answers for all candidates first
            candidate_answers = df_logic.loc[candidate_ids, 'answer']

            # Define the domain pool for the current question
            if original_source in PSYCHOLOGY_SOURCES:
                domain_mask = df_logic.loc[candidate_ids, 'source'].isin(PSYCHOLOGY_SOURCES).values
            else:
                domain_mask = (df_logic.loc[candidate_ids, 'source'] == original_source).values

            # Build the combined filter mask
            mask = (
                (candidate_scores > SIMILARITY_MIN) &
                (candidate_scores < SIMILARITY_MAX) &
                (candidate_ids != original_df_idx) &
                (candidate_answers.values != original_answer) & # The actual fix
                (domain_mask) # Apply domain mask to the primary selection
            )

            valid_indices = candidate_ids[mask]

            if len(valid_indices) > 0:
                distractor_idx = np.random.choice(valid_indices)
            else:
                # --- FIX 2: Make the fallback logic domain-aware ---
                fallback_mask = (
                    (candidate_ids != original_df_idx) &
                    (candidate_answers.values != original_answer) &
                    (domain_mask) # Apply domain mask to the fallback selection
                )
                if fallback_mask.any():
                    # Find the index of the highest score *among the valid fallback candidates*
                    distractor_idx = candidate_ids[fallback_mask][np.argmax(candidate_scores[fallback_mask])]
                else:
                    distractor_idx = -1

            if distractor_idx != -1:
                training_records.append({
                    'question': original_row.question,
                    'correct_answer': original_row.answer,
                    'distractor': df_logic.at[distractor_idx, 'answer'],
                    'question_source': original_source,
                    'distractor_source': df_logic.at[distractor_idx, 'source']
                })

        del best_scores_cpu, best_ids_cpu, best_scores_gpu, best_ids_gpu, query_embeddings
        gc.collect()
        torch.cuda.empty_cache()

print("\nConstructing final training set...")
training_data = pd.DataFrame(training_records)
# Save with a new name to avoid confusion
FIXED_OUTPUT_FILE = OUTPUT_FILE.replace(".parquet", "_FIXED.parquet")
training_data.to_parquet(FIXED_OUTPUT_FILE, index=False)

print("\n--- ✅ FINAL SUCCESS ---")
print(f"Bug-fixed, GPU-accelerated generation complete. Training set saved to: '{FIXED_OUTPUT_FILE}'")

--- Starting GPU Generation (v5 - Bug Fixes) ---
Using device: CUDA
Loading 900488 embeddings to GPU...


Loading & Normalizing Embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

✅ All embeddings loaded and normalized on GPU.
Processing 50000 samples in 98 batches...


Generating Distractors:   0%|          | 0/98 [00:00<?, ?it/s]


Constructing final training set...

--- ✅ FINAL SUCCESS ---
Bug-fixed, GPU-accelerated generation complete. Training set saved to: '/content/drive/MyDrive/psychology_tutor_engine/data/training_sets/distractor_generation_training_data_DOMAIN_AWARE_FIXED.parquet'
