In [1]:
# Cell 1: Setup Environment

# 1. Clone your public GitHub repository
!git clone https://github.com/adfras/psychology-tutor-engine.git

# 2. Change the working directory into your project's root
%cd psychology-tutor-engine

# 3. Install the specific libraries required for the embedding script
# This is more robust than relying on a missing requirements.txt
!pip install sentence-transformers pandas pyarrow torch

print("\n\n✅ Repository cloned and essential libraries installed.")
print("--> IMPORTANT: Now, upload 'normalized_questions.parquet' to 'data/2_processed_data/' using the file browser on the left.")

Cloning into 'psychology-tutor-engine'...
remote: Enumerating objects: 40, done.[K
remote: Counting objects: 100% (40/40), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 40 (delta 7), reused 38 (delta 5), pack-reused 0 (from 0)[K
Receiving objects: 100% (40/40), 212.25 KiB | 1.04 MiB/s, done.
Resolving deltas: 100% (7/7), done.
/content/psychology-tutor-engine
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1

In [4]:
# Cell 2: Run the Script

from pathlib import Path

# Check that the data file was uploaded correctly before running
data_file = Path("data/2_processed_data/normalized_questions.parquet")
if not data_file.exists():
    print("❌ ERROR: Data file not found!")
    print(f"Please make sure you have uploaded 'normalized_questions.parquet' to the '{data_file.parent}' folder.")
else:
    print("✅ Data file found. Starting embedding computation...")
    # Execute the script from your cloned repository.
    # It will use the libraries we just installed and the GPU.
    !python src/reasoning_engine/build_rag_index.py

    output_file = Path("data/2_processed_data/questions_with_embeddings.parquet")
    if output_file.exists():
        print(f"\n\n✅ SUCCESS! Embeddings computed.")
        print(f"--> You can now download '{output_file.name}' from the '{output_file.parent}' folder in the file browser.")

✅ Data file found. Starting embedding computation...
2025-06-27 02:42:19.001941: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750992139.021502    1220 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750992139.027508    1220 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-27 02:42:19.046544: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
--- Starting Embedding Computation ---
Using device: cuda
Loadin

In [3]:
!ls -lh data/2_processed_data/normalized_questions.parquet

-rw-r--r-- 1 root root 130M Jun 27 02:42 data/2_processed_data/normalized_questions.parquet


In [None]:
!python src/reasoning_engine/build_rag_index.py

2025-06-26 08:05:03.910730: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750925103.944661    3021 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750925103.956754    3021 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-26 08:05:03.987927: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
--- Starting Embedding Computation ---
Using device: cuda
Loading sentence-transformer model: 'all-MiniLM-L6-v2'...
m

In [5]:
# Cell for validating the output file before download

import pandas as pd
from pathlib import Path
import numpy as np

print("--- Starting Validation of questions_with_embeddings.parquet ---")

# Define the path to the file we want to test
FILE_TO_TEST = Path("data/2_processed_data/questions_with_embeddings.parquet")
is_valid = True

# 1. Check for File Existence
print(f"\n[1/5] Checking for file existence...")
if not FILE_TO_TEST.exists():
    print(f"❌ [FAIL] File not found at: {FILE_TO_TEST}")
    is_valid = False
else:
    print(f"✅ [PASS] File found.")

# Proceed only if the file exists
if is_valid:
    try:
        # 2. Check if the file is readable (not corrupted)
        print(f"\n[2/5] Checking if file is readable by pandas/pyarrow...")
        df = pd.read_parquet(FILE_TO_TEST)
        print(f"✅ [PASS] File is a valid Parquet file and was read successfully.")
        print(f"      - Shape of the dataframe: {df.shape}")

        # 3. Check for correct schema (core columns + embeddings)
        print(f"\n[3/5] Verifying schema...")
        expected_core_cols = {'question', 'answer', 'source'}
        actual_cols = set(df.columns)

        if expected_core_cols.issubset(actual_cols):
            print(f"✅ [PASS] Core columns ('question', 'answer', 'source') are present.")
        else:
            print(f"❌ [FAIL] Missing core columns: {expected_core_cols - actual_cols}")
            is_valid = False

        embedding_cols = [c for c in df.columns if c.startswith('embed_')]
        if len(embedding_cols) == 384:
            print(f"✅ [PASS] Found exactly 384 embedding columns (embed_0 to embed_383).")
        else:
            print(f"❌ [FAIL] Expected 384 embedding columns, but found {len(embedding_cols)}.")
            is_valid = False

        # 4. Check for null values in critical columns
        print(f"\n[4/5] Checking for null values...")
        if df['question'].isnull().any():
            print(f"❌ [FAIL] Null values found in the 'question' column.")
            is_valid = False
        else:
            print(f"✅ [PASS] 'question' column is free of nulls.")

        # 5. Check a sample embedding for valid data
        print(f"\n[5/5] Checking a sample embedding...")
        # Check that the first value of the first embedding column is a valid number
        sample_value = df['embed_0'].iloc[0]
        if pd.notna(sample_value) and isinstance(sample_value, np.floating):
            print(f"✅ [PASS] Sample embedding value is a valid number (e.g., {sample_value:.4f}).")
        else:
            print(f"❌ [FAIL] Sample embedding value is null or not a number.")
            is_valid = False

    except Exception as e:
        print(f"\n❌ [FAIL] An error occurred while reading or testing the file: {e}")
        is_valid = False

# --- Final Conclusion ---
print("\n" + "="*50)
print("--- VALIDATION SUMMARY ---")
if is_valid:
    print("✅ SUCCESS: The file 'questions_with_embeddings.parquet' is valid.")
    print("It is safe to download and stop the runtime.")
else:
    print("❌ FAILURE: The file is invalid or corrupted. DO NOT download it.")
    print("Please review the errors above.")
print("="*50)

--- Starting Validation of questions_with_embeddings.parquet ---

[1/5] Checking for file existence...
✅ [PASS] File found.

[2/5] Checking if file is readable by pandas/pyarrow...
✅ [PASS] File is a valid Parquet file and was read successfully.
      - Shape of the dataframe: (900488, 387)

[3/5] Verifying schema...
✅ [PASS] Core columns ('question', 'answer', 'source') are present.
✅ [PASS] Found exactly 384 embedding columns (embed_0 to embed_383).

[4/5] Checking for null values...
✅ [PASS] 'question' column is free of nulls.

[5/5] Checking a sample embedding...
✅ [PASS] Sample embedding value is a valid number (e.g., 0.1041).

--- VALIDATION SUMMARY ---
✅ SUCCESS: The file 'questions_with_embeddings.parquet' is valid.
It is safe to download and stop the runtime.
