In [1]:
!pip install -q sentence-transformers pinecone-client pandas tqdm

In [2]:
from google.colab import userdata
import os

In [10]:
try:
    PINECONE_API_KEY = ""
    PINECONE_INDEX_NAME = "cheesespoon"

except Exception as e:
  print("⚠️  Error loading secrets from Colab. Make sure you've added them in the Secrets panel.")
  raise e

print("✅ API keys loaded successfully from Colab Secrets")

# ============================================================================
# STEP 3: Check GPU Availability
# ============================================================================

import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"🖥️  Using device: {device}")
if device == 'cuda':
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("   ⚠️  No GPU detected. Consider enabling GPU in Runtime -> Change runtime type")


✅ API keys loaded successfully from Colab Secrets
🖥️  Using device: cuda
   GPU: Tesla T4
   Memory: 15.83 GB


In [4]:
from sentence_transformers import SentenceTransformer

# Using intfloat/multilingual-e5-large: excellent multilingual support (1024 dimensions)
# This model requires prefixing queries with "query: " and passages with "passage: "
# Great for non-English text and multilingual search
MODEL_NAME = 'intfloat/multilingual-e5-large'

print(f"📥 Loading embedding model: {MODEL_NAME}")
print("   Note: This is a large model (~2.24GB), download may take a moment...")
model = SentenceTransformer(MODEL_NAME, device=device)
embedding_dim = model.get_sentence_embedding_dimension()
print(f"✅ Model loaded. Embedding dimension: {embedding_dim}")


📥 Loading embedding model: intfloat/multilingual-e5-large
   Note: This is a large model (~2.24GB), download may take a moment...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

✅ Model loaded. Embedding dimension: 1024


In [15]:
from google.colab import files
import pandas as pd
import io

print("\n📤 Please upload your CSV file:")
uploaded = files.upload()

# Get the uploaded file
csv_filename = list(uploaded.keys())[0]
df = pd.read_csv(io.BytesIO(uploaded[csv_filename]))

print(f"\n✅ CSV loaded: {len(df)} courses found")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
print(df.head())


📤 Please upload your CSV file:


Saving courses_data_before_llm.csv to courses_data_before_llm (2).csv

✅ CSV loaded: 73 courses found

Columns: ['course_id', 'title', 'description', 'credits', 'prerequisites', 'moed_a', 'moed_b', 'general_rating', 'workload_rating', 'all_reviews', 'avg_grades']

First few rows:
   course_id                                title  \
0      94101  00940101 - מבוא להנדסת תעשיה וניהול   
1      94142    00940142 - תפעול מער' ייצור ושרות   
2      94170        00940170 - שיטות בהנדסת תעשיה   
3      94198      00940198 - אירועים בהנדסת תעשיה   
4      94219               00940219 - הנדסת תוכנה   

                                         description  credits  \
0  חיפוש קבוצה בפייסבוק\nסריקות מבחנים\nקבוצת ווא...      2.5   
1  חיפוש קבוצה בפייסבוק\nסריקות מבחנים\nקבוצת ווא...      3.5   
2  חיפוש קבוצה בפייסבוק\nסריקות מבחנים\nקבוצת ווא...      3.5   
3  חיפוש קבוצה בפייסבוק\nסריקות מבחנים\nקבוצת ווא...      3.5   
4  חיפוש קבוצה בפייסבוק\nסריקות מבחנים\nקבוצת ווא...      3.5   

         

In [8]:
!pip uninstall -y pinecone-client
!pip install -q pinecone

Found existing installation: pinecone-client 6.0.0
Uninstalling pinecone-client-6.0.0:
  Successfully uninstalled pinecone-client-6.0.0
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m745.9/745.9 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.9/280.9 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [11]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

# Check if index exists, create if not
if PINECONE_INDEX_NAME not in pc.list_indexes().names():
    print(f"\n🔨 Creating new Pinecone index: {PINECONE_INDEX_NAME}")
    pc.create_index(
        name=PINECONE_INDEX_NAME,
        dimension=embedding_dim,
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )
    print("✅ Index created")
else:
    print(f"\n✅ Using existing Pinecone index: {PINECONE_INDEX_NAME}")

index = pc.Index(PINECONE_INDEX_NAME)
print(f"📊 Index stats: {index.describe_index_stats()}")



🔨 Creating new Pinecone index: cheesespoon
✅ Index created
📊 Index stats: {'_response_info': {'raw_headers': {'connection': 'keep-alive',
                                    'content-length': '151',
                                    'content-type': 'application/json',
                                    'date': 'Tue, 20 Jan 2026 17:22:11 GMT',
                                    'grpc-status': '0',
                                    'server': 'envoy',
                                    'x-envoy-upstream-service-time': '36',
                                    'x-pinecone-request-id': '3630728306282533286',
                                    'x-pinecone-request-latency-ms': '35',
                                    'x-pinecone-response-duration-ms': '37'}},
 'dimension': 1024,
 'index_fullness': 0.0,
 'memoryFullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'storageFullness': 0.0,
 'total_vector_count': 0,
 'vector_type': 'dense'}


In [16]:
from tqdm import tqdm
import numpy as np

# Handle missing descriptions
df['description'] = df['description'].fillna('')

# Generate embeddings in batches
BATCH_SIZE = 32
vectors_to_upsert = []

print(f"\n🔄 Generating embeddings and preparing for upload...")

for i in tqdm(range(0, len(df), BATCH_SIZE), desc="Processing batches"):
    batch_df = df.iloc[i:i+BATCH_SIZE]

    # Generate embeddings for batch
    # E5 models work best with "passage: " prefix for documents
    descriptions = ["passage: " + desc for desc in batch_df['description'].tolist()]
    embeddings = model.encode(descriptions,
                             convert_to_numpy=True,
                             show_progress_bar=False,
                             device=device,
                             normalize_embeddings=True)  # E5 models benefit from normalization

    # Prepare vectors for Pinecone
    for idx, row in batch_df.iterrows():
        vector_id = str(row['course_id'])
        embedding = embeddings[idx - i].tolist()

        # Prepare metadata (all columns except description which is embedded)
        metadata = {
            'course_id': str(row['course_id']),
            'title': str(row['title']),
            'description': str(row['description']),  # Truncate long descriptions
            'credits': float(row['credits']) if pd.notna(row['credits']) else None,
            'prerequisites': str(row['prerequisites']) if pd.notna(row['prerequisites']) else '',
            'moed_a': str(row['moed_a']) if pd.notna(row['moed_a']) else '',
            'moed_b': str(row['moed_b']) if pd.notna(row['moed_b']) else '',
            'general_rating': float(row['general_rating']) if pd.notna(row['general_rating']) else None,
            'workload_rating': float(row['workload_rating']) if pd.notna(row['workload_rating']) else None,
            'all_reviews': str(row['all_reviews'])[:1000] if pd.notna(row['all_reviews']) else '',  # Truncate
            'avg_grades': str(row['avg_grades']) if pd.notna(row['avg_grades']) else '{}'  # Store as JSON string
        }

        # Remove None values
        metadata = {k: v for k, v in metadata.items() if v is not None}

        vectors_to_upsert.append({
            'id': vector_id,
            'values': embedding,
            'metadata': metadata
        })

print(f"\n📤 Uploading {len(vectors_to_upsert)} vectors to Pinecone...")

# Upload in batches
UPSERT_BATCH_SIZE = 100
for i in tqdm(range(0, len(vectors_to_upsert), UPSERT_BATCH_SIZE), desc="Uploading"):
    batch = vectors_to_upsert[i:i+UPSERT_BATCH_SIZE]
    index.upsert(vectors=batch)

print("\n✅ Upload complete!")
print(f"📊 Final index stats: {index.describe_index_stats()}")



🔄 Generating embeddings and preparing for upload...


Processing batches: 100%|██████████| 3/3 [00:03<00:00,  1.25s/it]



📤 Uploading 73 vectors to Pinecone...


Uploading: 100%|██████████| 1/1 [00:00<00:00,  1.25it/s]


✅ Upload complete!
📊 Final index stats: {'_response_info': {'raw_headers': {'connection': 'keep-alive',
                                    'content-length': '184',
                                    'content-type': 'application/json',
                                    'date': 'Tue, 20 Jan 2026 17:29:19 GMT',
                                    'grpc-status': '0',
                                    'server': 'envoy',
                                    'x-envoy-upstream-service-time': '3',
                                    'x-pinecone-request-id': '1851336644790870393',
                                    'x-pinecone-request-latency-ms': '3',
                                    'x-pinecone-response-duration-ms': '5'}},
 'dimension': 1024,
 'index_fullness': 0.0,
 'memoryFullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'__default__': {'vector_count': 30}},
 'storageFullness': 0.0,
 'total_vector_count': 30,
 'vector_type': 'dense'}



