In [None]:
import sys, subprocess, os
from pathlib import Path

# Colab Setup
if "google.colab" in sys.modules:
    print("Running in Google Colab. Installing dependencies...")
    subprocess.run(["pip", "install", "-q", "pandas", "numpy", "scikit-learn", "requests", "pydantic", "jsonschema", "plotly", "tqdm"])
    
    # Check for data
    if not (Path.cwd() / "data").exists():
        print("Data directory not found. Cloning repository...")
        subprocess.run(["git", "clone", "https://github.com/aire-program/aire-researcher-sandbox.git", "_repo"])
        
        # Move data and scripts to current directory
        if (Path("_repo/data").exists()):
            print("Moving data and scripts...")
            subprocess.run(["mv", "_repo/data", "."])
            subprocess.run(["mv", "_repo/scripts", "."])
            subprocess.run(["rm", "-rf", "_repo"])
        else:
            print("Warning: Data not found in cloned repo.")
    else:
        print("Data directory found.")


# Prototype: Semantic Search Demo

**What**: Visualize and explore the similarity between queries and documents in the vector space.

**Why**: Visualizing search results helps build intuition about how the model "sees" text similarity.

**How**:
1. **Input a query**.
2. **Compute similarity scores** against the corpus.
3. **Display results** ranked by relevance.

**Key Concept**: **Semantic Search** attempts to find results based on the *meaning* of the query, rather than just matching keywords (though TF-IDF is a statistical approximation of this).

By the end of this notebook, you will have completed the listed steps and produced the outputs described in the success criteria.

### Success criteria
- You computed similarity across abstracts.
- You inspected the closest pairs.
- You understand how preprocessing affects similarity.

In [None]:
from pathlib import Path


def find_data_dir() -> Path:
    candidates = [Path.cwd() / "data", Path.cwd().parent / "data", Path.cwd().parent.parent / "data"]
    for candidate in candidates:
        if (candidate / "sample_texts" / "articles_sample.csv").exists():
            return candidate
    raise FileNotFoundError("data directory not found. Run scripts/generate_synthetic_data.py.")

DATA_DIR = find_data_dir()


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

articles = pd.read_csv(DATA_DIR / "sample_texts" / "articles_sample.csv")
vectorizer = TfidfVectorizer(stop_words="english")
embeddings = vectorizer.fit_transform(articles["abstract"].fillna(""))

similarity_matrix = cosine_similarity(embeddings)

# Show the most similar pair (excluding diagonal)
import numpy as np
np.fill_diagonal(similarity_matrix, 0)
max_idx = similarity_matrix.argmax()
row, col = divmod(max_idx, similarity_matrix.shape[0])

print("Most similar pair:")
print(articles.iloc[row]["title"])
print(articles.iloc[col]["title"])
print("Similarity score", similarity_matrix[row, col])


### If you get stuck / What to try next

If you get stuck: rerun installs and confirm data presence. What to try next: adjust preprocessing or move to retrieval notebooks to test ranking changes.