In [None]:
# Skill Search and Matching (with skill expansion)

import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import pandas as pd
from tqdm import tqdm
from rapidfuzz import process, fuzz

# -----------------------------
# Load the dataset
# -----------------------------
df = pd.read_csv("resumes_cleaned.csv")  # cleaned dataset from Phase 3

# -----------------------------
# Load embeddings model
# -----------------------------
model_name = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

# -----------------------------
# Create resume embeddings
# -----------------------------
batch_size = 64
documents = df["document"].tolist()
embeddings = []

for i in tqdm(range(0, len(documents), batch_size)):
    batch = documents[i:i+batch_size]
    emb = model.encode(batch, show_progress_bar=False,
                       convert_to_numpy=True, normalize_embeddings=True)
    embeddings.append(emb)

embeddings = np.vstack(embeddings)

# -----------------------------
# Build FAISS index
# -----------------------------
d = embeddings.shape[1]  # embedding dimension
index = faiss.IndexFlatIP(d)  # cosine similarity (with normalized vectors)
index.add(embeddings)

# -----------------------------
# Curated Skills Taxonomy
# -----------------------------
curated_skills = [
    "Python", "Machine Learning", "Deep Learning", "NLP", "Computer Vision",
    "TensorFlow", "PyTorch", "Scikit-learn", "Data Analysis", "SQL",
    "Spark", "Hadoop", "Data Engineering", "MLOps", "Docker", "Kubernetes"
]

# -----------------------------
# Expand query using fuzzy matching + embeddings
# -----------------------------
def expand_skills(query, top_n=3, similarity_threshold=70):
    """
    Expand a query skill with related terms from curated taxonomy.
    Uses both fuzzy string matching and semantic similarity.
    """
    expanded = set([query])

    # --- Fuzzy matching (catch typos, close matches) ---
    matches = process.extract(query, curated_skills, scorer=fuzz.WRatio, limit=top_n)
    for match, score, _ in matches:
        if score >= similarity_threshold:
            expanded.add(match)

    # --- Embedding similarity (semantic expansion) ---
    query_vec = model.encode([query], normalize_embeddings=True)
    skill_vecs = model.encode(curated_skills, normalize_embeddings=True)
    sims = np.dot(skill_vecs, query_vec.T).flatten()

    best_idx = np.argsort(sims)[::-1][:top_n]
    for idx in best_idx:
        expanded.add(curated_skills[idx])

    return list(expanded)

# -----------------------------
# Search function with expansion
# -----------------------------
def search_resumes(query, k=5):
    expanded_skills = expand_skills(query)
    print(f"Expanded skills for '{query}': {expanded_skills}")

    # Encode expanded query terms
    query_vecs = model.encode(expanded_skills, normalize_embeddings=True)
    
    # Search for each expanded term and merge results
    scores, indices = index.search(query_vecs, k)

    # Collect top results
    results = {}
    for term, term_scores, term_idx in zip(expanded_skills, scores, indices):
        for score, idx in zip(term_scores, term_idx):
            if idx == -1: 
                continue
            if idx not in results or score > results[idx]["score"]:
                results[idx] = {"doc": df.iloc[idx]["document"], "score": float(score)}

    # Sort by best score
    sorted_results = sorted(results.items(), key=lambda x: x[1]["score"], reverse=True)
    return sorted_results[:k]

# -----------------------------
# Example usage
# -----------------------------
query = "PyTorch"
results = search_resumes(query, k=5)

for rank, (idx, item) in enumerate(results, start=1):
    print(f"\nRank {rank} | Score: {item['score']:.4f}")
    print(item["doc"][:300], "...")
