<a href="https://colab.research.google.com/github/amitaipat-create/Session2/blob/main/Copy_of_Mission_3_Practice_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
%pip install langchain==0.3.27 langchain-openai==0.3.29 langchain-community==0.3.27 sentence-transformers qdrant-client langchain-qdrant python-dotenv --quiet

In [17]:
import os
import getpass
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# OpenAI API Key (for chat models, not embeddings - we use local)
if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

In [18]:
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
from typing import List
import numpy as np

print("=" * 60)
print("PRACTICE 1: EMBEDDINGS & VECTOR STORES")
print("=" * 60)

PRACTICE 1: EMBEDDINGS & VECTOR STORES


In [19]:
# EXERCISE 1: Generate embeddings for multiple texts
# Task: Create a list of 3 different job titles/descriptions
# Use embeddings.embed_documents() to generate embeddings for all texts
# Print the number of embeddings generated



In [20]:
# ============================================================================
# CONCEPT 2: Qdrant Vector Store Setup
# Learning: Initialize and connect to Qdrant vector database
# ============================================================================
print("\nCONCEPT 2: Qdrant Vector Store Setup")
print("-" * 60)

# Create in-memory Qdrant client (for practice - use cloud in production)
client = QdrantClient(":memory:")

# Create collection with 768 dimensions (matching our embedding model)
collection_name = "resumes"
try:
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(
            size=768,  # all-mpnet-base-v2 dimension
            distance=Distance.COSINE
        )
    )
    print(f"Created collection: {collection_name}")
except Exception as e:
    print(f"Collection may already exist: {e}")


CONCEPT 2: Qdrant Vector Store Setup
------------------------------------------------------------
Created collection: resumes


In [21]:
# ============================================================================
# CONCEPT 3: Document Chunking and Embedding Storage
# Learning: Chunk documents and store with embeddings in Qdrant
# ============================================================================
print("\nCONCEPT 3: Document Chunking and Embedding Storage")
print("-" * 60)

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

# Sample resume text
resume_text = """
John Doe - Software Engineer
5 years experience in Python, FastAPI, PostgreSQL
Worked on microservices architecture
Expert in REST APIs and database design
"""

# Chunk the resume (500 tokens, 50 overlap)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)
chunks = text_splitter.split_text(resume_text)

print(f"Original text length: {len(resume_text)}")
print(f"Number of chunks: {len(chunks)}")
print(f"First chunk: {chunks[0][:100]}...")

# Store chunks in Qdrant with embeddings
vector_store = QdrantVectorStore(
    client=client,
    collection_name=collection_name,
    embedding=embeddings
)

# Add documents with metadata (must be Document objects, not dicts)
documents = [
    Document(page_content=chunk, metadata={"candidate_id": "john_doe", "chunk_id": i})
    for i, chunk in enumerate(chunks)
]

vector_store.add_documents(documents)
print(f"Stored {len(documents)} chunks in Qdrant")


CONCEPT 3: Document Chunking and Embedding Storage
------------------------------------------------------------
Original text length: 157
Number of chunks: 1
First chunk: John Doe - Software Engineer
5 years experience in Python, FastAPI, PostgreSQL
Worked on microservic...
Stored 1 chunks in Qdrant


In [22]:
# EXERCISE 2: Chunk and store a different resume
# Task: Create a new resume text for a different candidate (e.g., "Jane Smith - Data Scientist")
# Use text_splitter to chunk it, add metadata with candidate_id="jane_smith"
# Store the chunks in vector_store using add_documents()




In [23]:
# ============================================================================
# CONCEPT 4: Semantic Search Implementation
# Learning: Search for similar documents using semantic similarity
# ============================================================================

print("\nCONCEPT 4: Semantic Search Implementation")
print("-" * 60)

# Search query
query = "Python developer with API experience"
print(f"Search query: {query}")

# Perform semantic search
results = vector_store.similarity_search(query, k=2)

print(f"\nFound {len(results)} similar documents:")
for i, doc in enumerate(results, 1):
    print(f"\n{i}. {doc.page_content[:100]}...")
    print(f"   Metadata: {doc.metadata}")


CONCEPT 4: Semantic Search Implementation
------------------------------------------------------------
Search query: Python developer with API experience

Found 1 similar documents:

1. John Doe - Software Engineer
5 years experience in Python, FastAPI, PostgreSQL
Worked on microservic...
   Metadata: {'candidate_id': 'john_doe', 'chunk_id': 0, '_id': '934b4a772d6c4a5597e6f28f862fcf86', '_collection_name': 'resumes'}


In [24]:
# ============================================================================
# CONCEPT 5: Similarity Matching and Scoring
# Learning: Get similarity scores with search results
# ============================================================================
print("\nCONCEPT 5: Similarity Matching and Scoring")
print("-" * 60)

# Search with scores
results_with_scores = vector_store.similarity_search_with_score(query, k=2)

print(f"Results with similarity scores:")
for i, (doc, score) in enumerate(results_with_scores, 1):
    print(f"\n{i}. Score: {score:.4f}")
    print(f"   Content: {doc.page_content[:80]}...")
    # Lower score = more similar (cosine distance)


CONCEPT 5: Similarity Matching and Scoring
------------------------------------------------------------
Results with similarity scores:

1. Score: 0.5519
   Content: John Doe - Software Engineer
5 years experience in Python, FastAPI, PostgreSQL
W...


In [29]:
# EXERCISE 3: Search for "machine learning" and compare scores
# Task: Use similarity_search_with_score() to search for "machine learning"
# Print each result with its similarity score
# Compare the scores - lower score means more similar



In [26]:
# ============================================================================
# CONCEPT 6: Metadata Filtering in Vector Search
# Learning: Filter search results by metadata (e.g., candidate_id)
# ============================================================================
print("\nCONCEPT 6: Metadata Filtering in Vector Search")
print("-" * 60)

from langchain_qdrant import QdrantVectorStore

# Search with metadata filter
# Note: For simplicity, we'll search without filter and filter results manually
# In production, use proper Qdrant Filter objects for better performance
all_results = vector_store.similarity_search(query="Python developer", k=5)
filtered_results = [doc for doc in all_results if doc.metadata.get("candidate_id") == "john_doe"]

print(f"Filtered results (only john_doe): {len(filtered_results)}")
for doc in filtered_results:
    print(f"  - {doc.page_content[:60]}... (ID: {doc.metadata['candidate_id']})")




CONCEPT 6: Metadata Filtering in Vector Search
------------------------------------------------------------
Filtered results (only john_doe): 1
  - John Doe - Software Engineer
5 years experience in Python, F... (ID: john_doe)


In [28]:
# EXERCISE 4: Search for "data science" filtered to jane_smith
# Task: Use similarity_search() with filter parameter
# Search for "data science" but only return results where candidate_id="jane_smith"
# Print the number of filtered results


