# Tutorial 21: Case Study - Search Ranking System

## End-to-End ML System Design for Search Relevance

---

## Learning Objectives

By the end of this tutorial, you will be able to:

1. **Design a search ranking system** using the 7-step ML framework
2. **Implement Learning-to-Rank models** (pointwise, pairwise, listwise)
3. **Engineer query and document features** for search relevance
4. **Apply ranking metrics** (NDCG, MRR, Precision@K)
5. **Build a two-stage retrieval and ranking pipeline**
6. **Design A/B testing** for search quality evaluation

## Setup and Imports

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestRegressor
from collections import defaultdict

np.random.seed(42)
print('All imports successful!')

---

# 1. Problem Statement and Requirements

In [None]:
class SearchRequirements:
    def __init__(self):
        self.business = {'primary': 'Maximize relevant product discovery'}
        self.scale = {'documents': '100M', 'qps': '100K'}
        self.latency = {'total': 'p99 < 200ms'}
        
    def display(self):
        print('SEARCH RANKING REQUIREMENTS')
        print('=' * 40)
        print(f"Business: {self.business}")
        print(f"Scale: {self.scale}")
        print(f"Latency: {self.latency}")

req = SearchRequirements()
req.display()

## 1.1 Two-Stage Architecture

```
Query -> Retrieval (100M -> 1000) -> Ranking (1000 -> 20) -> Results
```

In [None]:
fig, ax = plt.subplots(figsize=(12, 4))
ax.set_xlim(0, 12)
ax.set_ylim(0, 4)
ax.axis('off')

boxes = [
    (0.5, 1, 1.5, 2, 'Query', '#3498db'),
    (2.5, 0.5, 2.5, 3, 'Retrieval\n100M->1K', '#2ecc71'),
    (5.5, 0.5, 2.5, 3, 'Ranking\n1K->20', '#e74c3c'),
    (8.5, 0.5, 2.5, 3, 'Re-rank\nBusiness', '#f39c12'),
    (11.5, 1, 1, 2, 'Results', '#9b59b6')
]

for x, y, w, h, label, color in boxes:
    rect = plt.Rectangle((x, y), w, h, facecolor=color, alpha=0.3, edgecolor=color, lw=2)
    ax.add_patch(rect)
    ax.text(x + w/2, y + h/2, label, ha='center', va='center', fontsize=9, fontweight='bold')

for x1, x2 in [(2, 2.5), (5, 5.5), (8, 8.5), (11, 11.5)]:
    ax.annotate('', xy=(x2, 2), xytext=(x1, 2), arrowprops=dict(arrowstyle='->', lw=2))

ax.set_title('Two-Stage Search Architecture', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

---

# 2. Data Generation

In [None]:
class SearchDataGenerator:
    def __init__(self, n_queries=2000, n_docs=3000):
        self.n_queries = n_queries
        self.n_docs = n_docs
        self.categories = ['electronics', 'clothing', 'home', 'sports', 'books']
        self.keywords = {
            'electronics': ['laptop', 'phone', 'tablet'],
            'clothing': ['shirt', 'pants', 'shoes'],
            'home': ['furniture', 'lamp', 'rug'],
            'sports': ['running', 'yoga', 'weights'],
            'books': ['fiction', 'science', 'history']
        }
        
    def generate_documents(self):
        np.random.seed(42)
        docs = []
        for i in range(self.n_docs):
            cat = np.random.choice(self.categories)
            kw = np.random.choice(self.keywords[cat])
            docs.append({
                'doc_id': f'doc_{i}',
                'title': f'{kw.title()} Product {i}',
                'category': cat,
                'price': round(np.random.exponential(50) + 10, 2),
                'rating': round(np.clip(np.random.normal(4.0, 0.5), 1, 5), 1),
                'num_reviews': int(np.random.exponential(100)),
                'keywords': [kw]
            })
        return pd.DataFrame(docs)
    
    def generate_queries(self):
        np.random.seed(43)
        queries = []
        for i in range(self.n_queries):
            cat = np.random.choice(self.categories)
            kw = np.random.choice(self.keywords[cat])
            queries.append({
                'query_id': f'q_{i}',
                'query_text': kw,
                'category_intent': cat
            })
        return pd.DataFrame(queries)
    
    def generate_pairs(self, queries, docs):
        np.random.seed(44)
        pairs = []
        doc_by_cat = docs.groupby('category')['doc_id'].apply(list).to_dict()
        
        for _, query in queries.iterrows():
            cat = query['category_intent']
            qwords = set(query['query_text'].lower().split())
            
            relevant = doc_by_cat.get(cat, [])
            others = [d for c, dl in doc_by_cat.items() if c != cat for d in dl]
            
            selected = list(np.random.choice(relevant, min(20, len(relevant)), replace=False))
            selected += list(np.random.choice(others, min(10, len(others)), replace=False))
            
            for pos, doc_id in enumerate(selected, 1):
                doc = docs[docs['doc_id'] == doc_id].iloc[0]
                dwords = set(doc['title'].lower().split())
                
                kw_match = len(qwords & dwords) / len(qwords) if qwords else 0
                cat_match = 1 if doc['category'] == cat else 0
                relevance = int(np.clip(kw_match * 2 + cat_match * 2 + np.random.normal(0, 0.3), 0, 4))
                clicked = np.random.random() < (relevance / 4) * (1 / np.log2(pos + 1))
                
                pairs.append({
                    'query_id': query['query_id'],
                    'doc_id': doc_id,
                    'relevance': relevance,
                    'clicked': clicked,
                    'position': pos,
                    'keyword_match': kw_match,
                    'category_match': cat_match
                })
        
        return pd.DataFrame(pairs)
    
    def generate_all(self):
        docs = self.generate_documents()
        queries = self.generate_queries()
        pairs = self.generate_pairs(queries, docs)
        print(f'Generated: {len(docs)} docs, {len(queries)} queries, {len(pairs)} pairs')
        return docs, queries, pairs

gen = SearchDataGenerator()
docs_df, queries_df, pairs_df = gen.generate_all()

In [None]:
print('Documents:')
print(docs_df.head())
print('\nQueries:')
print(queries_df.head())
print('\nPairs:')
print(pairs_df.head())

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(14, 4))

axes[0].hist(pairs_df['relevance'], bins=5, edgecolor='black', alpha=0.7)
axes[0].set_title('Relevance Distribution')

ctr = pairs_df.groupby('relevance')['clicked'].mean()
axes[1].bar(ctr.index, ctr.values, color='green', alpha=0.7)
axes[1].set_title('CTR by Relevance')

pos_ctr = pairs_df.groupby('position')['clicked'].mean()
axes[2].plot(pos_ctr.index[:15], pos_ctr.values[:15], marker='o')
axes[2].set_title('Position Bias')

plt.tight_layout()
plt.show()

## 2.1 Feature Engineering

In [None]:
class FeatureEngineering:
    def __init__(self):
        self.tfidf = TfidfVectorizer(max_features=500)
        
    def extract(self, pairs, queries, docs):
        data = pairs.merge(queries, on='query_id')
        data = data.merge(docs, on='doc_id')
        
        features = pd.DataFrame()
        features['keyword_match'] = data['keyword_match']
        features['category_match'] = data['category_match']
        features['rating'] = data['rating']
        features['log_reviews'] = np.log1p(data['num_reviews'])
        features['log_price'] = np.log1p(data['price'])
        features['query_len'] = data['query_text'].apply(lambda x: len(x.split()))
        
        return features, data['relevance'], data['query_id']

fe = FeatureEngineering()
X, y, qids = fe.extract(pairs_df, queries_df, docs_df)
print(f'Features: {X.shape}')
print(f'Columns: {list(X.columns)}')

## 2.2 Train/Test Split

In [None]:
unique_q = qids.unique()
np.random.shuffle(unique_q)
split = int(len(unique_q) * 0.8)

train_q = set(unique_q[:split])
test_q = set(unique_q[split:])

train_mask = qids.isin(train_q)
test_mask = qids.isin(test_q)

X_train, X_test = X[train_mask], X[test_mask]
y_train, y_test = y[train_mask].values, y[test_mask].values
qids_train, qids_test = qids[train_mask].values, qids[test_mask].values

print(f'Train: {len(X_train)}, Test: {len(X_test)}')

---

# 3. Model Development

## 3.1 BM25 Baseline

In [None]:
class BM25:
    def __init__(self):
        self.tfidf = TfidfVectorizer()
        self.doc_vecs = None
        self.doc_ids = None
        
    def fit(self, docs):
        print('Fitting BM25...')
        self.doc_ids = docs['doc_id'].values
        self.doc_vecs = self.tfidf.fit_transform(docs['title'])
        
    def rank(self, query, k=100):
        q_vec = self.tfidf.transform([query])
        scores = cosine_similarity(q_vec, self.doc_vecs)[0]
        top_idx = np.argsort(scores)[::-1][:k]
        return [(self.doc_ids[i], scores[i]) for i in top_idx]

bm25 = BM25()
bm25.fit(docs_df)

print('Top results for "laptop":')
for doc_id, score in bm25.rank('laptop', 5):
    title = docs_df[docs_df['doc_id'] == doc_id]['title'].values[0]
    print(f'  {title}: {score:.4f}')

## 3.2 Pointwise Ranker

In [None]:
class PointwiseRanker:
    def __init__(self):
        self.model = GradientBoostingClassifier(n_estimators=100, max_depth=5)
        self.scaler = StandardScaler()
        
    def fit(self, X, y):
        print('Training Pointwise Ranker...')
        Xs = self.scaler.fit_transform(X)
        yb = (y >= 2).astype(int)
        self.model.fit(Xs, yb)
        print(f'  Accuracy: {self.model.score(Xs, yb):.4f}')
        
    def predict(self, X):
        return self.model.predict_proba(self.scaler.transform(X))[:, 1]

pointwise = PointwiseRanker()
pointwise.fit(X_train, y_train)

## 3.3 LambdaMART-style Ranker

In [None]:
class LambdaMARTRanker:
    def __init__(self):
        self.model = RandomForestRegressor(n_estimators=100, max_depth=10)
        self.scaler = StandardScaler()
        
    def fit(self, X, y):
        print('Training LambdaMART Ranker...')
        Xs = self.scaler.fit_transform(X)
        self.model.fit(Xs, y)
        print(f'  R2: {self.model.score(Xs, y):.4f}')
        
    def predict(self, X):
        return self.model.predict(self.scaler.transform(X))
    
    def feature_importance(self, names):
        return sorted(zip(names, self.model.feature_importances_), key=lambda x: x[1], reverse=True)

lambdamart = LambdaMARTRanker()
lambdamart.fit(X_train, y_train)

print('\nFeature Importance:')
for n, i in lambdamart.feature_importance(X.columns)[:5]:
    print(f'  {n}: {i:.4f}')

---

# 4. Evaluation

## 4.1 Ranking Metrics

In [None]:
class RankingMetrics:
    @staticmethod
    def dcg(rels, k):
        rels = np.array(rels)[:k]
        return np.sum(rels / np.log2(np.arange(2, len(rels) + 2)))
    
    @staticmethod
    def ndcg(rels, k):
        dcg = RankingMetrics.dcg(rels, k)
        idcg = RankingMetrics.dcg(sorted(rels, reverse=True), k)
        return dcg / idcg if idcg > 0 else 0
    
    @staticmethod
    def mrr(rels, threshold=2):
        for i, r in enumerate(rels):
            if r >= threshold:
                return 1 / (i + 1)
        return 0
    
    @staticmethod
    def precision_at_k(rels, k, threshold=2):
        return np.sum(np.array(rels)[:k] >= threshold) / k

m = RankingMetrics()
test_rels = [4, 3, 0, 1, 2]
print(f'NDCG@5: {m.ndcg(test_rels, 5):.4f}')
print(f'MRR: {m.mrr(test_rels):.4f}')
print(f'P@5: {m.precision_at_k(test_rels, 5):.4f}')

In [None]:
def evaluate(ranker, X, y, qids, ks=[5, 10]):
    results = {k: {'ndcg': [], 'mrr': [], 'prec': []} for k in ks}
    groups = defaultdict(list)
    for i, q in enumerate(qids):
        groups[q].append(i)
    
    for qid, idx in groups.items():
        Xq = X.iloc[idx]
        yq = y[idx]
        scores = ranker.predict(Xq)
        sorted_rels = yq[np.argsort(scores)[::-1]]
        
        for k in ks:
            results[k]['ndcg'].append(m.ndcg(sorted_rels, k))
            results[k]['mrr'].append(m.mrr(sorted_rels))
            results[k]['prec'].append(m.precision_at_k(sorted_rels, k))
    
    return {k: {m: np.mean(v) for m, v in d.items()} for k, d in results.items()}

print('Pointwise Results:')
for k, v in evaluate(pointwise, X_test, y_test, qids_test).items():
    print(f'  @{k}: NDCG={v["ndcg"]:.4f}, MRR={v["mrr"]:.4f}')

print('\nLambdaMART Results:')
for k, v in evaluate(lambdamart, X_test, y_test, qids_test).items():
    print(f'  @{k}: NDCG={v["ndcg"]:.4f}, MRR={v["mrr"]:.4f}')

---

# 5. Deployment

In [None]:
class SearchService:
    def __init__(self, retriever, ranker, docs, fe):
        self.retriever = retriever
        self.ranker = ranker
        self.docs = docs
        self.fe = fe
        
    def search(self, query, k=10):
        import time
        start = time.time()
        
        # Retrieval
        candidates = self.retriever.rank(query, 50)
        cand_ids = [c[0] for c in candidates]
        
        # Create features
        qdf = pd.DataFrame([{'query_id': 'q', 'query_text': query, 'category_intent': 'unknown'}])
        pdf = pd.DataFrame([{'query_id': 'q', 'doc_id': d, 'relevance': 0, 'clicked': False,
                            'position': i, 'keyword_match': 0, 'category_match': 0}
                           for i, d in enumerate(cand_ids, 1)])
        
        qwords = set(query.lower().split())
        for i, did in enumerate(cand_ids):
            doc = self.docs[self.docs['doc_id'] == did].iloc[0]
            dwords = set(doc['title'].lower().split())
            pdf.loc[i, 'keyword_match'] = len(qwords & dwords) / len(qwords) if qwords else 0
        
        Xr, _, _ = self.fe.extract(pdf, qdf, self.docs)
        scores = self.ranker.predict(Xr)
        
        ranked = sorted(zip(cand_ids, scores), key=lambda x: x[1], reverse=True)[:k]
        latency = (time.time() - start) * 1000
        
        results = []
        for did, score in ranked:
            doc = self.docs[self.docs['doc_id'] == did].iloc[0]
            results.append({'doc_id': did, 'title': doc['title'], 'score': float(score)})
        
        return {'query': query, 'results': results, 'latency_ms': round(latency, 2)}

service = SearchService(bm25, lambdamart, docs_df, fe)

response = service.search('laptop', 5)
print(f'Search: "{response["query"]}" ({response["latency_ms"]}ms)')
for i, r in enumerate(response['results'], 1):
    print(f"  {i}. {r['title']} ({r['score']:.3f})")

## 5.1 A/B Testing

In [None]:
class ABTest:
    def __init__(self, control_ndcg=0.6, lift=0.05):
        self.control = control_ndcg
        self.treatment = control_ndcg * (1 + lift)
        
    def simulate(self, n=5000):
        np.random.seed(42)
        results = []
        for i in range(n):
            group = np.random.choice(['control', 'treatment'])
            base = self.control if group == 'control' else self.treatment
            ndcg = np.clip(np.random.normal(base, 0.15), 0, 1)
            results.append({'group': group, 'ndcg': ndcg})
        return pd.DataFrame(results)
    
    def analyze(self, df):
        ctrl = df[df['group'] == 'control']['ndcg']
        treat = df[df['group'] == 'treatment']['ndcg']
        lift = (treat.mean() - ctrl.mean()) / ctrl.mean() * 100
        from scipy.stats import ttest_ind
        _, p = ttest_ind(ctrl, treat)
        return {'control': ctrl.mean(), 'treatment': treat.mean(), 'lift': lift, 'p': p, 'sig': p < 0.05}

ab = ABTest()
res = ab.simulate()
analysis = ab.analyze(res)

print('A/B Test:')
print(f"  Control NDCG: {analysis['control']:.4f}")
print(f"  Treatment NDCG: {analysis['treatment']:.4f}")
print(f"  Lift: {analysis['lift']:.2f}%")
print(f"  Significant: {analysis['sig']}")

---

# 6. Monitoring

In [None]:
class SearchMonitor:
    def __init__(self):
        self.logs = []
        
    def log(self, query, n_results, latency, click_pos=None):
        self.logs.append({
            'ts': datetime.now(),
            'query': query,
            'n_results': n_results,
            'latency': latency,
            'click_pos': click_pos
        })
    
    def metrics(self):
        df = pd.DataFrame(self.logs)
        clicked = df[df['click_pos'].notna()]
        mrr = (1 / clicked['click_pos']).mean() if len(clicked) > 0 else 0
        return {
            'queries': len(df),
            'latency_p50': df['latency'].quantile(0.5),
            'latency_p99': df['latency'].quantile(0.99),
            'ctr': len(clicked) / len(df),
            'mrr': mrr
        }

monitor = SearchMonitor()

for _ in range(50):
    q = np.random.choice(['laptop', 'shoes', 'phone', 'book'])
    r = service.search(q, 5)
    click = np.random.choice([1, 2, 3, None], p=[0.2, 0.1, 0.05, 0.65])
    monitor.log(q, len(r['results']), r['latency_ms'], click)

met = monitor.metrics()
print('Monitoring:')
print(f"  Queries: {met['queries']}")
print(f"  Latency p50: {met['latency_p50']:.1f}ms")
print(f"  CTR: {met['ctr']:.2%}")
print(f"  MRR: {met['mrr']:.4f}")

---

# 7. Summary

In [None]:
print("""
7-Step Framework for Search Ranking
====================================

Step 1: Requirements
  - Scale: 100M docs, 100K QPS
  - Latency: p99 < 200ms

Step 2: Problem Framing  
  - Two-stage: Retrieval + Ranking
  - Learning-to-Rank approaches

Step 3: Data
  - Query-doc pairs with relevance
  - Implicit feedback (clicks)

Step 4: Models
  - BM25 baseline
  - Pointwise/Pairwise/Listwise
  - LambdaMART

Step 5: Evaluation
  - NDCG, MRR, Precision@K

Step 6: Deployment
  - Retrieval -> Ranking pipeline
  - A/B testing

Step 7: Monitoring
  - Latency, CTR, MRR tracking
""")

print('Tutorial 21 Complete!')