In [1]:
# =========================
# DATA LOADING
# =========================
import pandas as pd
from tqdm import tqdm
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import ast # For safely evaluating string representations of Python literals

tqdm.pandas()

from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# This is a great balance of speed and quality (multilingual too)
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# =========================
# DATA LOADING
# =========================
PATH = "/Users/amanjaiswal/Work/hop_v3/backend/combined_results.csv"
df = pd.read_csv(PATH)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import json

def safe_literal_eval(s):
    try:
        formatted_data = json.loads(s)
        if isinstance(formatted_data, list):
            return formatted_data
        elif isinstance(formatted_data, str):
            return list(formatted_data)
    except (json.JSONDecodeError, TypeError):
        return [] # Return empty list for invalid data
    raise Exception("error for row", s)

df['detailed_reviews'] = df['detailed_reviews'].apply(safe_literal_eval)
df_reviews_exploded = df[['place_id', 'detailed_reviews']].explode('detailed_reviews')
df_reviews_exploded_filtered = df_reviews_exploded[
    df_reviews_exploded['detailed_reviews'].apply(lambda x: isinstance(x, dict) and bool(x))
]
df_reviews = pd.json_normalize(df_reviews_exploded_filtered['detailed_reviews'])
df_reviews['place_id'] = df_reviews_exploded_filtered['place_id'].values
place_features = ['place_id', 'name', 'main_category', 'rating', 'address', 'reviews']
df_places = df[place_features]

flat_df = df_reviews.merge(df_places, on='place_id', how='left', suffixes=('_review', '_place'))

In [None]:
# Feature Engineering
filtered_df = flat_df[flat_df['rating_review'] >= 3]
vibe_df = filtered_df.groupby(
    ['place_id', 'name_place', 'main_category', 'rating_place', 'address', 'reviews']
)['review_text'].apply(lambda texts: " ".join([str(t) for t in texts if pd.notna(t)])).reset_index()

vibe_df.rename(columns={
    'name_place': 'place_name',
    'rating_place': 'avg_place_rating',
    'review_text': 'combined_reviews'
}, inplace=True)

def safe_encode(text):
    if isinstance(text, str) and text.strip():
        return model.encode(text)
    return model.encode("")

vibe_df['embedding'] = vibe_df['combined_reviews'].progress_apply(safe_encode)

100%|██████████| 328/328 [00:09<00:00, 35.85it/s]


In [5]:
# City level indexing
import re

def extract_city_from_query(query):
    if pd.isna(query):
        return "unknown"
    
    query = query.lower().strip()
    match = re.search(r'in\s+([a-z\s]+)$', query)
    if match:
        return match.group(1).strip()
    
    return "unknown"

df['city'] = df['query'].apply(extract_city_from_query)

vibe_df = vibe_df.merge(df[['place_id', 'city']].drop_duplicates(), on='place_id', how='left')

import faiss
city_indices = {}
for city, group in vibe_df.groupby('city'):
    if city == 'unknown': continue
    embeddings = np.vstack(group['embedding'].values).astype('float32')
    faiss.normalize_L2(embeddings)
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)

    city_indices[city] = {
        'index': index,
        'df': group.reset_index(drop=True)
    }

with open("city_faiss_indices.pkl", "wb") as f:
    pickle.dump(city_indices, f)

In [10]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

def search_places_by_city_advanced(query, city, top_k=5, weights={'similarity': 0.6, 'rating': 0.2, 'popularity': 0.2}):
    city = city.lower()
    if city not in city_indices:
        print(f"No data for this city: {city}")
        return pd.DataFrame()
    
    city_data = city_indices[city]
    index = city_data['index']
    df = city_data['df']

    # 1. Get initial candidates from FAISS
    k_for_retrieval = 25
    query_emb = model.encode(query).astype('float32')
    faiss.normalize_L2(query_emb.reshape(1, -1))
    distances, indices = index.search(query_emb.reshape(1, -1), k_for_retrieval)


    # 2. create df with candidates
    results = df.iloc[indices[0]].copy()
    results['similarity_score'] = distances[0]
    results['rating_score'] = results['avg_place_rating'] / 5.0

    reviews_scaler = MinMaxScaler()
    city_reviews_log = np.log1p(df['reviews'].values.reshape(-1, 1))
    reviews_scaler.fit(city_reviews_log)
    results['popularity_score'] = reviews_scaler.transform(np.log1p(results['reviews'].values.reshape(-1, 1)))

    results['combined_score'] = (
        weights['similarity'] * results['similarity_score'] +
        weights['rating'] * results['rating_score'] +
        weights['popularity'] * results['popularity_score']
    )

    final_results = results.sort_values(by='combined_score', ascending=False)
    return final_results[['place_name', 'main_category', 'avg_place_rating', 'reviews', 'combined_score', 'address']]

In [11]:
search_results = search_places_by_city_advanced('instagrammable cafes with good coffee', 'new delhi')
search_results

Unnamed: 0,place_name,main_category,avg_place_rating,reviews,combined_score,address
67,Waste to Wonder Theme Park,Theme park,4.1,28086,0.485811,"Block A, Ganga Vihar, Sarai Kale Khan, New Del..."
78,Khan Market,Market,4.4,63335,0.484679,"Khan Market, Rabindra Nagar, New Delhi, Delhi ..."
45,Hauz Khas District Park,Park,4.4,17565,0.479251,"Africa Avenue, Hauz Khas, New Delhi, Delhi 110016"
33,Talkatora Garden,Garden,4.5,4376,0.476067,"J5FW+82R, Rashtrapati Bhawan, President's Esta..."
28,Mumtaz Mahal,Historical landmark,4.2,465,0.469363,"M63V+R93, Red Fort, Old Delhi, New Delhi, Delh..."
3,Paschim Vihar District Park,Park,4.3,4678,0.466253,"M3CW+FCF, Outer Ring Rd, A-2 Paschim Vihar, Po..."
43,Bharat Darshan Park,Park,4.3,9149,0.451141,"Punjabi Bagh Xing, Ring Rd, JJ Colony, Moti Na..."
72,Cannaught place,Tourist attraction,4.7,2296,0.450353,"J6M9+PMF, Block B, Connaught Place, New Delhi,..."
41,Mehrauli Archeological Park Walk,Historical landmark,4.3,3394,0.444336,"Anuvrat Marg, opposite Qutab Minar Metro Stati..."
14,Indraprastha Park,Park,4.2,9017,0.439004,"Ring Road, Grand Trunk Road, Block A, Ganga Vi..."


### New features added
Combined text similarity with place rankings and popularity


### Algorithm explained
**Initialization**:

a. Select the correct FAISS index and place data D from CityIndices based on the input city.
b. If the city does not exist, terminate and return an empty list.

**Query Encoding**:

a. Convert the input query into a numerical vector q_vec using the Encoder.
b. Normalize q_vec to have a unit length (L2 normalization).

**Candidate Retrieval (The "Get" Step)**:

a. Use the FAISS index for the selected city to find the k_retrieval places whose embeddings are most similar to q_vec.
b. This returns a list of k_retrieval candidate places, C, along with their raw similarity_scores.

**Feature Normalization & Re-ranking (The "Re-rank" Step)**:

a. For each candidate place c in C:
i. Normalize Rating: Calculate rating_score = c.avg_place_rating / 5.0.
ii. Normalize Popularity: Calculate popularity_score by applying a log transformation to c.reviews and then scaling the result to a [0, 1] range using a Min-Max scaler fitted on the entire city's data D.
iii. Calculate Combined Score: Compute combined_score = (W_sim * c.similarity_score) + (W_rat * c.rating_score) + (W_pop * c.popularity_score).
iv. Store the combined_score for candidate c

**Final Selection**:

a. Sort the list of candidates C in descending order based on their combined_score.
b. Return the first top_k places from the sorted list.

