In [9]:
# =========================
# DATA LOADING
# =========================
import pandas as pd
from tqdm import tqdm
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import ast # For safely evaluating string representations of Python literals

tqdm.pandas()

from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# This is a great balance of speed and quality (multilingual too)
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# =========================
# DATA LOADING
# =========================
PATH = "/Users/amanjaiswal/Work/hop_v3/backend/combined_results.csv"
df = pd.read_csv(PATH)

# =========================
# DATA PREPROCESSING
# =========================
# Safely evaluate the string representation of the list of dictionaries
import json

def safe_literal_eval(s):
    formatted_data = json.loads(s)
    if isinstance(formatted_data, list):
        return formatted_data
    elif isinstance(formatted_data, str):
        return list(formatted_data)
    else:
        raise Exception("error for row", s)

df['detailed_reviews'] = df['detailed_reviews'].apply(safe_literal_eval)


df_reviews_exploded = df[['place_id', 'detailed_reviews']].explode('detailed_reviews')
df_reviews_exploded_filtered = df_reviews_exploded[
    df_reviews_exploded['detailed_reviews'].apply(lambda x: isinstance(x, dict) and bool(x))
]

df_reviews = pd.json_normalize(df_reviews_exploded_filtered['detailed_reviews'])
df_reviews['place_id'] = df_reviews_exploded_filtered['place_id'].values
place_features = ['place_id', 'name', 'main_category', 'rating', 'address', 'reviews']
df_places = df[place_features]

flat_df = df_reviews.merge(df_places, on='place_id', how='left', suffixes=('_review', '_place'))
flat_df.head()

# =========================
# FEATURE ENGINEERING
# =========================
# Filter and group reviews
filtered_df = flat_df[flat_df['rating_review'] >= 3]  # Optional: use only positive reviews

# Group and combine reviews per place
vibe_df = filtered_df.groupby(
    ['place_id', 'name_place', 'main_category', 'rating_place', 'address', 'reviews']
)['review_text'].apply(lambda texts: ' '.join([str(t) for t in texts if pd.notna(t)])).reset_index()

# Rename columns
vibe_df.rename(columns={
    'name_place': 'place_name',
    'rating_place': 'avg_place_rating',
    'review_text': 'combined_reviews'
}, inplace=True)

def safe_encode(text):
    if isinstance(text, str) and text.strip():
        return model.encode(text)
    return model.encode("")

vibe_df['embedding'] = vibe_df['combined_reviews'].progress_apply(safe_encode)
vibe_index = vibe_df[['place_id', 'place_name', 'embedding', 'avg_place_rating', 'reviews', 'main_category', 'address']]

100%|██████████| 328/328 [00:07<00:00, 42.25it/s]


In [None]:
# adding city to the vibe df
import re

def extract_city_from_query(query):
    if pd.isna(query):
        return "unknown"
    
    # Normalize text
    query = query.lower().strip()

    # Look for 'in <city>' pattern
    match = re.search(r'in\s+([a-z\s]+)$', query)
    if match:
        return match.group(1).strip()
    
    return "unknown"

df['city'] = df['query'].apply(extract_city_from_query)
vibe_df = vibe_df.merge(df[['place_id', 'city']], on='place_id', how='left')

In [13]:
# building faiss index per city
import faiss
city_indices = {}
for city, group in vibe_df.groupby('city'):
    embeddings = np.vstack(group['embedding'].values).astype('float32')
    faiss.normalize_L2(embeddings)
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)

    city_indices[city] = {
        'index': index,
        'df': group.reset_index(drop=True)
    }

# Save it all for later
with open("city_faiss_indices.pkl", "wb") as f:
    pickle.dump(city_indices, f)

In [15]:
def search_places_by_city(query, city, top_k=5):
    city = city.lower()
    if city not in city_indices:
        print("No data for this city")
        return pd.DataFrame()
    
    city_data = city_indices[city]
    index = city_data['index']
    df = city_data['df']

    query_emb = model.encode(query).astype('float32')
    faiss.normalize_L2(query_emb.reshape(1, -1))
    distances, indices = index.search(query_emb.reshape(1, -1), top_k)
    
    results = df.iloc[indices[0]].copy()
    results['score'] = distances[0]
    return results[['place_name', 'main_category', 'avg_place_rating', 'score', 'address']]


In [17]:
search_places_by_city('very cool place', 'new delhi')

Unnamed: 0,place_name,main_category,avg_place_rating,score,address
85,I Love Delhi installation selfie point by DSA,Tourist attraction,4.5,0.388395,"Plot No 3397, 2nd Floor, Pusa, Chowk, Patel Na..."
67,Waste to Wonder Theme Park,Theme park,4.1,0.379414,"Block A, Ganga Vihar, Sarai Kale Khan, New Del..."
34,The Garden of Five Senses,Garden,4.1,0.378027,"Westend Marg, Saidulajab, Saiyad ul Ajaib, Sak..."
15,Sunset Point - Akshardham,Tourist attraction,4.7,0.37801,"J79H+4MJ, Pandav Nagar, New Delhi, Delhi, 110092"
28,Mumtaz Mahal,Historical landmark,4.2,0.376917,"M63V+R93, Red Fort, Old Delhi, New Delhi, Delh..."


In [None]:
#