### Data Loading, Cleaning, Preprocessing, Feature Engineering, Model Training

In [27]:
# =========================
# DATA LOADING
# =========================
import pandas as pd
from tqdm import tqdm
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import ast # For safely evaluating string representations of Python literals

tqdm.pandas()

from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# This is a great balance of speed and quality (multilingual too)
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# =========================
# DATA LOADING
# =========================
PATH = "/Users/amanjaiswal/Work/hop_v3/backend/combined_results.csv"
df = pd.read_csv(PATH)

In [28]:
# =========================
# DATA PREPROCESSING
# =========================
# Safely evaluate the string representation of the list of dictionaries
import json

def safe_literal_eval(s):
    formatted_data = json.loads(s)
    if isinstance(formatted_data, list):
        return formatted_data
    elif isinstance(formatted_data, str):
        return list(formatted_data)
    else:
        raise Exception("error for row", s)

df['detailed_reviews'] = df['detailed_reviews'].apply(safe_literal_eval)
df['detailed_reviews']

0      [{'review_id': 'Ci9DQUlRQUNvZENodHljRjlvT2xRM1...
1      [{'review_id': 'Ci9DQUlRQUNvZENodHljRjlvT25Cb2...
2      [{'review_id': 'Ci9DQUlRQUNvZENodHljRjlvT25sU2...
3      [{'review_id': 'Ci9DQUlRQUNvZENodHljRjlvT2xSSF...
4      [{'review_id': 'Ci9DQUlRQUNvZENodHljRjlvT21sTW...
                             ...                        
324    [{'review_id': 'Ci9DQUlRQUNvZENodHljRjlvT2s5VG...
325    [{'review_id': 'Ci9DQUlRQUNvZENodHljRjlvT2tONl...
326    [{'review_id': 'ChdDSUhNMG9nS0VJQ0FnSURkOTkzYT...
327    [{'review_id': 'Ci9DQUlRQUNvZENodHljRjlvT2w4MV...
328    [{'review_id': 'ChdDSUhNMG9nS0VJanh0YXFhOEliRn...
Name: detailed_reviews, Length: 329, dtype: object

In [10]:
df_reviews_exploded = df[['place_id', 'detailed_reviews']].explode('detailed_reviews')
df_reviews_exploded_filtered = df_reviews_exploded[
    df_reviews_exploded['detailed_reviews'].apply(lambda x: isinstance(x, dict) and bool(x))
]

In [24]:
df_reviews = pd.json_normalize(df_reviews_exploded_filtered['detailed_reviews'])
df_reviews['place_id'] = df_reviews_exploded_filtered['place_id'].values
place_features = ['place_id', 'name', 'main_category', 'rating', 'address', 'reviews']
df_places = df[place_features]

flat_df = df_reviews.merge(df_places, on='place_id', how='left', suffixes=('_review', '_place'))
flat_df.head()

Unnamed: 0,review_id,review_link,name_review,reviewer_id,reviewer_profile,rating_review,review_text,published_at,published_at_date,response_from_owner_text,response_from_owner_ago,response_from_owner_date,total_number_of_reviews_by_reviewer,total_number_of_photos_by_reviewer,is_local_guide,review_translated_text,response_from_owner_translated_text,experience_details,review_photos,place_id,name_place,main_category,rating_place,address,reviews
0,Ci9DQUlRQUNvZENodHljRjlvT2xRM1ZtdExURkJ1T0hKQm...,https://www.google.com/maps/reviews/data=!4m8!...,Deblina Pandit,103667994135310950438,https://www.google.com/maps/contrib/1036679941...,1,,17 minutes ago,2025-06-22T16:22:31,,,,5,0,False,,,[],[],ChIJN1ZKKUkWrjsRzxIVM363-LE,Bengaluru Palace,Historical place museum,4.2,"Bengaluru, Karnataka",95506
1,Ci9DQUlRQUNvZENodHljRjlvT21wTlJGWnBiVzU1VURob2...,https://www.google.com/maps/reviews/data=!4m8!...,Shiv Wanjare,106388887931751951842,https://www.google.com/maps/contrib/1063888879...,5,Good place,44 minutes ago,2025-06-22T15:55:14,,,,2,28,True,,,[],[],ChIJN1ZKKUkWrjsRzxIVM363-LE,Bengaluru Palace,Historical place museum,4.2,"Bengaluru, Karnataka",95506
2,Ci9DQUlRQUNvZENodHljRjlvT2xvMU5rWnBWVEEwY1RGSG...,https://www.google.com/maps/reviews/data=!4m8!...,Bhadra Balraj,100542547900331997225,https://www.google.com/maps/contrib/1005425479...,5,Famous attraction in Bangalore known for it's ...,an hour ago,2025-06-19T17:35:14,,,,148,1073,True,,,[],"[{'id': 'CIABIhCuj9u1yT9KdrSQIUNiSBBK', 'url':...",ChIJN1ZKKUkWrjsRzxIVM363-LE,Bengaluru Palace,Historical place museum,4.2,"Bengaluru, Karnataka",95506
3,ChdDSUhNMG9nS0VJQ0FnSUM3OTl2ZXlRRRAB,https://www.google.com/maps/reviews/data=!4m8!...,Mukesh Sahu,114626764562783545060,https://www.google.com/maps/contrib/1146267645...,1,,an hour ago,2024-08-19T05:10:49,,,,0,0,False,,,[],[],ChIJN1ZKKUkWrjsRzxIVM363-LE,Bengaluru Palace,Historical place museum,4.2,"Bengaluru, Karnataka",95506
4,Ci9DQUlRQUNvZENodHljRjlvT2tZM1VEQTRUV3N5VWpScl...,https://www.google.com/maps/reviews/data=!4m8!...,Chandra sekhar Nayak,116956927774497165661,https://www.google.com/maps/contrib/1169569277...,3,Everything things was good but the ticket pric...,an hour ago,2025-06-22T14:51:03,,,,1,13,True,,,[],[],ChIJN1ZKKUkWrjsRzxIVM363-LE,Bengaluru Palace,Historical place museum,4.2,"Bengaluru, Karnataka",95506


In [23]:
# =========================
# FEATURE ENGINEERING
# =========================
# Filter and group reviews
filtered_df = flat_df[flat_df['rating_review'] >= 3]  # Optional: use only positive reviews

# Group and combine reviews per place
vibe_df = filtered_df.groupby(
    ['place_id', 'name_place', 'main_category', 'rating_place', 'address', 'reviews']
)['review_text'].apply(lambda texts: ' '.join([str(t) for t in texts if pd.notna(t)])).reset_index()

# Rename columns
vibe_df.rename(columns={
    'name_place': 'place_name',
    'rating_place': 'avg_place_rating',
    'review_text': 'combined_reviews'
}, inplace=True)

vibe_df.head()

Unnamed: 0,place_id,place_name,main_category,avg_place_rating,address,reviews,combined_reviews
0,ChIJ-2lgoG0XrjsRjQvrxjfPtY8,The MadLabs: Where Fun Meets Fascination,Tourist attraction,4.7,"1st floor, Snow city, Jayamahal Main Rd, oppos...",2133,The experience for me was really fun and infor...
1,ChIJ-8Jcp109rjsRe0QXmcbU8Ic,Jaya Prakash Narayana Park,Park,4.3,"22/1, Tank Bund Road, Brindavan Nagar, Bandapp...",23295,Good Nature with three parks It's have wide a...
2,ChIJ-WAdnZAVrjsRThpPifox97Q,Yediyuru Lake Park,Park,4.4,"Shop No. 1, 22nd Cross Rd, Yadiyur, Jayanagar,...",3838,Visited last week. It was relaxing. Birds rabb...
3,ChIJ-aC0xKbJ5zsR5UvyiXt5m6E,Juhu Garden,Garden,4.1,"22, Juhu Tara Rd, Hasmukh Nagar, Santacruz (We...",2745,Good place for evening walk Well Maintained It...
4,ChIJ-yYkJh4dDTkRE6uykaS2da0,The Square,Tourist attraction,4.7,"Aerocity, New Delhi, Delhi 110037",64,Great ambiance!!! Mini Europe in Delhi Aeroci...


In [19]:
# =========================
# EMBEDDING GENERATION (FEATURE TRANSFORMATION)
# =========================
# Generate embeddings and store them
def safe_encode(text):
    if isinstance(text, str) and text.strip():
        return model.encode(text)
    return model.encode("")

vibe_df['embedding'] = vibe_df['combined_reviews'].progress_apply(safe_encode)
vibe_index = vibe_df[['place_id', 'place_name', 'embedding', 'avg_place_rating', 'reviews', 'main_category', 'address']]

vibe_index.head()

100%|██████████| 328/328 [00:07<00:00, 43.68it/s]


Unnamed: 0,place_id,place_name,embedding,avg_place_rating,reviews,main_category,address
0,ChIJ-2lgoG0XrjsRjQvrxjfPtY8,The MadLabs: Where Fun Meets Fascination,"[-0.017609939, 0.0003872272, 0.07355433, 0.041...",4.7,2133,Tourist attraction,"1st floor, Snow city, Jayamahal Main Rd, oppos..."
1,ChIJ-8Jcp109rjsRe0QXmcbU8Ic,Jaya Prakash Narayana Park,"[0.10411609, -0.034203626, 0.11249071, 0.06483...",4.3,23295,Park,"22/1, Tank Bund Road, Brindavan Nagar, Bandapp..."
2,ChIJ-WAdnZAVrjsRThpPifox97Q,Yediyuru Lake Park,"[0.05275708, 0.014429987, 0.05567679, 0.053803...",4.4,3838,Park,"Shop No. 1, 22nd Cross Rd, Yadiyur, Jayanagar,..."
3,ChIJ-aC0xKbJ5zsR5UvyiXt5m6E,Juhu Garden,"[0.054459933, 0.06808016, 0.015984707, 0.04141...",4.1,2745,Garden,"22, Juhu Tara Rd, Hasmukh Nagar, Santacruz (We..."
4,ChIJ-yYkJh4dDTkRE6uykaS2da0,The Square,"[0.123570435, 0.04291612, 0.034333203, 0.02992...",4.7,64,Tourist attraction,"Aerocity, New Delhi, Delhi 110037"


In [20]:
# =========================
# ARTIFACT PERSISTENCE
# =========================
# Save the vibe_index
with open("vibe_index.pkl", "wb") as f:
    pickle.dump(vibe_index, f)

In [22]:
# =========================
# MODEL LOADING AND INFERENCE
# =========================
import pickle
import numpy as np

# =========================
# MODEL LOADING
# =========================
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity


model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def search_places(query, vibe_index_df, model, top_k=5):
    query_emb = model.encode(query)
    valid_embeddings_df = vibe_index_df[vibe_index_df['embedding'].apply(lambda x: isinstance(x, np.ndarray) and x.size > 0)].copy()
    if valid_embeddings_df.empty:
        print("No valid embeddings found for similarity calculation")
        return pd.DataFrame()
    
    valid_embeddings_df['score'] = valid_embeddings_df['embedding'].apply(lambda e: cosine_similarity([e], [query_emb])[0][0])
    return valid_embeddings_df.sort_values('score', ascending=False).head(top_k)


# Load the vibe_index for searching
with open("vibe_index.pkl", "rb") as f:
    vibe_index = pickle.load(f)

# Perform the search
results = search_places("good crowd", vibe_index, model)
print(results[['place_name', 'main_category', 'avg_place_rating', 'score', 'address']])

                         place_name  ...                                            address
247                I LOVE NEW DELHI  ...  J6M9+4XP, Connaught Place, New Delhi, Delhi 11...
77              #Orlem Selfie Point  ...  5RWQ+52R, near Our Lady of Lourdes Church, Mal...
239         TIB Pickup Point Mumbai  ...  Bus Stand, next to Kopri, Kopri, Thane East, T...
192                 Cutu View Point  ...  Aishwarya apartment, 4th A Main Road, Thayappa...
156  Yellow line metro, Rajiv Chowk  ...  J6M9+6X3, Inner Cir, Connaught Place, New Delh...

[5 rows x 5 columns]
