In [1]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp310-cp310-macosx_14_0_arm64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp310-cp310-macosx_14_0_arm64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [7]:
# =========================
# DATA LOADING
# =========================
import pandas as pd
from tqdm import tqdm
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import ast # For safely evaluating string representations of Python literals

tqdm.pandas()

from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# This is a great balance of speed and quality (multilingual too)
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# =========================
# DATA LOADING
# =========================
PATH = "/Users/amanjaiswal/Work/hop_v3/backend/combined_results.csv"
df = pd.read_csv(PATH)

# =========================
# DATA PREPROCESSING
# =========================
# Safely evaluate the string representation of the list of dictionaries
import json

def safe_literal_eval(s):
    formatted_data = json.loads(s)
    if isinstance(formatted_data, list):
        return formatted_data
    elif isinstance(formatted_data, str):
        return list(formatted_data)
    else:
        raise Exception("error for row", s)

df['detailed_reviews'] = df['detailed_reviews'].apply(safe_literal_eval)


df_reviews_exploded = df[['place_id', 'detailed_reviews']].explode('detailed_reviews')
df_reviews_exploded_filtered = df_reviews_exploded[
    df_reviews_exploded['detailed_reviews'].apply(lambda x: isinstance(x, dict) and bool(x))
]

df_reviews = pd.json_normalize(df_reviews_exploded_filtered['detailed_reviews'])
df_reviews['place_id'] = df_reviews_exploded_filtered['place_id'].values
place_features = ['place_id', 'name', 'main_category', 'rating', 'address', 'reviews']
df_places = df[place_features]

flat_df = df_reviews.merge(df_places, on='place_id', how='left', suffixes=('_review', '_place'))
flat_df.head()

# =========================
# FEATURE ENGINEERING
# =========================
# Filter and group reviews
filtered_df = flat_df[flat_df['rating_review'] >= 3]  # Optional: use only positive reviews

# Group and combine reviews per place
vibe_df = filtered_df.groupby(
    ['place_id', 'name_place', 'main_category', 'rating_place', 'address', 'reviews']
)['review_text'].apply(lambda texts: ' '.join([str(t) for t in texts if pd.notna(t)])).reset_index()

# Rename columns
vibe_df.rename(columns={
    'name_place': 'place_name',
    'rating_place': 'avg_place_rating',
    'review_text': 'combined_reviews'
}, inplace=True)

def safe_encode(text):
    if isinstance(text, str) and text.strip():
        return model.encode(text)
    return model.encode("")

vibe_df['embedding'] = vibe_df['combined_reviews'].progress_apply(safe_encode)
vibe_index = vibe_df[['place_id', 'place_name', 'embedding', 'avg_place_rating', 'reviews', 'main_category', 'address']]


100%|██████████| 328/328 [00:09<00:00, 35.05it/s]


In [8]:
import faiss
import numpy as np

# Create FAISS index
embedding_matrix = np.vstack(vibe_df['embedding'].values).astype('float32')
faiss.normalize_L2(embedding_matrix)

# Inner Product
index = faiss.IndexFlatIP(embedding_matrix.shape[1])
index.add(embedding_matrix)

faiss.write_index(index, "vibe_faiss.index")
vibe_df.to_pickle("vibe_df.pkl")


# Search function using FAISS
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load model, vibe metadata, and FAISS index
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
vibe_df = pd.read_pickle("vibe_df.pkl")
index = faiss.read_index("vibe_faiss.index")

def encode_and_normalize(query):
    query_emb = model.encode(query).astype('float32')
    faiss.normalize_L2(query_emb.reshape(1, -1))
    return query_emb.reshape(1, -1)

def search_places_faiss(query, top_k=5):
    query_emb = encode_and_normalize(query)
    distances, indices = index.search(query_emb, top_k)
    results = vibe_df.iloc[indices[0]].copy()
    results['score'] = distances[0]
    return results[['place_name', 'main_category', 'avg_place_rating', 'score', 'address']]


In [12]:
results = search_places_faiss("great vibes for gen z")
print(results)

                                   place_name       main_category  \
243                          I love ic heart   Tourist attraction   
197               Indian Flag Tower Bengaluru  Tourist attraction   
0    The MadLabs: Where Fun Meets Fascination  Tourist attraction   
46        Indira Gandhi Musical Fountain Park                Park   
177                Waste to Wonder Theme Park          Theme park   

     avg_place_rating     score  \
243               4.4  0.266294   
197               5.0  0.248915   
0                 4.7  0.242141   
46                4.1  0.237324   
177               4.1  0.226851   

                                               address  
243  II, Road No. 2, I C Colony, Borivali West, Mum...  
197  National Military Memorial Park, 2, Millers Rd...  
0    1st floor, Snow city, Jayamahal Main Rd, oppos...  
46   XHPR+4MV, Raj Bhavan Rd, Opposite to Jawahar L...  
177  Block A, Ganga Vihar, Sarai Kale Khan, New Del...  


**Benefits of using FAISS**
1. Handling of millions of vectors
2. Uses cosine similarity under the hood