In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import datetime

df = pd.read_csv("reviews_8_2_2025.csv")

In [None]:
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")
df["review_text"] = df["review_text"].fillna("")
df = df.drop_duplicates(subset=["place_id", "review_text"])
df["all_categories"] = df["all_categories"].apply(lambda x: eval(x) if isinstance(x, str) else x)
df["published_at_date"] = pd.to_datetime(df["published_at_date"])
df["days_since_review"] = (datetime.now() - df["published_at_date"]).dt.days
df["adjusted_rating"] = df["rating"] * np.exp(-df["days_since_review"] / 30)  # Decay factor
df["categories_text"] = df["all_categories"].apply(lambda x: " ".join(x))

df.head()

Unnamed: 0,city,place_id,place_name,rating,review_text,published_at,published_at_date,review_likes_count,is_local_guide,main_category,all_categories,days_since_review,adjusted_rating,categories_text
0,Bangalore,ChIJW0EJK3YWrjsRpa39RV0Fy-8,"Lotus Pavilion, ITC Gardenia - Restaurants In ...",5,,3 hours ago,2025-02-08 11:16:07.534479,0,False,European restaurant,"[European restaurant, Fine dining restaurant]",0,5.0,European restaurant Fine dining restaurant
10,Bangalore,ChIJW0EJK3YWrjsRpa39RV0Fy-8,"Lotus Pavilion, ITC Gardenia - Restaurants In ...",5,Only view is good but if you want to enjoy foo...,a month ago,2025-01-08 14:16:11.711088,0,True,European restaurant,"[European restaurant, Fine dining restaurant]",31,1.779095,European restaurant Fine dining restaurant
12,Bangalore,ChIJW0EJK3YWrjsRpa39RV0Fy-8,"Lotus Pavilion, ITC Gardenia - Restaurants In ...",5,Good,2 months ago,2024-12-08 14:16:12.129799,0,True,European restaurant,"[European restaurant, Fine dining restaurant]",62,0.633036,European restaurant Fine dining restaurant
16,Bangalore,ChIJW0EJK3YWrjsRpa39RV0Fy-8,"Lotus Pavilion, ITC Gardenia - Restaurants In ...",4,Food is great. Hospitality is good. I felt lob...,3 months ago,2024-11-08 14:16:12.514801,0,True,European restaurant,"[European restaurant, Fine dining restaurant]",92,0.186305,European restaurant Fine dining restaurant
22,Bangalore,ChIJW0EJK3YWrjsRpa39RV0Fy-8,"Lotus Pavilion, ITC Gardenia - Restaurants In ...",5,Excellent,6 months ago,2024-08-08 14:16:16.914646,0,True,European restaurant,"[European restaurant, Fine dining restaurant]",184,0.010847,European restaurant Fine dining restaurant


In [3]:
"""
Example:
Suppose we have the following review texts:
df["review_text"] = ["The food was great", "The service was terrible", "Great food and service"]
The TF-IDF vectorizer will convert these texts into a matrix of TF-IDF features, where each row represents a review
and each column represents a term (word) from the reviews. The values in the matrix indicate the importance of each term
in the corresponding review.

Reason:
We use TF-IDF (Term Frequency-Inverse Document Frequency) to convert the textual data into numerical features.
This helps in quantifying the importance of words in the reviews, which can be used for various machine learning tasks
such as clustering, classification, or sentiment analysis.
"""

tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df["review_text"])


In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def recommend_by_text(query, top_n=5, sample_reviews_per_restaurant=3):
    query_vector = tfidf_vectorizer.transform([query])
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
    """
    1.  Converts the user query (a string) into a TF-IDF vector (numerical representation of words).

    2.  Compares the query_vector with each restaurant's review TF-IDF vector.
        Computes the cosine similarity score between the query and each restaurant's review.
        This gives a list of similarity scores, one per restaurant review.

    Ex: If the query is "italian pasta", it will create a vector where "italian" and "pasta" gets a value and other words get zero.
    
    Step 1: TF-IDF Vectors
    Index	Review Text	TF-IDF Vector Representation
    0	"Best Italian pasta in town"	[0.8, 0.6, 0.0, 0.0]
    1	"Great place for Mexican food"	[0.0, 0.0, 0.9, 0.7]
    2	"Italian restaurant, fresh pizza"	[0.7, 0.8, 0.0, 0.0]

    Step 2: Calculate similarity scores
    Query vs. Review 0: (0.75 × 0.8) + (0.65 × 0.6) = 0.84
    Query vs. Review 1: (0.75 × 0.0) + (0.65 × 0.0) = 0.0
    Query vs. Review 2: (0.75 × 0.7) + (0.65 × 0.8) = 0.89

    Similarity_scores = [0.84, 0.0, 0.89]

    Higher scores mean better match.
    Review 2 (Italian restaurant, fresh pizza) has the highest score 0.89.
    Review 1 (Mexican food) is completely unrelated (0.0).
    """

    df["similarity_score"] = similarity_scores

    grouped_df = df.groupby(["place_id", "place_name"]).agg({
        "similarity_score": "max",  # Keep the best matching review score
        "rating": "mean"  # Average rating for fairness
    }).reset_index()

    top_recommendations = grouped_df.sort_values(
        by=["similarity_score", "rating"], ascending=[False, False]
    ).head(top_n)

    results = []

    for _, row in top_recommendations.iterrows():
        place_id = row["place_id"]
        place_name = row["place_name"]
        similarity_score = row["similarity_score"]
        rating = row["rating"]

        # Fetch reviews for this place that contributed to similarity
        matching_reviews = df[df["place_id"] == place_id].nlargest(sample_reviews_per_restaurant, "similarity_score")["review_text"].dropna().tolist()
        
        results.append({
            "name": place_name,
            "matching_score": similarity_score,
            "rating": rating,
            "sample_reviews": matching_reviews
        })

    return results


In [None]:
query_input = "Best place to host a birthday party"

output = recommend_by_text(query_input, top_n=5)
output

[{'name': 'Just BLR - The Best Resto Bar In Brigade Road',
  'matching_score': 0.39934832199195247,
  'rating': 4.826086956521739,
  'sample_reviews': ['best place for freak out',
   'good place to visit, ambiance are very good and nice placeto party here..... have fun with musics',
   'Such a wonderful place to party and hang out with friends... The staff are so welcoming and friendly.']},
 {'name': 'ALBA',
  'matching_score': 0.39934832199195247,
  'rating': 4.379310344827586,
  'sample_reviews': ['Best place',
   'Suber place and very good food',
   'Nothing great about the place']},
 {'name': 'Truffles',
  'matching_score': 0.39934832199195247,
  'rating': 4.205882352941177,
  'sample_reviews': ['Best place',
   'Best Place for Burgers Continental Main Course Salads The BEST',
   'The best and affordable']},
 {'name': 'Ssaffron',
  'matching_score': 0.37885155736992454,
  'rating': 4.793478260869565,
  'sample_reviews': ['Best in food Best in Ambience Best in Music Best in peace Sa