In [3]:
# Import necessary libraries

import json
import joblib
from collections import defaultdict
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import pandas as pd
import scipy.sparse
from math import radians, cos, sin, asin, sqrt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt_tab')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/agastimhatre/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Step 1: Join business dataset with reviews dataset. Tokenize reviews and convert to lowercase

In [21]:
# Store reviews for each business
business_information = defaultdict(lambda: {"reviews": []})

with open("yelp_academic_dataset_review.json", "r", encoding="utf-8") as file:

    for line in file: 

        curr = json.loads(line)
        business_information[curr["business_id"]]["reviews"].append(curr["text"])

In [22]:
# Extract business-specific data and store in dictionary
with open("yelp_academic_dataset_business.json", "r", encoding="utf-8") as file:

    for line in file:

        curr = json.loads(line)
        business_information[curr["business_id"]]["name"] = curr["name"]
        business_information[curr["business_id"]]["address"] = curr["address"]
        business_information[curr["business_id"]]["city"] = curr["city"]
        business_information[curr["business_id"]]["state"] = curr["state"]
        business_information[curr["business_id"]]["latitude"] = curr["latitude"]
        business_information[curr["business_id"]]["longitude"] = curr["longitude"]
        business_information[curr["business_id"]]["stars"] = curr["stars"]

In [4]:
# Tokenize reviews and convert all tokens to lowercase
tokenized_business_information = defaultdict(lambda: {"reviews": []})

for id_ in business_information.keys():

    for review in business_information[id_]["reviews"]:

        tokenized_business_information[id_]["reviews"].append([word.lower() for word in word_tokenize(review)])

In [5]:
# Write joined, tokenized business/review data to a file
with open('business_with_reviews.json', 'w') as f:
    for key, value in tokenized_business_information.items():
        json.dump({key: value}, f)
        f.write('\n')

## Step 2: Output scores for TFIDF ranker

In [4]:
# Read in business reviews from JSON file
business_reviews = dict()

with open('business_with_reviews.json', 'r') as f:

    for line in f:

        record = json.loads(line)
        business_reviews.update(record)

In [5]:
# Test one query on the entire dataset, check results
business_ids = list(business_reviews.keys())
corpus = []
for id_ in business_ids:

    # Each business review is condensed down into a single, long sentence
    # so that TFIDF and other embedding techniques could be applied
    corpus.append(" ".join(" ".join(tokens) for tokens in business_reviews[id_]["reviews"]))

In [6]:
# Create TFIDF vector and train vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

# Persist vector and model, and ids
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
scipy.sparse.save_npz("tfidf_matrix.npz", tfidf_matrix)
with open("business_ids.json", "w") as f: json.dump(business_ids, f)

In [7]:
# Reinitialize vectorizer and tfidf_matrix if session is lost
vectorizer = joblib.load('tfidf_vectorizer.joblib')
tfidf_matrix = scipy.sparse.load_npz("tfidf_matrix.npz")
with open("business_ids.json") as f: business_ids = json.load(f)

In [8]:
# Transform query so that it fits into TFIDF vector space
query = "cafe with great lattes in san francisco"
query_vec = vectorizer.transform([query])

# Use cosine similarity to find most similar document for the query
similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()

In [9]:
# Match up business ids with similarity scores
scores = []

n = len(business_ids)
for i in range(n): scores.append((business_ids[i], similarities[i]))

# Sort based on score; sort in descending order
scores.sort(key = lambda x: x[1], reverse=True)

In [23]:
# Print out top 10 results
for id, _ in scores[:10]:

    print(business_information[id]["name"])

Vico's Mobile Auto Detailing
Francisco's Mobile Auto Detailing
GET Cafe
HOME Page Cafe
Grand Cafe & Bistro
La Mademoiselle Marvelous Pastries and Cafe
The Grind House
Cafe Lavi
Black Press Coffee Shop #003
T cafe


## Step 3: Create/Store Embeddings and Use FAISS to generate scores, Compare to token-matching ranker

In [11]:
# Use transformer architecture to create dense vector embeddings
# from query and restaurant reviews 
# all = general purpose transformer
# MiniLM = lightweight transformer similar to BERT
# L6 = six layers
# v2 = version two
model = SentenceTransformer("all-MiniLM-L6-v2")

# Create embeddings from the reviews
embeddings = model.encode(corpus, convert_to_numpy=True)
dimension = embeddings.shape[1]

# Create FAISS index for similarity search
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

In [12]:
# Persist embeddings to file
np.savez("transformer_embeddings.npz", ids=business_ids, vectors=embeddings)

In [13]:
# Read embeddings back into matrix if session is lost

data = np.load("transformer_embeddings.npz", allow_pickle=True)
business_ids = data["ids"].tolist()
embeddings = data["vectors"]

In [14]:
# Persist index to file
faiss.write_index(index, "business_index.faiss")

In [15]:
# Read index back from file if session is lost
index = faiss.read_index("business_index.faiss")

In [24]:
# Check results of test query
query = "cafe with great lattes in san francisco"
query_vec = model.encode([query], convert_to_numpy=True)

# Get top k results
k = 50
D, I = index.search(query_vec, k)
top_businesses = [business_ids[i] for i in I[0]]

for id in top_businesses:

    print(business_information[id]["name"])

Cajé Coffee - Arlington St
Caje
Foundry Cafe & Market
Dune Coffee Roasters - State Street
Cafe Lavi
Bee Coffee Roasters
Caffe Sorrentino
LAVAZIO Cafe
Breaking Grounds Coffee & Cafe
Blossom Cafe
Jiggy Coffee
Spinelli's Bar Italia
Red Kettle Coffee
Saxbys
Kahwa Coffee
Cajé Coffee Roasters - Haley St
Coffee and Supply
Ch. Cafeteria
Presta Coffee Roasters
Brew Haha
Bay Coffee & Tea Company
Peet's Coffee
Reinette Cafe & Patisserie
Forin Cafe
The Chelsea St Pete
Kahwa St Pete Drive Thru
The Buzz
Broadway Bean Coffee
Darkshot Coffee
Buddy Brew - Hyde Park
Good Karma Cafe
Jitters Coffee
Dilworth Park Café
22nd Street Coffee
La Mancha Coffeehouse
Cherry Coffee Roasters
Rabbit Hole Cafe
Royal Cafe
Starbucks
Commissary Barber & Barista
Brick And Bell
Caffeine Roasters
The Painted Bridge Espresso Bar
Lambertville Trading Company
Nameless Coffee & Tea House
Tout De Suite Cafe
Cajé Coffee Roasters
Catalyst Coffee Bar
Foundation Coffee Co
The Blend Coffee & Wine


## Batuhan's part

In [25]:
bordering_states = {
    'AL': ['FL', 'GA', 'MS', 'TN'],
    'AK': [],
    'AZ': ['CA', 'CO', 'NM', 'NV', 'UT'],
    'AR': ['LA', 'MO', 'MS', 'OK', 'TN', 'TX'],
    'CA': ['AZ', 'NV', 'OR'],
    'CO': ['AZ', 'KS', 'NE', 'NM', 'OK', 'UT', 'WY'],
    'CT': ['MA', 'NY', 'RI'],
    'DE': ['MD', 'NJ', 'PA'],
    'FL': ['AL', 'GA'],
    'GA': ['AL', 'FL', 'NC', 'SC', 'TN'],
    'HI': [],
    'ID': ['MT', 'NV', 'OR', 'UT', 'WA', 'WY'],
    'IL': ['IA', 'IN', 'KY', 'MO', 'WI'],
    'IN': ['IL', 'KY', 'MI', 'OH'],
    'IA': ['IL', 'MN', 'MO', 'NE', 'SD', 'WI'],
    'KS': ['CO', 'MO', 'NE', 'OK'],
    'KY': ['IL', 'IN', 'MO', 'OH', 'TN', 'VA', 'WV'],
    'LA': ['AR', 'MS', 'TX'],
    'ME': ['NH'],
    'MD': ['DE', 'PA', 'VA', 'WV'],
    'MA': ['CT', 'NH', 'NY', 'RI', 'VT'],
    'MI': ['IN', 'OH', 'WI'],
    'MN': ['IA', 'ND', 'SD', 'WI'],
    'MS': ['AL', 'AR', 'LA', 'TN'],
    'MO': ['AR', 'IA', 'IL', 'KS', 'KY', 'NE', 'OK', 'TN'],
    'MT': ['ID', 'ND', 'SD', 'WY'],
    'NE': ['CO', 'IA', 'KS', 'MO', 'SD', 'WY'],
    'NV': ['AZ', 'CA', 'ID', 'OR', 'UT'],
    'NH': ['MA', 'ME', 'VT'],
    'NJ': ['DE', 'NY', 'PA'],
    'NM': ['AZ', 'CO', 'OK', 'TX', 'UT'],
    'NY': ['CT', 'MA', 'NJ', 'PA', 'VT'],
    'NC': ['GA', 'SC', 'TN', 'VA'],
    'ND': ['MN', 'MT', 'SD'],
    'OH': ['IN', 'KY', 'MI', 'PA', 'WV'],
    'OK': ['AR', 'CO', 'KS', 'MO', 'NM', 'TX'],
    'OR': ['CA', 'ID', 'NV', 'WA'],
    'PA': ['DE', 'MD', 'NJ', 'NY', 'OH', 'WV'],
    'RI': ['CT', 'MA'],
    'SC': ['GA', 'NC'],
    'SD': ['IA', 'MN', 'MT', 'ND', 'NE', 'WY'],
    'TN': ['AL', 'AR', 'GA', 'KY', 'MO', 'MS', 'NC', 'VA'],
    'TX': ['AR', 'LA', 'NM', 'OK'],
    'UT': ['AZ', 'CO', 'ID', 'NV', 'NM', 'WY'],
    'VT': ['MA', 'NH', 'NY'],
    'VA': ['KY', 'MD', 'NC', 'TN', 'WV'],
    'WA': ['ID', 'OR'],
    'WV': ['KY', 'MD', 'OH', 'PA', 'VA'],
    'WI': ['IA', 'IL', 'MI', 'MN'],
    'WY': ['CO', 'ID', 'MT', 'NE', 'SD', 'UT']
}

In [18]:
def haversine(lat1, lon1, lat2, lon2):
    
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    angle = sin((lat2 - lat1)/2)**2 + cos(lat1) * cos(lat2) * sin((lon2 - lon1)/2)**2
    dist = 2 * asin(sqrt(angle))
    return dist * 6371

def extract_2nd(a):
    return a[1]

In [27]:
def rank_by_location_with_embeddings(query, state_u, lat_u, lon_u, model, index, business_ids, business_info):
    allowed_states = [state_u] + bordering_states.get(state_u, [])
    
    # Encode query with SentenceTransformer
    query_vec = model.encode([query], convert_to_numpy=True)
    _, I = index.search(query_vec, 100)
    top_ids = [business_ids[i] for i in I[0]]
    
    restaurant_values_embed = []
    for bid in top_ids:
        info = business_info.get(bid, {})
        # Check if any info is empty
        lat, lon = info.get("latitude"), info.get("longitude")
        if not lat or not lon:
            continue
        if not info.get("reviews"):
            continue
        # Check if the state is close, if not skip
        if info.get("state") not in allowed_states:
            continue
        
        # Find distance
        distance = haversine(lat_u, lon_u, lat, lon)
        value = info["stars"]
        
        if distance > 5:
            # Only penalize the portion over 5km
            value = value - 0.005 * (distance - 5)
        
        restaurant_values_embed.append((bid, value))
    

    final_ranking = sorted(restaurant_values_embed, key=extract_2nd, reverse=True)
    lowest_score = final_ranking[-1][1]

    if lowest_score < 0:

        for i in range(len(final_ranking)):

            _, curr_score = final_ranking[i]
            final_ranking[i] = (_, curr_score - lowest_score)

    return final_ranking

In [28]:
# Query and rank

user_state = "CA"
user_lat = 37.77
user_lon = -122.42

results = rank_by_location_with_embeddings(
    "vegan burger", user_state, user_lat, user_lon,
    model, index, business_ids, business_information
)
 
# Print top results
for bid, score in results:
    print(f"{business_information[bid]['name']} ({bid}): score = {score:.2f}")

House of Mexica (bCpdPDm3mnqMtVOZWYjEWw): score = 6.61
Ike's Love & Sandwiches (roy566JpT8RPwKQDQYt-LQ): score = 6.13
Burger Me! (Sc6V7hXU7Ar2_mmRoRYsKw): score = 6.13
Rascal's Vegan Food (UQZe-qWOpGiyEPsTUko5Rg): score = 5.87
Sizl Burger (j-R5Rrink9LxUta6PC9iFw): score = 5.57
The Natural Cafe (lu2UpRHLBZfvLG_caNEHKQ): score = 5.37
Little Love Burger (4xsfnA6lXXa5dgqOxiubdQ): score = 2.05
E. River Bar (HD8QZaZdFkymJwKn8_CVQg): score = 2.03
Divine Bovine Burgers (jEAqMCGw3ocephZJFsl-qw): score = 2.01
Substance Diner (fzt_W-vCaq4iGactb07VwA): score = 1.86
Beaut Burger (zL7wVx6Ihf1VEb9Wi7P0bw): score = 1.55
Lovin' Spoonfuls Vegan Restaurant (szad9yEwckNw0P3L5FGyNQ): score = 1.55
Welcome Diner (DK98TgLNmIguxMMPGlyb5w): score = 1.54
Graze Premium Burgers (NvaLV6FYt0bax2iKvuYYZQ): score = 1.51
Burger King (KL1xQGSxC5OmpZyq0PqjXg): score = 0.00


## Tanmay's Part

In [29]:
def apply_rating_penalty(avg_rating, threshold,curve="linear", param = 1):
    
    diff = max(0.0, threshold - avg_rating)
    #percentage difference
    scale = diff / threshold
 
    scale = scale * param
 
    if curve == "linear":
        return 1.0 - scale
    elif curve == "square":
        return 1.0 - scale**2
    elif curve == "exp":
        return np.exp(-5 * scale)  
    else:
        return 1.0  # No penalty

In [30]:
def rank_with_similarity_and_rankingpenalty(similarities, business_ids, business_information, threshold=3.0, curve="linear",param = 1):
    ranked = []
 
    #for every business id
    for i, bid in enumerate(business_ids):
 
        avg_rating = business_information[bid].get("stars", 0.0)
 
        penalty_weight = apply_rating_penalty(avg_rating, threshold, curve,param)
 
 
        adjusted_score = similarities[i] * penalty_weight
 
    
        ranked.append((bid, adjusted_score))
 
    ranked.sort(key=lambda x: x[1], reverse=True)
 
    return ranked
 
 
def get_map_100(ranked_list, ground_truth, relevant_label="highly"):
    relevant_found = 0
    precision_total = 0.0
 
    for i, (bid, _) in enumerate(ranked_list[:100]):
        if ground_truth.get(bid) == relevant_label:
            relevant_found += 1
            precision_total += relevant_found / (i + 1)
 
    return precision_total / relevant_found if relevant_found > 0 else 0.0

In [31]:
##Finding Optimal Params
  
curves = ["linear", "square", "exp"]
params = [0.2, 0.5, 0.75, 1.0, 1.25, 1.5, 2.0]
 
best_map = 0.0
best_curve = None
best_param = None
 
results = []
 
'''for curve in curves:
    for param in params:
        ranked_results = rank_with_similarity_and_rankingpenalty(
            similarities,
            business_ids,
            business_information,
            threshold=3.0,
            curve=curve,
            param=param
        )
 
        map_score = get_map_100(ranked_results,
                                ground_truth_labels
                                #need gt labels to make sure all results are checked properly
                                )
 
        results.append({
            "curve": curve,
            "param": param,
            "map@100": map_score
        })
 
        print(f"Curve: {curve}, Param: {param}, MAP@100: {map_score:.4f}")
 
        if map_score > best_map:
            best_map = map_score
            best_curve = curve
            best_param = param
 
# Convert results to DataFrame
results_df = pd.DataFrame(results)
 
# Print best config
print("\nBest configuration:")
print(f"Curve: {best_curve}")
print(f"Param: {best_param}")
print(f"MAP@100: {best_map:.4f}")'''

'for curve in curves:\n    for param in params:\n        ranked_results = rank_with_similarity_and_rankingpenalty(\n            similarities,\n            business_ids,\n            business_information,\n            threshold=3.0,\n            curve=curve,\n            param=param\n        )\n\n        map_score = get_map_100(ranked_results,\n                                ground_truth_labels\n                                #need gt labels to make sure all results are checked properly\n                                )\n\n        results.append({\n            "curve": curve,\n            "param": param,\n            "map@100": map_score\n        })\n\n        print(f"Curve: {curve}, Param: {param}, MAP@100: {map_score:.4f}")\n\n        if map_score > best_map:\n            best_map = map_score\n            best_curve = curve\n            best_param = param\n\n# Convert results to DataFrame\nresults_df = pd.DataFrame(results)\n\n# Print best config\nprint("\nBest configuration:")\npr

## Ground Truth (Praveen's Part)

In [None]:
def build_ground_truth_level_B(biz_path, rev_path, query_keywords, location,
                               max_matches=5000, min_stars=0.0):
    biz_meta = {}
    with open(biz_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i >= max_matches: break
            b = json.loads(line)
            biz_meta[b["business_id"]] = b
 
    # load up to 10 reviews per business
    reviews = defaultdict(list)
    with open(rev_path, "r", encoding="utf-8") as f:
        for line in f:
            r = json.loads(line)
            bid = r["business_id"]
            if bid in biz_meta and len(reviews[bid]) < 10:
                reviews[bid].append(r["text"].lower())
 
    gt = {}
    for bid, info in biz_meta.items():
        city = info.get("city","").lower().strip()
        cats = (info.get("categories") or "").lower()
        stars = float(info.get("stars",0))
        # metadata filters
        if city != location or stars < min_stars or not any(k in cats for k in ["cafe","coffee","espresso","tea"]):
            gt[bid] = "irrelevant"
        else:
            text = " ".join(reviews.get(bid, []))
            gt[bid] = "somewhat" if any(qk in text for qk in query_keywords) else "irrelevant"
    return gt, biz_meta

In [33]:
def build_ground_truth_level_B_all(biz_path, rev_path, query_keywords, min_stars=0.0):

    # Load all business metadata

    biz_meta = {}

    with open(biz_path, "r", encoding="utf-8") as f:

        for line in f:

            b = json.loads(line)

            biz_meta[b["business_id"]] = b
 
    # Load up to 10 reviews per business

    reviews = defaultdict(list)

    with open(rev_path, "r", encoding="utf-8") as f:

        for line in f:

            r = json.loads(line)

            bid = r["business_id"]

            if bid in biz_meta and len(reviews[bid]) < 10:

                reviews[bid].append(r["text"].lower())
 
    # Assign labels

    gt = {}

    for bid, info in biz_meta.items():

        cats = (info.get("categories") or "").lower()

        stars = float(info.get("stars", 0))

        # metadata filters: only check category and stars

        if stars < min_stars or not any(k in cats for k in ["cafe", "coffee", "espresso", "tea"]):

            gt[bid] = "irrelevant"

        else:

            text = " ".join(reviews.get(bid, []))

            gt[bid] = "somewhat" if any(qk in text for qk in query_keywords) else "irrelevant"
 
    return gt, biz_meta

In [34]:
biz_path = "yelp_academic_dataset_business.json"

rev_path = "yelp_academic_dataset_review.json"

query_keywords = ["latte", "coffee", "espresso", "cappuccino"]
 
gt, meta = build_ground_truth_level_B_all(biz_path, rev_path, query_keywords, min_stars=3.0)
 
# Save to disk

with open("ground_truth_level_B_full.json", "w") as f:

    json.dump(gt, f, indent=2)

In [45]:
import json
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

def build_ground_truth_level_A_plus_all(biz_path, rev_path, query, query_keywords,
                                        location, min_stars=3.0,
                                        thresholds=(0.5, 0.3), max_reviews=10):
    """
    """
    # Load business metadata
    biz_meta = {}
    with open(biz_path, "r", encoding="utf-8") as f:
        for line in f:
            b = json.loads(line)
            cats = (b.get("categories") or "").lower()
            stars = float(b.get("stars", 0))
            city = (b.get("city") or "").lower().strip()

            # Metadata filters
            #if location.lower() not in city:
            #    continue
            if stars < min_stars:
                continue
            if not any(k in cats for k in ["cafe", "coffee", "espresso", "tea"]):
                continue

            biz_meta[b["business_id"]] = b

    print(f"Businesses after filtering: {len(biz_meta)}")

    # Load up to N reviews per business
    reviews = defaultdict(list)
    with open(rev_path, "r", encoding="utf-8") as f:
        for line in f:
            r = json.loads(line)
            bid = r["business_id"]
            if bid in biz_meta and len(reviews[bid]) < max_reviews:
                reviews[bid].append(r["text"])

    print(f"Businesses with loaded reviews: {len(reviews)}")

    # Load SBERT model and encode query
    model = SentenceTransformer("all-MiniLM-L6-v2")
    q_emb = model.encode(query)

    # Assign labels
    gt = {}
    for bid, info in biz_meta.items():
        review_texts = reviews.get(bid, [])
        if not review_texts:
            gt[bid] = "irrelevant"
            continue

        combined_text = " ".join(review_texts).lower()
        keyword_flag = any(k in combined_text for k in query_keywords)

        doc_emb = model.encode(combined_text)
        sim = cosine_similarity([q_emb], [doc_emb]).flatten()[0]

        if sim >= thresholds[0]:
            gt[bid] = "highly"
        elif sim >= thresholds[1] or keyword_flag:
            gt[bid] = "somewhat"
        else:
            gt[bid] = "irrelevant"

    print(f"Final labeled businesses: {len(gt)}")
    return gt, biz_meta

In [46]:
query = "Cafe with great lattes"
keywords = ["latte", "coffee", "espresso", "cappuccino"]
location = "san francisco"
 
gt, meta = build_ground_truth_level_A_plus_all(
    "yelp_academic_dataset_business.json",
    "yelp_academic_dataset_review.json",
    query=query,
    query_keywords=keywords,
    location=location,
    min_stars=3.0,
    thresholds=(0.4, 0.1)  # more forgiving if needed
)
 
# Save to file
with open("ground_truth_level_A_plus_all.json", "w") as f:
    json.dump({query: gt}, f, indent=2)

Businesses after filtering: 9350
Businesses with loaded reviews: 9350
Final labeled businesses: 9350


In [52]:
number_of_ids_found = 0
for id, _ in scores[:1000]:

    if id in meta: number_of_ids_found += 1

print(number_of_ids_found)

734
