In [None]:
# Import necessary libraries

import json
import joblib
from collections import defaultdict
from sentence_transformers import SentenceTransformer
import numpy as np
!pip install faiss-cpu
!pip install sentence-transformers
import faiss
import scipy.sparse

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt_tab')



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Step 1: Join business dataset with reviews dataset. Tokenize reviews and convert to lowercase

In [None]:
# Store reviews for each business
business_information = defaultdict(lambda: {"reviews": []})

with open("yelp_academic_dataset_review.json", "r", encoding="utf-8") as file:

    for line in file:

        curr = json.loads(line)
        business_information[curr["business_id"]]["reviews"].append(curr["text"])

FileNotFoundError: [Errno 2] No such file or directory: 'yelp_academic_dataset_review.json'

In [None]:
# Extract business-specific data and store in dictionary
with open("yelp_academic_dataset_business.json", "r", encoding="utf-8") as file:

    for line in file:

        curr = json.loads(line)
        business_information[curr["business_id"]]["name"] = curr["name"]
        business_information[curr["business_id"]]["address"] = curr["address"]
        business_information[curr["business_id"]]["city"] = curr["city"]
        business_information[curr["business_id"]]["state"] = curr["state"]
        business_information[curr["business_id"]]["stars"] = curr["stars"]


In [None]:
# Tokenize reviews and convert all tokens to lowercase
tokenized_business_information = defaultdict(lambda: {"reviews": []})

for id_ in business_information.keys():

    for review in business_information[id_]["reviews"]:

        tokenized_business_information[id_]["reviews"].append([word.lower() for word in word_tokenize(review)])

In [None]:
# Write joined, tokenized business/review data to a file
with open('business_with_reviews.json', 'w') as f:
    for key, value in tokenized_business_information.items():
        json.dump({key: value}, f)
        f.write('\n')

## Step 2: Output scores for TFIDF ranker

In [None]:
# Read in business reviews from JSON file
business_reviews = dict()

with open('business_with_reviews.json', 'r') as f:

    for line in f:

        record = json.loads(line)
        business_reviews.update(record)

In [None]:
# Test one query on the entire dataset, check results
business_ids = list(business_reviews.keys())
corpus = []
for id_ in business_ids:

    # Each business review is condensed down into a single, long sentence
    # so that TFIDF and other embedding techniques could be applied
    corpus.append(" ".join(" ".join(tokens) for tokens in business_reviews[id_]["reviews"]))

In [None]:
# Create TFIDF vector and train vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

# Persist vector and model, and ids
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
scipy.sparse.save_npz("tfidf_matrix.npz", tfidf_matrix)
with open("business_ids.json", "w") as f: json.dump(business_ids, f)

In [None]:
# Reinitialize vectorizer and tfidf_matrix if session is lost
vectorizer = joblib.load('tfidf_vectorizer.joblib')
tfidf_matrix = scipy.sparse.load_npz("tfidf_matrix.npz")
with open("business_ids.json") as f: business_ids = json.load(f)

In [None]:
# Transform query so that it fits into TFIDF vector space
query = "cafe with great lattes in san francisco"
query_vec = vectorizer.transform([query])

# Use cosine similarity to find most similar document for the query
similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()

In [None]:
# Match up business ids with similarity scores
scores = []

n = len(business_ids)
for i in range(n): scores.append((business_ids[i], similarities[i]))

# Sort based on score; sort in descending order
scores.sort(key = lambda x: x[1], reverse=True)

In [None]:
# Print out top 10 results
for id, _ in scores[:10]:

    print(business_information[id]["name"])

## Step 3: Create/Store Embeddings and Use FAISS to generate scores, Compare to token-matching ranker

In [None]:
# Use transformer architecture to create dense vector embeddings
# from query and restaurant reviews
# all = general purpose transformer
# MiniLM = lightweight transformer similar to BERT
# L6 = six layers
# v2 = version two
model = SentenceTransformer("all-MiniLM-L6-v2")

# Create embeddings from the reviews
embeddings = model.encode(corpus, convert_to_numpy=True)
dimension = embeddings.shape[1]

# Create FAISS index for similarity search
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

In [None]:
# Persist embeddings to file
np.savez("transformer_embeddings.npz", ids=business_ids, vectors=embeddings)

In [None]:
# Read embeddings back into matrix if session is lost

data = np.load("transformer_embeddings.npz", allow_pickle=True)
business_ids = data["ids"].tolist()
embeddings = data["vectors"]

In [None]:
# Persist index to file
faiss.write_index(index, "business_index.faiss")

In [None]:
# Read index back from file if session is lost
index = faiss.read_index("business_index.faiss")

In [None]:
# Check results of test query
query = "cafe with great lattes in san francisco"
query_vec = model.encode([query], convert_to_numpy=True)

# Get top k results
k = 50
D, I = index.search(query_vec, k)
top_businesses = [business_ids[i] for i in I[0]]

for id in top_businesses:

    print(business_information[id]["name"])

**Ratings Ranks**

In [None]:
def apply_rating_penalty(avg_rating, threshold,curve="linear", param = 1):
    diff = max(0.0, threshold - avg_rating)
    #percentage difference
    scale = diff / threshold

    scale = scale * param

    if curve == "linear":
        return 1.0 - scale
    elif curve == "square":
        return 1.0 - scale**2
    elif curve == "exp":
        return np.exp(-5 * scale)
    else:
        return 1.0  # No penalty


In [None]:
def rank_with_similarity_and_rankingpenalty(similarities, business_ids, business_information, threshold=3.0, curve="linear",param = 1):
    ranked = []

    #for every business id
    for i, bid in enumerate(business_ids):

        avg_rating = business_information[bid].get("stars", 0.0)

        penalty_weight = apply_rating_penalty(avg_rating, threshold, curve,param)
        adjusted_score = similarities[i] * penalty_weight
        ranked.append((bid, adjusted_score))

    ranked.sort(key=lambda x: x[1], reverse=True)

    return ranked


def get_map_100(ranked_list, ground_truth, relevant_label="highly"):
    relevant_found = 0
    precision_total = 0.0

    for i, (bid, _) in enumerate(ranked_list[:100]):
        if ground_truth.get(bid) == relevant_label:
            relevant_found += 1
            precision_total += relevant_found / (i + 1)

    return precision_total / relevant_found if relevant_found > 0 else 0.0


In [None]:
##Finding Optimal Params

import pandas as pd

curves = ["linear", "square", "exp"]
params = [0.2, 0.5, 0.75, 1.0, 1.25, 1.5, 2.0]

best_map = 0.0
best_curve = None
best_param = None

results = []

for curve in curves:
    for param in params:
        ranked_results = rank_with_similarity_and_rankingpenalty(
            similarities,
            business_ids,
            business_information,
            threshold=3.0,
            curve=curve,
            param=param
        )

        map_score = get_map_100(ranked_results,
                                ground_truth_labels
                                #need gt labels to make sure all results are checked properly
                                )

        results.append({
            "curve": curve,
            "param": param,
            "map@100": map_score
        })

        print(f"Curve: {curve}, Param: {param}, MAP@100: {map_score:.4f}")

        if map_score > best_map:
            best_map = map_score
            best_curve = curve
            best_param = param

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Print best config
print("\nBest configuration:")
print(f"Curve: {best_curve}")
print(f"Param: {best_param}")
print(f"MAP@100: {best_map:.4f}")
