In [55]:
# Import necessary libraries

import json
import joblib
from collections import defaultdict
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import scipy.sparse

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/agastimhatre/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Step 1: Join business dataset with reviews dataset. Tokenize reviews and convert to lowercase

In [43]:
# Store reviews for each business
business_information = defaultdict(lambda: {"reviews": []})

with open("yelp_academic_dataset_review.json", "r", encoding="utf-8") as file:

    for line in file: 

        curr = json.loads(line)
        business_information[curr["business_id"]]["reviews"].append(curr["text"])

In [44]:
# Extract business-specific data and store in dictionary
with open("yelp_academic_dataset_business.json", "r", encoding="utf-8") as file:

    for line in file:

        curr = json.loads(line)
        business_information[curr["business_id"]]["name"] = curr["name"]
        business_information[curr["business_id"]]["address"] = curr["address"]
        business_information[curr["business_id"]]["city"] = curr["city"]
        business_information[curr["business_id"]]["state"] = curr["state"]


In [None]:
# Tokenize reviews and convert all tokens to lowercase
tokenized_business_information = defaultdict(lambda: {"reviews": []})

for id_ in business_information.keys():

    for review in business_information[id_]["reviews"]:

        tokenized_business_information[id_]["reviews"].append([word.lower() for word in word_tokenize(review)])

In [None]:
# Write joined, tokenized business/review data to a file
with open('business_with_reviews.json', 'w') as f:
    for key, value in tokenized_business_information.items():
        json.dump({key: value}, f)
        f.write('\n')

## Step 2: Output scores for TFIDF ranker

In [None]:
# Read in business reviews from JSON file
business_reviews = dict()

with open('business_with_reviews.json', 'r') as f:

    for line in f:

        record = json.loads(line)
        business_reviews.update(record)

In [None]:
# Test one query on the entire dataset, check results
business_ids = list(business_reviews.keys())
corpus = []
for id_ in business_ids:

    # Each business review is condensed down into a single, long sentence
    # so that TFIDF and other embedding techniques could be applied
    corpus.append(" ".join(" ".join(tokens) for tokens in business_reviews[id_]["reviews"]))

In [None]:
# Create TFIDF vector and train vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

# Persist vector and model
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
scipy.sparse.save_npz("tfidf_matrix.npz", tfidf_matrix)

In [None]:
# Reinitialize vectorizer and tfidf_matrix if session is lost
vectorizer = joblib.load('tfidf_vectorizer.joblib')
tfidf_matrix = scipy.sparse.load_npz("tfidf_matrix.npz")

In [None]:
# Transform query so that it fits into TFIDF vector space
query = "cafe with great lattes in san francisco"
query_vec = vectorizer.transform([query])

# Use cosine similarity to find most similar document for the query
similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()

In [None]:
# Match up business ids with similarity scores
scores = []

n = len(business_ids)
for i in range(n): scores.append((business_ids[i], similarities[i]))

# Sort based on score; sort in descending order
scores.sort(key = lambda x: x[1], reverse=True)

In [None]:
# Print out top 10 results
for id, _ in scores[:10]:

    print(business_information[id]["name"])

Vico's Mobile Auto Detailing
Francisco's Mobile Auto Detailing
GET Cafe
HOME Page Cafe
Grand Cafe & Bistro
La Mademoiselle Marvelous Pastries and Cafe
The Grind House
Cafe Lavi
Black Press Coffee Shop #003
T cafe


## Step 3: Create/Store Embeddings and Use FAISS to generate scores, Compare to token-matching ranker

In [None]:
# Use transformer architecture to create dense vector embeddings
# from query and restaurant reviews 
# all = general purpose transformer
# MiniLM = lightweight transformer similar to BERT
# L6 = six layers
# v2 = version two
model = SentenceTransformer("all-MiniLM-L6-v2")

# Create embeddings from the reviews
embeddings = model.encode(corpus, convert_to_numpy=True)
dimension = embeddings.shape[1]

# Create FAISS index for similarity search
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [None]:
# Persist index to file
faiss.write_index(index, "business_index.faiss")

In [None]:
# Read index back from file if session is lost
index = faiss.read_index("business_index.faiss")

In [None]:
# Check results of test query
query = "cafe with great lattes in san francisco"
query_vec = model.encode([query], convert_to_numpy=True)

# Get top k results
k = 50
D, I = index.search(query_vec, k)
top_businesses = [business_ids[i] for i in I[0]]

for id in top_businesses:

    print(business_information[id]["name"])

Cajé Coffee - Arlington St
Caje
Foundry Cafe & Market
Dune Coffee Roasters - State Street
Cafe Lavi
Bee Coffee Roasters
Caffe Sorrentino
LAVAZIO Cafe
Breaking Grounds Coffee & Cafe
Blossom Cafe
Jiggy Coffee
Spinelli's Bar Italia
Red Kettle Coffee
Saxbys
Kahwa Coffee
Cajé Coffee Roasters - Haley St
Coffee and Supply
Ch. Cafeteria
Presta Coffee Roasters
Brew Haha
Bay Coffee & Tea Company
Peet's Coffee
Reinette Cafe & Patisserie
Forin Cafe
The Chelsea St Pete
Kahwa St Pete Drive Thru
The Buzz
Broadway Bean Coffee
Darkshot Coffee
Buddy Brew - Hyde Park
Good Karma Cafe
Jitters Coffee
Dilworth Park Café
22nd Street Coffee
La Mancha Coffeehouse
Cherry Coffee Roasters
Rabbit Hole Cafe
Royal Cafe
Starbucks
Commissary Barber & Barista
Brick And Bell
Caffeine Roasters
The Painted Bridge Espresso Bar
Lambertville Trading Company
Nameless Coffee & Tea House
Tout De Suite Cafe
Cajé Coffee Roasters
Catalyst Coffee Bar
Foundation Coffee Co
The Blend Coffee & Wine
