# Restaurant Recommendation System Using Embeddings 
### Anirudh Mehrotra

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ast
import json
import folium
from gensim.utils import simple_preprocess

## Read Datasets

In [2]:
# Training data
df_train = pd.read_csv("reviews_train.csv", sep='\t', encoding = 'ISO-8859-1')
df_train.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,A-8BBGnBoz5-K1WaM5Cgaw,DUxCdkXnwYEzHZvT8MyvIw,ju4YP8SLdR_BmWr_-Xh83Q,5,2,0,1,Best pho in Santa Barbara County. Staff are g...,2018-07-27 00:09:23
1,k--beJRNBZzFklRoppa2MA,SAgf1IxxuomOWSIDzy07pQ,uE40984_YDgVvPeRpFcCaQ,5,0,0,0,We came for the hot chicken but were won over ...,2014-02-26 13:05:47
2,qmr304jvtYetK5i_Djrx_A,cMkPQZVDOibs2bz8St7Acg,JvawJ9bSr22xn4R9oLvl_w,3,0,0,0,I really should have used my better judgement ...,2014-02-26 22:33:39
3,KXNdht_of5t-Dh1eoaeYHQ,9m13F_RCcz_r48tQH82I5A,bdfZdB2MTXlT6-RBjSIpQg,3,0,0,0,This place is relatively smaller than other ph...,2013-12-09 08:12:28
4,asJ8k1sm8jO01bi-s5JW8g,goySBsZ3QJfSaElPIDIzLw,14ZGwnDyydXdSBsLXpSUrA,5,0,0,0,Best Southeast restaurant in Philadelphia. The...,2011-06-03 03:54:54


In [3]:
# Test data
df_test = pd.read_csv("reviews_test_all.csv", sep='\t', encoding = 'ISO-8859-1')
df_test.head()

Unnamed: 0.1,Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,37039,8QSAs3yVhcNS0y1fWSn41Q,BY_7xEliSP5iEig9bemaKw,e86IBzGCsrnhJbD_wELj7w,3,1,0,0,The main course was actually very tasty and th...,2011-01-16 20:23:33
1,19778,-lNpxdJNrvMtZ_RTvT2NtA,djOl6zKvKdbt4lNnDKUXJg,z22hSRptt_DS0nWjsIka2A,5,0,0,0,Outback is my favorite steakhouse. I've dined ...,2015-05-13 17:03:27
2,80124,7Ylcy1txacpnY76275KqQw,1iokf9rM43YAwxsa8bp1OQ,jRLskcm_icZIKs81mYC4iQ,4,5,3,3,"I absolutely love Ethiopian, I'd eat it every ...",2015-04-22 17:26:46
3,35507,9JrJF-YHl44WJtZxaArteQ,NbcOw8Scs1AQcRR9uzlCPw,H47H_73y7aZ9KHpzct-xBg,4,0,0,0,"I was pleasantly surprised by this place , I l...",2018-05-20 00:10:30
4,35251,r3GB-Kg5UL_tOKdkM-nZvA,yymYLENYLOHwDRAxZvU3vA,B-DiQpcSTJ7oMMnwzbAGTQ,5,0,0,0,"Omg omg, I would marry the man that would brin...",2018-07-09 23:56:46


In [4]:
# Businesses data
df_businesses = pd.read_csv("businesses.csv", sep='\t', encoding = 'ISO-8859-1', index_col=0)
print (len(df_businesses))
df_businesses.head()

100000


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [5]:
# Users data
df_users = pd.read_csv("users.csv", sep='\t', encoding = 'ISO-8859-1')
print (len(df_users))
df_users.head()

100000


Unnamed: 0.1,Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,7217,1259,5994,2007,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",...,65,55,56,18,232,844,467,467,239,180
1,1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091,13066,27281,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...","ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",...,264,184,157,251,1847,7054,3131,3131,1521,1946
2,2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,2008-07-25 10:41:00,2086,1010,1003,20092010201120122013,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA...",...,13,10,17,3,66,96,119,119,35,18
3,3,SZDeASXq7o05mMNLshsdIA,Gwen,224,2005-11-29 04:38:33,512,330,299,200920102011,"enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74Vg...",...,4,1,6,2,12,16,26,26,10,9
4,4,hA5lMy-EnncsH4JoR-hFGQ,Karen,79,2007-01-05 19:40:59,29,15,7,,"PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMbA...",...,1,0,0,0,1,1,0,0,0,0


## Text Preprocessing
### Light preprocessing, by downcasesing words and removing tokens too short or too long in addition to stopwords.  It is applied to each 'text' entry in training data.



In [6]:
# text preprocessing
import re
import nltk
from nltk.corpus import stopwords


In [7]:
stop_words = set(stopwords.words('english'))

# tokenize one string text
def tokenize_text(text):
    token_list = []
    text_str = re.sub(r'[^\w\s]', '', text.lower()) # drop non-alphanumeric or non-white-space
    for word in nltk.tokenize.word_tokenize(text_str):
        if word not in stop_words and len(word) > 1 and len(word) < 20:
            token_list.append(word)
    return token_list

#------------
text1 = "We went for Sunday Brunch, highly recommended. They take reservations  which was great for us since we were a group of 7. This was my favorite restaurant in Nashville. I had the breakfast burrito and I have been craving another one ever since! But really, I don't think you can go wrong with anything on the menu! The service was great, a little slow but it was a busy Sunday morning. I'm looking forward to the next time I am in town so I can try it for dinner!"
tokenize_text(text1)

['went',
 'sunday',
 'brunch',
 'highly',
 'recommended',
 'take',
 'reservations',
 'great',
 'us',
 'since',
 'group',
 'favorite',
 'restaurant',
 'nashville',
 'breakfast',
 'burrito',
 'craving',
 'another',
 'one',
 'ever',
 'since',
 'really',
 'dont',
 'think',
 'go',
 'wrong',
 'anything',
 'menu',
 'service',
 'great',
 'little',
 'slow',
 'busy',
 'sunday',
 'morning',
 'im',
 'looking',
 'forward',
 'next',
 'time',
 'town',
 'try',
 'dinner']

In [8]:
# Apply the tokenization function to all 'text' in the review training dataset.
doc = df_train.text.apply(tokenize_text)
print (doc[0])
print (doc[11])

['best', 'pho', 'santa', 'barbara', 'county', 'staff', 'great', 'drive', '3540', 'minutes', 'pho', 'would', 'drive', 'double', 'seriously', 'tho', 'pho', 'withdrawals', 'thanks', 'phamous']
['place', 'awesome', 'right', 'walk', 'ur', 'overwhelmed', 'many', 'options', 'available', 'hard', 'choose', 'guy', 'helped', 'us', 'friendly', 'made', 'polite', 'conversation', 'lets', 'talk', 'actual', 'gelato', 'good', 'quality', 'amazing', 'especially', 'liked', 'lotus', 'cookie', 'mango', 'outside', 'place', 'hopping', 'makes', 'feel', 'like', 'ur', 'somewhere', 'tucson', 'come']


# **Embeddings 1**: GloVe
## Load GloVe (https://nlp.stanford.edu/projects/glove/) and store its embeddings in memory

In [9]:
def load_glove(vector_len):
    # vector_len must be 50/100/200/300.
    GloVe = f"glove.6B.{vector_len}d.txt"
    
    embeddings_dict = {} # embeddings of all words stored in a dictionary

    with open(GloVe, 'r', encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = str(values[0])

            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict

## Obtain the embedding of a given word from the Glove dictionary

In [10]:
embeddings_dict = load_glove(50)  # vector length = 50
embedding = embeddings_dict['information']
print (embedding)
len(embeddings_dict)

[ 0.63591   0.28142   1.103     0.90695   0.58408  -0.66616  -0.58817
 -0.55119   1.0063   -0.22333  -0.021339  0.59643   0.020229 -0.33389
  0.27095   0.099159 -0.62187  -0.62834   0.87429  -0.15716   0.97701
  0.36715   0.65559   0.15535   0.22763  -1.4113   -0.65703  -0.72715
  0.25938  -0.23776   3.3925   -0.58473  -0.34668  -1.7489   -0.015439
  0.50899  -0.25659   0.069998  0.086402  0.395     1.0702    0.088681
  0.54121   0.53468   0.09773  -0.25598  -0.15555   1.5154    0.81081
  0.11142 ]


400000

In [11]:
# Measure the cosine similarity between 'information' and 'news'
from numpy.linalg import norm

A = embedding
B = embeddings_dict['news']
print (B)
cosine = np.dot(A,B)/(norm(A)*norm(B))
print("Cosine Similarity:", cosine)

[-0.20825   0.47786   0.52196   1.0587   -0.10045  -1.1269   -1.2581
 -0.11041  -0.074125 -0.77976  -0.37942  -0.2486   -0.39224   0.42972
  0.9806    0.12668  -1.3772   -0.22793  -0.18497   0.41014   0.96781
  0.8916    0.84685   0.57416   0.46455  -1.7287   -0.63918   0.56256
 -0.12651   0.49711   3.3326    0.034399  0.46149  -0.44826  -1.1945
 -0.47593  -0.31927  -0.6442    0.089735  0.073952  0.70755   0.52948
 -0.12034  -0.46779   0.24722   0.28045  -0.62632   1.4458    0.51045
  0.74156 ]
Cosine Similarity: 0.7023798


In [12]:
embeddings_dict = load_glove(300) # vector/embedding length = 300
embedding = embeddings_dict['information']
print (embedding)
len(embeddings_dict)

[-3.4245e-01 -4.4337e-01  1.2016e-01 -4.4797e-01  3.6409e-01  1.9042e-01
  3.5814e-02  7.2083e-02  3.0849e-01 -2.7113e+00  1.6265e-01 -1.6390e-01
  1.4919e-01  3.6991e-01 -3.4511e-01 -5.0148e-02 -2.7712e-01 -2.9568e-01
  1.9507e-02  8.5272e-02  1.6408e-01  3.4738e-01  2.8669e-01 -6.5308e-02
 -3.3220e-01 -1.2347e-01 -2.5370e-01  7.3277e-01 -2.1210e-01  5.1321e-01
 -2.3863e-01 -1.4686e-02 -9.1284e-02  1.8715e-01 -2.0645e-01 -3.5094e-01
  1.1643e-01 -6.0074e-02  6.4171e-02 -7.6658e-01 -2.5077e-01  4.1920e-01
  2.3099e-01  9.6431e-01 -7.1394e-02 -1.5819e-01 -5.3712e-03 -6.7648e-03
 -2.7829e-01 -1.9475e-01  5.6329e-01 -3.4195e-02 -1.8190e-01 -3.4428e-01
 -5.7025e-01  4.5278e-01  1.5301e-01 -6.8768e-02 -2.6461e-01  2.1887e-02
 -2.1543e-01  1.2986e-01  6.0415e-02  1.3248e-01  1.5046e-01 -1.2934e-01
  3.1512e-01 -3.8397e-01 -8.2451e-02  4.8302e-01 -3.4851e-01  2.4238e-01
  3.0626e-01 -8.2572e-04 -2.7866e-01  3.5426e-01  1.3572e-01 -1.3797e-01
 -2.1691e-01  1.1389e-01  7.3912e-02 -3.1077e-01  3

400000

In [13]:
A = embedding
B = embeddings_dict['news']
print (B)
cosine = np.dot(A,B)/(norm(A)*norm(B))
print("Cosine Similarity:", cosine)

[-4.1681e-01  1.9399e-01  2.2356e-01  1.8028e-01  3.3988e-01  1.5870e-01
 -2.1046e-01  4.2863e-01 -4.1325e-02 -1.8492e+00 -7.3289e-02  1.0118e-02
  4.5465e-01  7.2215e-01  6.3270e-01  2.2182e-04 -2.4905e-01  4.3111e-02
  3.3807e-01 -2.5771e-01  1.3360e-01  2.2773e-01  7.3658e-01  7.9592e-01
  1.1137e-02 -1.1906e-01 -8.9547e-03  1.7780e-01  1.8806e-01 -5.1765e-01
 -3.8165e-01 -2.5147e-01  8.0401e-01  3.5761e-01 -1.4509e+00 -1.8325e-01
 -4.0522e-01  1.0572e-01  3.6496e-01  3.2123e-01 -6.8881e-02 -1.9882e-02
  1.8024e-01  9.7504e-01 -8.8517e-02  2.0294e-01  9.8143e-02 -1.0290e-01
 -3.1582e-01  4.3241e-01  2.5816e-01  6.6642e-01  9.8665e-02 -5.2727e-01
 -2.8406e-02  1.1121e+00  2.5231e-01  1.2746e-01 -5.5072e-02 -7.4524e-01
  6.8430e-01  4.0475e-01  3.6269e-01 -1.1521e-01 -2.8835e-01 -1.9462e-01
  7.9845e-01  1.6139e-01  4.9639e-01 -5.1145e-01 -4.9031e-02  5.7478e-02
 -8.3950e-01 -2.8308e-02 -1.3099e-01 -5.4157e-02  4.9117e-01  4.0342e-01
  5.6126e-03  8.8418e-01  5.4271e-02 -2.5825e-01 -7

# **Embeddings 2**: Word2Vec
## Use Word2Vec model from Gensim, train the model with the 'text' field of the training data to create a custom word embeddings for the given data.

In [14]:
# Apply gensim's tokenization to the texts in training and test data
import gensim

doc_train = df_train.text.apply(gensim.utils.simple_preprocess)
doc_train[0]

['best',
 'pho',
 'in',
 'santa',
 'barbara',
 'county',
 'staff',
 'are',
 'great',
 'drive',
 'minutes',
 'for',
 'the',
 'pho',
 'would',
 'drive',
 'double',
 'seriously',
 'tho',
 'no',
 'more',
 'pho',
 'withdrawals',
 'thanks',
 'phamous']

# Training Word2vec

Training the word2vec model is simple. First initialize Word2vec and pass the doc. So, we are essentially passing on a list of lists. Where each list within the main list contains a set of tokens from a user review. Word2Vec uses all these tokens to internally create a vocabulary.

`min_count` – Ignores all words with total frequency lower than this. `min_count=2` means that to include only those words in the Word2Vec model that appear at least twice in the corpus. Words that only occur once in the corpus are not that important.

In [15]:
# output embedding size = 50, min_count = 2
model = gensim.models.Word2Vec(doc_train, vector_size=50, window=10, min_count=2, workers=10)
model.train(doc_train,total_examples=len(doc),epochs=10)

(57792915, 76256390)

In [16]:
# Measure the cosine similarity between 'information' and 'news'
from numpy.linalg import norm

A = model.wv['information'] 
print (A)
B = model.wv['news']
print (B)
cosine = np.dot(A,B)/(norm(A)*norm(B))
print("Cosine Similarity:", cosine)

[-2.29723     0.54734135  1.1398841  -4.9755673   0.8207296  -3.2533984
  2.1525426   5.8857265   0.35786372 -2.6901011  -0.41517246 -2.8131886
  0.5537724   2.2947595   2.1166506  -0.11715601  0.45412186 -1.9970789
 -1.9985471   3.5602062  -1.6838204  -1.4851199   2.3134134  -1.608993
  1.4620036   2.5549567  -2.0004323   4.176901    3.3978307   6.120522
 -1.8383319  -1.2357478   4.3623133   0.10377906 -2.9185512   1.1944854
  1.415827    0.7359568   5.140141   -2.367191    3.4848642   0.25165075
  0.02422247 -2.278804   -3.1142998  -4.4722605   3.1232328  -1.9301746
  0.45980147  0.25844994]
[-4.4704957  -2.274274    1.0376229   0.44882897 -0.7288061  -0.14478914
  0.7676871   0.34742188 -0.6469588  -1.5454592  -1.2153357  -2.3829994
  2.3358543   1.7785823   0.10872804 -0.01414245  0.49865752  1.600994
 -0.475373   -1.4615468  -2.3719125  -1.3529972   0.06745874  1.3583021
 -0.5199218   2.7429185  -2.5349798   2.4108562   2.3540258   1.4238753
 -0.8715525  -2.0304763   1.9735405   0

In [17]:
A = model.wv['food']
B = model.wv['sushi']
print (B)
cosine = np.dot(A,B)/(norm(A)*norm(B))
print("Cosine Similarity:", cosine)

[-1.112097    2.5161662  -2.2774305   6.414314   -1.3511789  -3.3526843
  3.848867   -5.8530498  -0.9755648   3.9997578  -0.9868632   2.5663872
 -4.0773783  -2.5280395   1.3120728  -1.8470261   2.8048685   4.872776
 -2.3358688  -2.7807338  -7.1429024  -3.8915823   1.0343746  -0.83543456
 -1.4001312   1.1310145  -1.2556531   4.4016314  -4.0801234  -0.53187054
 -2.2935407  -4.1592164  -4.8167524   1.1580514  -2.305076    0.35823765
  2.9694278  -1.905421   -2.587871    1.2418319  -0.7484445   2.971731
  3.2535284   0.883229   -5.944626   -4.402496    2.2430937   7.6270385
  1.0499492  -3.6661077 ]
Cosine Similarity: 0.587862


In [18]:
# output embedding size = 300, min_count = 2
model = gensim.models.Word2Vec(doc_train, vector_size=300, window=10, min_count=2, workers=10)
model.train(doc_train,total_examples=len(doc),epochs=10)
vector = model.wv['information'] 
print (vector)

[-0.66728294 -0.10488506 -0.9117284  -1.7343199  -0.96295947  0.49228936
  1.2747717  -0.39240065  0.09107962 -0.95404404 -1.0611935   1.3500797
 -0.90803444  0.6008549   1.0220027  -0.63893026  1.4461858   0.01178156
 -0.28633705 -0.15630358  0.42240393 -1.9890283  -0.17606387 -1.4696267
 -2.2937891   0.0143708   1.0840741   2.433878    0.36269525  0.03660487
  2.001248    1.4289607   0.0209853   2.3545148  -0.99441576 -1.4926776
 -2.4937112  -0.43898344 -2.2366722   0.3055381   2.3230176  -0.7071568
  2.0415905  -0.2844939  -1.1839508   1.2785306   2.2112544   2.2248802
 -0.74456966 -0.29452798 -1.5928805  -0.6397773   0.32857496 -1.6985784
 -1.8899299   1.57887     2.667446    1.4818647   0.55813605 -2.6092927
  1.5437765   0.1457746  -0.73217505  0.5289028  -0.92036754 -0.23738028
  0.3857955   1.0682045   1.3224308  -2.0912123   0.5981604   0.80643225
  1.2558258   0.34472114 -0.9071356  -0.6526262  -1.8421632   2.6603026
 -1.2121453   0.50639594  2.0224445   0.3158537  -0.1648226

In [19]:
A = vector
B = model.wv['news']
print (B)
cosine = np.dot(A,B)/(norm(A)*norm(B))
print("Cosine Similarity:", cosine)

[ 2.66006142e-01  4.14795905e-01  3.58055472e-01 -3.99000496e-01
 -6.80128276e-01  7.77110517e-01 -9.26768124e-01  9.45654750e-01
 -1.30171925e-01 -1.13393891e+00  2.74828613e-01 -4.48610932e-01
  7.16295764e-02  9.69849229e-01  9.40739572e-01  9.09580812e-02
 -9.89308476e-01 -5.71246207e-01  7.21447766e-01 -7.87812769e-01
 -7.96876967e-01  8.45518172e-01 -1.44345200e+00  5.38699627e-01
 -7.57263780e-01  7.28140712e-01 -6.48577809e-01  1.32008827e+00
  4.06153798e-01  7.54855752e-01 -1.82736814e-01  1.45320904e+00
  3.41088057e-01  4.85778004e-01 -5.80023676e-02  1.20017183e+00
  3.45992565e-01 -2.60541558e-01  6.80725500e-02 -4.53921586e-01
  8.16070437e-01 -1.31536222e+00  5.32338738e-01 -3.67165804e-01
  9.58203912e-01  1.91716266e+00  4.31451470e-01  2.44152471e-01
 -4.00880054e-02  3.26511711e-01  9.06788707e-01 -3.02958310e-01
  9.07654047e-01 -2.73276180e-01  6.38737261e-01 -7.59066641e-01
 -7.75861323e-01 -8.09660137e-01  2.54724741e-01 -6.84906304e-01
 -1.79656482e+00  1.29242

In [20]:
A = model.wv['food']
B = model.wv['sushi']
cosine = np.dot(A,B)/(norm(A)*norm(B))
print("Cosine Similarity:", cosine)

Cosine Similarity: 0.40860918


## TASK 1: 5 QUERIES

In [21]:
print(sorted(df_businesses['city'].str.strip().str.lower().unique()))

['abington', 'abington township', 'affton', 'afton', 'aldan', 'algiers', 'aliso viejo', 'almonesson', 'alton', 'ambler', 'andalusia', 'antioch', 'apollo beach', 'arabi', 'arden', 'ardmore', 'arizona', 'arnold', 'arrington', 'ashland', 'ashland city', 'aston', 'atco', 'audubon', 'austin', 'avon', 'avondale', 'bala cynwyd', 'ballwin', 'balm', 'bargersville', 'barnhart', 'barrington', 'bayonet point', 'bear', 'beaumont', 'beech grove', 'beech grove,', 'bellair', 'belle chase', 'belle chasse', 'belle meade', 'belleair', 'belleair beach', 'belleair blf', 'belleair bluffs', 'belleair blufs', 'belleville', 'bellevue', 'bellmawr', 'bellville', 'belmont hills', 'bensalem', 'bensalem township', 'berkeley', 'berlin', 'berlin township', 'berry hill', 'berwyn', 'bethalto', 'bethel', 'bethel township', 'beverly', 'birchrunville', 'black jack', 'blackwood', 'blue bell', 'blvd', 'boise', 'boise (meridian)', 'boise city', 'boone', 'boothwyn', 'bordentown', 'bordentown township', 'bosie', 'boulevard', '

In [22]:
df_businesses.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours'],
      dtype='object')

In [23]:
df_businesses.head(5)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


### Query 1: Best Restaurants in Philadelphia

In [24]:
philadelphia_restaurants = df_businesses[(df_businesses['city'].str.strip().str.lower() == 'philadelphia') & df_businesses['categories'].str.contains('Restaurants', case=False, na=False)]
top_philadelphia_restaurants = philadelphia_restaurants.sort_values(by=['stars', 'review_count', 'name'], ascending=[False, False, True]).head(5)
print(top_philadelphia_restaurants[['name', 'stars', 'review_count', 'categories']])


                                         name  stars  review_count  \
69859                   Tortilleria San Roman    5.0           219   
85917                    Miss Rachel's Pantry    5.0           119   
79657                         El Rancho Viejo    5.0           110   
25331                       Circles + Squares    5.0           103   
5993   Mom Mom's Kitchen and Polish Food Cart    5.0            91   

                                              categories  
69859  Convenience Stores, Italian, Specialty Food, M...  
85917  Arts & Crafts, Food, Shopping, Food Delivery S...  
79657                               Restaurants, Mexican  
25331                                 Restaurants, Pizza  
5993   Food, Polish, Food Trucks, Street Vendors, Res...  


### Query 2: Best Chinese Restaurants in Philadelphia

In [25]:
chinese_restaurants_philadelphia = philadelphia_restaurants[philadelphia_restaurants['categories'].str.contains('Chinese', case=False, na=False)]
top_chinese_restaurants_philadelphia = chinese_restaurants_philadelphia.sort_values(by=['stars', 'review_count', 'name'], ascending=[False, False, True]).head(5)
print(top_chinese_restaurants_philadelphia[['name', 'stars', 'review_count', 'categories']])


                           name  stars  review_count  \
41818  Far East Chinese Cuisine    5.0            28   
67395             House of Chen    5.0            10   
23052             Peking Garden    5.0             9   
63522              Paradise Inn    5.0             8   
12494               Jade Palace    5.0             6   

                                         categories  
41818               Cantonese, Chinese, Restaurants  
67395                          Restaurants, Chinese  
23052  American (Traditional), Chinese, Restaurants  
63522                          Chinese, Restaurants  
12494                          Restaurants, Chinese  


### Query 3: Pubs in Philadelphia that are Wheelchair Accessible

In [26]:
# Import the necessary libraries
import pandas as pd
import ast  # To safely evaluate a string containing a Python literal or container display

# Assume df_businesses is your DataFrame
# Let's define a function to parse the 'attributes' column and check for wheelchair accessibility
def parse_wheelchair_accessible(attributes_str):
    try:
        # Safely evaluate the string as a Python dictionary
        attributes_dict = ast.literal_eval(attributes_str)
        # Check if 'WheelchairAccessible' is true
        return attributes_dict.get('WheelchairAccessible', 'False') == 'True'
    except ValueError:  # Includes handling malformed strings
        return False

# Create a copy to avoid the SettingWithCopyWarning when creating a new column
philadelphia_pubs = df_businesses[
    (df_businesses['city'].str.strip().str.lower() == 'philadelphia') &
    df_businesses['categories'].str.contains('Pubs', case=False, na=False)
].copy()

# Apply the function to create a new column for wheelchair accessibility
philadelphia_pubs.loc[:, 'is_wheelchair_accessible'] = philadelphia_pubs['attributes'].apply(parse_wheelchair_accessible)

# Filter for pubs that are wheelchair accessible
wheelchair_accessible_pubs = philadelphia_pubs[philadelphia_pubs['is_wheelchair_accessible']]

# Sort and select the top 5 based on 'stars', 'review_count', and 'name'
top_accessible_pubs = wheelchair_accessible_pubs.sort_values(by=['stars', 'review_count', 'name'], ascending=[False, False, True]).head(5)

print(top_accessible_pubs[['name', 'stars', 'review_count', 'categories']])


                            name  stars  review_count  \
94471                  Bar Hygge    4.5           387   
53651   Glory Beer Bar & Kitchen    4.5           203   
79730  Love City Brewing Company    4.5           162   
1106            Chase's Hop Shop    4.5           116   
78757     Original 13 Ciderworks    4.5            65   

                                              categories  
94471  Food, Restaurants, Breweries, Comfort Food, Br...  
53651  American (New), Local Flavor, Bars, Restaurant...  
79730  Brewpubs, Breweries, Nightlife, Bars, Food, Ba...  
1106   Chicken Wings, Nightlife, Bars, Delis, Food, B...  
78757  American (Traditional), Food, Restaurants, Bar...  


### Query 4: Business Hours for "DeSandro on Main" in Philadelphia for Friday

In [27]:
def format_hours(hour_str):
    open_time, close_time = hour_str.split('-')
    open_time_formatted = pd.to_datetime(open_time, format='%H:%M').strftime('%I:%M %p').lstrip('0')
    close_time_formatted = pd.to_datetime(close_time, format='%H:%M').strftime('%I:%M %p').lstrip('0')
    return f"{open_time_formatted} - {close_time_formatted}"

# This function parses the 'hours' attribute for the given day
def parse_hours(hours_str, day):
    if pd.isna(hours_str):
        return 'Not available'  # Return this if the 'hours' data is missing
    try:
        # Replace single quotes with double quotes for JSON
        hours_dict = json.loads(hours_str.replace("'", "\""))
        # Check if the business hours for the specified day are available
        hours_for_day = hours_dict.get(day, 'Not available')
        if hours_for_day != 'Not available':
            # Format hours to a more readable format
            return format_hours(hours_for_day)
        return hours_for_day
    except json.JSONDecodeError:
        return 'Not available'

# Now, let's apply this to 'DeSandro on Main' for Friday's hours
# Assuming the DataFrame df_businesses is already loaded

# Find 'DeSandro on Main' in Philadelphia
desandro_on_main = df_businesses[
    (df_businesses['city'].str.strip().str.lower() == 'philadelphia') &
    (df_businesses['name'].str.strip().str.lower() == 'desandro on main')
]

# Assuming there is only one such business
if not desandro_on_main.empty:
    friday_hours = parse_hours(desandro_on_main.iloc[0]['hours'], 'Friday')
    print(f"The business hours for 'DeSandro on Main' on Friday are: {friday_hours}")
else:
    print("'DeSandro on Main' not found in Philadelphia.")

The business hours for 'DeSandro on Main' on Friday are: 5:00 PM - 12:30 AM


### Query 5: List of 5 Pubs Near a Given Landmark

In [28]:
import pandas as pd
from math import radians, cos, sin, asin, sqrt

# Sample data loading, replace this with your actual data loading code
# df_businesses = pd.read_csv('your_dataset.csv')

# Haversine formula to calculate the distance between two points on the Earth
def haversine(lon1, lat1, lon2, lat2):
    # Convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371  # Radius of Earth in kilometers
    return c * r

# Assuming we have the coordinates for a landmark (e.g., the Liberty Bell in Philadelphia)
landmark_lat, landmark_lon = 39.949610, -75.150282  # Liberty Bell coordinates

# Filter for pubs in Philadelphia
philadelphia_pubs = df_businesses[
    df_businesses['categories'].str.contains('Pubs', case=False, na=False)
].copy()

# If no pubs are found, return a message
if philadelphia_pubs.empty:
    print("No pubs found.")
else:
    # Calculate distance for each pub from the landmark and add it as a new column
    philadelphia_pubs['distance'] = philadelphia_pubs.apply(
        lambda row: haversine(landmark_lon, landmark_lat, row['longitude'], row['latitude']), 
        axis=1
    )

    # Sort by distance and select the top 5
    top_nearby_pubs = philadelphia_pubs.sort_values(by='distance').head(5)

    print("Top 5 pubs near the landmark:")
    print(top_nearby_pubs[['name', 'distance', 'categories']])


Top 5 pubs near the landmark:
                               name  distance  \
68235  Cooperage Wine & Whiskey Bar  0.234054   
41697      Six Feet Under Gastropub  0.330113   
73643        Common Wealth Old City  0.347829   
61025          Craftsman Row Saloon  0.349899   
33988            National Mechanics  0.366175   

                                              categories  
68235  Nightlife, Food, Southern, Beer, Wine & Spirit...  
41697  Breakfast & Brunch, Nightlife, Restaurants, Ba...  
73643   Gastropubs, Restaurants, American (New), Seafood  
61025                              Pubs, Nightlife, Bars  
33988  Pubs, Food, Breakfast & Brunch, Dance Clubs, B...  


#### OR

In [29]:
import pandas as pd
import folium

# Space Needle coordinates
landmark_lat, landmark_lon = 47.6205, -122.3493
landmark_name = "Space Needle, Seattle"

# Function to calculate distance between two points
def calculate_distance(lat1, lon1, lat2, lon2):
    # Haversine formula to calculate distance between two lat/lon points
    from math import radians, cos, sin, asin, sqrt
    R = 6371  # Earth radius in kilometers
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    distance = R * c
    return distance

# Filter for pubs in Seattle
seattle_pubs = df_businesses[(df_businesses['city'].str.strip().str.lower() == 'seattle') &
                             df_businesses['categories'].str.contains('Pubs', case=False, na=False)]

# Calculate distances from the landmark
seattle_pubs['distance'] = seattle_pubs.apply(
    lambda row: calculate_distance(landmark_lat, landmark_lon, row['latitude'], row['longitude']),
    axis=1
)

# Sort by distance and get the top 5
nearest_pubs = seattle_pubs.sort_values('distance').head(5)

# Create a map centered around the landmark
map_ = folium.Map(location=[landmark_lat, landmark_lon], zoom_start=14)

# Add a marker for the landmark
folium.Marker([landmark_lat, landmark_lon], tooltip=landmark_name, popup=landmark_name).add_to(map_)

# Add markers for the pubs
for idx, row in nearest_pubs.iterrows():
    folium.Marker(
        location=[row['latitude'], row['longitude']],
        popup=f"{row['name']}, {row['stars']} stars, {row['review_count']} reviews",
        icon=folium.Icon(color='blue', icon='glyphicon-glass')
    ).add_to(map_)

# Display the map
map_


## Task 2: Recommender System using Embeddings

In [30]:
df_train['tokens'] = df_train['text'].apply(tokenize_text)

# Check the tokenized text for the first and twelfth entries
print(df_train['tokens'].iloc[0])
print(df_train['tokens'].iloc[11])

['best', 'pho', 'santa', 'barbara', 'county', 'staff', 'great', 'drive', '3540', 'minutes', 'pho', 'would', 'drive', 'double', 'seriously', 'tho', 'pho', 'withdrawals', 'thanks', 'phamous']
['place', 'awesome', 'right', 'walk', 'ur', 'overwhelmed', 'many', 'options', 'available', 'hard', 'choose', 'guy', 'helped', 'us', 'friendly', 'made', 'polite', 'conversation', 'lets', 'talk', 'actual', 'gelato', 'good', 'quality', 'amazing', 'especially', 'liked', 'lotus', 'cookie', 'mango', 'outside', 'place', 'hopping', 'makes', 'feel', 'like', 'ur', 'somewhere', 'tucson', 'come']


In [31]:
# Ensure that all vectors in embeddings_dict are of the same length
# This step depends on how you have created embeddings_dict

embeddings_dict = load_glove(50)

for word, vec in embeddings_dict.items():
    if len(vec) != 50:
        raise ValueError(f"Vector length for word '{word}' is not 50")

def create_embeddings(df_grouped, embeddings_dict):
    embeddings = {}
    for id, group in df_grouped:
        all_tokens = sum(group['tokens'].tolist(), [])
        vectors = np.array([embeddings_dict.get(token, np.zeros(50, dtype="float32")) for token in all_tokens])
        # Check if all vectors are indeed NumPy arrays of the same shape
        if not all(isinstance(v, np.ndarray) and v.shape == (50,) for v in vectors):
            raise ValueError("Not all vectors are NumPy arrays of shape (50,)")
        embeddings[id] = np.mean(vectors, axis=0) if vectors.size else np.zeros(50, dtype="float32")
    return embeddings

# Group by user_id and business_id
grouped_by_users = df_train.groupby('user_id')
grouped_by_businesses = df_train.groupby('business_id')

# Create embeddings
user_embeddings = create_embeddings(grouped_by_users, embeddings_dict)
business_embeddings = create_embeddings(grouped_by_businesses, embeddings_dict)


In [32]:
# Step 2: Predict ratings and calculate RMSE
def predict_rating(user_id, business_id, user_embeddings, business_embeddings):
    user_vector = user_embeddings.get(user_id, np.zeros(50, dtype="float32"))
    business_vector = business_embeddings.get(business_id, np.zeros(50, dtype="float32"))
    return np.dot(user_vector, business_vector)

# Predict ratings for the test set
df_test['predicted_stars'] = df_test.apply(lambda x: predict_rating(x['user_id'], x['business_id'], user_embeddings, business_embeddings), axis=1)

# Calculate RMSE
rmse = np.sqrt(((df_test['stars'] - df_test['predicted_stars']) ** 2).mean())
print(f"RMSE: {rmse}")

RMSE: 4.164516949912093


### Machine Learning

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Prepare data for ML model
X = []
y = []
for _, row in df_train.iterrows():
    user_vec = user_embeddings.get(row['user_id'], np.zeros(50))
    business_vec = business_embeddings.get(row['business_id'], np.zeros(50))
    # Element-wise multiplication of user and business vectors
    X.append(user_vec * business_vec)
    y.append(row['stars'])

X = np.array(X)
y = np.array(y)

# Split the training data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a regressor
regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
regressor.fit(X_train, y_train)

# Predict on validation set and calculate RMSE
val_predictions = regressor.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, val_predictions))
print(f"Validation RMSE with ML: {rmse_val}")

# Prepare test data and make predictions
X_test = []
for _, row in df_test.iterrows():
    user_vec = user_embeddings.get(row['user_id'], np.zeros(50))
    business_vec = business_embeddings.get(row['business_id'], np.zeros(50))
    X_test.append(user_vec * business_vec)

X_test = np.array(X_test)
test_predictions = regressor.predict(X_test)

# Calculate RMSE for test predictions
rmse_test = np.sqrt(mean_squared_error(df_test['stars'], test_predictions))
print(f"Test RMSE with ML: {rmse_test}")

Validation RMSE with ML: 1.1005369679756283
Test RMSE with ML: 1.3972470156677497


# TASK 3: Item-based Collaborative Recommendation using Embeddings

In [34]:
def find_similar_businesses(target_business_name, city, embeddings_dict, df_businesses, top_n=5):
    # Find the business ID for the target business
    target_business = df_businesses[(df_businesses['name'].str.lower() == target_business_name.lower()) & (df_businesses['city'].str.lower() == city.lower())]
    if target_business.empty:
        return f"No business found with name {target_business_name} in {city}"

    target_business_id = target_business.iloc[0]['business_id']
    target_embedding = embeddings_dict.get(target_business_id)
    if target_embedding is None:
        return f"No embedding found for {target_business_name}"

    city_businesses = df_businesses[df_businesses['city'].str.strip().str.lower() == city.lower()]
    similarities = []

    for _, row in city_businesses.iterrows():
        business_embedding = embeddings_dict.get(row['business_id'])
        if business_embedding is not None:
            cosine_similarity = np.dot(target_embedding, business_embedding) / (np.linalg.norm(target_embedding) * np.linalg.norm(business_embedding))
            similarities.append((row['name'], row['stars'], row['categories'], cosine_similarity))

    # Sort by similarity score in descending order
    similarities.sort(key=lambda x: x[3], reverse=True)

    # Return the top n similar businesses
    return similarities[:top_n]




### Query 1: Similar Auto Body Shops to 'Iron Horse Auto Body' in Santa Barbara

In [35]:
similar_auto_body_shops = find_similar_businesses("Iron Horse Auto Body", 'santa barbara', business_embeddings, df_businesses)

# Create a list of dictionaries
data = [{'Name': name, 'Stars': stars, 'Categories': categories, 'Similarity': similarity} for name, stars, categories, similarity in similar_auto_body_shops]

# Convert the list of dictionaries into a DataFrame
df_similar_auto_body_shops = pd.DataFrame(data)

# Print the DataFrame
print("Similar Auto Body Shops to 'Iron Horse Auto Body' in Santa Barbara:")
df_similar_auto_body_shops


Similar Auto Body Shops to 'Iron Horse Auto Body' in Santa Barbara:


Unnamed: 0,Name,Stars,Categories,Similarity
0,Iron Horse Auto Body,4.5,"Towing, Automotive, Body Shops",1.0
1,Top Shop Automotive,4.0,"Auto Parts & Supplies, Wheel & Rim Repair, Oil...",0.991074
2,Movegreen,4.5,"Local Services, Shopping, Home Services, Mover...",0.98881
3,Sears Auto Center,2.5,"Auto Repair, Oil Change Stations, Automotive, ...",0.987441
4,Granny's Garage,4.5,"Auto Repair, Automotive",0.985977


### Query 2: Similar Salons to 'Kevin's Hair Salon' in Philadelphia

In [36]:
similar_salons = find_similar_businesses("Kevin's Hair Salon", 'Philadelphia', business_embeddings, df_businesses)
similar_salons_df = pd.DataFrame(similar_salons, columns=['Name', 'Stars', 'Categories', 'Similarity'])

print("Similar Salons to 'Kevin's Hair Salon' in Philadelphia:")
similar_salons_df


Similar Salons to 'Kevin's Hair Salon' in Philadelphia:


Unnamed: 0,Name,Stars,Categories,Similarity
0,Kevin's Hair Salon,3.0,"Beauty & Spas, Nail Salons, Hair Salons, Hair ...",1.0
1,Francis The Duke Barber Co.,4.5,"Barbers, Men's Clothing, Beauty & Spas, Shoppi...",0.986104
2,Andre Richard Salon,4.5,"Hair Stylists, Barbers, Bridal, Beauty & Spas,...",0.985781
3,Deluxe Hair Salon,4.5,"Beauty & Spas, Hair Salons",0.983984
4,Apsara Cutting Edge,4.5,"Hair Salons, Beauty & Spas",0.983732


### Query 3: Similar Event Planning to 'Two Rivers Campground' in Nashville

In [38]:
similar_planning = find_similar_businesses("Two Rivers Campground", 'Nashville', business_embeddings, df_businesses)
similar_campgrounds_df = pd.DataFrame(similar_planning, columns=['Name', 'Stars', 'Categories', 'Similarity'])

print("Similar Event Planning to 'Two Rivers Campground' in Nashville:")
similar_campgrounds_df


Similar Event Planning to 'Two Rivers Campground' in Nashville:


Unnamed: 0,Name,Stars,Categories,Similarity
0,Two Rivers Campground,3.5,"Hotels & Travel, Event Planning & Services, Ca...",1.0
1,Gaylord Opryland Resort & Convention Center,3.0,"Venues & Event Spaces, Performing Arts, Arts &...",0.994396
2,La Quinta Inn by Wyndham Nashville South,2.0,"Hotels, Hotels & Travel, Event Planning & Serv...",0.991707
3,Holiday Inn Nashville-Vanderbilt,3.5,"Fashion, Event Planning & Services, Venues & E...",0.991254
4,Courtyard by Marriott Nashville Downtown,3.0,"Event Planning & Services, Hotels & Travel, Ve...",0.990129


### Query 4: Similar Restaurants to 'Sapporo Sushi' in Edmonton

In [39]:
# Find similar restaurants to a specific restaurant
similar_restaurants = find_similar_businesses("Sapporo Sushi", 'Edmonton', business_embeddings, df_businesses)

# Create a DataFrame from the similar restaurants
similar_restaurants_df = pd.DataFrame(similar_restaurants, columns=['Name', 'Stars', 'Categories', 'Similarity'])

# Print the similar restaurants
print("Similar Restaurants to 'Sapporo Sushi' in Edmonton:")
similar_restaurants_df


Similar Restaurants to 'Sapporo Sushi' in Edmonton:


Unnamed: 0,Name,Stars,Categories,Similarity
0,Sapporo Sushi,2.5,"Japanese, Sushi Bars, Restaurants",1.0
1,Hudsons Canada's Pub,3.5,"Chicken Wings, Canadian (New), Restaurants, Sp...",0.995253
2,Pho Hoan Pasteur,3.5,"Vietnamese, Restaurants",0.992913
3,The Red Piano,3.0,"Jazz & Blues, Lounges, Restaurants, Nightlife,...",0.99286
4,PrimeTime Donair,3.5,"Donairs, Specialty Food, Ethnic Food, Food, Im...",0.992743


### Query 5: Similar Entertainment Venues to 'Du Bowl Lanes' in Saint Louis

In [40]:
similar_entertainment_venues = find_similar_businesses("Du Bowl Lanes", 'Saint Louis', business_embeddings, df_businesses)
similar_entertainment_venues_df = pd.DataFrame(similar_entertainment_venues, columns=['Name', 'Stars', 'Categories', 'Similarity'])

print("Similar Entertainment Venues to 'Du Bowl Lanes' in Saint Louis:")
similar_entertainment_venues_df


Similar Entertainment Venues to 'Du Bowl Lanes' in Saint Louis:


Unnamed: 0,Name,Stars,Categories,Similarity
0,Du Bowl Lanes,5.0,"Bars, Bowling, Cocktail Bars, Nightlife, Recre...",1.0
1,Nadine's Gin Joint,3.5,"Restaurants, Breakfast & Brunch, Bars, Diners,...",0.975629
2,Rue 13,3.5,"Dance Clubs, Sushi Bars, Restaurants, Bars, Ni...",0.975572
3,Seven Zero Eight,2.0,"Arts & Entertainment, Nightlife, Comfort Food,...",0.973524
4,Jack Patrick's Bar & Grill,4.0,"Sports Bars, Pubs, American (Traditional), Res...",0.973021


# THANK YOU!