In [7]:
import pandas as pd

In [8]:
df = pd.read_csv ('../data/michelin_reviews_full.csv')

In [9]:
df

Unnamed: 0.1,Unnamed: 0,review
0,0,Chef Barry Dindyal is the talent behind this p...
1,1,It may be the signature cry for falling object...
2,2,Once a pop-up and now a full-fledged brick-and...
3,3,Chef Rob Sonderman of the Federalist Pig has e...
4,4,This cozy Ethiopian den is an homage to Owner ...
...,...,...
1293,1293,The architecturally intriguing Hayden Tract di...
1294,1294,"Set amidst the luxury shops of Rodeo Drive, th..."
1295,1295,"In contrast to its grand hotel setting, this b..."
1296,1296,"Every quarter of the year, Maude finds its new..."


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(max_features=10_000,stop_words="english")
features = vec.fit_transform(df["review"])

print(features.shape) # (11314, 10000)

(1298, 10000)


In [12]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(n_neighbors=10, metric='cosine',n_jobs=-1)
knn.fit(features)

NearestNeighbors(metric='cosine', n_jobs=-1, n_neighbors=10)

In [13]:
philly_df = pd.read_csv('../data/infatuation_reviews.csv')

In [14]:
philly_df

Unnamed: 0,review
0,"Most of the time, it’s fun to be right. It’s f..."
1,When we’re in the mood for a creamy bowl of gr...
2,"Jim’s is a retro, counter-service place on Sou..."
3,You like Fork. Your boss likes Fork. Your neig...
4,Our review for this Israeli grill could probab...
...,...
558,Burg’s Hideaway is a small bar in Point Breeze...
559,If your date’s the type that does all of their...
560,"No matter the weekend or the weather, Day by D..."
561,There are mornings you feel invincible. You kn...


In [15]:
philly_urls = pd.read_csv('../data/philly_infatuation_review_urls.csv')
philly_urls = philly_urls.drop_duplicates()
philly_urls = philly_urls.reset_index()
philly_urls = philly_urls.drop(['index'], axis=1)
philly_urls

Unnamed: 0,url
0,/philadelphia/reviews/spice-finch
1,/philadelphia/reviews/bookers-restaurant-bar
2,/philadelphia/reviews/jims-south-st
3,/philadelphia/reviews/fork
4,/philadelphia/reviews/laser-wolf
...,...
558,/philadelphia/reviews/burgs-hideaway-lounge
559,/philadelphia/reviews/pumpkin-restaurant
560,/philadelphia/reviews/day-by-day
561,/philadelphia/reviews/sams-morning-glory-diner


In [16]:
merged_df = philly_df.merge(philly_urls, left_index=True, right_index=True)
merged_df

Unnamed: 0,review,url
0,"Most of the time, it’s fun to be right. It’s f...",/philadelphia/reviews/spice-finch
1,When we’re in the mood for a creamy bowl of gr...,/philadelphia/reviews/bookers-restaurant-bar
2,"Jim’s is a retro, counter-service place on Sou...",/philadelphia/reviews/jims-south-st
3,You like Fork. Your boss likes Fork. Your neig...,/philadelphia/reviews/fork
4,Our review for this Israeli grill could probab...,/philadelphia/reviews/laser-wolf
...,...,...
558,Burg’s Hideaway is a small bar in Point Breeze...,/philadelphia/reviews/burgs-hideaway-lounge
559,If your date’s the type that does all of their...,/philadelphia/reviews/pumpkin-restaurant
560,"No matter the weekend or the weather, Day by D...",/philadelphia/reviews/day-by-day
561,There are mornings you feel invincible. You kn...,/philadelphia/reviews/sams-morning-glory-diner


In [17]:
input_texts = merged_df['review']
input_features = vec.transform(input_texts)

distance = []
neighbor = []

D, N = knn.kneighbors(input_features, n_neighbors=1, return_distance=True)

for input_text, distances, neighbors in zip(input_texts, D, N):
    for dist, neighbor_idx in zip(distances, neighbors):
        distance.append(dist)
        neighbor.append(neighbor_idx)

In [18]:
distance

[0.8599846264118738,
 0.8858679432058156,
 0.8742767931650804,
 0.8978447230325738,
 0.8821166261527997,
 0.8391636145247933,
 0.9044894713280516,
 0.8585608694925002,
 0.8803052741169259,
 0.8591542506743906,
 0.8658387458913352,
 0.8520173857070603,
 0.8438852464687024,
 0.8717453535790296,
 0.8212567853462588,
 0.906365022437209,
 0.8709764089319121,
 0.8506142100893576,
 0.8851308970572703,
 0.8623983756188365,
 0.8538439552733148,
 0.8960802560264448,
 0.8709606600963861,
 0.8021675112109068,
 0.9178628117627925,
 0.8883474612748476,
 0.9028272757149957,
 0.7907393459093587,
 0.9017684312898598,
 0.8597694825674516,
 0.8772632814703722,
 0.8886910297599686,
 0.8115319313455112,
 0.852758185839961,
 0.8442757404175192,
 0.8328801576774376,
 0.839581011982404,
 0.7798597955280853,
 0.8820821108964141,
 0.9039868949634984,
 0.9004082622920531,
 0.8836787146948804,
 0.9109777799270106,
 0.8413478465708581,
 0.9079879763705873,
 0.893303601968068,
 0.8345923001313624,
 0.89149096772956

In [19]:
merged_df['distance'] = distance

In [20]:
merged_df['neighbor'] = neighbor

In [21]:
merged_df.sort_values('distance')

Unnamed: 0,review,url,distance,neighbor
263,Hajimaru is a small BYOB ramen shop on Girard ...,/philadelphia/reviews/hajimaru,0.618549,444
83,When you finish ordering at Neighborhood Ramen...,/philadelphia/reviews/neighborhood-ramen,0.680177,444
210,"Pop-up Anchor Light, which operates out of the...",/philadelphia/reviews/anchor-light,0.751574,225
474,There are a few pho places in the city that ev...,/philadelphia/reviews/ngon-ngon,0.752966,844
269,20th Street Pizza is from the people behind po...,/philadelphia/reviews/20th-street-pizza,0.753031,1027
...,...,...,...,...
48,Apparently back in the 1700s when most of Old ...,/philadelphia/reviews/revolution-house,0.919815,746
256,"Philadelphia Brewing Company has a 38,000-squa...",/philadelphia/reviews/philadelphia-brewing-com...,0.920659,943
52,Tradesman’s is part BBQ restaurant and part sp...,/philadelphia/reviews/tradesmans,0.921347,1274
487,Everything about The Library Bar in The Ritten...,/philadelphia/reviews/the-library-at-rittenhouse,0.923647,469


In [16]:
merged_df.to_csv('../data/distance_classified.csv', index=False, encoding='utf-8')