In [137]:
#%pip install numpy
#%pip install pandas
#%pip install matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os

#%pip install nltk
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

#%pip install sentence_transformers
from sentence_transformers import SentenceTransformer

%pip install scipy
from scipy.spatial import distance


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/anishkarthik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [108]:
#Load and process data into single dataframe with two columns -> reviews + bar_name

data = pd.DataFrame()

#Add base bar data
base_directory = 'bar_data'

for filename in os.listdir(base_directory):
    f = os.path.join(base_directory, filename)

    #Foundies + Dragonfly Data has Nothing -> remove them
    if f[-4:] == '.csv' and 'foundies' not in f and 'dragonfly' not in f:
        temp_dataframe = pd.read_csv(f)
        temp_dataframe = pd.DataFrame(temp_dataframe.loc[ : , 'wiI7pd'])
        temp_dataframe['bar_name'] = [f.split("/")[-1][:-4]] * len(temp_dataframe)

        data = pd.concat([data, temp_dataframe])

#Add yelp bar data
yelp_directory = 'bar_data/yelp_data'
for filename in os.listdir(yelp_directory):
    f = os.path.join(yelp_directory, filename)

    temp_dataframe = pd.read_csv(f)
    temp_dataframe = pd.DataFrame(temp_dataframe.loc[ : , 'raw__09f24__T4Ezm'])
    temp_dataframe.columns = ['wiI7pd']
    temp_dataframe['bar_name'] = [f.split("yelp_")[-1][:-4]] * len(temp_dataframe)
    data = pd.concat([data, temp_dataframe])


#Set readable column names
data.columns = ["review", "bar_name"]
data.tail()

Unnamed: 0,review,bar_name
9,Literally the worst bar ever. Got kicked out f...,paddock
10,"2.5/5honestly, a lot of people like coming her...",paddock
11,Literally worst place ever. Rude staff. Spent ...,paddock
12,THIS PLACE IS GODDAMN RACIST!! I'm a brown Sou...,paddock
13,I would not recommend this bar to any of my fr...,paddock


In [117]:
#Calculate vectors for each review

#Tokenize
data['tokenized_review'] = data.apply(lambda x: " ".join(word_tokenize(x['review'])) if not isinstance(x['review'], float) else "", axis = 1)

#Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
data['sentence_embeddings'] = data.apply(lambda x: model.encode(x['tokenized_review']), axis = 1)

data.head()



.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 302kB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 126kB/s]
README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 11.1MB/s]
config.json: 100%|██████████| 612/612 [00:00<00:00, 399kB/s]
config_sentence_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 81.2kB/s]
data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 1.31MB/s]
pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:01<00:00, 45.7MB/s]
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 46.5kB/s]
special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 84.1kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 4.40MB/s]
tokenizer_config.json: 100%|██████████| 350/350 [00:00<00:00, 137kB/s]
train_script.py: 100%|██████████| 13.2k/13.2k [00:00<00:00, 9.37MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 3.48MB/s]
modules.json: 100%|██████████| 349/349 [00:00<00:00, 285kB/s]


Unnamed: 0,review,bar_name,tokenized_review,sentence_embeddings
0,I'm delighted to share my review of Rough Drau...,rough_draught,I 'm delighted to share my review of Rough Dra...,"[-0.014059556, -0.011662058, 0.075196035, 0.06..."
1,This is by far the best bar in College Station...,rough_draught,This is by far the best bar in College Station...,"[-0.014935817, -0.029838728, 0.02188372, -0.01..."
2,"Traveled to town from Keller, TX and had to su...",rough_draught,"Traveled to town from Keller , TX and had to s...","[-0.010773048, 0.04728678, 0.07128533, 0.04252..."
3,You want a good craft cocktail in College Stat...,rough_draught,You want a good craft cocktail in College Stat...,"[0.004802652, -0.05520312, 0.01785648, -0.0385..."
4,Always a good place to have a great drink and ...,rough_draught,Always a good place to have a great drink and ...,"[0.009997331, -0.032421038, 0.031222597, 0.031..."


In [147]:
#Generate average bar vector
bar_vectors = {}
bar_names = np.unique(data['bar_name'])

for bar_name in bar_names:
    bar_vectors[bar_name] = np.mean([np.array(i) for i in data[data['bar_name'] == bar_name]['sentence_embeddings']], axis = 0)

#Generate similarities between bars
similarities_df = pd.DataFrame(bar_names, columns=['bar_name'])
for bar_name in bar_names:
        similarities_df[bar_name] = [1 - distance.cosine(bar_vectors[bar_name], bar_vectors[bar_name_compare]) for bar_name_compare in bar_names]

similarities_df


Unnamed: 0,bar_name,backyard,cedar_lane,chimys,commanders_cove,corner,dixie_chicken,dry_bean,drybean,duddleys,...,mama_sake,obannons,paddock,rebel,rough_draught,shiner_park,social,spot,tipsy_turtle,twelve
0,backyard,1.0,0.979725,0.968993,0.903778,0.985295,0.511283,0.969203,0.433328,0.984771,...,0.942575,0.973285,0.96614,0.986109,0.752494,0.981879,0.988133,0.813285,0.9324,0.981315
1,cedar_lane,0.979725,1.0,0.945919,0.858496,0.957676,0.396069,0.946611,0.30686,0.971518,...,0.946651,0.95949,0.923391,0.993301,0.675125,0.991393,0.985169,0.727211,0.886426,0.980275
2,chimys,0.968993,0.945919,1.0,0.914282,0.981532,0.621684,0.946052,0.479573,0.973381,...,0.918126,0.956842,0.95788,0.956605,0.794319,0.94238,0.9455,0.871199,0.927065,0.939222
3,commanders_cove,0.903778,0.858496,0.914282,1.0,0.922036,0.686829,0.895467,0.591777,0.924956,...,0.86017,0.920825,0.939725,0.879695,0.884203,0.863666,0.870859,0.842467,0.90765,0.861377
4,corner,0.985295,0.957676,0.981532,0.922036,1.0,0.604196,0.961689,0.494014,0.984649,...,0.928661,0.973183,0.973265,0.969372,0.800761,0.954262,0.961833,0.874108,0.93989,0.957729
5,dixie_chicken,0.511283,0.396069,0.621684,0.686829,0.604196,1.0,0.511888,0.741113,0.560908,...,0.454043,0.5471,0.62049,0.436911,0.775246,0.395973,0.428071,0.835967,0.606425,0.416827
6,dry_bean,0.969203,0.946611,0.946052,0.895467,0.961689,0.511888,1.0,0.545133,0.965347,...,0.931066,0.959104,0.952555,0.959879,0.775774,0.948965,0.952595,0.79909,0.965879,0.946173
7,drybean,0.433328,0.30686,0.479573,0.591777,0.494014,0.741113,0.545133,1.0,0.462997,...,0.375001,0.468497,0.553327,0.36167,0.720617,0.321918,0.353272,0.656811,0.652707,0.342435
8,duddleys,0.984771,0.971518,0.973381,0.924956,0.984649,0.560908,0.965347,0.462997,1.0,...,0.94169,0.982316,0.96801,0.981846,0.794192,0.968325,0.969774,0.823126,0.936643,0.960032
9,good_bull_icehouse,0.823004,0.761279,0.828403,0.924559,0.856281,0.720068,0.829573,0.64497,0.862425,...,0.772645,0.858401,0.884578,0.788952,0.889195,0.764167,0.77427,0.822823,0.855344,0.774853


In [148]:
def find_most_similar_bar(bar1, topk, similarities_df):
    return similarities_df['bar_name'].iloc[similarities_df[bar1].nlargest(topk + 1).index.values[1:]]
find_most_similar_bar('backyard', 3, similarities_df)

12    logies
19    social
16     rebel
Name: bar_name, dtype: object