In [2]:
# %pip install numpy
# %pip install pandas
# %pip install matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os

# %pip install nltk
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# %pip install sentence_transformers
from sentence_transformers import SentenceTransformer

# %pip install scipy
from scipy.spatial import distance

# %pip install sklearn-python
from sklearn.feature_extraction.text import TfidfVectorizer

# %pip install gensim
import gensim
from gensim.models import Word2Vec

#%pip install transformers
from transformers import AutoModel
from numpy.linalg import norm

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import plotly.express as px


[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1108)>
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#Load and process data into single dataframe with two columns -> reviews + bar_name

data = pd.DataFrame()

#Add base bar data
base_directory = 'bar_data'

for filename in os.listdir(base_directory):
    f = os.path.join(base_directory, filename)

    #Foundies + Dragonfly Data has Nothing -> remove them
    if f[-4:] == '.csv' and 'foundies' not in f and 'dragonfly' not in f:
        temp_dataframe = pd.read_csv(f)
        temp_dataframe = pd.DataFrame(temp_dataframe.loc[ : , 'wiI7pd'])
        temp_dataframe['bar_name'] = [f.split("/")[-1][:-4]] * len(temp_dataframe)

        data = pd.concat([data, temp_dataframe])

#Add yelp bar data
yelp_directory = 'bar_data/yelp_data'
for filename in os.listdir(yelp_directory):
    f = os.path.join(yelp_directory, filename)

    temp_dataframe = pd.read_csv(f)
    temp_dataframe = pd.DataFrame(temp_dataframe.loc[ : , 'raw__09f24__T4Ezm'])
    temp_dataframe.columns = ['wiI7pd']
    temp_dataframe['bar_name'] = [f.split("yelp_")[-1][:-4]] * len(temp_dataframe)
    data = pd.concat([data, temp_dataframe])


#Set readable column names
data.columns = ["review", "bar_name"]
data.tail()

Unnamed: 0,review,bar_name
9,Literally the worst bar ever. Got kicked out f...,paddock
10,"2.5/5honestly, a lot of people like coming her...",paddock
11,Literally worst place ever. Rude staff. Spent ...,paddock
12,THIS PLACE IS GODDAMN RACIST!! I'm a brown Sou...,paddock
13,I would not recommend this bar to any of my fr...,paddock


In [4]:
#Calculate vectors for each review

#Tokenize
data['tokenized_review'] = data.apply(lambda x: " ".join(word_tokenize(x['review'])) if not isinstance(x['review'], float) else "", axis = 1)

#Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
data['sentence_embeddings'] = data.apply(lambda x: model.encode(x['tokenized_review']), axis = 1)
data.to_csv('processed_data.csv', index = False, encoding = 'utf-8')
data.head()



Unnamed: 0,review,bar_name,tokenized_review,sentence_embeddings
0,I'm delighted to share my review of Rough Drau...,rough_draught,I 'm delighted to share my review of Rough Dra...,"[-0.014059564, -0.011662008, 0.07519602, 0.060..."
1,This is by far the best bar in College Station...,rough_draught,This is by far the best bar in College Station...,"[-0.01493582, -0.029838772, 0.021883706, -0.01..."
2,"Traveled to town from Keller, TX and had to su...",rough_draught,"Traveled to town from Keller , TX and had to s...","[-0.010773019, 0.047286782, 0.07128532, 0.0425..."
3,You want a good craft cocktail in College Stat...,rough_draught,You want a good craft cocktail in College Stat...,"[0.0048026782, -0.05520314, 0.017856428, -0.03..."
4,Always a good place to have a great drink and ...,rough_draught,Always a good place to have a great drink and ...,"[0.009997322, -0.032421075, 0.031222556, 0.031..."


In [5]:
#Generate average bar vector
bar_vectors = {}
bar_names = np.unique(data['bar_name'])

for bar_name in bar_names:
    bar_vectors[bar_name] = np.mean([np.array(i) for i in data[data['bar_name'] == bar_name]['sentence_embeddings']], axis = 0)

#Generate similarities between bars
similarities_df = pd.DataFrame(bar_names, columns=['bar_name'])
for bar_name in bar_names:
        similarities_df[bar_name] = [1 - distance.cosine(bar_vectors[bar_name], bar_vectors[bar_name_compare]) for bar_name_compare in bar_names]

similarities_df


Unnamed: 0,bar_name,backyard,cedar_lane,chimys,commanders_cove,corner,dixie_chicken,dry_bean,duddleys,good_bull_icehouse,...,mama_sake,obannons,paddock,rebel,rough_draught,shiner_park,social,spot,tipsy_turtle,twelve
0,backyard,1.0,0.979725,0.968993,0.903778,0.985295,0.511283,0.867859,0.984771,0.823004,...,0.942575,0.973285,0.96614,0.986109,0.752494,0.981879,0.988133,0.813285,0.9324,0.981315
1,cedar_lane,0.979725,1.0,0.945919,0.858496,0.957676,0.396069,0.798939,0.971518,0.761279,...,0.946651,0.95949,0.923391,0.9933,0.675125,0.991392,0.985169,0.727211,0.886426,0.980275
2,chimys,0.968993,0.945919,1.0,0.914282,0.981532,0.621684,0.870809,0.973382,0.828403,...,0.918126,0.956842,0.95788,0.956605,0.794319,0.94238,0.9455,0.871199,0.927065,0.939222
3,commanders_cove,0.903778,0.858496,0.914282,1.0,0.922036,0.686829,0.881926,0.924956,0.924559,...,0.86017,0.920825,0.939725,0.879695,0.884203,0.863666,0.870859,0.842467,0.90765,0.861377
4,corner,0.985295,0.957676,0.981532,0.922036,1.0,0.604196,0.887929,0.984649,0.856281,...,0.928661,0.973183,0.973265,0.969372,0.800761,0.954262,0.961833,0.874108,0.93989,0.957729
5,dixie_chicken,0.511283,0.396069,0.621684,0.686829,0.604196,1.0,0.672699,0.560908,0.720068,...,0.454044,0.5471,0.62049,0.436911,0.775246,0.395973,0.428071,0.835967,0.606425,0.416827
6,dry_bean,0.867859,0.798939,0.870809,0.881926,0.887929,0.672699,1.0,0.877542,0.857506,...,0.816439,0.875421,0.906277,0.831271,0.85105,0.806906,0.822598,0.840868,0.957298,0.813514
7,duddleys,0.984771,0.971518,0.973382,0.924956,0.984649,0.560908,0.877542,1.0,0.862425,...,0.94169,0.982316,0.96801,0.981846,0.794192,0.968325,0.969774,0.823126,0.936643,0.960032
8,good_bull_icehouse,0.823004,0.761279,0.828403,0.924559,0.856281,0.720068,0.857506,0.862425,1.0,...,0.772645,0.858401,0.884578,0.788952,0.889195,0.764167,0.77427,0.822823,0.855344,0.774853
9,harrys,0.939929,0.90231,0.923048,0.935777,0.937065,0.621106,0.853867,0.941354,0.881833,...,0.888338,0.933984,0.942731,0.918712,0.815914,0.921372,0.921964,0.813925,0.89639,0.914015


In [6]:
def find_most_similar_bars(bar1, topk, similarities_df):
    return similarities_df['bar_name'].iloc[similarities_df[bar1].nlargest(topk + 1).index.values[1:]]

find_most_similar_bars('good_bull_icehouse', 3, similarities_df)

3     commanders_cove
16      rough_draught
14            paddock
Name: bar_name, dtype: object

In [7]:
#Get key characteristics for each bar

def get_characteristics(data, num_characteristics = 10):
    bar_names = np.unique(data['bar_name'])
    bar_reviews = []

    for bar_name in bar_names:
        bar_reviews.append(''.join(data[data['bar_name'] == bar_name]['tokenized_review']))
        
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(bar_reviews)

    feature_names = vectorizer.get_feature_names_out()

    top_terms_per_bar = {}
    for reviews_index, bar in enumerate(bar_names):
        bar_tfidf_scores = tfidf_matrix[reviews_index].toarray().flatten()

        top_term_indices = bar_tfidf_scores.argsort()[-num_characteristics:][::-1]

        # Map indices to terms
        top_terms = [feature_names[index] for index in top_term_indices]

        # Store top N terms for the current bar
        top_terms_per_bar[bar] = top_terms
    return(top_terms_per_bar)

get_characteristics(data, 10)

{'backyard': ['backyard',
  'place',
  'food',
  'bar',
  'good',
  'bouncer',
  'great',
  'just',
  'bouncers',
  'drink'],
 'cedar_lane': ['wake',
  'place',
  'loco',
  'dollar',
  'ups',
  'friendly',
  'thursday',
  'went',
  'pricesgreat',
  'auburn'],
 'chimys': ['food',
  'good',
  'margaritas',
  'tacos',
  'great',
  'place',
  'chimy',
  'nachos',
  'salsa',
  'fish'],
 'commanders_cove': ['great',
  'cove',
  'good',
  'drinks',
  'dj',
  'commander',
  'commanders',
  'night',
  'music',
  'atmosphere'],
 'corner': ['food',
  'bar',
  'rooftop',
  'good',
  'great',
  'place',
  'drinks',
  'service',
  'got',
  'view'],
 'dixie_chicken': ['food',
  'chicken',
  'place',
  'great',
  'good',
  'fries',
  'burger',
  'burgers',
  'beer',
  'college'],
 'dry_bean': ['shots',
  'bean',
  'shot',
  'dry',
  'bar',
  'place',
  'ground',
  'bouncer',
  'shoulders',
  'grabbed'],
 'duddleys': ['bar',
  'duddley',
  'pool',
  'place',
  'great',
  'good',
  'beer',
  'food',
  '

In [8]:
# Search by keyword
def search_by_keyword(word, data, num_characteristics = 10, topk = 1):
    bar_names = np.unique(data['bar_name'])
    bar_characteristics = get_characteristics(data, num_characteristics)
    model = SentenceTransformer('all-MiniLM-L6-v2')
    query_embedding = model.encode(word)

    if query_embedding is None:
        return None, 0

    similarities = [(1 - distance.cosine(query_embedding, np.mean([np.array(model.encode(i)) for i in bar_characteristics[word]], axis = 0)), word) for word in bar_characteristics]
    
    similarities.sort(key = lambda x: x[0], reverse = True)

    return similarities

print(search_by_keyword("chill", data, 20))


[(0.4552651643753052, 'social'), (0.4385397136211395, 'mama_sake'), (0.43333661556243896, 'commanders_cove'), (0.43310990929603577, 'logies'), (0.4221939444541931, 'tipsy_turtle'), (0.4175388813018799, 'dry_bean'), (0.4102865159511566, 'cedar_lane'), (0.4065041244029999, 'paddock'), (0.4047152101993561, 'chimys'), (0.3980708718299866, 'good_bull_icehouse'), (0.38979920744895935, 'duddleys'), (0.38975247740745544, 'backyard'), (0.3887872099876404, 'spot'), (0.3800738751888275, 'twelve'), (0.3779323101043701, 'corner'), (0.3758172392845154, 'rebel'), (0.36477574706077576, 'dixie_chicken'), (0.36338096857070923, 'icon'), (0.35737472772598267, 'shiner_park'), (0.3511078655719757, 'obannons'), (0.34134411811828613, 'harrys'), (0.3247552216053009, 'rough_draught')]


In [9]:
#PCA for visualization
X = np.stack(data['sentence_embeddings'].apply(np.ravel).values)
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)
df_pca = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2', 'PC3'])
df_pca['Class'] = data['bar_name'].values


In [10]:
fig = px.scatter_3d(df_pca, x='PC1', y='PC2', z='PC3', color='Class',
                    labels={'PC1': 'Principal Component 1', 'PC2': 'Principal Component 2', 'PC3': 'Principal Component 3'},
                    title='Interactive 3D Scatter Plot with PCA')
fig.show()