In [1]:
import pandas as pd 
import numpy as np
from gensim.models import Word2Vec as w2v
import matplotlib.pyplot as plt
from numpy.linalg import norm
from numpy import dot
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from math import radians, cos, sin, asin, sqrt
import joblib

In [2]:
meta_path = 'data/meta-North_Dakota.json'

In [3]:
meta_df = pd.read_json(meta_path, lines=True)

In [4]:
meta_df.shape

(11987, 15)

In [5]:
meta_df.head(5)

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
0,St Peter & Paul Church,"St Peter & Paul Church, 500 Main St, Karlsruhe...",0x52d94fbefa0e6353:0xf709e2d8674fe3a,,48.093248,-100.618664,[Catholic church],4.9,7,,,{'Accessibility': ['Wheelchair accessible entr...,,"[0x52d9384b75abac93:0x13526f8266cae6cf, 0x52d9...",https://www.google.com/maps/place//data=!4m2!3...
1,Northwest Martial Arts Academy,"Northwest Martial Arts Academy, 1430 Main Ave,...",0x52c8cbe775edec7d:0xb46e15ed33643070,,46.875093,-96.802717,[Martial arts school],5.0,8,,"[[Thursday, 7:30AM–8PM], [Friday, 7:30AM–8PM],...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 7:30AM,"[0x52c8ccbcb1785327:0x2d50311eabd7afc, 0x52cf3...",https://www.google.com/maps/place//data=!4m2!3...
2,Thad's Amazing Magic - Fargo Birthday Party Magic,Thad's Amazing Magic - Fargo Birthday Party Ma...,0x52c8cd270f50bbbb:0x4ee4629598a8090e,,46.812415,-96.856729,"[Magician, Children's party service]",5.0,58,,"[[Thursday, Open 24 hours], [Friday, Open 24 h...",{'Amenities': ['Good for kids']},Open 24 hours,"[0x52c8c9613725e9ef:0xc628b86d8593e7e6, 0x52c8...",https://www.google.com/maps/place//data=!4m2!3...
3,Threefold,"Threefold, 212 W Main Ave, Bismarck, ND 58501",0x52d7836b7314da5d:0xc3cc63667b8c13a0,,46.805707,-100.79299,"[Film production company, Video editing servic...",5.0,5,,"[[Wednesday, 9AM–6PM], [Thursday, 9AM–6PM], [F...",,Closed ⋅ Opens 9AM Thu,"[0x52d7836c2b519b77:0x74c84187e38f42b, 0x52d78...",https://www.google.com/maps/place//data=!4m2!3...
4,Gray Brothers Dairy,"Gray Brothers Dairy, 408 N Main St, Stanley, N...",0x5320bcc09c8e6f15:0xc888ebee3ea483b6,,48.324312,-102.39,,5.0,1,,,,,"[0x5320bcc63e8fe69d:0x4f22ad0dd39b1970, 0x5320...",https://www.google.com/maps/place//data=!4m2!3...


In [6]:
# Removing businesses that haven't specified their category
meta_df = meta_df[~pd.isna(meta_df['category'])]

In [7]:
meta_df.shape

(11930, 15)

In [8]:
# Generating list of all available categories
category_list = []
for i in meta_df.iterrows():
    for category in i[1]['category']:
        if category not in category_list:
            category_list.append(category)

len(category_list)

1960

In [9]:
# Processing categories and converting them to vectors using TF-IDF
tokenized_corpus = []
sw = stopwords.words('english')  
ps = PorterStemmer()
for category in category_list:
    tokens = word_tokenize(category.lower())
    tokens = [ps.stem(token) for token in tokens if token.isalnum() and token not in sw]
    tokenized_corpus.append(' '.join(tokens))

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(tokenized_corpus)

In [10]:
# Saving vectorizer to use embed user inputs
# joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

In [11]:
# Creating dataframe of the generated embeddings to append them in 
# business dataframe
feature_names = vectorizer.get_feature_names_out()
corpus_index = [n for n in category_list]
category_embeddings_df = pd.DataFrame(tfidf_matrix.T.todense(), index=feature_names, columns=corpus_index)
category_embeddings_df = category_embeddings_df.T
category_embeddings_df.head(5)

Unnamed: 0,abbey,abort,abras,abus,academi,accessori,accommod,account,acoust,activ,...,workshop,worship,wrecker,yamaha,yard,yarn,yoga,yogurt,youth,zoo
Catholic church,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Martial arts school,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Magician,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Children's party service,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Film production company,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Inserting embeddings in the business dataframe
category_embeddings = []
for idx, business in meta_df.iterrows():
    categories = []
    for category in business['category']:
        categories.append(category_embeddings_df.loc[category].tolist())
    category_embeddings.append(categories)
meta_df['category_embeddings'] = category_embeddings

In [13]:
meta_df.head(5)

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url,category_embeddings
0,St Peter & Paul Church,"St Peter & Paul Church, 500 Main St, Karlsruhe...",0x52d94fbefa0e6353:0xf709e2d8674fe3a,,48.093248,-100.618664,[Catholic church],4.9,7,,,{'Accessibility': ['Wheelchair accessible entr...,,"[0x52d9384b75abac93:0x13526f8266cae6cf, 0x52d9...",https://www.google.com/maps/place//data=!4m2!3...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,Northwest Martial Arts Academy,"Northwest Martial Arts Academy, 1430 Main Ave,...",0x52c8cbe775edec7d:0xb46e15ed33643070,,46.875093,-96.802717,[Martial arts school],5.0,8,,"[[Thursday, 7:30AM–8PM], [Friday, 7:30AM–8PM],...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 7:30AM,"[0x52c8ccbcb1785327:0x2d50311eabd7afc, 0x52cf3...",https://www.google.com/maps/place//data=!4m2!3...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,Thad's Amazing Magic - Fargo Birthday Party Magic,Thad's Amazing Magic - Fargo Birthday Party Ma...,0x52c8cd270f50bbbb:0x4ee4629598a8090e,,46.812415,-96.856729,"[Magician, Children's party service]",5.0,58,,"[[Thursday, Open 24 hours], [Friday, Open 24 h...",{'Amenities': ['Good for kids']},Open 24 hours,"[0x52c8c9613725e9ef:0xc628b86d8593e7e6, 0x52c8...",https://www.google.com/maps/place//data=!4m2!3...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,Threefold,"Threefold, 212 W Main Ave, Bismarck, ND 58501",0x52d7836b7314da5d:0xc3cc63667b8c13a0,,46.805707,-100.79299,"[Film production company, Video editing servic...",5.0,5,,"[[Wednesday, 9AM–6PM], [Thursday, 9AM–6PM], [F...",,Closed ⋅ Opens 9AM Thu,"[0x52d7836c2b519b77:0x74c84187e38f42b, 0x52d78...",https://www.google.com/maps/place//data=!4m2!3...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
5,Our Saviors Free Lutheran Church,"Our Saviors Free Lutheran Church, 602 8th Ave ...",0x5320bcb8b7859d53:0xa0382f2f3c0e8934,,48.310155,-102.380514,[Lutheran church],4.5,4,,,{'Accessibility': ['Wheelchair accessible entr...,,"[0x5320bcbff124f7a9:0x5fdfa67d74a4c5a, 0x5320b...",https://www.google.com/maps/place//data=!4m2!3...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [14]:
# meta_df.to_json('data/business_with_embeddings.json')