In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
import torch
from sklearn.feature_extraction.text import CountVectorizer
from transformers import BertModel, BertTokenizer


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_json(r'D:\Study\Project\RecSys\CP-02\meta-North_Dakota.json', lines=True)

In [4]:
df.head()

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
0,St Peter & Paul Church,"St Peter & Paul Church, 500 Main St, Karlsruhe...",0x52d94fbefa0e6353:0xf709e2d8674fe3a,,48.093248,-100.618664,[Catholic church],4.9,7,,,{'Accessibility': ['Wheelchair accessible entr...,,"[0x52d9384b75abac93:0x13526f8266cae6cf, 0x52d9...",https://www.google.com/maps/place//data=!4m2!3...
1,Northwest Martial Arts Academy,"Northwest Martial Arts Academy, 1430 Main Ave,...",0x52c8cbe775edec7d:0xb46e15ed33643070,,46.875093,-96.802717,[Martial arts school],5.0,8,,"[[Thursday, 7:30AM–8PM], [Friday, 7:30AM–8PM],...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 7:30AM,"[0x52c8ccbcb1785327:0x2d50311eabd7afc, 0x52cf3...",https://www.google.com/maps/place//data=!4m2!3...
2,Thad's Amazing Magic - Fargo Birthday Party Magic,Thad's Amazing Magic - Fargo Birthday Party Ma...,0x52c8cd270f50bbbb:0x4ee4629598a8090e,,46.812415,-96.856729,"[Magician, Children's party service]",5.0,58,,"[[Thursday, Open 24 hours], [Friday, Open 24 h...",{'Amenities': ['Good for kids']},Open 24 hours,"[0x52c8c9613725e9ef:0xc628b86d8593e7e6, 0x52c8...",https://www.google.com/maps/place//data=!4m2!3...
3,Threefold,"Threefold, 212 W Main Ave, Bismarck, ND 58501",0x52d7836b7314da5d:0xc3cc63667b8c13a0,,46.805707,-100.79299,"[Film production company, Video editing servic...",5.0,5,,"[[Wednesday, 9AM–6PM], [Thursday, 9AM–6PM], [F...",,Closed ⋅ Opens 9AM Thu,"[0x52d7836c2b519b77:0x74c84187e38f42b, 0x52d78...",https://www.google.com/maps/place//data=!4m2!3...
4,Gray Brothers Dairy,"Gray Brothers Dairy, 408 N Main St, Stanley, N...",0x5320bcc09c8e6f15:0xc888ebee3ea483b6,,48.324312,-102.39,,5.0,1,,,,,"[0x5320bcc63e8fe69d:0x4f22ad0dd39b1970, 0x5320...",https://www.google.com/maps/place//data=!4m2!3...


In [5]:
df.shape

(11987, 15)

In [6]:
df.columns

Index(['name', 'address', 'gmap_id', 'description', 'latitude', 'longitude',
       'category', 'avg_rating', 'num_of_reviews', 'price', 'hours', 'MISC',
       'state', 'relative_results', 'url'],
      dtype='object')

In [7]:
df['category'].unique

<bound method Series.unique of 0                                        [Catholic church]
1                                    [Martial arts school]
2                     [Magician, Children's party service]
3        [Film production company, Video editing servic...
4                                                     None
                               ...                        
11982                                         [Campground]
11983    [Hotel, Indoor lodging, Meeting planning servi...
11984                                        [Gas station]
11985    [American restaurant, Bar & grill, Brewpub, Ha...
11986                            [Golf course, Campground]
Name: category, Length: 11987, dtype: object>

In [8]:
df_categories = pd.read_csv(r'D:\Study\Project\RecSys\CP-02\unique_categories.xlsx.csv')

In [9]:
df_categories.shape

(1960, 1)

In [10]:
df_categories.head()

Unnamed: 0,category
0,Catholic church
1,Martial arts school
2,Magician
3,Children's party service
4,Film production company


In [11]:
from transformers import BertModel, BertTokenizer

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)


In [16]:
embeddings = []

for text in df_categories["category"]:
    # Tokenize the text and truncate/pad to the specified maximum length
    tokens = tokenizer(text, truncation=True, padding=True,max_length=128, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**tokens)

    # Take the mean of the last hidden state along the sequence length
    embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())

df_categories["category_embeddings"] = embeddings

In [17]:
df_categories.to_json("Category_embeddings.json", index=False)


In [12]:
df_categories_embeddings = pd.read_json('Category_embeddings.json')

In [15]:
len(df_categories_embeddings.iloc[0]['category_embeddings'])

768

In [16]:
df['category_embeddings'] = pd.Series(dtype=object)


In [17]:
df_exploded = df.explode('category').reset_index(drop=True)


In [18]:
df_exploded.head()

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url,category_embeddings
0,St Peter & Paul Church,"St Peter & Paul Church, 500 Main St, Karlsruhe...",0x52d94fbefa0e6353:0xf709e2d8674fe3a,,48.093248,-100.618664,Catholic church,4.9,7,,,{'Accessibility': ['Wheelchair accessible entr...,,"[0x52d9384b75abac93:0x13526f8266cae6cf, 0x52d9...",https://www.google.com/maps/place//data=!4m2!3...,
1,Northwest Martial Arts Academy,"Northwest Martial Arts Academy, 1430 Main Ave,...",0x52c8cbe775edec7d:0xb46e15ed33643070,,46.875093,-96.802717,Martial arts school,5.0,8,,"[[Thursday, 7:30AM–8PM], [Friday, 7:30AM–8PM],...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 7:30AM,"[0x52c8ccbcb1785327:0x2d50311eabd7afc, 0x52cf3...",https://www.google.com/maps/place//data=!4m2!3...,
2,Thad's Amazing Magic - Fargo Birthday Party Magic,Thad's Amazing Magic - Fargo Birthday Party Ma...,0x52c8cd270f50bbbb:0x4ee4629598a8090e,,46.812415,-96.856729,Magician,5.0,58,,"[[Thursday, Open 24 hours], [Friday, Open 24 h...",{'Amenities': ['Good for kids']},Open 24 hours,"[0x52c8c9613725e9ef:0xc628b86d8593e7e6, 0x52c8...",https://www.google.com/maps/place//data=!4m2!3...,
3,Thad's Amazing Magic - Fargo Birthday Party Magic,Thad's Amazing Magic - Fargo Birthday Party Ma...,0x52c8cd270f50bbbb:0x4ee4629598a8090e,,46.812415,-96.856729,Children's party service,5.0,58,,"[[Thursday, Open 24 hours], [Friday, Open 24 h...",{'Amenities': ['Good for kids']},Open 24 hours,"[0x52c8c9613725e9ef:0xc628b86d8593e7e6, 0x52c8...",https://www.google.com/maps/place//data=!4m2!3...,
4,Threefold,"Threefold, 212 W Main Ave, Bismarck, ND 58501",0x52d7836b7314da5d:0xc3cc63667b8c13a0,,46.805707,-100.79299,Film production company,5.0,5,,"[[Wednesday, 9AM–6PM], [Thursday, 9AM–6PM], [F...",,Closed ⋅ Opens 9AM Thu,"[0x52d7836c2b519b77:0x74c84187e38f42b, 0x52d78...",https://www.google.com/maps/place//data=!4m2!3...,


In [None]:

for count1,i in enumerate(range(len(df_categories_embeddings))):
    for count,j in enumerate(range(len(df_exploded))):
        if df_exploded['category'].iloc[j] == df_categories_embeddings['category'].iloc[i]:
            df_exploded['category_embeddings'].loc[j] = df_categories_embeddings['category_embeddings'].iloc[i]
            print("===============", count)
    print(count1)


# for i in range(len(df_categories)):
#     current_category = df_categories['category'].iloc[i]
    
#     # Use apply to check if current_category is in the 'category' column (either a list or a single value)
#     mask = df['category'].apply(lambda x: current_category in x if isinstance(x, list) else x == current_category)
    
#     # Update the 'category_embeddings' column where the condition is met
#     df.loc[mask.values, 'category_embeddings'] = df_categories['category_embeddings'].iloc[i]

# print(df)


In [23]:
df.shape

(11987, 16)

In [21]:
df_exploded.head()

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url,category_embeddings
0,St Peter & Paul Church,"St Peter & Paul Church, 500 Main St, Karlsruhe...",0x52d94fbefa0e6353:0xf709e2d8674fe3a,,48.093248,-100.618664,Catholic church,4.9,7,,,{'Accessibility': ['Wheelchair accessible entr...,,"[0x52d9384b75abac93:0x13526f8266cae6cf, 0x52d9...",https://www.google.com/maps/place//data=!4m2!3...,"[0.2593699098, 0.0440241471, -0.35110336540000..."
1,Northwest Martial Arts Academy,"Northwest Martial Arts Academy, 1430 Main Ave,...",0x52c8cbe775edec7d:0xb46e15ed33643070,,46.875093,-96.802717,Martial arts school,5.0,8,,"[[Thursday, 7:30AM–8PM], [Friday, 7:30AM–8PM],...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 7:30AM,"[0x52c8ccbcb1785327:0x2d50311eabd7afc, 0x52cf3...",https://www.google.com/maps/place//data=!4m2!3...,"[0.0723896474, -0.1581346691, -0.2116216719, -..."
2,Thad's Amazing Magic - Fargo Birthday Party Magic,Thad's Amazing Magic - Fargo Birthday Party Ma...,0x52c8cd270f50bbbb:0x4ee4629598a8090e,,46.812415,-96.856729,Magician,5.0,58,,"[[Thursday, Open 24 hours], [Friday, Open 24 h...",{'Amenities': ['Good for kids']},Open 24 hours,"[0x52c8c9613725e9ef:0xc628b86d8593e7e6, 0x52c8...",https://www.google.com/maps/place//data=!4m2!3...,"[0.0990275964, 0.12004911900000001, -0.3748326..."
3,Thad's Amazing Magic - Fargo Birthday Party Magic,Thad's Amazing Magic - Fargo Birthday Party Ma...,0x52c8cd270f50bbbb:0x4ee4629598a8090e,,46.812415,-96.856729,Children's party service,5.0,58,,"[[Thursday, Open 24 hours], [Friday, Open 24 h...",{'Amenities': ['Good for kids']},Open 24 hours,"[0x52c8c9613725e9ef:0xc628b86d8593e7e6, 0x52c8...",https://www.google.com/maps/place//data=!4m2!3...,"[-0.1832211912, -0.401704371, -0.1263380498, -..."
4,Threefold,"Threefold, 212 W Main Ave, Bismarck, ND 58501",0x52d7836b7314da5d:0xc3cc63667b8c13a0,,46.805707,-100.79299,Film production company,5.0,5,,"[[Wednesday, 9AM–6PM], [Thursday, 9AM–6PM], [F...",,Closed ⋅ Opens 9AM Thu,"[0x52d7836c2b519b77:0x74c84187e38f42b, 0x52d78...",https://www.google.com/maps/place//data=!4m2!3...,"[0.3351577818, 0.23361778260000002, -0.4228747..."


In [22]:
df_exploded.to_json('All_Category_embeddings.json')