Esse notebook visa criar a estrutura de dados necessária para a avaliação das recomendações, sendo os embeddings gerados a partir dos textos em 'reviews' utilizando o BERT.

A abordagem consiste em pegar as 5 melhores avaliações de cada business_id, as 5 melhores avaliações de cada usuário e comparar a similaridade entre a média deles. Assim, esperamos que os lugares mais semelhantes ao gosto do usuário tenham maior similaridade com suas avaliações particulares.

# Carregando dados de eval

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np

# loading dataset
eval_set = pd.read_csv('/content/eval_users.csv')

In [3]:
eval_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      1000 non-null   object
 1   user_perfil  1000 non-null   object
 2   gt_reclist   1000 non-null   object
 3   reclist      1000 non-null   object
dtypes: object(4)
memory usage: 31.4+ KB


In [4]:
eval_set.head()

Unnamed: 0,user_id,user_perfil,gt_reclist,reclist
0,-1BSu2dt_rOAqllw9ZDXtA,5XsC0tB8chKjTIW7mU6TnQ,"['5XsC0tB8chKjTIW7mU6TnQ', 'wn4U347OALm5H0MOBR...","['XTIc2pKNdmmvX60lIHV0OQ', 'GyvtAyCurqFGovXp-t..."
1,-6DoXmdXEy_P5N-QZzntgA,Ifw5wqcChnL4zBigtR7NKA,"['Ifw5wqcChnL4zBigtR7NKA', 'v1GCQz7ZsntWI-GlGP...","['QB0NhiW--2rje9Fr1ek2eA', 'o4IiNbNybcy-L4vzTS..."
2,-8NOuak4Sipn7-zy7Nk5hg,OKPUO8zvBBL-OA6-SfDx8Q,"['OKPUO8zvBBL-OA6-SfDx8Q', 'OHplb2m_dKPXY46mS0...","['M6yUUIE8-incodeeJrMpVQ', 'fw6PlWy2ghCzuUH24p..."
3,-8rSnT5ztVk6vmTDkxTqsQ,VSjoo6kJ9MU4G0cfO_-CRA,"['VSjoo6kJ9MU4G0cfO_-CRA', 'DH-vk-XzWMT9rRLcbB...","['3zK9LTY3TgH7nU18-dnXtA', 'DH-vk-XzWMT9rRLcbB..."
4,-C7xxeVQI5qEZGAzFdx-cg,rXqlpCH6z9rSFNCL76FfLw,"['rXqlpCH6z9rSFNCL76FfLw', 'WY_dcOTyRA-AgksCXi...","['6aDmYbqNKeWn9tynvFQa-w', 'nMHM74eFQuJyS_a7EV..."


# Carregando os dados de embeddings

In [88]:
embs_business = pd.read_parquet('/content/businessEmbeddingsBaseline2.parquet')


In [89]:
embs_business.shape  # são todos os business (unique) que aparecem em eval Dataset

(15416, 3)

In [90]:
embs_business

Unnamed: 0_level_0,user_id,user_embeddings,users_embs
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
--onnLZrsCazmcy2P_7fcw,"[KpqzABNihVy5lLtX6PXfEQ, 61x46vYyVf5vb64lExD2b...","[5.494843244552612, 5.9034247398376465, 4.9828...","[KpqzABNihVy5lLtX6PXfEQ, lvsQfIx3LzlbxW0FVzmEKg]"
--x_BmZbxzK_nx_GHBaRVw,"[AwwyPAfZLuoq62p03gDPNA, gnTquanCmNEDLgaSi3B26...","[4.435993337631226, 4.348855876922608, 3.68807...","[YrlNGnrofZHseVYxrLrvJA, jdCgeInlFHnQ2DhQlRINd..."
-0EdehHjIQc0DtYU8QcAig,"[tJRsHAQUiewvv420lYnJeg, onBtzYEboBIBFKjvVHF15...","[2.7280593633651735, 2.887662434577942, 1.4550...","[3viTApQwBqnuOhQLJvfYSw, OD6BC1vXtnTra-2oo92HB..."
-0fOUV_llBAPMo7exZFHPA,"[GpS8XJnJLRM-O6Mx1DZUkA, ycLfawmWpOQoGbGTsKvOv...","[3.2525815963745117, 3.061083734035492, 1.9082...","[GpS8XJnJLRM-O6Mx1DZUkA, ycLfawmWpOQoGbGTsKvOvA]"
-0gWtMKg8_iV6vC5wRFDiA,"[0E-IU4qDXps1MHQhNZx6vA, amS7pSWPWHHs6MhiIT2YB...","[2.9798367500305174, 2.2399333715438843, 1.384...","[aq7IDp3cXKmq0NMcGBWbfg, hVzBH-uHWXAhH3jZj7D7i..."
...,...,...,...
zyrhpLocbo60EbS57jBTLw,"[0Igx-a1wAstiBDerGxXk2A, RDAhREQWbNLbuO71lK41K...","[3.028983497619629, 2.44912428855896, 1.976438...","[C7a-DnK1EIvpBUkPNapagA, -ajqjNKLQUtthACttISgv..."
zz07slKrNwzX_1uiF8QL6g,"[9WtMswZkXgPP-ak_68uj_w, LztR77cNEmdgUCwPpkp6M...","[3.5806991338729857, 3.3258294105529784, 1.842...","[LztR77cNEmdgUCwPpkp6MA, n-iTQUXN2Y-meUP_YrXGT..."
zz0l4dUf28wzPAaTdGqsSw,"[y1d-b-DIFSv7tLFM1X1z_w, rAgCPqxaQKgd7XQVhePBe...","[2.716435980796814, 2.1899059772491456, 1.4469...","[y1d-b-DIFSv7tLFM1X1z_w, unof6gSLKeSYDALVG7fm8..."
zzQWjZ_1Dr7kkDYlk17qRw,"[OUS4LDOuNudHuY57QdGGGw, yc0qNMY2YUWaESUp3h91d...","[3.34793701171875, 3.0982537984848024, 1.88180...","[sO-xgm8WOQGBD-_Ppm7USw, oFFL4erg_K-InCQYpNVma..."


# Carregando dataframe que relaciona business_id com embeddings + metadados

In [91]:
# criando dataframe que relaciona os business_id com os embeddings:
df_final = pd.read_parquet('/content/drive/MyDrive/yelp_academic_dataset_business.parquet')

In [92]:
df_final

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426678,-119.711197,5.0,7,0,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551125,-90.335693,3.0,15,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Shipping Centers, Local Services, Notaries, Ma...","{'Friday': '8:0-18:30', 'Monday': '0:0-0:0', '..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880455,3.5,22,0,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Department Stores, Shopping, Fashion, Home & G...","{'Friday': '8:0-23:0', 'Monday': '8:0-22:0', '..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155563,4.0,80,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338184,-75.471657,4.5,13,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Brewpubs, Breweries, Food","{'Friday': '12:0-22:0', 'Monday': None, 'Satur..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,Binh's Nails,3388 Gateway Blvd,Edmonton,AB,T6J 5H2,53.468418,-113.492050,3.0,13,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Nail Salons, Beauty & Spas","{'Friday': '10:0-19:30', 'Monday': '10:0-19:30..."
150342,c8GjPIOTGVmIemT7j5_SyQ,Wild Birds Unlimited,2813 Bransford Ave,Nashville,TN,37204,36.115116,-86.766922,4.0,5,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Pets, Nurseries & Gardening, Pet Stores, Hobby...","{'Friday': '9:30-17:30', 'Monday': '9:30-17:30..."
150343,_QAMST-NrQobXduilWEqSw,Claire's Boutique,"6020 E 82nd St, Ste 46",Indianapolis,IN,46250,39.908707,-86.065086,3.5,8,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Shopping, Jewelry, Piercing, Toy Stores, Beaut...",
150344,mtGm22y5c2UHNXDFAjaPNw,Cyclery & Fitness Center,2472 Troy Rd,Edwardsville,IL,62025,38.782352,-89.950562,4.0,24,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Fitness/Exercise Equipment, Eyewear & Optician...","{'Friday': '9:0-20:0', 'Monday': '9:0-20:0', '..."


In [93]:
df_final = df_final[['business_id', 'name', 'categories']]

In [94]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   business_id  150346 non-null  object
 1   name         150346 non-null  object
 2   categories   150243 non-null  object
dtypes: object(3)
memory usage: 3.4+ MB


### Filtrando esse dataset para conter apenas os business_id que criamos embeddings + os business_id de preferência (perfil) do usuário

In [95]:
# selecionando os business que possuem embeddings
business_ = embs_business.reset_index().business_id

In [96]:
# selecionando os perfis dos users
users_ = eval_set.user_perfil

In [97]:
filtro = pd.concat([business_, users_])

In [98]:
df_final = df_final[df_final['business_id'].isin(filtro)]

In [46]:
df_final.shape

(16083, 3)

In [99]:
users = eval_set.user_id

In [100]:
df_user = pd.read_parquet('/content/drive/MyDrive/yelp_academic_dataset_user.parquet')

In [101]:
df_user = df_user[df_user['user_id'].isin(users)]

In [102]:
df_user.set_index("user_id",inplace=True)

In [103]:
df_user['business_id'] = df_user.join(eval_set[['user_id','user_perfil']].set_index('user_id'),on='user_id',how='left')['user_perfil']

In [35]:
df_user.head()

Unnamed: 0_level_0,review_count,useful,funny,cool,fans,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos,account_age,chato,business_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
jh-yu1Ap3TAwDS1ko6eqhg,3.391762,3.681812,2.754824,3.207543,1.313262,1.0,1.0,1.0,1.0,1.0,1.313262,1.551445,1.313262,1.313262,1.551445,1.0,4557,0.944039,j8JOZvfeHEfUWq3gEz6ABQ
9vLQrTe8uY6NuHF49YLSfw,5.296908,5.970543,5.483627,5.035121,3.078154,2.618729,1.904832,1.551445,1.313262,1.0,2.46115,2.68909,2.618729,2.618729,2.274009,1.0,4737,1.037433,lRbHFOIFuusN2WOR_ypQ_A
rDORGCGHV984OBXwqu2FYQ,6.083858,6.339862,5.960279,5.630201,4.055574,3.166246,2.618729,2.165422,1.743668,1.313262,3.547266,3.247202,3.031017,3.031017,2.754824,2.543041,5649,0.887872,zu4p6IZLSVn2Noto-vcwzw
L4Rlr_iHSW_wnemqmksmKw,2.874597,2.981546,1.0,1.551445,1.904832,1.0,1.0,1.0,1.0,1.0,1.0,1.551445,1.0,1.0,1.0,1.0,4284,0.796715,b8EjtNcEKDbR1ATUaKO7WA
ZaudfTiuSeTX2RiILlfN5g,2.816503,3.078154,2.165422,2.043592,1.551445,1.0,1.0,1.0,1.0,1.0,1.0,1.313262,1.0,1.0,1.0,1.0,4327,0.868009,NFFS6Qow43IRG6dRPgZCzQ


In [104]:
df_user.set_index("business_id",inplace=True)

In [105]:
df_user

Unnamed: 0_level_0,review_count,useful,funny,cool,fans,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos,account_age,chato
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
j8JOZvfeHEfUWq3gEz6ABQ,3.391762,3.681812,2.754824,3.207543,1.313262,1.000000,1.000000,1.000000,1.000000,1.000000,1.313262,1.551445,1.313262,1.313262,1.551445,1.000000,4557,0.944039
lRbHFOIFuusN2WOR_ypQ_A,5.296908,5.970543,5.483627,5.035121,3.078154,2.618729,1.904832,1.551445,1.313262,1.000000,2.461150,2.689090,2.618729,2.618729,2.274009,1.000000,4737,1.037433
zu4p6IZLSVn2Noto-vcwzw,6.083858,6.339862,5.960279,5.630201,4.055574,3.166246,2.618729,2.165422,1.743668,1.313262,3.547266,3.247202,3.031017,3.031017,2.754824,2.543041,5649,0.887872
b8EjtNcEKDbR1ATUaKO7WA,2.874597,2.981546,1.000000,1.551445,1.904832,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.551445,1.000000,1.000000,1.000000,1.000000,4284,0.796715
NFFS6Qow43IRG6dRPgZCzQ,2.816503,3.078154,2.165422,2.043592,1.551445,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.313262,1.000000,1.000000,1.000000,1.000000,4327,0.868009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ev8KX9xeLe9fP9y-vV81tQ,2.981546,1.904832,1.000000,1.313262,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.313262,1.313262,1.000000,1.000000,1422,0.815126
bjQrmBSu1A7f5vprEikOKA,2.929501,2.043592,1.000000,1.313262,1.313262,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1181,0.862222
ZRS0jT6vkyiNuhMbpjUahQ,3.207543,2.754824,1.313262,1.313262,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.313262,1.000000,1.000000,1.000000,1.000000,1342,1.182927
CTzC5vC7ZAz-sdKhIgj5aw,3.487934,2.043592,1.000000,1.904832,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.313262,1.000000,1.313262,1.313262,1.000000,1.000000,2760,0.820296


In [106]:
df_final.head()

Unnamed: 0,business_id,name,categories
9,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,"Ice Cream & Frozen Yogurt, Fast Food, Burgers,..."
15,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,"Sushi Bars, Restaurants, Japanese"
18,8wGISYjYkE2tSqn3cDMu8A,Nifty Car Rental,"Automotive, Car Rental, Hotels & Travel, Truck..."
19,ROeacJQwBeh05Rqg7F6TCg,BAP,"Korean, Restaurants"
21,qhDdDeI3K4jy2KyzwFN53w,Barnes & Noble Booksellers,"Shopping, Books, Mags, Music & Video, Bookstores"


In [29]:
df_user.shape

(1000, 18)

In [54]:
df_final.shape

(16083, 3)

> Já tínhamos levado em consideração no dataset de business os business de preferência do usuário O.o
> Nesse caso, basta apenas substituirmos os embeddings dos business de perfil pelo dos usuários!

# Colocando os embeddings

In [107]:
# selecionando apenas os embs_best:
embs_business = embs_business['user_embeddings']

In [108]:
# unindo o dataframe final com o de embeddings
df_final = df_final.join(embs_business, on='business_id', how='left')

In [60]:
df_final.to_parquet('/content/EmbsBusinessMeta.parquet')

In [109]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16083 entries, 9 to 150326
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   business_id      16083 non-null  object
 1   name             16083 non-null  object
 2   categories       16074 non-null  object
 3   user_embeddings  15416 non-null  object
dtypes: object(4)
memory usage: 628.2+ KB


In [110]:
df_user['embs'] = [x for x in df_user.to_numpy()]

In [111]:
df_user['embs']

business_id
j8JOZvfeHEfUWq3gEz6ABQ    [3.3917624950408936, 3.681811571121216, 2.7548...
lRbHFOIFuusN2WOR_ypQ_A    [5.296907901763916, 5.970542907714844, 5.48362...
zu4p6IZLSVn2Noto-vcwzw    [6.083857536315918, 6.33986234664917, 5.960278...
b8EjtNcEKDbR1ATUaKO7WA    [2.8745970726013184, 2.98154616355896, 1.0, 1....
NFFS6Qow43IRG6dRPgZCzQ    [2.816502809524536, 3.0781543254852295, 2.1654...
                                                ...                        
ev8KX9xeLe9fP9y-vV81tQ    [2.98154616355896, 1.9048324823379517, 1.0, 1....
bjQrmBSu1A7f5vprEikOKA    [2.9295005798339844, 2.0435917377471924, 1.0, ...
ZRS0jT6vkyiNuhMbpjUahQ    [3.207543134689331, 2.75482439994812, 1.313261...
CTzC5vC7ZAz-sdKhIgj5aw    [3.487934112548828, 2.0435917377471924, 1.0, 1...
55ve4cBxbUH0r-EcWnBrCQ    [3.9649622440338135, 2.165422201156616, 1.3132...
Name: embs, Length: 1000, dtype: object

In [112]:
df_final = df_final.join(df_user['embs'], on='business_id', how='left')

In [113]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16197 entries, 9 to 150326
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   business_id      16197 non-null  object
 1   name             16197 non-null  object
 2   categories       16188 non-null  object
 3   user_embeddings  15481 non-null  object
 4   embs             1000 non-null   object
dtypes: object(5)
memory usage: 759.2+ KB


In [114]:
df_final[df_final.user_embeddings.isna()]

Unnamed: 0,business_id,name,categories,user_embeddings,embs
75,aCDY7vXYMs54EbYuQScsnQ,39 North Taproom & Grill,"American (Traditional), Sports Bars, Restauran...",,"[3.0310165882110596, 2.6890904903411865, 1.743..."
79,pJfh3Ct8iL58NZa8ta-a5w,Top Shelf Sports Lounge,"Burgers, Sports Bars, Bars, Lounges, Restauran...",,"[3.166246175765991, 2.61872935295105, 1.313261..."
126,anLQj9AM8vjbcLSIE0iUgg,Papa Murphy's,"Pizza, Restaurants",,"[2.8745970726013184, 3.9649622440338135, 3.357..."
141,SZU9c8V2GuREDN5KgyHFJw,Santa Barbara Shellfish Company,"Live/Raw Food, Restaurants, Seafood, Beer Bar,...",,"[5.83690881729126, 6.086134433746338, 5.073411..."
141,SZU9c8V2GuREDN5KgyHFJw,Santa Barbara Shellfish Company,"Live/Raw Food, Restaurants, Seafood, Beer Bar,...",,"[3.487934112548828, 3.6301448345184326, 2.1654..."
...,...,...,...,...,...
63083,UaDnEIiRNCyLvrQWE4g0sg,Square Pie,"Italian, Pizza, Restaurants",,"[3.0781543254852295, 2.2740087509155273, 1.0, ..."
66085,eIKjxqdMduIeIqzPg7yc-Q,Full Belly Deli - Mill Street,"Sandwiches, Delis, Food, Food Trucks, Restaurants",,"[3.166246175765991, 1.9048324823379517, 1.0, 2..."
70852,CTzC5vC7ZAz-sdKhIgj5aw,Aripeka Stone Crab Company,"Food, Seafood Markets, Boating, Food Stands, S...",,"[3.487934112548828, 2.0435917377471924, 1.0, 1..."
81830,rI8yDXsFbVfc-rw3po5XFA,Magic Pot,"Restaurants, Asian Fusion, Hot Pot, Tapas/Smal...",,"[2.61872935295105, 2.9295005798339844, 1.0, 1...."


In [115]:
# Substitui os valores nulos em user_embeddings pelos valores de embs
df_final['user_embeddings'].fillna(df_final['embs'], inplace=True)


In [116]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16197 entries, 9 to 150326
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   business_id      16197 non-null  object
 1   name             16197 non-null  object
 2   categories       16188 non-null  object
 3   user_embeddings  16197 non-null  object
 4   embs             1000 non-null   object
dtypes: object(5)
memory usage: 759.2+ KB


In [117]:
df_final = df_final.drop(columns='embs')

In [118]:
df_final.head()

Unnamed: 0,business_id,name,categories,user_embeddings
9,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,"Ice Cream & Frozen Yogurt, Fast Food, Burgers,...","[3.2906478881835937, 2.9965803623199463, 1.990..."
15,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,"Sushi Bars, Restaurants, Japanese","[3.4442252635955812, 3.4557546615600585, 1.963..."
18,8wGISYjYkE2tSqn3cDMu8A,Nifty Car Rental,"Automotive, Car Rental, Hotels & Travel, Truck...","[2.486134099960327, 2.0726041555404664, 1.3996..."
19,ROeacJQwBeh05Rqg7F6TCg,BAP,"Korean, Restaurants","[4.213016676902771, 5.026243495941162, 4.25037..."
21,qhDdDeI3K4jy2KyzwFN53w,Barnes & Noble Booksellers,"Shopping, Books, Mags, Music & Video, Bookstores","[5.889504432678223, 6.3557953357696535, 4.2729..."


In [119]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16197 entries, 9 to 150326
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   business_id      16197 non-null  object
 1   name             16197 non-null  object
 2   categories       16188 non-null  object
 3   user_embeddings  16197 non-null  object
dtypes: object(4)
memory usage: 632.7+ KB


In [120]:
df_final.user_embeddings.apply(len).value_counts()

18    16197
Name: user_embeddings, dtype: int64

# Primeira tentativa: levando em consideração somente os embeddings formados pelos reviews dos business (excluindo o perfil calculado com foco nos reviews individuais dos users)

In [121]:
# gerando os embeddings no formato desejado
import os
def export_dataset(df: pd.DataFrame, emb_column: str, output_file: str):
    """
    Export the embeddings to a csv file.
    """
    if not os.path.exists(output_file):
        os.makedirs(output_file)

    np.savetxt(output_file+'/embeddings.txt', np.stack(df[emb_column]), delimiter='\t')
    df.drop(emb_column, axis=1).to_csv(output_file+"/metadados.csv", sep="\t", index=False)

In [122]:
export_dataset(df_final, 'user_embeddings', '/content/FirstAttempt')

## Calculando resultados

In [123]:
!pip install nmslib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nmslib
  Downloading nmslib-2.1.1-cp38-cp38-manylinux2010_x86_64.whl (13.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m72.6 MB/s[0m eta [36m0:00:00[0m
Collecting pybind11<2.6.2
  Downloading pybind11-2.6.1-py2.py3-none-any.whl (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.5/188.5 KB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pybind11, nmslib
Successfully installed nmslib-2.1.1 pybind11-2.6.1


In [126]:
!python ../content/evaluation.py ../content/FirstAttempt/embeddings.txt ../content/FirstAttempt/metadados.csv

Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib

0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
***************************************************

0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
***************************************************
*              business_id  ...                                         categories
0  bBDDEgkFA1Otx9Lfe7BZUQ  ...  Ice Cream & Frozen Yogurt, Fast Food, Burgers,...
1  MUTTqe8uqyMdBl186RmNeA  ...                  Sushi Bars, Restaurants, Japanese
2  8wGISYjYkE2tSqn3cDMu8A  ...  Automotive, Car Rental, Hotels & Travel, Truck...
3  ROeacJQwBeh05Rqg7F6TCg  ...                                Korean, Restaurants
4  qhDdDeI3K4jy2KyzwFN53w  ...   Shopping, Books, Mags, Music & Video