In [1]:
import pandas as pd
import numpy as np

# Importando dados

In [2]:
eval_df = pd.read_csv('../data/evaluation/eval_users.csv')

In [3]:
eval_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      1000 non-null   object
 1   user_perfil  1000 non-null   object
 2   gt_reclist   1000 non-null   object
 3   reclist      1000 non-null   object
dtypes: object(4)
memory usage: 31.4+ KB


In [4]:
eval_df.head()

Unnamed: 0,user_id,user_perfil,gt_reclist,reclist
0,-1BSu2dt_rOAqllw9ZDXtA,5XsC0tB8chKjTIW7mU6TnQ,"['5XsC0tB8chKjTIW7mU6TnQ', 'wn4U347OALm5H0MOBR...","['XTIc2pKNdmmvX60lIHV0OQ', 'GyvtAyCurqFGovXp-t..."
1,-6DoXmdXEy_P5N-QZzntgA,Ifw5wqcChnL4zBigtR7NKA,"['Ifw5wqcChnL4zBigtR7NKA', 'v1GCQz7ZsntWI-GlGP...","['QB0NhiW--2rje9Fr1ek2eA', 'o4IiNbNybcy-L4vzTS..."
2,-8NOuak4Sipn7-zy7Nk5hg,OKPUO8zvBBL-OA6-SfDx8Q,"['OKPUO8zvBBL-OA6-SfDx8Q', 'OHplb2m_dKPXY46mS0...","['M6yUUIE8-incodeeJrMpVQ', 'fw6PlWy2ghCzuUH24p..."
3,-8rSnT5ztVk6vmTDkxTqsQ,VSjoo6kJ9MU4G0cfO_-CRA,"['VSjoo6kJ9MU4G0cfO_-CRA', 'DH-vk-XzWMT9rRLcbB...","['3zK9LTY3TgH7nU18-dnXtA', 'DH-vk-XzWMT9rRLcbB..."
4,-C7xxeVQI5qEZGAzFdx-cg,rXqlpCH6z9rSFNCL76FfLw,"['rXqlpCH6z9rSFNCL76FfLw', 'WY_dcOTyRA-AgksCXi...","['6aDmYbqNKeWn9tynvFQa-w', 'nMHM74eFQuJyS_a7EV..."


> Sabemos que:
- User_id : é o id do usuário
- user_perfil: é o id do business que a pessoa melhor avaliou
- gt_reclist: business que a pessoa avaliou com 4 ou 5 estrelas
- rec_list: business selecionados aleatoriamente + os business de gt_reclist -> devemos ordenar essa lista.

-> A ideia é criarmos os embeddings somente para os business que estão em gt_reclist e rec_list e para os usuários selecionados.

# Separando os usuários que estão no dataset

In [5]:
users = eval_df['user_id'].unique()

In [6]:
users.shape

(1000,)

-> Para esses usuários, vamos gerar embeddings de suas avaliações e de seus perfis.

In [7]:
# carregando os dados de users
users_df = pd.read_parquet('../data/DatasetsLimpos/yelp_academic_dataset_user.parquet')

In [8]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987897 entries, 0 to 1987896
Data columns (total 19 columns):
 #   Column              Dtype  
---  ------              -----  
 0   user_id             object 
 1   review_count        float32
 2   useful              float32
 3   funny               float32
 4   cool                float32
 5   fans                float32
 6   compliment_hot      float32
 7   compliment_more     float32
 8   compliment_profile  float32
 9   compliment_cute     float32
 10  compliment_list     float32
 11  compliment_note     float32
 12  compliment_plain    float32
 13  compliment_cool     float32
 14  compliment_funny    float32
 15  compliment_writer   float32
 16  compliment_photos   float32
 17  account_age         int64  
 18  chato               float32
dtypes: float32(17), int64(1), object(1)
memory usage: 159.2+ MB


### Gerando os embeddings de seus reviews

In [9]:
df_reviews = pd.read_parquet('../data/DatasetsLimpos/yelp_academic_dataset_review.parquet')

In [10]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6990280 entries, 0 to 6990279
Data columns (total 7 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   user_id      object
 1   business_id  object
 2   stars        uint8 
 3   useful       int32 
 4   funny        int32 
 5   cool         int32 
 6   text         object
dtypes: int32(3), object(3), uint8(1)
memory usage: 246.7+ MB


In [11]:
# filtrando apenas os reviews dos usuários selecionados
df_reviews = df_reviews[df_reviews['user_id'].isin(users)]

In [12]:
df_reviews.shape

(13758, 7)

In [13]:
df_reviews.user_id.nunique()

1000

In [14]:
# agrupando os reviews por usuário
df_reviews_text = df_reviews[['user_id', 'text']].groupby('user_id').agg(lambda x: list(x))

In [15]:
df_reviews_text.head()

Unnamed: 0_level_0,text
user_id,Unnamed: 1_level_1
-1BSu2dt_rOAqllw9ZDXtA,[Hank and I love Brocatos!The freshest ingredi...
-6DoXmdXEy_P5N-QZzntgA,[We stopped in for breakfast burritos one morn...
-8NOuak4Sipn7-zy7Nk5hg,[One of Philadelphia's best restaurants in my ...
-8rSnT5ztVk6vmTDkxTqsQ,[PROS\nYelp deal - use BOGO 50% entree\nOxtail...
-C7xxeVQI5qEZGAzFdx-cg,[This place is the best! Their food isn't spic...


In [16]:
# criando coluna quantidade de reviews
df_reviews_text['qtd_reviews'] = df_reviews_text['text'].apply(lambda x: len(x))

In [17]:
df_reviews_text.qtd_reviews.describe()

count    1000.000000
mean       13.758000
std         4.122673
min        10.000000
25%        11.000000
50%        12.000000
75%        15.000000
max        35.000000
Name: qtd_reviews, dtype: float64

> Boa parte dos usuários selecionados tem menos de 15 reviews. Mas é um número bacana já para termos em mente seu comportamento.

### Gerando os embeddings com o BERT

In [None]:
import torch
from transformers import BertTokenizer, BertModel
import time


# Load pretrained model/tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

In [None]:
def get_bert_embedding(text_list):
    # if text_list is over than 15 items, split it in groups of 15
    # print(len(text_list))
    if len(text_list) > 15:
        # print("Text list is over than 15 items, split it in groups of 15")
        text_list = [text_list[i:i + 15] for i in range(0, len(text_list), 15)]
    else:
        text_list = [text_list]
    embs = []
    for text in text_list:
        start = time.time()
        tokens = []
        # clip text if it is too long (more than 512 tokens)
        for idx in range(len(text)):
            if len(text[idx]) > 512:
                text[idx] = text[idx][:510]
            # Add the special tokens.
            marked_text = "[CLS] " + text[idx] + " [SEP]"
            # Split the sentence into tokens.
            tokenized_text = tokenizer.tokenize(marked_text)
            # padding if text is less than 512 tokens
            if len(tokenized_text) < 512:
                tokenized_text = tokenized_text + ["[PAD]"] * (512 - len(tokenized_text))
            # Map the token strings to their vocabulary indexes.
            indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
            tokens.append(indexed_tokens)
        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor(tokens)

        # Put the model in "evaluation" mode,meaning feed-forward operation.
        model.eval()
        # Run the text through BERT, and collect all the hidden states produced from all 12 layers.
        with torch.no_grad():
            outputs = model(tokens_tensor)[2][-4:]

        # sum of last four layer
        outputs = torch.stack(outputs, dim=1)
        outputs = outputs.sum(1)
        # mean of the tokens, results in one vector of 768 dimensions per text
        outputs = torch.mean(outputs, 1).squeeze(0).numpy()
        print("Time to get embedding: ", time.time() - start)
        embs.append(outputs)
    return embs

In [None]:
# aplicando no dataframe
df_reviews_text['bert_embedding'] = df_reviews_text['text'].apply(get_bert_embedding)