In [2]:
import pandas as pd
import numpy as np

In [3]:
df_business = pd.read_parquet('../data/yelp_dataset/yelp_academic_dataset_business.parquet')

# Alterações no df_business

In [4]:
# drop do name
df_business.drop(columns=['name'], inplace=True)

In [5]:
# drop do address, city e state
df_business.drop(columns=['address', 'city', 'state'], inplace=True)

In [6]:
# drop do postal_code
df_business.drop(columns=['postal_code'], inplace=True)

In [7]:
# ficar esperto com is_open, pois os lugares fechados podem ser úteis para treinar aqui, mas na hora de recomendar devem ser deixados de lado.

In [8]:
# drop dos atributos
df_business.drop(columns=['attributes'], inplace=True)

In [9]:
# drop dos horários
df_business.drop(columns=['hours'], inplace=True)

In [10]:
df_business.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   150346 non-null  object 
 1   latitude      150346 non-null  float32
 2   longitude     150346 non-null  float32
 3   stars         150346 non-null  float32
 4   review_count  150346 non-null  uint16 
 5   is_open       150346 non-null  uint8  
 6   categories    150243 non-null  object 
dtypes: float32(3), object(2), uint16(1), uint8(1)
memory usage: 4.4+ MB


> Pronto para transformações!

# Alteraçoes no df_review

In [11]:
df_reviews = pd.read_parquet('../data/yelp_dataset/yelp_academic_dataset_review_0.parquet')
df_reviews2 = pd.read_parquet('../data/yelp_dataset/yelp_academic_dataset_review_1.parquet')
df_reviews = pd.concat([df_reviews, df_reviews2])
del df_reviews2

In [17]:
# dropando o review_id
df_reviews.drop(columns=['review_id'], inplace=True)

In [15]:
# stars, useful, funny e cool podem ser tratados como intensificadores do embedding do texto.
# useful, funny e cool devem ser adicionados de np.e para evitar np.log(0) e depois aplicar np.log

df_reviews['useful'] = df_reviews['useful'].apply(lambda x: np.log(x + np.e))
df_reviews['funny'] = df_reviews['funny'].apply(lambda x: np.log(x + np.e))
df_reviews['cool'] = df_reviews['cool'].apply(lambda x: np.log(x + np.e))

In [16]:
# dropando a data
df_reviews.drop(columns=['date'], inplace=True)

In [20]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6990280 entries, 0 to 6990279
Data columns (total 7 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   user_id      object
 1   business_id  object
 2   stars        uint8 
 3   useful       int32 
 4   funny        int32 
 5   cool         int32 
 6   text         object
dtypes: int32(3), object(3), uint8(1)
memory usage: 246.7+ MB


In [19]:
# transformando useful funny e cool em int32
df_reviews['useful'] = df_reviews['useful'].astype(np.int32)
df_reviews['funny'] = df_reviews['funny'].astype(np.int32)
df_reviews['cool'] = df_reviews['cool'].astype(np.int32)

# Alterações no df_user

In [21]:
df_user = pd.read_parquet('../data/yelp_dataset/yelp_academic_dataset_user_0.parquet')
df_user2 = pd.read_parquet('../data/yelp_dataset/yelp_academic_dataset_user_1.parquet')
df_user = pd.concat([df_user, df_user2])
del df_user2

In [22]:
# dropando o name
df_user.drop(columns=['name'], inplace=True)

In [23]:
# passando para escala logarítmica o review_count
df_user['review_count'] = df_user['review_count'].apply(lambda x: np.log(x + np.e))

In [27]:
# transformando yelping_since em idade em dias
df_user['account_age'] = (pd.to_datetime('today') - pd.to_datetime(df_user.yelping_since)).dt.days

In [28]:
# drop yelping_since
df_user.drop(columns=['yelping_since'], inplace=True)

In [29]:
# drop friends
df_user.drop(columns=['friends'], inplace=True)

In [30]:
# escala logaritmica de useful funny e cool
df_user['useful'] = df_user['useful'].apply(lambda x: np.log(x + np.e))
df_user['funny'] = df_user['funny'].apply(lambda x: np.log(x + np.e))
df_user['cool'] = df_user['cool'].apply(lambda x: np.log(x + np.e))

In [31]:
# escala logaritmica de fans
df_user['fans'] = df_user['fans'].apply(lambda x: np.log(x + np.e))

In [32]:
# dropando o elite
df_user.drop(columns=['elite'], inplace=True)

In [33]:
# criando a métrica "chato" usando a average_stars (quanto maior, mais chato)
df_user['chato'] = df_user.average_stars.median() / df_user.average_stars

In [34]:
# dropando o average_stars
df_user.drop(columns=['average_stars'], inplace=True)

In [35]:
# uasndo escala logaritmica no compliment_hot, compliment_more, compliment_profile, compliment_cute, compliment_list, compliment_note, compliment_plain, compliment_cool, compliment_funny, compliment_writer, compliment_photos
df_user['compliment_hot'] = df_user['compliment_hot'].apply(lambda x: np.log(x + np.e))
df_user['compliment_more'] = df_user['compliment_more'].apply(lambda x: np.log(x + np.e))
df_user['compliment_profile'] = df_user['compliment_profile'].apply(lambda x: np.log(x + np.e))
df_user['compliment_cute'] = df_user['compliment_cute'].apply(lambda x: np.log(x + np.e))
df_user['compliment_list'] = df_user['compliment_list'].apply(lambda x: np.log(x + np.e))
df_user['compliment_note'] = df_user['compliment_note'].apply(lambda x: np.log(x + np.e))
df_user['compliment_plain'] = df_user['compliment_plain'].apply(lambda x: np.log(x + np.e))
df_user['compliment_cool'] = df_user['compliment_cool'].apply(lambda x: np.log(x + np.e))
df_user['compliment_funny'] = df_user['compliment_funny'].apply(lambda x: np.log(x + np.e))
df_user['compliment_writer'] = df_user['compliment_writer'].apply(lambda x: np.log(x + np.e))
df_user['compliment_photos'] = df_user['compliment_photos'].apply(lambda x: np.log(x + np.e))


In [36]:
df_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987897 entries, 0 to 1987896
Data columns (total 19 columns):
 #   Column              Dtype  
---  ------              -----  
 0   user_id             object 
 1   review_count        float64
 2   useful              float64
 3   funny               float64
 4   cool                float64
 5   fans                float64
 6   compliment_hot      float64
 7   compliment_more     float64
 8   compliment_profile  float64
 9   compliment_cute     float64
 10  compliment_list     float64
 11  compliment_note     float64
 12  compliment_plain    float64
 13  compliment_cool     float64
 14  compliment_funny    float64
 15  compliment_writer   float64
 16  compliment_photos   float64
 17  account_age         int64  
 18  chato               float32
dtypes: float32(1), float64(16), int64(1), object(1)
memory usage: 280.6+ MB


In [37]:
# passando as colunas para float32
df_user['review_count'] = df_user['review_count'].astype(np.float32)

df_user['useful'] = df_user['useful'].astype(np.float32)
df_user['funny'] = df_user['funny'].astype(np.float32)
df_user['cool'] = df_user['cool'].astype(np.float32)

df_user['fans'] = df_user['fans'].astype(np.float32)

df_user['chato'] = df_user['chato'].astype(np.float32)

df_user['compliment_hot'] = df_user['compliment_hot'].astype(np.float32)
df_user['compliment_more'] = df_user['compliment_more'].astype(np.float32)
df_user['compliment_profile'] = df_user['compliment_profile'].astype(np.float32)
df_user['compliment_cute'] = df_user['compliment_cute'].astype(np.float32)
df_user['compliment_list'] = df_user['compliment_list'].astype(np.float32)
df_user['compliment_note'] = df_user['compliment_note'].astype(np.float32)
df_user['compliment_plain'] = df_user['compliment_plain'].astype(np.float32)
df_user['compliment_cool'] = df_user['compliment_cool'].astype(np.float32)
df_user['compliment_funny'] = df_user['compliment_funny'].astype(np.float32)
df_user['compliment_writer'] = df_user['compliment_writer'].astype(np.float32)
df_user['compliment_photos'] = df_user['compliment_photos'].astype(np.float32)

# Salvando os dataframes

In [38]:
df_business.to_parquet('../data/DatasetsLimpos/yelp_academic_dataset_business.parquet')

In [40]:
df_reviews.to_parquet('../data/DatasetsLimpos/yelp_academic_dataset_review.parquet')

In [41]:
df_user.to_parquet('../data/DatasetsLimpos/yelp_academic_dataset_user.parquet')

# Unindo os datasets

In [1]:
import pandas as pd
import numpy as np

In [2]:
df_business = pd.read_parquet('../data/DatasetsLimpos/yelp_academic_dataset_business.parquet')
df_reviews = pd.read_parquet('../data/DatasetsLimpos/yelp_academic_dataset_review.parquet')
df_user = pd.read_parquet('../data/DatasetsLimpos/yelp_academic_dataset_user.parquet')

In [3]:
df_business.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   150346 non-null  object 
 1   latitude      150346 non-null  float32
 2   longitude     150346 non-null  float32
 3   stars         150346 non-null  float32
 4   review_count  150346 non-null  uint16 
 5   is_open       150346 non-null  uint8  
 6   categories    150243 non-null  object 
dtypes: float32(3), object(2), uint16(1), uint8(1)
memory usage: 4.4+ MB


In [4]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6990280 entries, 0 to 6990279
Data columns (total 7 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   user_id      object
 1   business_id  object
 2   stars        uint8 
 3   useful       int32 
 4   funny        int32 
 5   cool         int32 
 6   text         object
dtypes: int32(3), object(3), uint8(1)
memory usage: 246.7+ MB


In [5]:
df_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987897 entries, 0 to 1987896
Data columns (total 19 columns):
 #   Column              Dtype  
---  ------              -----  
 0   user_id             object 
 1   review_count        float32
 2   useful              float32
 3   funny               float32
 4   cool                float32
 5   fans                float32
 6   compliment_hot      float32
 7   compliment_more     float32
 8   compliment_profile  float32
 9   compliment_cute     float32
 10  compliment_list     float32
 11  compliment_note     float32
 12  compliment_plain    float32
 13  compliment_cool     float32
 14  compliment_funny    float32
 15  compliment_writer   float32
 16  compliment_photos   float32
 17  account_age         int64  
 18  chato               float32
dtypes: float32(17), int64(1), object(1)
memory usage: 159.2+ MB
