In [1]:
import os
import yaml
import pandas as pd
from datasets import load_dataset
from plotly import express as px
from sklearn.model_selection import train_test_split

In [None]:
with open('../conf/base/parameters.yml', 'r') as f:
    params = yaml.safe_load(f)['parameters']

In [4]:
def download_dataset() -> pd.DataFrame:
    dataset = load_dataset('ruanchaves/b2w-reviews01')

    return dataset['train'].to_pandas()

In [5]:
df = download_dataset()

Found cached dataset b2w-reviews01 (/home/bobcasta/.cache/huggingface/datasets/ruanchaves___b2w-reviews01/default/1.0.0/414dce7ecb4a1d15781c59f850c2abb5f36dca77358f366349c56501c50ba38d)


  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
df

Unnamed: 0,submission_date,reviewer_id,product_id,product_name,product_brand,site_category_lv1,site_category_lv2,review_title,overall_rating,recommend_to_a_friend,review_text,reviewer_birth_year,reviewer_gender,reviewer_state
0,2018-01-01 00:11:28,d0fb1ca69422530334178f5c8624aa7a99da47907c44de...,132532965,Notebook Asus Vivobook Max X541NA-GO472T Intel...,,Informática,Notebook,Bom,4,Yes,Estou contente com a compra entrega rápida o ú...,1958.0,F,RJ
1,2018-01-01 00:13:48,014d6dc5a10aed1ff1e6f349fb2b059a2d3de511c7538a...,22562178,Copo Acrílico Com Canudo 500ml Rocie,,Utilidades Domésticas,"Copos, Taças e Canecas","Preço imbatível, ótima qualidade",4,Yes,"Por apenas R$1994.20,eu consegui comprar esse ...",1996.0,M,SC
2,2018-01-01 00:26:02,44f2c8edd93471926fff601274b8b2b5c4824e386ae4f2...,113022329,Panela de Pressão Elétrica Philips Walita Dail...,philips walita,Eletroportáteis,Panela Elétrica,ATENDE TODAS AS EXPECTATIVA.,4,Yes,SUPERA EM AGILIDADE E PRATICIDADE OUTRAS PANEL...,1984.0,M,SP
3,2018-01-01 00:35:54,ce741665c1764ab2d77539e18d0e4f66dde6213c9f0863...,113851581,Betoneira Columbus - Roma Brinquedos,roma jensen,Brinquedos,Veículos de Brinquedo,presente mais que desejado,4,Yes,MEU FILHO AMOU! PARECE DE VERDADE COM TANTOS D...,1985.0,F,SP
4,2018-01-01 01:00:28,7d7b6b18dda804a897359276cef0ca252f9932bf4b5c8e...,131788803,"Smart TV LED 43"" LG 43UJ6525 Ultra HD 4K com C...",lg,TV e Home Theater,TV,"Sem duvidas, excelente",5,Yes,"A entrega foi no prazo, as americanas estão de...",1994.0,M,MG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132368,2018-05-31 23:30:50,15f20e95ff44163f3175aaf67a5ae4a94d5030b409e521...,17962233,Carregador De Pilha Sony + 4 Pilhas Aa 2500mah,,Câmeras e Filmadoras,Acessórios para Câmeras e Filmadoras,Ótimo produto!,5,Yes,"Vale muito, estou usando no controle do Xbox e...",1988.0,M,RS
132369,2018-05-31 23:42:25,def7cf9028b0673ab8bca3b1d06e085461fafb88cd48d9...,132631701,Mop Giratório Fit + Refil Extra - At Home,,Utilidades Domésticas,Material de Limpeza,Sensacional,5,Yes,"Prático e barato, super indico o produto para ...",1979.0,F,SP
132370,2018-05-31 23:44:16,7bcbf542f5d7dd9a9a192a6805adba7a7a4c1ce3bf00df...,16095859,Fita Led 5m Rgb 3528 Siliconada Com 300 Leds C...,,Automotivo,Iluminação,Ótimo produto,4,Yes,Chegou antes do prazo previsto e corresponde a...,1979.0,F,PR
132371,2018-05-31 23:46:48,e6fb0b19277d01c2a300c7837a105f3c369377e92f9c19...,6774907,Etiquetas Jurídicas Vade Mecum - Marca Fácil,marca facil,Papelaria,Material de Escritório,O produto não é bom.,1,No,"Material fraco, poderia ser melhor. Ficou deve...",1991.0,M,RJ


In [None]:
df = pd.DataFrame(df['train'])

In [None]:
def load_reviews_dataset(dataset: str, path: str):
    """
    Load the reviews dataset from the datasets library. 
    and save in a csv file in data folder
    
    Args:
        dataset: dataset to be loaded
    
    Returns:
        reviews: B2W reviews dataset from huggingface datasets library
    """
    # load the dataset
    reviews = datasets.load_dataset(dataset=dataset_name)
    return reviews['train'].to_csv(path=dataset_path, index=False)

In [None]:
#DATASET = 'ruanchaves/b2w-reviews01'
reviews = load_reviews_dataset()

In [None]:
df = pd.read_csv('../data/01_raw/reviews.csv')

In [None]:
# Print the quantity of unique values in each column
df.nunique()

In [None]:
# Print the quantity of null values in each column
df.isnull().sum()

In [None]:
# Create a function to drop all the rows with products have no names
def drop_null_products(dataframe: pd.DataFrame):
    """
    Drop all the rows with products have no names
    
    Args:
        dataframe: dataframe to be cleaned
    
    Returns:
        dataframe: cleaned dataframe
    """
    for column in params["columns"]:
        dataframe = dataframe.dropna(subset=[column])
    return dataframe

In [None]:
cols = ['product_name', 'review_text']
drop_null_products(df)

In [None]:
# Drop all the rows with products have no names and all reviews with no text
df = drop_null_products(df, 'product_name')
df = drop_null_products(df, 'review_text')

In [None]:
df['overall_rating'].value_counts()

In [None]:
# Plot a barplot to see the quantity of overall ratings with plotly
fig = px.bar(
    df, 
    x=df['overall_rating'].value_counts().index,
    y=df['overall_rating'].value_counts(), 
    title='Overall Rating',
    text=df['overall_rating'].value_counts(),
    labels={'x':'Overall Rating', 'y':'Quantity'}
)


fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', height=500, width=800, plot_bgcolor='white')
fig.show()

In [None]:
# Plot a relative stacked barplot to see the distribution of the quantity of overall ratings by recommendation
fig = px.bar(
    df.groupby('overall_rating')['recommend_to_a_friend'].value_counts(normalize=True).round(2).reset_index(name='count'),
    x='overall_rating',
    y='count',
    color='recommend_to_a_friend',
    barmode='relative',
    title='Overall Rating by Recommendation',
    text='count',
    labels={'x':'Overall Rating', 'y':'Quantity'}
)


fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', height=500, width=800, plot_bgcolor='white')
fig.show()

In [None]:
# Plot a relative stacked barplot to see the distribution of the quantity of overall ratings by reviewer gender
fig = px.bar(
    df, 
    x=df['reviewer_gender'].value_counts().index,
    y=df['reviewer_gender'].value_counts(), 
    title='Reviewer Gender',
    text=df['reviewer_gender'].value_counts(),
    labels={'x':'Reviewer Gender', 'y':'Quantity'}
)


fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', height=500, width=800, plot_bgcolor='white')
fig.show()

In [None]:
# Plot a relative stacked barplot to see the distribution of the quantity of overall ratings by recommendation
fig = px.bar(
    df.groupby('overall_rating')['reviewer_gender'].value_counts(normalize=True).round(2).reset_index(name='count'),
    x='overall_rating',
    y='count',
    color='reviewer_gender',
    barmode='relative',
    title='Overall Rating by gender',
    text='count',
    labels={'x':'Overall Rating', 'y':'Quantity'}
)


fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', height=500, width=800, plot_bgcolor='white')
fig.show()

In [None]:
df['reviewer_birth_year'].describe()

In [None]:
def fix_birth_year(dataframe: pd.DataFrame, column: str):
    """
    Fix values in the column reviewer_birth_year
    
    Args:
        dataframe: dataframe to be cleaned
    
    Returns:
        dataframe: cleaned dataframe
    """
    dataframe[column] = dataframe[column].apply(lambda x: x if x > 100 else x + 1900)
    return dataframe

In [None]:
df = fix_birth_year(df, 'reviewer_birth_year')
#df['reviewer_birth_year'] = df['reviewer_birth_year'].apply(lambda x: x if x > 100 else x + 1900)

In [None]:
# plot the reviewer_birth_year distribution
fig = px.histogram(
    df,
    x='reviewer_birth_year',
    marginal='box',
    title='Reviewer Birth Year',
    labels={'x':'Reviewer Birth Year', 'y':'Quantity'}
)

fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', height=500, width=800, plot_bgcolor='white')
fig.show()

In [None]:
# plot the boxplot of the reviewer_birth_year distribution by overall_rating
fig = px.box(
    df,
    x='overall_rating',
    y='reviewer_birth_year',
    color='overall_rating',
    title='Reviewer Birth Year by Overall Rating',
    labels={'x':'Overall Rating', 'y':'Reviewer Birth Year'}
)

fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', height=500, width=800, plot_bgcolor='white')

In [None]:
# Plot a countplot of reviewer_state distribution and sort the values descending
fig = px.bar(
    df,
    x=df['reviewer_state'].value_counts().index,
    y=df['reviewer_state'].value_counts(),
    title='Reviewer State',
    text=df['reviewer_state'].value_counts(),
    labels={'x':'Reviewer State', 'y':'Quantity'}
)

fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', height=500, width=1080, plot_bgcolor='white')
fig.show()

In [None]:
# Plot a relative stacked barplot to see the distribution of overall ratings by reviewer state
fig = px.bar(
    df.groupby('reviewer_state',)['overall_rating'].value_counts(normalize=True).round(3).reset_index(name='count'),
    x='count',
    y='reviewer_state',
    color='overall_rating',
    barmode='relative',
    title='Overall Rating by State',
    text='count',
    labels={'x':'Overall Rating', 'y':'Quantity'}
)

fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', height=720, width=1080, plot_bgcolor='white')
fig.show()

In [None]:
# Create a countplot of top 10 site_category_lv1
fig = px.bar(
    df,
    x=df['site_category_lv1'].value_counts().index[:10],
    y=df['site_category_lv1'].value_counts()[:10],
    title='Site Category Lv1',
    text=df['site_category_lv1'].value_counts()[:10],
    labels={'x':'Site Category Lv1', 'y':'Quantity'}
)

fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', height=500, width=1080, plot_bgcolor='white')
fig.show()

In [None]:
# group the site_category_lv1 by normalize overall_rating and count of products_id
df_site_category_lv1 = df.groupby('site_category_lv1')['overall_rating'].value_counts(normalize=True).round(3).reset_index(name='count')
df_site_category_lv1 = df_site_category_lv1.merge(df.groupby('site_category_lv1')['product_id'].count().reset_index(name='count_products'), on='site_category_lv1')

# plot the relative stacked barplot of the site_category_lv1 by overall_rating
fig = px.bar(
    df_site_category_lv1.sort_values(by='count_products', ascending=False).head(50),
    x='count',
    y='site_category_lv1',
    color='overall_rating',
    barmode='relative',
    title='Overall Rating by Site Category Lv1',
    text='count',
    labels={'x':'Site Category Lv1', 'y':'Quantity'}
)

fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', height=500, width=1080, plot_bgcolor='white')
fig.show()

In [None]:
# Create a countplot of top 10 site_category_lv2
fig = px.bar(
    df,
    x=df['site_category_lv2'].value_counts().index[:10],
    y=df['site_category_lv2'].value_counts()[:10],
    title='Site Category Lv2',
    text=df['site_category_lv2'].value_counts()[:10],
    labels={'x':'Site Category Lv2', 'y':'Quantity'}
)

fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', height=500, width=1080, plot_bgcolor='white')
fig.show()

In [None]:
# group the site_category_lv1 by normalize overall_rating and count of products_id
df_site_category_lv1 = df.groupby('site_category_lv2')['overall_rating'].value_counts(normalize=True).round(3).reset_index(name='count')
df_site_category_lv1 = df_site_category_lv1.merge(df.groupby('site_category_lv2')['product_id'].count().reset_index(name='count_products'), on='site_category_lv2')

# plot the relative stacked barplot of the site_category_lv1 by overall_rating
fig = px.bar(
    df_site_category_lv1.sort_values(by='count_products', ascending=False).head(50),
    x='count',
    y='site_category_lv2',
    color='overall_rating',
    barmode='relative',
    title='Overall Rating by Site Category Lv2',
    text='count',
    labels={'x':'Site Category Lv2', 'y':'Quantity'}
)

fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', height=500, width=1080, plot_bgcolor='white')
fig.show()

In [None]:
# group the data by site_category_lv1 and site_category_lv2 by quantity of reviewer_id and product_id
df_site_category_lv1_lv2 = df.groupby(['site_category_lv1', 'site_category_lv2'])['reviewer_id', 'product_id'].nunique().reset_index()

# Create a treemap of site_category_lv1 and site_category_lv2
fig = px.treemap(
    df_site_category_lv1_lv2,
    path=[px.Constant('product'), 'site_category_lv1', 'site_category_lv2'],
    values='reviewer_id',
    color='product_id',
    title='Site Category Lv1 and Lv2',
    labels={'site_category_lv1':'Site Category Lv1', 'site_category_lv2':'Site Category Lv2'}
)

fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', height=720, width=1080, plot_bgcolor='white')
fig.show()

In [None]:
# Create a countplot of top 10 product_brand
fig = px.bar(
    df,
    x=df['product_brand'].value_counts().index[:10],
    y=df['product_brand'].value_counts()[:10],
    title='Product Brand',
    text=df['product_brand'].value_counts()[:10],
    labels={'x':'Product Brand', 'y':'Quantity'}
)

fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', height=500, width=1080, plot_bgcolor='white')
fig.show()

In [None]:
# group the product_brand by normalize overall_rating and count of products_id
df_product_brand = df.groupby('product_brand')['overall_rating'].value_counts(normalize=True).round(3).reset_index(name='count')
df_product_brand = df_product_brand.merge(df.groupby('product_brand')['product_id'].count().reset_index(name='count_products'), on='product_brand')

# plot the relative stacked barplot of the product_brand by overall_rating
fig = px.bar(
    df_product_brand.sort_values(by='count_products', ascending=False).head(50),
    x='count',
    y='product_brand',
    color='overall_rating',
    barmode='relative',
    title='Overall Rating by Product Brand',
    text='count',
    labels={'x':'Product Brand', 'y':'Quantity'}
)

fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', height=500, width=1080, plot_bgcolor='white')
fig.show()

In [None]:
# create a flag to identify all product that have 1 unique review plot a barplot
df['qt_review'] = df.groupby('product_name')['review_text'].transform('nunique')

In [None]:
df['unique_review'] = df['qt_review'].apply(lambda x: 'unique review' if x == 1 else 'more than 1 review')

fig = px.bar(
    df,
    x=df['unique_review'].value_counts().index,
    y=df['unique_review'].value_counts(),
    title='Unique Review',
    text=df['unique_review'].value_counts(),
    labels={'x':'Unique Review', 'y':'Quantity'}
)

fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', height=500, width=1080, plot_bgcolor='white')
fig.show()

In [None]:
def split_data(dataframe: pd.DataFrame, test_size: float):
    """
    Split data into train and test sets
    
    Args:
        df: dataframe to be split
        test_size: size of the test set
    
    Returns:
        train: train set
        test: test set
    """
    train, test = train_test_split(df, test_size=test_size, random_state=42)
    return train, test