In [None]:
import os
from pathlib import Path
home = os.getcwd()
current = home
while 'data' not in os.listdir(current):
    current = Path(current).parent
DATA_FOLDER = os.path.join(current, 'data')

# Exploring the products data

In [None]:
import pandas as pd
products_csv = os.path.join(DATA_FOLDER, 'olist_products_dataset.csv') 
df = pd.read_csv(products_csv)
print(df.head())
# category, description length, photo_quality and weight
df.drop(columns=["product_length_cm", "product_height_cm", "product_width_cm"], inplace=True)

In [None]:
# since we have a relatively large amount of data we can actually drop any nan values
print(len(df))
print(len(df.dropna())) # we lose aounrd 600 samples; around 2% we can live with that
df = df.dropna()

In [None]:
import matplotlib.pyplot as plt 
# map each category by its name
cat_freq = df['product_category_name'].value_counts()
categories = sorted(df['product_category_name'].unique(), key=lambda x: cat_freq[x])
categories_map = {c: i for i, c in enumerate(categories)}
df_plot = df.copy()
df_plot['product_category_name'] = df['product_category_name'].map(categories_map)

In [None]:
df['product_category_name']

In [None]:
df_plot['product_category_name'].hist(bins=range(0, len(categories), 3), figsize=(15, 8))
plt.xticks(ticks=list(range(0, len(categories), 3)), rotation=90)
plt.xlabel('categories: mapped to numerical values')
plt.ylabel('frequencies')
plt.title('frequencies of product categories')
plt.show()
del(df_plot)

We can see that the distribution of product categories is quite skewed as few categories constitute the majority of the products sold on the platform. For further analysis, we will consider only categories with at least 500 products (500 is more than enough to possibly apply Central Limit Theorem)

In [None]:
product_categories = list(cat_freq[cat_freq > 500].index)
df = df[df['product_category_name'].isin(product_categories)]
len(df)

In [None]:
product_categories

it seems that we kept (27007 / 32951) $\approx 82\%$ of the initial data. Great start !!

Time to translate the categories to English

In [None]:
# let's start with data preparation
from google_trans_new import google_translator  
def translate_text(translator_obj: google_translator, text, source_lang, target_lang) -> str:
    if translator_obj is None:
        translator_obj = google_translator()
    res = translator_obj.translate(text, lang_tgt=target_lang, lang_src=source_lang)
    return res.text


In [None]:
# let's translate the categories from portogues to English 
from deep_translator import GoogleTranslator
import re
translator = GoogleTranslator(source='pt', target='en')
categories_english = [translator.translate(text=re.sub('_', ' ', t)) for t in product_categories]

In [None]:
# save the translations
df['product_category_name'] = df['product_category_name'].map(dict([(pt, en) for pt, en in zip(product_categories, categories_english)]))
prepared_data_folder = os.path.join(Path(DATA_FOLDER).parent, 'data_prepared') 
if not os.path.isdir(prepared_data_folder):
    os.makedirs(prepared_data_folder)
df.to_csv(prepared_data_folder, 'products_prepared.csv')