# CRO Project data mining / exploration of the type of products per seller

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.api as sm
from scipy.stats import chisquare
from datetime import datetime

# Hide deprecated warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
items_data = pd.read_csv('olist_order_items_dataset.csv')
product_data = pd.read_csv('olist_products_dataset.csv')

In [3]:
items_data.shape

(112650, 7)

In [4]:
product_data.shape

(32951, 9)

In [5]:
product_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 9 columns):
product_id                    32951 non-null object
product_category_name         32341 non-null object
product_name_lenght           32341 non-null float64
product_description_lenght    32341 non-null float64
product_photos_qty            32341 non-null float64
product_weight_g              32949 non-null float64
product_length_cm             32949 non-null float64
product_height_cm             32949 non-null float64
product_width_cm              32949 non-null float64
dtypes: float64(7), object(2)
memory usage: 2.3+ MB


In [6]:
items_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 7 columns):
order_id               112650 non-null object
order_item_id          112650 non-null int64
product_id             112650 non-null object
seller_id              112650 non-null object
shipping_limit_date    112650 non-null object
price                  112650 non-null float64
freight_value          112650 non-null float64
dtypes: float64(2), int64(1), object(4)
memory usage: 6.0+ MB


## Merge datasets

In [7]:
products_df = pd.merge(items_data, product_data, on='product_id', how='left')

In [8]:
products_df = products_df.drop(['product_description_lenght','product_photos_qty'], axis = 1) 

## Product category analysis

#### Most frequent product categories

In [9]:
category_freq = products_df[['seller_id','product_category_name']].groupby('product_category_name').count().reset_index().sort_values(by='seller_id')

#### Product categories per seller

In [10]:
product_cat_df = products_df.pivot_table(index=['seller_id','product_category_name'], aggfunc='sum').reset_index()

In [11]:
product_cat_df = product_cat_df.drop(['freight_value','order_item_id','price','product_height_cm','product_length_cm',
                    'product_name_lenght','product_weight_g','product_width_cm'], axis = 1) 

In [12]:
product_cat_df['product_category_name'].unique()

array(['eletroportateis', 'construcao_ferramentas_construcao',
       'ferramentas_jardim', 'esporte_lazer', 'moveis_decoracao',
       'cama_mesa_banho', 'utilidades_domesticas', 'beleza_saude',
       'fashion_bolsas_e_acessorios', 'moveis_colchao_e_estofado',
       'informatica_acessorios',
       'moveis_cozinha_area_de_servico_jantar_e_jardim',
       'construcao_ferramentas_ferramentas', 'bebes', 'fashion_calcados',
       'papelaria', 'sinalizacao_e_seguranca', 'cool_stuff',
       'climatizacao', 'eletrodomesticos', 'eletrodomesticos_2',
       'relogios_presentes', 'brinquedos', 'casa_construcao',
       'instrumentos_musicais', 'market_place', 'telefonia',
       'malas_acessorios', 'perfumaria', 'dvds_blu_ray', 'musica',
       'automotivo', 'consoles_games', 'alimentos', 'artes_e_artesanato',
       'eletronicos', 'audio', 'casa_conforto_2',
       'agro_industria_e_comercio', 'construcao_ferramentas_seguranca',
       'pet_shop', 'construcao_ferramentas_iluminacao',
     

In [13]:
dicc = {'eletroportateis':'electronics', 'construcao_ferramentas_construcao':'construction',
       'ferramentas_jardim':'garden', 'esporte_lazer':'fashion', 'moveis_decoracao':'forniture',
       'cama_mesa_banho':'forniture', 'utilidades_domesticas':'house','beleza_saude':'cosmetics',
       'fashion_bolsas_e_acessorios':'fashion', 'moveis_colchao_e_estofado':'forniture',
       'informatica_acessorios':'electronics',
       'moveis_cozinha_area_de_servico_jantar_e_jardim':'forniture',
       'construcao_ferramentas_ferramentas':'construction', 'bebes':'babies','fashion_calcados':'fashion',
       'papelaria':'stationary_arts', 'sinalizacao_e_seguranca':'construction','cool_stuff':'house',
       'climatizacao':'electronics', 'eletrodomesticos':'electronics', 'eletrodomesticos_2':'electronics',
       'relogios_presentes':'fashion','brinquedos':'toys', 'casa_construcao':'construction',
       'instrumentos_musicais':'music', 'market_place':'house', 'telefonia':'electronics',
       'malas_acessorios':'fashion', 'perfumaria':'cosmetics', 'dvds_blu_ray':'electronics','musica':'music',
       'automotivo':'car_acc','consoles_games':'electronics', 'alimentos':'food', 'artes_e_artesanato':'stationary_arts',
       'eletronicos':'electronics', 'audio':'electronics', 'casa_conforto_2':'house',
        'agro_industria_e_comercio':'construction', 'construcao_ferramentas_seguranca':'construction',
       'pet_shop':'pets','construcao_ferramentas_iluminacao':'construction',
       'alimentos_bebidas':'food',  'construcao_ferramentas_jardim':'garden','pcs':'construction',
       'moveis_sala':'forniture', 'portateis_casa_forno_e_cafe':'house', 'artes':'stationary_arts',
       'artigos_de_festas':'house', 'industria_comercio_e_negocios':'construction',
       'fashion_esporte':'fashion', 'bebidas':'food', 'casa_conforto':'house', 'moveis_escritorio':'forniture',
       'livros_interesse_geral':'books', 'livros_tecnicos':'books', 'artigos_de_natal':'house',
       'tablets_impressao_imagem':'electronics', 'fashion_roupa_masculina':'fashion',
       'telefonia_fixa':'electronics', 'cine_foto':'cinema', 'pc_gamer':'electronics', 'fraldas_higiene':'babies',
       'livros_importados':'books','fashion_roupa_feminina':'fashion',
       'fashion_underwear_e_moda_praia':'fashion', 'moveis_quarto':'forniture',
       'fashion_roupa_infanto_juvenil':'fashion', 'la_cuisine':'house',
       'portateis_cozinha_e_preparadores_de_alimentos':'house',
       'seguros_e_servicos':'house','cds_dvds_musicais':'electronics','flores':'garden'}



In [14]:
product_cat_df['mother_cat']= product_cat_df['product_category_name'].map(dicc)

In [15]:
product_cat_df

Unnamed: 0,seller_id,product_category_name,mother_cat
0,0015a82c2db000af6aaaf3ae2ecb0532,eletroportateis,electronics
1,001cca7ae9ae17fb1caed9dfb1094831,construcao_ferramentas_construcao,construction
2,001cca7ae9ae17fb1caed9dfb1094831,ferramentas_jardim,garden
3,001e6ad469a905060d959994f1b41e4f,esporte_lazer,fashion
4,002100f778ceb8431b7a1020ff7ab48f,moveis_decoracao,forniture
5,004c9cd9d87a3c30c522c48c4fc07416,cama_mesa_banho,forniture
6,00720abe85ba0859807595bbf045a33b,utilidades_domesticas,house
7,00ab3eff1b5192e5f1a63bcecfee11c8,beleza_saude,cosmetics
8,00d8b143d12632bad99c0ad66ad52825,fashion_bolsas_e_acessorios,fashion
9,00ee68308b45bc5e2660cd833c3f81cc,cama_mesa_banho,forniture


In [16]:
df_mother_cat = pd.get_dummies(product_cat_df['mother_cat'])

In [17]:
new_df = pd.concat([product_cat_df, df_mother_cat], axis=1)

In [18]:
new_df.head()

Unnamed: 0,seller_id,product_category_name,mother_cat,babies,books,car_acc,cinema,construction,cosmetics,electronics,fashion,food,forniture,garden,house,music,pets,stationary_arts,toys
0,0015a82c2db000af6aaaf3ae2ecb0532,eletroportateis,electronics,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,001cca7ae9ae17fb1caed9dfb1094831,construcao_ferramentas_construcao,construction,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,001cca7ae9ae17fb1caed9dfb1094831,ferramentas_jardim,garden,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,001e6ad469a905060d959994f1b41e4f,esporte_lazer,fashion,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,002100f778ceb8431b7a1020ff7ab48f,moveis_decoracao,forniture,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [19]:
new_cat = new_df.drop(['product_category_name','mother_cat'], axis = 1) 

In [20]:
new_cat

Unnamed: 0,seller_id,babies,books,car_acc,cinema,construction,cosmetics,electronics,fashion,food,forniture,garden,house,music,pets,stationary_arts,toys
0,0015a82c2db000af6aaaf3ae2ecb0532,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,001cca7ae9ae17fb1caed9dfb1094831,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,001cca7ae9ae17fb1caed9dfb1094831,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,001e6ad469a905060d959994f1b41e4f,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,002100f778ceb8431b7a1020ff7ab48f,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
5,004c9cd9d87a3c30c522c48c4fc07416,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
6,00720abe85ba0859807595bbf045a33b,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
7,00ab3eff1b5192e5f1a63bcecfee11c8,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
8,00d8b143d12632bad99c0ad66ad52825,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
9,00ee68308b45bc5e2660cd833c3f81cc,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
