## Analyse des données Olist :

### Manipulation :

In [3]:
import pandas as pd
from termcolor import colored as cl

# lecture des csv sous le format dataframe 

df_customer = pd.read_csv("csv/olist_customers_dataset.csv")
df_geo = pd.read_csv("csv/olist_geolocation_dataset.csv")
df_items = pd.read_csv("csv/olist_order_items_dataset.csv")
df_payments = pd.read_csv("csv/olist_order_payments_dataset.csv")
df_reviews = pd.read_csv("csv/olist_order_reviews_dataset.csv")
df_orders = pd.read_csv("csv/olist_orders_dataset.csv")
df_products = pd.read_csv("csv/olist_products_dataset.csv")
df_sellers = pd.read_csv("csv/olist_sellers_dataset.csv")
df_categ = pd.read_csv("csv/product_category_name_translation.csv")



In [4]:
# Mettre les dataframes dans un dictionnaire pour facilement les parcourir

dict_df = { "df_customer" : df_customer,
            "df_geo" : df_geo,
            "df_items" : df_items , 
            "df_payments" : df_payments ,
            "df_reviews" : df_reviews ,
            "df_orders" : df_orders ,
            "df_products" : df_products ,
            "df_sellers" : df_sellers ,
            "df_categ" : df_categ }


In [5]:
# extraction des noms de colonnes de tous les dataframes et les stocker dans un dictionnaire

dict_col = {}

for key in dict_df:

    dict_col[key] = list(dict_df[key].columns)

In [6]:
# visualiser les colonnes pour chaque dataframe

for key in dict_col:
    
    print(cl(f"\n{key} :\n\n","yellow",attrs=["bold","underline"]),cl(f"{len(dict_col[key])} colonnes\n","green"))
    print(dict_col[key])


df_customer :

 5 colonnes

['customer_id', 'customer_unique_id', 'customer_zip_code_prefix', 'customer_city', 'customer_state']

df_geo :

 5 colonnes

['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng', 'geolocation_city', 'geolocation_state']

df_items :

 7 colonnes

['order_id', 'order_item_id', 'product_id', 'seller_id', 'shipping_limit_date', 'price', 'freight_value']

df_payments :

 5 colonnes

['order_id', 'payment_sequential', 'payment_type', 'payment_installments', 'payment_value']

df_reviews :

 7 colonnes

['review_id', 'order_id', 'review_score', 'review_comment_title', 'review_comment_message', 'review_creation_date', 'review_answer_timestamp']

df_orders :

 8 colonnes

['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']

df_products :

 9 colonnes

['product_id', 'product_category_name', 'product_name_lenght', '

### Valeurs nulles et doublons :

In [7]:
# visualiser les dataframes avec des doublons

for df in dict_df:
    if dict_df[df].duplicated().any() == True:
        
        print(cl(f"\n{df} :\n\n","yellow",attrs=["bold","underline"]))
        print(dict_df[df].duplicated().sum())


df_geo :


261831


In [8]:
# dataframes avec des valeurs nules

for df in dict_df:
    if dict_df[df].isnull().values.any() == True:
        print(cl(f"\n{df} :\n\n","yellow",attrs=["bold","underline"]))
        print(dict_df[df].isna().sum())


df_reviews :


review_id                      0
order_id                       0
review_score                   0
review_comment_title       87656
review_comment_message     58247
review_creation_date           0
review_answer_timestamp        0
dtype: int64

df_orders :


order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
order_estimated_delivery_date       0
dtype: int64

df_products :


product_id                      0
product_category_name         610
product_name_lenght           610
product_description_lenght    610
product_photos_qty            610
product_weight_g                2
product_length_cm               2
product_height_cm               2
product_width_cm                2
dtype: int64


### Analyses des commandes :

In [9]:
df_orders.describe()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
count,99441,99441,99441,99441,99281,97658,96476,99441
unique,99441,99441,8,98875,90733,81018,95664,459
top,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2018-04-11 10:48:14,2018-02-27 04:31:10,2018-05-09 15:48:00,2018-05-08 23:38:46,2017-12-20 00:00:00
freq,1,1,96478,3,9,47,3,522


In [10]:
df_sellers.head()

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP
2,ce3ad9de960102d0677a81f5d0bb7b2d,20031,rio de janeiro,RJ
3,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,sao paulo,SP
4,51a04a8a6bdcb23deccc82b0b80742cf,12914,braganca paulista,SP


In [11]:
df_geo.tail()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
1000158,99950,-28.068639,-52.010705,tapejara,RS
1000159,99900,-27.877125,-52.224882,getulio vargas,RS
1000160,99950,-28.071855,-52.014716,tapejara,RS
1000161,99980,-28.388932,-51.846871,david canabarro,RS
1000162,99950,-28.070104,-52.018658,tapejara,RS


In [14]:
df_reviews["review_score"].value_counts()

5    57328
4    19142
1    11424
3     8179
2     3151
Name: review_score, dtype: int64