In [2]:
import pandas as pd
import sqlalchemy
import pgspecial

import itertools

In [3]:
%load_ext sql

In [4]:
db_connect = "benj@localhost/olist"
connectStr =  f"postgresql://{db_connect}"
%sql $connectStr

'Connected: benj@olist'

# Exploration db

Nombre de clients dans la base

In [4]:
%%sql
select count(distinct customer_unique_id) from customers

 * postgresql://benj@localhost/olist
1 rows affected.


count
96096


Status des commandes

In [5]:
%sql select distinct order_status,count(*) from orders group by order_status

 * postgresql://benj@localhost/olist
8 rows affected.


order_status,count
approved,2
canceled,625
created,5
delivered,96478
invoiced,314
processing,301
shipped,1107
unavailable,609


Toutes les commandes ont au moins un paiement associé ?

In [7]:
%%sql 
select distinct order_status,count(*) from orders o where
not exists (select 1 from order_payments pa where pa.order_id = o.order_id)
group by order_status

 * postgresql://benj@localhost/olist
1 rows affected.


order_status,count
delivered,1


Les différrents types de paiement

In [7]:
%%sql
select distinct payment_type,count(*) from order_payments
group by payment_type 

 * postgresql://benj@localhost/olist
5 rows affected.


payment_type,count
boleto,19784
credit_card,76795
debit_card,1529
not_defined,3
voucher,5775


Payment en plusieurs versements = payment_installments

Les paiments avec des bons et en liquide se font en une fois.

In [8]:
%%sql
select distinct payment_installments,payment_type,count(*) from order_payments
where payment_type in ('voucher','boleto')
group by payment_installments,payment_type

 * postgresql://benj@localhost/olist
2 rows affected.


payment_installments,payment_type,count
1,boleto,19784
1,voucher,5775


Les paiements d'une commande peuvent se faire avec plusieurs modes de paiement; chaque type de paiement se voit attribué un numéro de séquence

In [34]:
%%sql
select * from order_payments 
where order_id = (select distinct order_id from order_payments where payment_sequential = (select max(payment_sequential) from order_payments LIMIT 1) LIMIT 1)
order by payment_sequential

 * postgresql://benj@localhost/olist
29 rows affected.


order_id,payment_sequential,payment_type,payment_installments,payment_value,id
fa65dad1b0e818e3ccc5cb0e39231352,1,voucher,1,3.71,14322
fa65dad1b0e818e3ccc5cb0e39231352,2,voucher,1,8.51,23075
fa65dad1b0e818e3ccc5cb0e39231352,3,voucher,1,2.95,65642
fa65dad1b0e818e3ccc5cb0e39231352,4,voucher,1,29.16,9986
fa65dad1b0e818e3ccc5cb0e39231352,5,voucher,1,0.66,28331
fa65dad1b0e818e3ccc5cb0e39231352,6,voucher,1,5.02,29649
fa65dad1b0e818e3ccc5cb0e39231352,7,voucher,1,0.32,82594
fa65dad1b0e818e3ccc5cb0e39231352,8,voucher,1,26.02,68854
fa65dad1b0e818e3ccc5cb0e39231352,9,voucher,1,1.08,17275
fa65dad1b0e818e3ccc5cb0e39231352,10,voucher,1,12.86,19566


Nombre de paiements par voucher

In [12]:
%%sql
select count(*) from customers c
join orders o on c.customer_id = o.customer_id
join order_payments pa on pa.order_id = o.order_id
where pa.payment_type = 'voucher'

 * postgresql://benj@localhost/olist
1 rows affected.


count
5775


Nombre de commandes sans commentaires

In [10]:
%%sql
select count(distinct order_id) from orders o
where not exists(select 1 from order_reviews r where r.order_id = o.order_id)

 * postgresql://benj@localhost/olist
1 rows affected.


count
768


dates disponibles

In [41]:
order_dates = %sql select date_trunc('day',min(order_delivered_customer_date)),date_trunc('day',max(order_delivered_customer_date)) from orders
print(f'order_dates: \n{order_dates}')
review_dates = %sql select min(review_creation_date),max(review_creation_date) from order_reviews
print(f'review_dates: \n{review_dates}')

 * postgresql://benj@localhost/olist
1 rows affected.
order_dates: 
+---------------------+---------------------+
|      date_trunc     |     date_trunc_1    |
+---------------------+---------------------+
| 2016-10-11 00:00:00 | 2018-10-17 00:00:00 |
+---------------------+---------------------+
 * postgresql://benj@localhost/olist
1 rows affected.
review_dates: 
+---------------------+---------------------+
|         min         |         max         |
+---------------------+---------------------+
| 2016-10-02 00:00:00 | 2018-08-31 00:00:00 |
+---------------------+---------------------+


Mode de paiement du montant le plus important

In [18]:
#def payment_types(main_table,agg_table):
#    """
#    """
#
#    req = """
#        select c.customer_unique_id,pa.payment_type,count(*) 
#        from customers c
#        join orders o  on c.customer_id = o.customer_id
#        join order_payments pa on pa.order_id = o.order_id
#        group by c.customer_unique_id,pa.payment_type
#        order by c.customer_unique_id    
#    """
#
#    agg = f"""{agg_table} as ({req})"""
#
#    return (agg,
#            list(
#            map( 
#                lambda payment_type : (
#                    f"select {agg_table}.count from {agg_table} where {agg_table}.payment_type = '{payment_type}' and {agg_table}.customer_unique_id = {main_table}.customer_unique_id",
#                    f'nb_{payment_type}'
#                ),
#                selected_payment_types()
#            )
#            )
#    )

#def payment_type_max_amount(main_table,agg_table):
#
#    req = """
#        select c.customer_unique_id,pa.payment_type,max(payment_value) 
#        from customers c
#        join orders o  on c.customer_id = o.customer_id
#        join order_payments pa on pa.order_id = o.order_id
#        group by c.customer_unique_id,pa.payment_type
#        order by c.customer_unique_id
#    """
#
#    
#
#    return (agg,
#            list(
#            map( 
#                lambda payment_type : (
#                    f"select {agg_table}.count from {agg_table} where {agg_table}.payment_type = '{payment_type}' and {agg_table}.customer_unique_id = {main_table}.customer_unique_id",
#                    f'nb_{payment_type}'
#                ),
#                selected_payment_types()
#            )
#            )
#    )

In [47]:
#payment types
def take(n,res):
    return list(map(lambda t: t[n],res))

def make_sub_reqs(agg_req,fields_reqs):
    def f (main_table,agg_table):
        agg = f"{agg_table} as ({agg_req})"
        return (
            agg,
            [ fields_req((agg_table,main_table)) for fields_req in fields_reqs]
        )

    return f

def make_fields_req():
    def f(main_table,agg_table):
        return 

def main_req(*args,histo_date,limit=None):
    def make_args(*args,main_table):
        return list(zip(
                    args,
                    map(
                        lambda r: {'main_table':f'{main_table}','agg_table':f'agg_{str(r)}'},
                        range(len(args)
                        )
                    )
                )
        )

    def join_sub_req(reqs):
        return ',\n'.join(map(lambda t: f"({t[0]}) as {t[1]}",reqs))
    

    def call_f(farg):
        f    = farg[0]
        args = farg[1]
        return f(**args) 

    # main request
    main_table = 'M'
    #req = f"select {main_table}.customer_unique_id,{main_table}.customer_city,{main_table}.customer_state"
    
    #generate sub queries
    fargs = make_args(*args, main_table = main_table)
    sub_reqs = list(map(lambda farg: call_f(farg),fargs))

    # Common table expressions
    ctes  =  "WITH\n" + ',\n'.join(take(0,sub_reqs)) + "\n"
    # fields
    fields   =  "\n" + join_sub_req(itertools.chain(*take(1,sub_reqs))) + "\n"
    #limits
    limits_e =  f'LIMIT {limit}' if limit is not None else ''

    return f"""
    {ctes} select {main_table}.customer_unique_id,{main_table}.customer_city,{main_table}.customer_state,{fields} from customers {main_table}
    join order tmp on tmp.customer_id = {main_table}.customer_id and tmp.order_delivered_customer_date <= '{histo_date}' 
    {limits_e}
    """

# configs = [(req_agg, [field_req(t)]]
# req_agg is an agregation request (that will be executed as a CTE)
# field_req a callable that: 
# -expects a tuple as parameter t[0]: name of the cte t[1]:name of the main table 
# -returns (req,field_name)
#  where req is a request that should join with the previous CTE
#        and  field_name is the feature name in the final Dataframe

def selected_payment_types():
    types = %sql select distinct payment_type from order_payments
    return list(filter(lambda p : p != 'not_defined',take(0,types)))

def make_payment_type(payment_type):
    return lambda t: (
        f"select {t[0]}.count from {t[0]} where {t[0]}.payment_type = '{payment_type}' and {t[1]}.customer_unique_id = {t[0]}.customer_unique_id"
        ,
        f'nb_{payment_type}'
    )

configs = [ 
    ("""
        select c.customer_unique_id,pa.payment_type,count(*) 
        from customers c
        join orders o  on c.customer_id = o.customer_id
        join order_payments pa on pa.order_id = o.order_id
        group by c.customer_unique_id,pa.payment_type
    """,
        [ make_payment_type(payment_type) for payment_type in selected_payment_types() ]
    ),
    ("""
        select t.customer_unique_id,t.payment_type,max(t.sum) from (
		  select c.customer_unique_id,pa.payment_type,sum(payment_value)
		  from customers c
		  join orders o  on c.customer_id = o.customer_id
		  join order_payments pa on pa.order_id = o.order_id
		  group by c.customer_unique_id,pa.payment_type
	  ) t group by t.customer_unique_id,t.payment_type
    """,
        [
            lambda t: (
                f"select {t[0]}.payment_type from {t[0]} where {t[1]}.customer_unique_id = {t[0]}.customer_unique_id LIMIT 1"
                ,
                "payment_type_max"
            )
        ]
    ),
    ("""
        select c.customer_unique_id,max(payment_installments) as m1,max(payment_sequential) as m2
        from customers c
        join orders o  on c.customer_id = o.customer_id
        join order_payments pa on pa.order_id = o.order_id
        group by c.customer_unique_id
    """,
        [
            lambda t: (
                f"select {t[0]}.m1 from {t[0]} where {t[1]}.customer_unique_id = {t[0]}.customer_unique_id"
                ,
                "payment_installments_max"
            ),
            lambda t: (
                f"select {t[0]}.m2 from {t[0]} where {t[1]}.customer_unique_id = {t[0]}.customer_unique_id"
                ,
                "payment_sequential_max"
            )
        ]
    ),
    ("""
        select c.customer_unique_id,sum(price+freight_value) as order_value
        from customers c
        join orders o  on c.customer_id = o.customer_id
        join order_items i on i.order_id = o.order_id
        group by c.customer_unique_id,i.order_id
    """,
        [
            lambda t: (
                f"select max({t[0]}.order_value) from {t[0]} where {t[1]}.customer_unique_id = {t[0]}.customer_unique_id LIMIT 1"
                ,
                "order_value_max"
            ),
            lambda t: (
                f"select min({t[0]}.order_value) from {t[0]} where {t[1]}.customer_unique_id = {t[0]}.customer_unique_id LIMIT 1"
                ,
                "order_value_min"
            ),
            lambda t: (
                f"select avg({t[0]}.order_value) from {t[0]} where {t[1]}.customer_unique_id = {t[0]}.customer_unique_id LIMIT 1"
                ,
                "order_value_mean"
            ),
            lambda t: (
                f"select stddev_pop({t[0]}.order_value) from {t[0]} where {t[1]}.customer_unique_id = {t[0]}.customer_unique_id LIMIT 1"
                ,
                "order_value_stddev"
            )
        ]
    ),
    ("""
        select c.customer_unique_id,count(*)
        from customers c
        join orders o  on c.customer_id = o.customer_id
        group by c.customer_unique_id
    """,
        [
            lambda t: (
                f"select {t[0]}.count from {t[0]} where {t[1]}.customer_unique_id = {t[0]}.customer_unique_id"
                ,
                "nb_orders"
            )
        ]
    ),
    ("""
        select c.customer_unique_id,count(*)
        from customers c
        join orders o  on c.customer_id = o.customer_id
        join order_items i on i.order_id = o.order_id
        group by c.customer_unique_id,i.order_id
    """,
        [
            lambda t: (
                f"select avg({t[0]}.count) from {t[0]} where {t[1]}.customer_unique_id = {t[0]}.customer_unique_id"
                ,
                "nb_items_order_avg"
            )
        ]
    ),
    ("""
        select c.customer_unique_id,i.freight_value/i.price as r
        from customers c
        join orders o on c.customer_id = o.customer_id
        join order_items i on i.order_id = o.order_id
    """,
        [
            lambda t: (
                f"select max({t[0]}.r) from {t[0]} where {t[1]}.customer_unique_id = {t[0]}.customer_unique_id LIMIT 1"
                ,
                "r_freight_price_max"
            )
        ]
    ),
    ("""
        select c.customer_unique_id,p.product_category_name,sum(i.price)
        from customers c
        join orders o on c.customer_id = o.customer_id
        join order_items i on i.order_id = o.order_id
        join products p on p.product_id = i.product_id
        group by c.customer_unique_id,p.product_category_name
    """,
        [
            lambda t: (
                f"""select {t[0]}.product_category_name from {t[0]} where {t[1]}.customer_unique_id = {t[0]}.customer_unique_id and
                {t[0]}.sum = (select max({t[0]}.sum) from {t[0]} where {t[1]}.customer_unique_id = {t[0]}.customer_unique_id LIMIT 1) 
                LIMIT 1
                """
                ,
                "product_cat_total_max"
            )
        ]
    ),
    ("""
        select c.customer_unique_id,r.review_score
        from customers c
        join orders o on c.customer_id = o.customer_id
        join order_reviews r on r.order_id = o.order_id
    """,
        [
            lambda t: (
                f"""select min({t[0]}.review_score) from {t[0]} where {t[1]}.customer_unique_id = {t[0]}.customer_unique_id LIMIT 1"""
                ,
                "review_score_min"
            ),
            lambda t: (
                f"""select avg({t[0]}.review_score) from {t[0]} where {t[1]}.customer_unique_id = {t[0]}.customer_unique_id LIMIT 1"""
                ,
                "review_score_avg"
            ),
            lambda t: (
                f"""select max({t[0]}.review_score) from {t[0]} where {t[1]}.customer_unique_id = {t[0]}.customer_unique_id LIMIT 1"""
                ,
                "review_score_max"
            )            
        ]
    ),             
]

def generate_df(histo_date):
    main_req = main_req(*list(map(lambda config:make_sub_reqs(*config), configs)),histo_date=histo_date,limit=10)
    print(main_req)
    r = %sql $main_req
    return r.DataFrame()


 * postgresql://benj@localhost/olist
5 rows affected.

    WITH
agg_0 as (
        select c.customer_unique_id,pa.payment_type,count(*) 
        from customers c
        join orders o  on c.customer_id = o.customer_id
        join order_payments pa on pa.order_id = o.order_id
        group by c.customer_unique_id,pa.payment_type
    ),
agg_1 as (
        select t.customer_unique_id,t.payment_type,max(t.sum) from (
		  select c.customer_unique_id,pa.payment_type,sum(payment_value)
		  from customers c
		  join orders o  on c.customer_id = o.customer_id
		  join order_payments pa on pa.order_id = o.order_id
		  group by c.customer_unique_id,pa.payment_type
	  ) t group by t.customer_unique_id,t.payment_type
    ),
agg_2 as (
        select c.customer_unique_id,max(payment_installments) as m1,max(payment_sequential) as m2
        from customers c
        join orders o  on c.customer_id = o.customer_id
        join order_payments pa on pa.order_id = o.order_id
        group by c.customer_u