In [2]:
import sqlalchemy as sa
from sqlalchemy import orm
from dotenv import load_dotenv
import pandas as pd
import os
import sys

root_path = os.path.abspath(os.path.join('..'))
if root_path not in sys.path:
    sys.path.append(root_path)
    

load_dotenv()

conn_string = 'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/cosmetic'.format(
    db_user=os.getenv('DB_USER'),
    db_password=os.getenv('DB_PASSWORD'),
    db_host=os.getenv('DB_HOST'),
    db_port=os.getenv('DB_PORT'),
)
engine = sa.create_engine(conn_string)

In [2]:
# group by DATE
with engine.connect() as conn:
    df = pd.read_sql(
        '''
            SELECT 
                e.product_id,
                e.category_id,
                MAX(e.price) as max_price,
                MIN(e.price) as min_price,
                AVG(e.price) as avg_price,
                e.event_time::DATE as date,
                COUNT(*) FILTER(WHERE event_type = 'view') as view,
                COUNT(*) FILTER(WHERE event_type = 'cart') as cart,
                COUNT(*) FILTER(WHERE event_type = 'remove_from_cart') as remove_from_cart,
                COUNT(*) FILTER(WHERE event_type = 'purchase') as purchase
            FROM
                events e
                INNER JOIN products p ON p.product_id = e.product_id AND p.category_id = e.category_id AND p.events_count >= 107 AND p.max_price > 0
            GROUP BY
                e.product_id, e.category_id, e.event_time::DATE 
            ORDER BY
                e.category_id ASC,
                e.product_id ASC,
                date ASC
                
        ''',
        con=conn
    )

    conn.execute(sa.text('TRUNCATE TABLE data_by_date'))

    df.to_sql(
        name='data_by_date', 
        con=conn, 
        if_exists='append', 
        index=False,
        chunksize=1000,
        dtype={
            'date': sa.types.DATE
        }
    )

    conn.commit()

df

Count: 2684167


Unnamed: 0,product_id,category_id,max_price,min_price,avg_price,date,view,cart,remove_from_cart,purchase
0,5706113,1487580004832248652,15.08,15.08,15.08,2019-10-01,2,3,0,0
1,5706113,1487580004832248652,15.08,15.08,15.08,2019-10-02,7,0,0,0
2,5706113,1487580004832248652,15.08,15.08,15.08,2019-10-04,1,0,0,0
3,5706113,1487580004832248652,15.08,15.08,15.08,2019-10-05,0,1,0,0
4,5706113,1487580004832248652,15.08,15.08,15.08,2019-10-07,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...
2684162,5928473,2242903426784559183,12.54,12.54,12.54,2020-02-25,58,4,0,1
2684163,5928473,2242903426784559183,12.54,12.54,12.54,2020-02-26,12,1,0,1
2684164,5928473,2242903426784559183,12.54,12.54,12.54,2020-02-27,12,0,2,0
2684165,5928473,2242903426784559183,12.54,12.54,12.54,2020-02-28,1,0,0,0


In [4]:
# ranking by sales of the previous month
from collections import defaultdict
import datetime

from schemas.data_by_date import DataByDate
from schemas.product import Product
from utils.utils import date_range

with orm.Session(engine) as session, session.begin():
    d = datetime.date(2019, 10, 1)
    end = datetime.date(2020, 3, 1)

    products = dict[tuple[int, int], Product]()
    memorized = defaultdict[datetime.date, defaultdict[tuple[int, int], int]](lambda: defaultdict(int))

    for row in session.scalars(sa.select(Product)).all():
        products[(row.product_id, row.category_id)] = row

    def key_func(data: DataByDate):
        key = (data.product_id, data.category_id)
        product = products[key]        
        prev_sales = sum(memorized[p][key]['purchase'] for p in date_range(d - datetime.timedelta(days=30), d, datetime.timedelta(days=1)))
        return -prev_sales, data.date - product.release_date, product.max_price
    
    while d < end:
        print(d)
        rows = list[DataByDate]()
        categories = defaultdict[int, list[DataByDate]](list[DataByDate])
        for row in session.scalars(sa.select(DataByDate).where(DataByDate.date == d)):
            rows.append(row)
            categories[row.category_id].append(row)

        for category in categories.values():
            category.sort(key=key_func)
            for index, row in enumerate(category):
                row.rank_in_category = index + 1

        rows.sort(key=key_func)
        for index, row in enumerate(rows):
            row.rank = index + 1

        pre_d = d - datetime.timedelta(days=30)
        for row in rows:
            key = (row.product_id, row.category_id)
            memorized[d][key] = row.purchase
        del memorized[pre_d]

        session.flush()    
        d += datetime.timedelta(days=1)
        
    session.commit()

2019-10-01
2019-10-02
2019-10-03
2019-10-04
2019-10-05
2019-10-06
2019-10-07
2019-10-08
2019-10-09
2019-10-10
2019-10-11
2019-10-12
2019-10-13
2019-10-14
2019-10-15
2019-10-16
2019-10-17
2019-10-18
2019-10-19
2019-10-20
2019-10-21
2019-10-22
2019-10-23
2019-10-24
2019-10-25
2019-10-26
2019-10-27
2019-10-28
2019-10-29
2019-10-30
2019-10-31
2019-11-01
2019-11-02
2019-11-03
2019-11-04
2019-11-05
2019-11-06
2019-11-07
2019-11-08
2019-11-09
2019-11-10
2019-11-11
2019-11-12
2019-11-13
2019-11-14
2019-11-15
2019-11-16
2019-11-17
2019-11-18
2019-11-19
2019-11-20
2019-11-21
2019-11-22
2019-11-23
2019-11-24
2019-11-25
2019-11-26
2019-11-27
2019-11-28
2019-11-29
2019-11-30
2019-12-01
2019-12-02
2019-12-03
2019-12-04
2019-12-05
2019-12-06
2019-12-07
2019-12-08
2019-12-09
2019-12-10
2019-12-11
2019-12-12
2019-12-13
2019-12-14
2019-12-15
2019-12-16
2019-12-17
2019-12-18
2019-12-19
2019-12-20
2019-12-21
2019-12-22
2019-12-23
2019-12-24
2019-12-25
2019-12-26
2019-12-27
2019-12-28
2019-12-29
2019-12-30

In [3]:
# calculate normalization parameters
import json

with engine.connect() as conn:
    parameter_df = pd.read_sql(
        '''
            SELECT
                MIN(d.max_price) AS price_min,
                MIN(rank) AS rank_min,
                MIN(rank_in_category) AS rank_in_category_min,
                MIN((d.date - p.release_date)::INT) AS days_on_shelf_min,
                MIN(view) AS view_min,
                MIN(cart) AS cart_min,
                MIN(remove_from_cart) AS remove_from_cart_min,
                MIN(purchase) AS purchase_min,

                MAX(d.max_price) AS price_max,
                MAX(rank) AS rank_max,
                MAX(rank_in_category) AS rank_in_category_max,
                MAX((d.date - p.release_date)::INT) AS days_on_shelf_max,
                MAX(view) AS view_max,
                MAX(cart) AS cart_max,
                MAX(remove_from_cart) AS remove_from_cart_max,
                MAX(purchase) AS purchase_max,

                AVG(d.max_price) AS price_mean,
                AVG(rank) AS rank_mean,
                AVG(rank_in_category) AS rank_in_category_mean,
                AVG((d.date - p.release_date)::INT) AS days_on_shelf_mean,
                AVG(view) AS view_mean,
                AVG(cart) AS cart_mean,
                AVG(remove_from_cart) AS remove_from_cart_mean,
                AVG(purchase) AS purchase_mean,

                STDDEV(d.max_price) AS price_std,
                STDDEV(rank) AS rank_std,
                STDDEV(rank_in_category) AS rank_in_category_std,
                STDDEV((d.date - p.release_date)::INT) AS days_on_shelf_std,
                STDDEV(view) AS view_std,
                STDDEV(cart) AS cart_std,
                STDDEV(remove_from_cart) AS remove_from_cart_std,
                STDDEV(purchase) AS purchase_std

            FROM data_by_date d
                INNER JOIN products p ON p.product_id = d.product_id AND p.category_id = d.category_id
        ''',
        con=conn
    )
    
    with open('../datasets/parameters.json', 'w') as file:
        json.dump(parameter_df.to_dict('records')[0], file, indent=4)
        
    conn.commit()
        
parameter_df

Unnamed: 0,price_min,rank_min,rank_in_category_min,days_on_shelf_min,view_min,cart_min,remove_from_cart_min,purchase_min,price_max,rank_max,...,remove_from_cart_mean,purchase_mean,price_std,rank_std,rank_in_category_std,days_on_shelf_std,view_std,cart_std,remove_from_cart_std,purchase_std
0,0.05,1,1,0,0,0,0,0,327.78,21304,...,1.415982,0.46467,14.000933,5194.34817,148.911938,44.010973,10.370086,5.673207,2.965847,1.378449
