In [1]:
import sqlalchemy as sa
from sqlalchemy import orm
from dotenv import load_dotenv
import pandas as pd
import os
import sys

root_path = os.path.abspath(os.path.join('..'))
if root_path not in sys.path:
    sys.path.append(root_path)
    

load_dotenv()

conn_string = 'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/cosmetic'.format(
    db_user=os.getenv('DB_USER'),
    db_password=os.getenv('DB_PASSWORD'),
    db_host=os.getenv('DB_HOST'),
    db_port=os.getenv('DB_PORT'),
)
engine = sa.create_engine(conn_string)

In [None]:
# group by WEEK
with engine.connect() as conn:
    df = pd.read_sql(
        '''
            SELECT 
                e.product_id,
                e.category_id,
                MAX(e.price) as max_price,
                MIN(e.price) as min_price,
                AVG(e.price) as avg_price,
                DATE_TRUNC('WEEK', e.event_time)::DATE as date,
                COUNT(*) FILTER(WHERE event_type = 'view') as view,
                COUNT(*) FILTER(WHERE event_type = 'cart') as cart,
                COUNT(*) FILTER(WHERE event_type = 'remove_from_cart') as remove_from_cart,
                COUNT(*) FILTER(WHERE event_type = 'purchase') as purchase
            FROM
                events e
            GROUP BY
                e.product_id, e.category_id, DATE_TRUNC('WEEK', e.event_time)::DATE 
            ORDER BY
                e.category_id ASC,
                e.product_id ASC,
                date ASC
                
        ''',
        con=conn
    )

    conn.execute(sa.text('TRUNCATE TABLE data_by_week'))

    df.to_sql(
        name='data_by_week', 
        con=conn, 
        if_exists='append', 
        index=False,
        chunksize=1000,
        dtype={
            'date': sa.types.DATE
        }
    )

    conn.commit()

print('Count:', df.shape[0])
df.head(15)

Count: 808361


Unnamed: 0,product_id,category_id,max_price,min_price,avg_price,date,view,cart,remove_from_cart,purchase
0,5916194,1487580004807082827,11.7,11.7,11.7,2019-12-23,5,0,0,0
1,5916194,1487580004807082827,11.7,11.7,11.7,2019-12-30,3,0,0,0
2,5916194,1487580004807082827,11.7,11.7,11.7,2020-01-06,1,0,0,0
3,5916194,1487580004807082827,11.7,11.7,11.7,2020-01-13,6,1,0,1
4,5916194,1487580004807082827,11.7,11.7,11.7,2020-01-20,6,0,0,0
5,5916194,1487580004807082827,11.7,11.7,11.7,2020-01-27,1,1,1,0
6,5916194,1487580004807082827,11.7,11.7,11.7,2020-02-03,3,0,0,0
7,5916194,1487580004807082827,11.7,11.7,11.7,2020-02-10,5,1,0,0
8,5916194,1487580004807082827,11.7,11.7,11.7,2020-02-17,2,0,0,0
9,5916195,1487580004807082827,12.41,11.7,11.995833,2019-12-23,10,2,0,0


In [6]:
# ranking by sales of the previous month
from collections import defaultdict
import datetime

from schemas.data_by_week import DataByWeek
from schemas.product import Product

with orm.Session(engine) as session, session.begin():
    d = datetime.date(2019, 9, 30)
    end = datetime.date(2020, 3, 1)

    products = dict[tuple[int, int], Product]()
    sales = defaultdict[tuple[int, int], tuple[int]](int)
    memorized = defaultdict[datetime.date, defaultdict[tuple[int, int], int]](lambda: defaultdict(int))

    for row in session.scalars(sa.select(Product)).all():
        products[(row.product_id, row.category_id)] = row

    def key_func(data: DataByWeek):
        key = (data.product_id, data.category_id)
        product = products[key]
        return -sales[key], data.date - product.release_date, product.max_price
    
    while d < end:
        rows = list[DataByWeek]()
        categories = defaultdict[int, list[DataByWeek]](list[DataByWeek])
        for row in session.scalars(sa.select(DataByWeek).where(DataByWeek.date == d)):
            rows.append(row)
            categories[row.category_id].append(row)

        for category in categories.values():
            category.sort(key=key_func)
            for index, row in enumerate(category):
                row.rank_in_category = index + 1

        rows.sort(key=key_func)
        for index, row in enumerate(rows):
            row.rank = index + 1

        pre_d = d - datetime.timedelta(weeks=4)
        for row in rows:
            key = (row.product_id, row.category_id)
            sales[key] += row.purchase - memorized[pre_d][key]
            memorized[d][key] = row.purchase
        del memorized[pre_d]

        session.flush()
        d += datetime.timedelta(weeks=1)
    session.commit()