In [1]:
import math
import pickle
import shutil
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from neuralprophet import NeuralProphet
from tqdm import tqdm, trange

from constants import (
    DEBUG_FORECAST_FOLDER,
    DEBUG_POSTGRESQL_PARQUET_TRAIN,
    DEBUG_POSTGRESQL_PARQUET_FUTURE,
    PG_LOG_DTYPES,
)
from forecast_metadata import ForecastMD
from generated_forecast_md import GeneratedForecastMD


In [12]:
pq_files = [Path(DEBUG_POSTGRESQL_PARQUET_TRAIN)]
print(f"Parquet files: {pq_files}")
for pq_file in tqdm(pq_files, desc="Reading Parquet files.", disable=True):
    df = pd.read_parquet(pq_file)
    df["log_time"] = df["log_time"].dt.tz_convert("UTC")
    print(f"{pq_file} has timestamps from {df['log_time'].min()} to {df['log_time'].max()}.")
    df["query_template"] = df["query_template"].replace("", np.nan)
    dropna_before = df.shape[0]
    df = df.dropna(subset=["query_template"])
    dropna_after = df.shape[0]
    print(
        f"Dropped {dropna_before - dropna_after} empty query template rows in {pq_file}. {dropna_after} rows remain."
    )
    
df.head()

Parquet files: [PosixPath('artifacts/tmp/data/train.parquet')]
artifacts/tmp/data/train.parquet has timestamps from 2022-05-08 03:10:49.613000+00:00 to 2022-05-08 03:11:06.850000+00:00.
Dropped 15 empty query template rows in artifacts/tmp/data/train.parquet. 91880 rows remain.


Unnamed: 0,log_time,session_id,session_line_num,virtual_transaction_id,transaction_id,query_template,query_params
12,2022-05-08 03:10:51.151000+00:00,627734bb.115ed,3,3/3,0,BEGIN,[]
13,2022-05-08 03:10:51.152000+00:00,627734bb.115ed,4,3/3,0,SET extra_float_digits = $1,['3']
14,2022-05-08 03:10:51.153000+00:00,627734bb.115ed,5,3/3,0,SET application_name = $1,['PostgreSQL JDBC Driver']
15,2022-05-08 03:10:51.153000+00:00,627734bb.115ed,6,3/3,0,COMMIT,[]
16,2022-05-08 03:10:51.176000+00:00,627734bb.115ed,7,3/4,0,select current_schema(),[]


In [49]:
vids = df.groupby("virtual_transaction_id").size()
vids = vids[(vids >= 5) & (vids <= 20)].index # vids with larger than 5 queries
vids[-20:]

Index(['3/97', '3/970', '3/973', '3/974', '3/976', '3/978', '3/98', '3/981',
       '3/984', '3/986', '3/987', '3/988', '3/989', '3/99', '3/990', '3/993',
       '3/994', '3/996', '3/997', '3/999'],
      dtype='object', name='virtual_transaction_id')

In [58]:
pd.options.display.max_colwidth = None
for i, vid in enumerate(vids):
  print(vid)
  if i >= 10:
    break
  display(df[df["virtual_transaction_id"] == vid][["query_template", "query_params"]])


3/100


Unnamed: 0,query_template,query_params
1982,BEGIN,[]
1983,"SELECT C_FIRST, C_MIDDLE, C_LAST, C_STREET_1, C_STREET_2, C_CITY, C_STATE, C_ZIP, C_PHONE, C_CREDIT, C_CREDIT_LIM, C_DISCOUNT, C_BALANCE, C_YTD_PAYMENT, C_PAYMENT_CNT, C_SINCE FROM customer WHERE C_W_ID = $1 AND C_D_ID = $2 AND C_ID = $3","['1.0', '2.0', '1823.0']"
1984,"SELECT O_ID, O_CARRIER_ID, O_ENTRY_D FROM oorder WHERE O_W_ID = $1 AND O_D_ID = $2 AND O_C_ID = $3 ORDER BY O_ID DESC LIMIT $4","['1.0', '2.0', '1823.0', '1']"
1985,"SELECT OL_I_ID, OL_SUPPLY_W_ID, OL_QUANTITY, OL_AMOUNT, OL_DELIVERY_D FROM order_line WHERE OL_O_ID = $1 AND OL_D_ID = $2 AND OL_W_ID = $3","['2443.0', '2.0', '1.0']"
1986,COMMIT,[]


3/1002


Unnamed: 0,query_template,query_params
29105,BEGIN,[]
29106,UPDATE warehouse SET W_YTD = W_YTD + $1 WHERE W_ID = $2,"['3524.550048828125', '1.0']"
29107,"SELECT W_STREET_1, W_STREET_2, W_CITY, W_STATE, W_ZIP, W_NAME FROM warehouse WHERE W_ID = $1",['1.0']
29108,UPDATE district SET D_YTD = D_YTD + $1 WHERE D_W_ID = $2 AND D_ID = $3,"['3524.550048828125', '1.0', '5.0']"
29109,"SELECT D_STREET_1, D_STREET_2, D_CITY, D_STATE, D_ZIP, D_NAME FROM district WHERE D_W_ID = $1 AND D_ID = $2","['1.0', '5.0']"
29110,"SELECT C_FIRST, C_MIDDLE, C_ID, C_STREET_1, C_STREET_2, C_CITY, C_STATE, C_ZIP, C_PHONE, C_CREDIT, C_CREDIT_LIM, C_DISCOUNT, C_BALANCE, C_YTD_PAYMENT, C_PAYMENT_CNT, C_SINCE FROM customer WHERE C_W_ID = $1 AND C_D_ID = $2 AND C_LAST = $3 ORDER BY C_FIRST","['1.0', '5.0', 'PRESOUGHTABLE']"
29111,"UPDATE customer SET C_BALANCE = -$1, C_YTD_PAYMENT = $2, C_PAYMENT_CNT = $3 WHERE C_W_ID = $4 AND C_D_ID = $5 AND C_ID = $6","['3534.550048828125', '3534.550048828125', '2.0', '1.0', '5.0', '2674.0']"
29112,"INSERT INTO history (H_C_D_ID, H_C_W_ID, H_C_ID, H_D_ID, H_W_ID, H_DATE, H_AMOUNT, H_DATA) VALUES ($1,$2,$3,$4,$5,$6,$7,$8)","['5.0', '1.0', '2674.0', '5.0', '1.0', '2022-05-07 23:10:56.843', '3524.550048828125', 'zexjfddn rqefnmwm']"
29113,COMMIT,[]


3/1004


Unnamed: 0,query_template,query_params
29178,BEGIN,[]
29179,UPDATE warehouse SET W_YTD = W_YTD + $1 WHERE W_ID = $2,"['1379.47998046875', '1.0']"
29180,"SELECT W_STREET_1, W_STREET_2, W_CITY, W_STATE, W_ZIP, W_NAME FROM warehouse WHERE W_ID = $1",['1.0']
29181,UPDATE district SET D_YTD = D_YTD + $1 WHERE D_W_ID = $2 AND D_ID = $3,"['1379.47998046875', '1.0', '5.0']"
29182,"SELECT D_STREET_1, D_STREET_2, D_CITY, D_STATE, D_ZIP, D_NAME FROM district WHERE D_W_ID = $1 AND D_ID = $2","['1.0', '5.0']"
29183,"SELECT C_FIRST, C_MIDDLE, C_LAST, C_STREET_1, C_STREET_2, C_CITY, C_STATE, C_ZIP, C_PHONE, C_CREDIT, C_CREDIT_LIM, C_DISCOUNT, C_BALANCE, C_YTD_PAYMENT, C_PAYMENT_CNT, C_SINCE FROM customer WHERE C_W_ID = $1 AND C_D_ID = $2 AND C_ID = $3","['1.0', '6.0', '1267.0']"
29184,"UPDATE customer SET C_BALANCE = -$1, C_YTD_PAYMENT = $2, C_PAYMENT_CNT = $3 WHERE C_W_ID = $4 AND C_D_ID = $5 AND C_ID = $6","['1389.47998046875', '1389.47998046875', '2.0', '1.0', '6.0', '1267.0']"
29185,"INSERT INTO history (H_C_D_ID, H_C_W_ID, H_C_ID, H_D_ID, H_W_ID, H_DATE, H_AMOUNT, H_DATA) VALUES ($1,$2,$3,$4,$5,$6,$7,$8)","['6.0', '1.0', '1267.0', '5.0', '1.0', '2022-05-07 23:10:56.85', '1379.47998046875', 'zexjfddn rqefnmwm']"
29186,COMMIT,[]


3/1005


Unnamed: 0,query_template,query_params
29187,BEGIN,[]
29188,UPDATE warehouse SET W_YTD = W_YTD + $1 WHERE W_ID = $2,"['3075.60009765625', '1.0']"
29189,"SELECT W_STREET_1, W_STREET_2, W_CITY, W_STATE, W_ZIP, W_NAME FROM warehouse WHERE W_ID = $1",['1.0']
29190,UPDATE district SET D_YTD = D_YTD + $1 WHERE D_W_ID = $2 AND D_ID = $3,"['3075.60009765625', '1.0', '10.0']"
29191,"SELECT D_STREET_1, D_STREET_2, D_CITY, D_STATE, D_ZIP, D_NAME FROM district WHERE D_W_ID = $1 AND D_ID = $2","['1.0', '10.0']"
29192,"SELECT C_FIRST, C_MIDDLE, C_ID, C_STREET_1, C_STREET_2, C_CITY, C_STATE, C_ZIP, C_PHONE, C_CREDIT, C_CREDIT_LIM, C_DISCOUNT, C_BALANCE, C_YTD_PAYMENT, C_PAYMENT_CNT, C_SINCE FROM customer WHERE C_W_ID = $1 AND C_D_ID = $2 AND C_LAST = $3 ORDER BY C_FIRST","['1.0', '10.0', 'CALLYOUGHTANTI']"
29193,"UPDATE customer SET C_BALANCE = -$1, C_YTD_PAYMENT = $2, C_PAYMENT_CNT = $3 WHERE C_W_ID = $4 AND C_D_ID = $5 AND C_ID = $6","['3085.60009765625', '3085.60009765625', '2.0', '1.0', '10.0', '717.0']"
29194,"INSERT INTO history (H_C_D_ID, H_C_W_ID, H_C_ID, H_D_ID, H_W_ID, H_DATE, H_AMOUNT, H_DATA) VALUES ($1,$2,$3,$4,$5,$6,$7,$8)","['10.0', '1.0', '717.0', '10.0', '1.0', '2022-05-07 23:10:56.851', '3075.60009765625', 'zexjfddn vxcdym']"
29195,COMMIT,[]


3/1006


Unnamed: 0,query_template,query_params
29196,BEGIN,[]
29197,"SELECT C_FIRST, C_MIDDLE, C_LAST, C_STREET_1, C_STREET_2, C_CITY, C_STATE, C_ZIP, C_PHONE, C_CREDIT, C_CREDIT_LIM, C_DISCOUNT, C_BALANCE, C_YTD_PAYMENT, C_PAYMENT_CNT, C_SINCE FROM customer WHERE C_W_ID = $1 AND C_D_ID = $2 AND C_ID = $3","['1.0', '6.0', '1091.0']"
29198,"SELECT O_ID, O_CARRIER_ID, O_ENTRY_D FROM oorder WHERE O_W_ID = $1 AND O_D_ID = $2 AND O_C_ID = $3 ORDER BY O_ID DESC LIMIT $4","['1.0', '6.0', '1091.0', '1']"
29199,"SELECT OL_I_ID, OL_SUPPLY_W_ID, OL_QUANTITY, OL_AMOUNT, OL_DELIVERY_D FROM order_line WHERE OL_O_ID = $1 AND OL_D_ID = $2 AND OL_W_ID = $3","['1278.0', '6.0', '1.0']"
29200,COMMIT,[]


3/1008


Unnamed: 0,query_template,query_params
29257,BEGIN,[]
29258,UPDATE warehouse SET W_YTD = W_YTD + $1 WHERE W_ID = $2,"['4632.1201171875', '1.0']"
29259,"SELECT W_STREET_1, W_STREET_2, W_CITY, W_STATE, W_ZIP, W_NAME FROM warehouse WHERE W_ID = $1",['1.0']
29260,UPDATE district SET D_YTD = D_YTD + $1 WHERE D_W_ID = $2 AND D_ID = $3,"['4632.1201171875', '1.0', '4.0']"
29261,"SELECT D_STREET_1, D_STREET_2, D_CITY, D_STATE, D_ZIP, D_NAME FROM district WHERE D_W_ID = $1 AND D_ID = $2","['1.0', '4.0']"
29262,"SELECT C_FIRST, C_MIDDLE, C_LAST, C_STREET_1, C_STREET_2, C_CITY, C_STATE, C_ZIP, C_PHONE, C_CREDIT, C_CREDIT_LIM, C_DISCOUNT, C_BALANCE, C_YTD_PAYMENT, C_PAYMENT_CNT, C_SINCE FROM customer WHERE C_W_ID = $1 AND C_D_ID = $2 AND C_ID = $3","['1.0', '4.0', '1054.0']"
29263,"UPDATE customer SET C_BALANCE = -$1, C_YTD_PAYMENT = $2, C_PAYMENT_CNT = $3 WHERE C_W_ID = $4 AND C_D_ID = $5 AND C_ID = $6","['4642.1201171875', '4642.1201171875', '2.0', '1.0', '4.0', '1054.0']"
29264,"INSERT INTO history (H_C_D_ID, H_C_W_ID, H_C_ID, H_D_ID, H_W_ID, H_DATE, H_AMOUNT, H_DATA) VALUES ($1,$2,$3,$4,$5,$6,$7,$8)","['4.0', '1.0', '1054.0', '4.0', '1.0', '2022-05-07 23:10:56.856', '4632.1201171875', 'zexjfddn orasqk']"
29265,COMMIT,[]


3/1010


Unnamed: 0,query_template,query_params
29310,BEGIN,[]
29311,UPDATE warehouse SET W_YTD = W_YTD + $1 WHERE W_ID = $2,"['3007.3701171875', '1.0']"
29312,"SELECT W_STREET_1, W_STREET_2, W_CITY, W_STATE, W_ZIP, W_NAME FROM warehouse WHERE W_ID = $1",['1.0']
29313,UPDATE district SET D_YTD = D_YTD + $1 WHERE D_W_ID = $2 AND D_ID = $3,"['3007.3701171875', '1.0', '2.0']"
29314,"SELECT D_STREET_1, D_STREET_2, D_CITY, D_STATE, D_ZIP, D_NAME FROM district WHERE D_W_ID = $1 AND D_ID = $2","['1.0', '2.0']"
29315,"SELECT C_FIRST, C_MIDDLE, C_LAST, C_STREET_1, C_STREET_2, C_CITY, C_STATE, C_ZIP, C_PHONE, C_CREDIT, C_CREDIT_LIM, C_DISCOUNT, C_BALANCE, C_YTD_PAYMENT, C_PAYMENT_CNT, C_SINCE FROM customer WHERE C_W_ID = $1 AND C_D_ID = $2 AND C_ID = $3","['1.0', '2.0', '507.0']"
29316,"UPDATE customer SET C_BALANCE = -$1, C_YTD_PAYMENT = $2, C_PAYMENT_CNT = $3 WHERE C_W_ID = $4 AND C_D_ID = $5 AND C_ID = $6","['3017.3701171875', '3017.3701171875', '2.0', '1.0', '2.0', '507.0']"
29317,"INSERT INTO history (H_C_D_ID, H_C_W_ID, H_C_ID, H_D_ID, H_W_ID, H_DATE, H_AMOUNT, H_DATA) VALUES ($1,$2,$3,$4,$5,$6,$7,$8)","['2.0', '1.0', '507.0', '2.0', '1.0', '2022-05-07 23:10:56.878', '3007.3701171875', 'zexjfddn mcjzpbewb']"
29318,COMMIT,[]


3/1011


Unnamed: 0,query_template,query_params
29319,BEGIN,[]
29320,UPDATE warehouse SET W_YTD = W_YTD + $1 WHERE W_ID = $2,"['4138.91015625', '1.0']"
29321,"SELECT W_STREET_1, W_STREET_2, W_CITY, W_STATE, W_ZIP, W_NAME FROM warehouse WHERE W_ID = $1",['1.0']
29322,UPDATE district SET D_YTD = D_YTD + $1 WHERE D_W_ID = $2 AND D_ID = $3,"['4138.91015625', '1.0', '3.0']"
29323,"SELECT D_STREET_1, D_STREET_2, D_CITY, D_STATE, D_ZIP, D_NAME FROM district WHERE D_W_ID = $1 AND D_ID = $2","['1.0', '3.0']"
29324,"SELECT C_FIRST, C_MIDDLE, C_ID, C_STREET_1, C_STREET_2, C_CITY, C_STATE, C_ZIP, C_PHONE, C_CREDIT, C_CREDIT_LIM, C_DISCOUNT, C_BALANCE, C_YTD_PAYMENT, C_PAYMENT_CNT, C_SINCE FROM customer WHERE C_W_ID = $1 AND C_D_ID = $2 AND C_LAST = $3 ORDER BY C_FIRST","['1.0', '3.0', 'PRESANTIEING']"
29325,"UPDATE customer SET C_BALANCE = -$1, C_YTD_PAYMENT = $2, C_PAYMENT_CNT = $3 WHERE C_W_ID = $4 AND C_D_ID = $5 AND C_ID = $6","['4148.91015625', '4148.91015625', '2.0', '1.0', '3.0', '2584.0']"
29326,"INSERT INTO history (H_C_D_ID, H_C_W_ID, H_C_ID, H_D_ID, H_W_ID, H_DATE, H_AMOUNT, H_DATA) VALUES ($1,$2,$3,$4,$5,$6,$7,$8)","['3.0', '1.0', '2584.0', '3.0', '1.0', '2022-05-07 23:10:56.879', '4138.91015625', 'zexjfddn dkkqce']"
29327,COMMIT,[]


3/1012


Unnamed: 0,query_template,query_params
29328,BEGIN,[]
29329,UPDATE warehouse SET W_YTD = W_YTD + $1 WHERE W_ID = $2,"['3311.530029296875', '1.0']"
29330,"SELECT W_STREET_1, W_STREET_2, W_CITY, W_STATE, W_ZIP, W_NAME FROM warehouse WHERE W_ID = $1",['1.0']
29331,UPDATE district SET D_YTD = D_YTD + $1 WHERE D_W_ID = $2 AND D_ID = $3,"['3311.530029296875', '1.0', '5.0']"
29332,"SELECT D_STREET_1, D_STREET_2, D_CITY, D_STATE, D_ZIP, D_NAME FROM district WHERE D_W_ID = $1 AND D_ID = $2","['1.0', '5.0']"
29333,"SELECT C_FIRST, C_MIDDLE, C_LAST, C_STREET_1, C_STREET_2, C_CITY, C_STATE, C_ZIP, C_PHONE, C_CREDIT, C_CREDIT_LIM, C_DISCOUNT, C_BALANCE, C_YTD_PAYMENT, C_PAYMENT_CNT, C_SINCE FROM customer WHERE C_W_ID = $1 AND C_D_ID = $2 AND C_ID = $3","['1.0', '5.0', '2037.0']"
29334,"UPDATE customer SET C_BALANCE = -$1, C_YTD_PAYMENT = $2, C_PAYMENT_CNT = $3 WHERE C_W_ID = $4 AND C_D_ID = $5 AND C_ID = $6","['3321.530029296875', '3321.530029296875', '2.0', '1.0', '5.0', '2037.0']"
29335,"INSERT INTO history (H_C_D_ID, H_C_W_ID, H_C_ID, H_D_ID, H_W_ID, H_DATE, H_AMOUNT, H_DATA) VALUES ($1,$2,$3,$4,$5,$6,$7,$8)","['5.0', '1.0', '2037.0', '5.0', '1.0', '2022-05-07 23:10:56.88', '3311.530029296875', 'zexjfddn rqefnmwm']"
29336,COMMIT,[]


3/1019


Unnamed: 0,query_template,query_params
29609,BEGIN,[]
29610,UPDATE warehouse SET W_YTD = W_YTD + $1 WHERE W_ID = $2,"['798.2000122070312', '1.0']"
29611,"SELECT W_STREET_1, W_STREET_2, W_CITY, W_STATE, W_ZIP, W_NAME FROM warehouse WHERE W_ID = $1",['1.0']
29612,UPDATE district SET D_YTD = D_YTD + $1 WHERE D_W_ID = $2 AND D_ID = $3,"['798.2000122070312', '1.0', '7.0']"
29613,"SELECT D_STREET_1, D_STREET_2, D_CITY, D_STATE, D_ZIP, D_NAME FROM district WHERE D_W_ID = $1 AND D_ID = $2","['1.0', '7.0']"
29614,"SELECT C_FIRST, C_MIDDLE, C_ID, C_STREET_1, C_STREET_2, C_CITY, C_STATE, C_ZIP, C_PHONE, C_CREDIT, C_CREDIT_LIM, C_DISCOUNT, C_BALANCE, C_YTD_PAYMENT, C_PAYMENT_CNT, C_SINCE FROM customer WHERE C_W_ID = $1 AND C_D_ID = $2 AND C_LAST = $3 ORDER BY C_FIRST","['1.0', '7.0', 'EINGPRESABLE']"
29615,"UPDATE customer SET C_BALANCE = -$1, C_YTD_PAYMENT = $2, C_PAYMENT_CNT = $3 WHERE C_W_ID = $4 AND C_D_ID = $5 AND C_ID = $6","['808.2000122070312', '808.2000122070312', '2.0', '1.0', '7.0', '943.0']"
29616,"INSERT INTO history (H_C_D_ID, H_C_W_ID, H_C_ID, H_D_ID, H_W_ID, H_DATE, H_AMOUNT, H_DATA) VALUES ($1,$2,$3,$4,$5,$6,$7,$8)","['7.0', '1.0', '943.0', '7.0', '1.0', '2022-05-07 23:10:56.905', '798.2000122070312', 'zexjfddn bdayjnrz']"
29617,COMMIT,[]


3/102


In [60]:
display(df[df["virtual_transaction_id"] == '3/103'][["query_template", "query_params"]])

Unnamed: 0,query_template,query_params
2056,BEGIN,[]
2057,"SELECT C_DISCOUNT, C_LAST, C_CREDIT FROM customer WHERE C_W_ID = $1 AND C_D_ID = $2 AND C_ID = $3","['1.0', '4.0', '155.0']"
2058,SELECT W_TAX FROM warehouse WHERE W_ID = $1,['1.0']
2059,"SELECT D_NEXT_O_ID, D_TAX FROM district WHERE D_W_ID = $1 AND D_ID = $2 FOR UPDATE","['1.0', '4.0']"
2060,UPDATE district SET D_NEXT_O_ID = D_NEXT_O_ID + $1 WHERE D_W_ID = $2 AND D_ID = $3,"['1', '1.0', '4.0']"
2061,"INSERT INTO oorder (O_ID, O_D_ID, O_W_ID, O_C_ID, O_ENTRY_D, O_OL_CNT, O_ALL_LOCAL) VALUES ($1, $2, $3, $4, $5, $6, $7)","['3003.0', '4.0', '1.0', '155.0', '2022-05-07 23:10:52.036', '6.0', '1.0']"
2062,"INSERT INTO new_order (NO_O_ID, NO_D_ID, NO_W_ID) VALUES ( $1, $2, $3)","['3003.0', '4.0', '1.0']"
2063,"SELECT I_PRICE, I_NAME , I_DATA FROM item WHERE I_ID = $1",['5158.0']
2064,"SELECT S_QUANTITY, S_DATA, S_DIST_01, S_DIST_02, S_DIST_03, S_DIST_04, S_DIST_05, S_DIST_06, S_DIST_07, S_DIST_08, S_DIST_09, S_DIST_10 FROM stock WHERE S_I_ID = $1 AND S_W_ID = $2 FOR UPDATE","['5158.0', '1.0']"
2065,"SELECT I_PRICE, I_NAME , I_DATA FROM item WHERE I_ID = $1",['71863.0']
