In [23]:
from glob import glob
from pathlib import Path
from tqdm.auto import tqdm
import gc

import numpy as np
import pandas as pd
import cudf


# memory management for cudf
cudf.set_option("default_integer_bitwidth", 32)
cudf.set_option("default_float_bitwidth", 32)

In [24]:
# save parquet format form jsonl
input_dir = Path("../../input")
phase = "test"
chunksize = 100000

chunks = pd.read_json(input_dir / f"{phase}.jsonl", lines=True, chunksize=chunksize)
Path.mkdir(input_dir / f"{phase}_parquet", exist_ok=True)

for i, chunk in enumerate(tqdm(chunks)):
    event_dict = {
        "session": [],
        "aid": [],
        "ts": [],
        "type": [],
    }

    for session, events in zip(chunk["session"].tolist(), chunk["events"].tolist()):
        for event in events:
            event_dict["session"].append(session)
            event_dict["aid"].append(event["aid"])
            event_dict["ts"].append(event["ts"])
            event_dict["type"].append(event["type"])

    start = str(i * chunksize).zfill(9)
    end = str(i * chunksize + chunksize).zfill(9)
    event_df = pd.DataFrame(event_dict)
    event_df.to_parquet(input_dir / f"{phase}_parquet" / f"{start}_{end}.parquet")
    
    del chunk, event_dict, event_df
    gc.collect()

0it [00:00, ?it/s]

In [4]:
input_dir = Path("../../input")
phase = "test"
dfs = []
for path in sorted(list(input_dir.glob(f"{phase}_parquet/*.parquet"))):
    dfs.append(pd.read_parquet(path))
df = pd.concat(dfs, axis=0)
df

Unnamed: 0,session,aid,ts,type
0,12899779,59625,1661724000278,clicks
1,12899780,1142000,1661724000378,clicks
2,12899780,582732,1661724058352,clicks
3,12899780,973453,1661724109199,clicks
4,12899780,736515,1661724136868,clicks
...,...,...,...,...
244438,14571577,1141710,1662328774770,clicks
244439,14571578,519105,1662328775009,clicks
244440,14571579,739876,1662328775605,clicks
244441,14571580,202353,1662328781067,clicks


In [5]:
df["session"].nunique()

1671803

In [16]:
df.type.unique()

array(['clicks', 'carts', 'orders'], dtype=object)

In [17]:
df.aid.nunique()

1855603

In [7]:
df[df["session"] == 12899780].head(30)

Unnamed: 0,session,aid,ts,type
1,12899780,1142000,1661724000378,clicks
2,12899780,582732,1661724058352,clicks
3,12899780,973453,1661724109199,clicks
4,12899780,736515,1661724136868,clicks
5,12899780,1142000,1661724155248,clicks


In [34]:
df["timestamp"] = pd.to_datetime(df["ts"], unit="ms")
df

Unnamed: 0,session,aid,ts,type,timestamp
0,12899779,59625,1661724000278,clicks,2022-08-28 22:00:00.278
1,12899780,1142000,1661724000378,clicks,2022-08-28 22:00:00.378
2,12899780,582732,1661724058352,clicks,2022-08-28 22:00:58.352
3,12899780,973453,1661724109199,clicks,2022-08-28 22:01:49.199
4,12899780,736515,1661724136868,clicks,2022-08-28 22:02:16.868
...,...,...,...,...,...
244438,14571577,1141710,1662328774770,clicks,2022-09-04 21:59:34.770
244439,14571578,519105,1662328775009,clicks,2022-09-04 21:59:35.009
244440,14571579,739876,1662328775605,clicks,2022-09-04 21:59:35.605
244441,14571580,202353,1662328781067,clicks,2022-09-04 21:59:41.067


In [3]:
# label_df = pd.read_json("../../output-label/test_labels.jsonl", lines=True)
label_df

Unnamed: 0,session,labels
0,11098528,"{'clicks': 1679529, 'carts': [1199737], 'order..."
1,11098529,{'clicks': 1105029}
2,11098530,{'orders': [409236]}
3,11098531,{'orders': [1365569]}
4,11098532,{'clicks': 1596491}
...,...,...
1783732,12899774,{'clicks': 1399483}
1783733,12899775,{'clicks': 1760714}
1783734,12899776,{'clicks': 1737908}
1783735,12899777,{'clicks': 384045}
