In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load data

In [2]:
deliveries = pd.read_json("../data_new/deliveries.jsonl", lines=True)
products = pd.read_json("../data_new/products.jsonl", lines=True)
sessions = pd.read_json("../data_new/sessions.jsonl", lines=True)
users = pd.read_json("../data_new/users.jsonl", lines=True)

In [3]:
deliveries["purchase_timestamp"] = pd.to_datetime(deliveries["purchase_timestamp"])
deliveries["delivery_timestamp"] = pd.to_datetime(deliveries["delivery_timestamp"])

# Merge data

In [4]:
df = sessions.merge(users, on="user_id", how="outer").merge(products, on="product_id", how="outer").merge(deliveries, on="purchase_id", how="outer")
df = df[df["event_type"] == "BUY_PRODUCT"].reset_index(drop=True)

# Feature engineering and encoding

In [5]:
df["hours"] = (df["delivery_timestamp"] - df["purchase_timestamp"]).dt.total_seconds() / 3600

In [6]:
df["purchase_year"] = df["purchase_timestamp"].dt.year
df["purchase_month"] = df["purchase_timestamp"].dt.month
df["purchase_day"] = df["purchase_timestamp"].dt.day

df["purchase_hour"] = df["purchase_timestamp"].dt.hour
df["purchase_minute"] = df["purchase_timestamp"].dt.minute
df["purchase_second"] = df["purchase_timestamp"].dt.second

df["purchase_dayofweek"] = df["purchase_timestamp"].dt.dayofweek

In [7]:
# Source: https://www.kaggle.com/avanwyk/encoding-cyclical-features-for-deep-learning
df["purchase_month_sin"] = np.sin(2 * np.pi * df["purchase_month"] / 12)
df["purchase_month_cos"] = np.cos(2 * np.pi * df["purchase_month"] / 12)

df["purchase_day_sin"] = np.sin(2 * np.pi * df["purchase_day"] / 31)
df["purchase_day_cos"] = np.cos(2 * np.pi * df["purchase_day"] / 31)

df["purchase_hour_sin"] = np.sin(2 * np.pi * df["purchase_hour"] / 24)
df["purchase_hour_cos"] = np.cos(2 * np.pi * df["purchase_hour"] / 24)

df["purchase_minute_sin"] = np.sin(2 * np.pi * df["purchase_minute"] / 60)
df["purchase_minute_cos"] = np.cos(2 * np.pi * df["purchase_minute"] / 60)

df["purchase_second_sin"] = np.sin(2 * np.pi * df["purchase_second"] / 60)
df["purchase_second_cos"] = np.cos(2 * np.pi * df["purchase_second"] / 60)

df["purchase_dayofweek_sin"] = np.sin(2 * np.pi * df["purchase_dayofweek"] / 7)
df["purchase_dayofweek_cos"] = np.cos(2 * np.pi * df["purchase_dayofweek"] / 7)

In [8]:
for delivery_company in [360, 516, 620]:
    df["delivery_company_" + str(delivery_company)] = df["delivery_company"].apply(lambda x: int(x == delivery_company))

In [9]:
for city in ["Gdynia", "Konin", "Kutno", "Mielec", "Police", "Radom", "Szczecin", "Warszawa"]:
    df["city_" + city] = df["city"].apply(lambda x: int(x == city))

In [10]:
for cat in ["gry i konsole", "komputery", "sprzęt rtv", "telefony i akcesoria"]:
    df["category_" + cat] = df["category_path"].apply(lambda x: int(cat in x.lower()))

In [11]:
mean_price = df["price"].mean()
std_price = df["price"].std()

df["price"] = (df["price"] - mean_price) / std_price

print(mean_price, std_price)

689.4939303349793 1247.1321569552465


In [12]:
FEATURES = ["category_gry i konsole", "category_komputery", "category_sprzęt rtv", "category_telefony i akcesoria", "city_Gdynia", "city_Konin", "city_Kutno", "city_Mielec", "city_Police", "city_Radom", "city_Szczecin", "city_Warszawa", "delivery_company_360", "delivery_company_516", "delivery_company_620", "price", "purchase_day", "purchase_day_cos", "purchase_day_sin", "purchase_dayofweek", "purchase_dayofweek_cos", "purchase_dayofweek_sin", "purchase_hour", "purchase_hour_cos", "purchase_hour_sin", "purchase_minute", "purchase_minute_cos", "purchase_minute_sin", "purchase_month", "purchase_month_cos", "purchase_month_sin", "purchase_second", "purchase_second_cos", "purchase_second_sin", "purchase_year"]
TARGET = "hours"

In [13]:
df[FEATURES]

Unnamed: 0,category_gry i konsole,category_komputery,category_sprzęt rtv,category_telefony i akcesoria,city_Gdynia,city_Konin,city_Kutno,city_Mielec,city_Police,city_Radom,...,purchase_minute,purchase_minute_cos,purchase_minute_sin,purchase_month,purchase_month_cos,purchase_month_sin,purchase_second,purchase_second_cos,purchase_second_sin,purchase_year
0,0,0,1,0,0,0,0,0,1,0,...,46,1.045285e-01,-9.945219e-01,1,8.660254e-01,0.500000,41,-4.067366e-01,-0.913545,2021
1,0,0,1,0,0,0,0,0,1,0,...,41,-4.067366e-01,-9.135455e-01,4,-5.000000e-01,0.866025,48,3.090170e-01,-0.951057,2021
2,0,0,1,0,0,0,0,0,1,0,...,5,8.660254e-01,5.000000e-01,4,-5.000000e-01,0.866025,0,1.000000e+00,0.000000,2021
3,0,0,1,0,0,0,0,0,1,0,...,47,2.079117e-01,-9.781476e-01,4,-5.000000e-01,0.866025,5,8.660254e-01,0.500000,2021
4,0,0,1,0,0,0,0,1,0,0,...,15,2.832769e-16,1.000000e+00,3,6.123234e-17,1.000000,15,2.832769e-16,1.000000,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7488,0,0,0,1,0,0,0,1,0,0,...,9,5.877853e-01,8.090170e-01,4,-5.000000e-01,0.866025,12,3.090170e-01,0.951057,2021
7489,0,0,0,1,0,1,0,0,0,0,...,4,9.135455e-01,4.067366e-01,4,-5.000000e-01,0.866025,59,9.945219e-01,-0.104528,2021
7490,0,0,0,1,0,1,0,0,0,0,...,30,-1.000000e+00,5.665539e-16,3,6.123234e-17,1.000000,38,-6.691306e-01,-0.743145,2021
7491,0,0,0,1,0,1,0,0,0,0,...,45,-1.836970e-16,-1.000000e+00,2,5.000000e-01,0.866025,36,-8.090170e-01,-0.587785,2021


In [14]:
df[TARGET]

0       51.005447
1       42.131664
2       29.112785
3       42.778009
4       98.765337
          ...    
7488    60.993458
7489    39.876568
7490    65.888192
7491    69.513081
7492    26.544417
Name: hours, Length: 7493, dtype: float64

# Split data

In [15]:
train, dev_test = train_test_split(df, train_size=0.7, random_state=42, shuffle=True)
dev, test = train_test_split(dev_test, test_size=0.5, random_state=42, shuffle=True)

In [16]:
len(train), len(dev), len(test)

(5245, 1124, 1124)

In [17]:
len(train) / len(df), len(dev) / len(df), len(test) / len(df)

(0.6999866542105966, 0.1500066728947017, 0.1500066728947017)

# Export data

In [18]:
train.to_csv("../data_preprocessed/train.csv", index=False)
dev.to_csv("../data_preprocessed/dev.csv", index=False)
test.to_csv("../data_preprocessed/test.csv", index=False)