In [148]:
# Keep the “high-value transaction” target
# Remove Quantity and UnitPrice as input features
# Add features derived from customer history / past invoices / time / country
# Use LightGBM to train
# Monitor drift on these features over time

In [149]:
import pandas as pd
import numpy as np

In [150]:
import sys
from pathlib import Path

PROJECT_ROOT = Path().resolve().parents[0]
sys.path.append(str(PROJECT_ROOT))


In [151]:
from src.data_loader import load_data
from src.features import add_time_features
from src.features import create_target

In [152]:
df = load_data("../data/raw/online_retail_II.csv")
df

Unnamed: 0,Quantity,InvoiceDate,Price,Country,StockCode
0,12,2009-12-01 07:45:00,6.95,United Kingdom,85048
1,12,2009-12-01 07:45:00,6.75,United Kingdom,79323P
2,12,2009-12-01 07:45:00,6.75,United Kingdom,79323W
3,48,2009-12-01 07:45:00,2.10,United Kingdom,22041
4,24,2009-12-01 07:45:00,1.25,United Kingdom,21232
...,...,...,...,...,...
1067366,6,2011-12-09 12:50:00,2.10,France,22899
1067367,4,2011-12-09 12:50:00,4.15,France,23254
1067368,4,2011-12-09 12:50:00,4.15,France,23255
1067369,3,2011-12-09 12:50:00,4.95,France,22138


In [153]:
df = add_time_features(df)

In [154]:
target = create_target(df , 0.8)
target

0          1
1          1
2          1
3          1
4          1
          ..
1067366    0
1067367    0
1067368    0
1067369    0
1067370    0
Length: 1067371, dtype: int64

In [155]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1067371 entries, 0 to 1067370
Data columns (total 11 columns):
 #   Column       Non-Null Count    Dtype         
---  ------       --------------    -----         
 0   Quantity     1067371 non-null  int64         
 1   InvoiceDate  1067371 non-null  datetime64[ns]
 2   Price        1067371 non-null  float64       
 3   Country      1067371 non-null  object        
 4   StockCode    1067371 non-null  object        
 5   Year         1067371 non-null  int32         
 6   Month        1067371 non-null  int32         
 7   Day          1067371 non-null  int32         
 8   Hour         1067371 non-null  int32         
 9   min          1067371 non-null  int32         
 10  sec          1067371 non-null  int32         
dtypes: datetime64[ns](1), float64(1), int32(6), int64(1), object(2)
memory usage: 65.1+ MB


In [156]:
columns_to_drop = ["Quantity" , "Price"]

In [157]:
df.StockCode.unique().size

5305

In [158]:
from sklearn.preprocessing import TargetEncoder

In [159]:
X = df.StockCode
X_df = X.to_frame(name="StockCode")

In [160]:
enc = TargetEncoder(smooth='auto')

In [161]:
X_encoded = enc.fit_transform(X_df , target)

In [162]:
X_encoded

array([[0.46563086],
       [0.51734214],
       [0.55131796],
       ...,
       [0.14998258],
       [0.23030998],
       [0.68621662]])

In [163]:
df["StockCode"] = X_encoded

In [164]:
df.head()

Unnamed: 0,Quantity,InvoiceDate,Price,Country,StockCode,Year,Month,Day,Hour,min,sec
0,12,2009-12-01 07:45:00,6.95,United Kingdom,0.465631,2009,12,1,7,45,0
1,12,2009-12-01 07:45:00,6.75,United Kingdom,0.517342,2009,12,1,7,45,0
2,12,2009-12-01 07:45:00,6.75,United Kingdom,0.551318,2009,12,1,7,45,0
3,48,2009-12-01 07:45:00,2.1,United Kingdom,0.39055,2009,12,1,7,45,0
4,24,2009-12-01 07:45:00,1.25,United Kingdom,0.204992,2009,12,1,7,45,0


In [165]:
day_of_week = df['InvoiceDate'].dt.weekday
day_of_week

0          1
1          1
2          1
3          1
4          1
          ..
1067366    4
1067367    4
1067368    4
1067369    4
1067370    4
Name: InvoiceDate, Length: 1067371, dtype: int32

In [166]:
df['is_weekend'] = (day_of_week >= 5).astype(int)

In [167]:
df = df.drop(columns=columns_to_drop)

In [168]:
df.head()

Unnamed: 0,InvoiceDate,Country,StockCode,Year,Month,Day,Hour,min,sec,is_weekend
0,2009-12-01 07:45:00,United Kingdom,0.465631,2009,12,1,7,45,0,0
1,2009-12-01 07:45:00,United Kingdom,0.517342,2009,12,1,7,45,0,0
2,2009-12-01 07:45:00,United Kingdom,0.551318,2009,12,1,7,45,0,0
3,2009-12-01 07:45:00,United Kingdom,0.39055,2009,12,1,7,45,0,0
4,2009-12-01 07:45:00,United Kingdom,0.204992,2009,12,1,7,45,0,0


In [169]:
df.columns

Index(['InvoiceDate', 'Country', 'StockCode', 'Year', 'Month', 'Day', 'Hour',
       'min', 'sec', 'is_weekend'],
      dtype='object')

In [170]:
df["target"] = target

In [171]:
df.head()

Unnamed: 0,InvoiceDate,Country,StockCode,Year,Month,Day,Hour,min,sec,is_weekend,target
0,2009-12-01 07:45:00,United Kingdom,0.465631,2009,12,1,7,45,0,0,1
1,2009-12-01 07:45:00,United Kingdom,0.517342,2009,12,1,7,45,0,0,1
2,2009-12-01 07:45:00,United Kingdom,0.551318,2009,12,1,7,45,0,0,1
3,2009-12-01 07:45:00,United Kingdom,0.39055,2009,12,1,7,45,0,0,1
4,2009-12-01 07:45:00,United Kingdom,0.204992,2009,12,1,7,45,0,0,1


In [172]:
df.to_csv("../data/processed/data.csv")