In [1]:
# Keep the “high-value transaction” target
# Remove Quantity and UnitPrice as input features
# Add features derived from customer history / past invoices / time / country
# Use LightGBM to train
# Monitor drift on these features over time

In [2]:
import pandas as pd
import numpy as np

In [3]:
import sys
from pathlib import Path

PROJECT_ROOT = Path().resolve().parents[0]
sys.path.append(str(PROJECT_ROOT))


In [4]:
from src.data_loader import load_data
from src.features import add_time_features
from src.features import create_target

In [8]:
df = load_data("../data/raw/online__retail_II.csv")
df

Unnamed: 0,Quantity,InvoiceDate,Price,Country,StockCode
0,12,2009-12-01 07:45:00,6.95,United Kingdom,85048
1,12,2009-12-01 07:45:00,6.75,United Kingdom,79323P
2,12,2009-12-01 07:45:00,6.75,United Kingdom,79323W
3,48,2009-12-01 07:45:00,2.10,United Kingdom,22041
4,24,2009-12-01 07:45:00,1.25,United Kingdom,21232
...,...,...,...,...,...
1067366,6,2011-12-09 12:50:00,2.10,France,22899
1067367,4,2011-12-09 12:50:00,4.15,France,23254
1067368,4,2011-12-09 12:50:00,4.15,France,23255
1067369,3,2011-12-09 12:50:00,4.95,France,22138


In [9]:
df = add_time_features(df)

In [10]:
target = create_target(df , 0.8)
target

0          1
1          1
2          1
3          1
4          1
          ..
1067366    0
1067367    0
1067368    0
1067369    0
1067370    0
Length: 1067371, dtype: int64

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1067371 entries, 0 to 1067370
Data columns (total 11 columns):
 #   Column       Non-Null Count    Dtype         
---  ------       --------------    -----         
 0   Quantity     1067371 non-null  int64         
 1   InvoiceDate  1067371 non-null  datetime64[ns]
 2   Price        1067371 non-null  float64       
 3   Country      1067371 non-null  object        
 4   StockCode    1067371 non-null  object        
 5   Year         1067371 non-null  int32         
 6   Month        1067371 non-null  int32         
 7   Day          1067371 non-null  int32         
 8   Hour         1067371 non-null  int32         
 9   min          1067371 non-null  int32         
 10  sec          1067371 non-null  int32         
dtypes: datetime64[ns](1), float64(1), int32(6), int64(1), object(2)
memory usage: 65.1+ MB


In [12]:
columns_to_drop = ["Quantity" , "Price"]

In [13]:
df.StockCode.unique().size

5305

In [14]:
from sklearn.preprocessing import TargetEncoder

In [15]:
X = df.StockCode
X_df = X.to_frame(name="StockCode")

In [16]:
enc = TargetEncoder(smooth='auto')

In [17]:
X_encoded = enc.fit_transform(X_df , target)

In [18]:
X_encoded

array([[0.45363881],
       [0.53943918],
       [0.53840622],
       ...,
       [0.14324001],
       [0.2379432 ],
       [0.67999639]])

In [19]:
df["StockCode"] = X_encoded

In [20]:
df.head()

Unnamed: 0,Quantity,InvoiceDate,Price,Country,StockCode,Year,Month,Day,Hour,min,sec
0,12,2009-12-01 07:45:00,6.95,United Kingdom,0.453639,2009,12,1,7,45,0
1,12,2009-12-01 07:45:00,6.75,United Kingdom,0.539439,2009,12,1,7,45,0
2,12,2009-12-01 07:45:00,6.75,United Kingdom,0.538406,2009,12,1,7,45,0
3,48,2009-12-01 07:45:00,2.1,United Kingdom,0.364935,2009,12,1,7,45,0
4,24,2009-12-01 07:45:00,1.25,United Kingdom,0.20456,2009,12,1,7,45,0


In [21]:
day_of_week = df['InvoiceDate'].dt.weekday
day_of_week

0          1
1          1
2          1
3          1
4          1
          ..
1067366    4
1067367    4
1067368    4
1067369    4
1067370    4
Name: InvoiceDate, Length: 1067371, dtype: int32

In [22]:
df['is_weekend'] = (day_of_week >= 5).astype(int)

In [23]:
# Day of week
df['day_of_week'] = df['InvoiceDate'].dt.dayofweek

# Cyclic encoding for hour
df['hour_sin'] = np.sin(2 * np.pi * df['Hour']/24)
df['hour_cos'] = np.cos(2 * np.pi * df['Hour']/24)

# Quarter
df['quarter'] = df['Month'].apply(lambda x: (x-1)//3 + 1)

# Frequency encoding
df['stockcode_freq'] = df.groupby('StockCode')['StockCode'].transform('count')
df['country_freq'] = df.groupby('Country')['Country'].transform('count')

In [24]:
df = df.drop(columns=columns_to_drop)

In [25]:
df.head()

Unnamed: 0,InvoiceDate,Country,StockCode,Year,Month,Day,Hour,min,sec,is_weekend,day_of_week,hour_sin,hour_cos,quarter,stockcode_freq,country_freq
0,2009-12-01 07:45:00,United Kingdom,0.453639,2009,12,1,7,45,0,0,1,0.965926,-0.258819,4,111,981330
1,2009-12-01 07:45:00,United Kingdom,0.539439,2009,12,1,7,45,0,0,1,0.965926,-0.258819,4,62,981330
2,2009-12-01 07:45:00,United Kingdom,0.538406,2009,12,1,7,45,0,0,1,0.965926,-0.258819,4,100,981330
3,2009-12-01 07:45:00,United Kingdom,0.364935,2009,12,1,7,45,0,0,1,0.965926,-0.258819,4,114,981330
4,2009-12-01 07:45:00,United Kingdom,0.20456,2009,12,1,7,45,0,0,1,0.965926,-0.258819,4,552,981330


In [26]:
df.columns

Index(['InvoiceDate', 'Country', 'StockCode', 'Year', 'Month', 'Day', 'Hour',
       'min', 'sec', 'is_weekend', 'day_of_week', 'hour_sin', 'hour_cos',
       'quarter', 'stockcode_freq', 'country_freq'],
      dtype='object')

In [27]:
df["target"] = target

In [28]:
df.head()

Unnamed: 0,InvoiceDate,Country,StockCode,Year,Month,Day,Hour,min,sec,is_weekend,day_of_week,hour_sin,hour_cos,quarter,stockcode_freq,country_freq,target
0,2009-12-01 07:45:00,United Kingdom,0.453639,2009,12,1,7,45,0,0,1,0.965926,-0.258819,4,111,981330,1
1,2009-12-01 07:45:00,United Kingdom,0.539439,2009,12,1,7,45,0,0,1,0.965926,-0.258819,4,62,981330,1
2,2009-12-01 07:45:00,United Kingdom,0.538406,2009,12,1,7,45,0,0,1,0.965926,-0.258819,4,100,981330,1
3,2009-12-01 07:45:00,United Kingdom,0.364935,2009,12,1,7,45,0,0,1,0.965926,-0.258819,4,114,981330,1
4,2009-12-01 07:45:00,United Kingdom,0.20456,2009,12,1,7,45,0,0,1,0.965926,-0.258819,4,552,981330,1


In [29]:
df.to_csv("../data/processed/data.csv")