In [1]:
import pandas as pd
import numpy as np
import joblib 
import os
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("../data/OnlineRetail.csv", encoding="ISO-8859-1")
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [3]:
print("Shape:", df.shape)
df.info()

Shape: (541909, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


Remove Invalid Records

In [4]:
df = df.dropna(subset=["CustomerID"])
df = df[df["Quantity"]>0]
df= df[df["UnitPrice"]>0]
df.shape

(397884, 8)

Create Transaction Value and convert date

In [5]:
df["TotalPrice"] = df["Quantity"] * df["UnitPrice"]
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])


Build Customer-Level Features

In [6]:
customer_df = df.groupby("CustomerID").agg({
    "InvoiceDate": lambda x: (df["InvoiceDate"].max() - x.max()).days,
    "InvoiceNo": "nunique",
    "TotalPrice": "sum",
    "Quantity": "sum"
}).reset_index()

customer_df.columns = [
    "CustomerID",
    "Recency",
    "Frequency",
    "MonetaryValue",
    "TotalQuantity"
]

customer_df.head()

Unnamed: 0,CustomerID,Recency,Frequency,MonetaryValue,TotalQuantity
0,12346.0,325,1,77183.6,74215
1,12347.0,1,7,4310.0,2458
2,12348.0,74,4,1797.24,2341
3,12349.0,18,1,1757.55,631
4,12350.0,309,1,334.4,197


Feature Scaling

In [8]:
features = customer_df[[
    "Recency",
    "Frequency",
    "MonetaryValue",
    "TotalQuantity"
]]
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)
features_scaled

array([[ 2.33457414e+00, -4.25096503e-01,  8.35866818e+00,
         1.44730378e+01],
       [-9.05340320e-01,  3.54416797e-01,  2.50966264e-01,
         2.51057567e-01],
       [-1.75359593e-01, -3.53398530e-02, -2.85960063e-02,
         2.27868586e-01],
       ...,
       [-8.45341904e-01, -2.95177619e-01, -2.08742313e-01,
        -2.15694995e-01],
       [-8.85340848e-01,  1.52368675e+00,  4.51854273e-03,
         4.07711685e-02],
       [-4.95351144e-01, -1.65258736e-01, -2.41412739e-02,
         7.82302913e-02]], shape=(4338, 4))

Save Artifacts

In [9]:
os.makedirs("../artifacts", exist_ok=True)

joblib.dump(customer_df, "../artifacts/customer_raw.pkl")
joblib.dump(features, "../artifacts/customer_features.pkl")
joblib.dump(features_scaled, "../artifacts/customer_features_scaled.pkl")
joblib.dump(scaler, "../artifacts/scaler.pkl")

print("Artifacts saved successfully")


Artifacts saved successfully
