# Feature Store Demo

## Step 1: Transform Customers Data

In [2]:
import pandas as pd
from sklearn.preprocessing import  LabelEncoder,MinMaxScaler
from datetime import datetime,date,timezone

In [7]:
customers_df = pd.read_csv("raw/customers.csv")

In [8]:
customers_df.head()

Unnamed: 0.1,Unnamed: 0,customer_id,name,sex,state,age,is_married,active_since,event_time
0,0,C1,brooke williams,F,alabama,76,True,2019-03-09 14:43:26,2024-05-02T05:39:10.965Z
1,1,C2,jim reese,M,oregon,76,True,2016-07-24 05:21:59,2024-05-02T05:39:10.966Z
2,2,C3,adam walker phd,M,north dakota,40,True,2017-11-21 01:14:51,2024-05-02T05:39:10.967Z
3,3,C4,nathan roberts,M,louisiana,68,False,2016-11-16 04:44:55,2024-05-02T05:39:10.967Z
4,4,C5,richard adkins,M,idaho,62,False,2017-03-27 21:09:31,2024-05-02T05:39:10.968Z


In [9]:
label_encoder = LabelEncoder()
min_max_scaler = MinMaxScaler()

In [10]:
customers_df.drop(columns = ['name','state'], inplace=True)

In [11]:
customers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    10000 non-null  int64 
 1   customer_id   10000 non-null  object
 2   sex           10000 non-null  object
 3   age           10000 non-null  int64 
 4   is_married    10000 non-null  bool  
 5   active_since  10000 non-null  object
 6   event_time    10000 non-null  object
dtypes: bool(1), int64(2), object(4)
memory usage: 478.6+ KB


In [12]:
bins = [18, 30, 40, 50, 60, 70, 90]
labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70-plus']
customers_df['age_range'] = pd.cut(customers_df['age'], bins, labels=labels, include_lowest=True)

In [13]:
customers_df.head()

Unnamed: 0.1,Unnamed: 0,customer_id,sex,age,is_married,active_since,event_time,age_range
0,0,C1,F,76,True,2019-03-09 14:43:26,2024-05-02T05:39:10.965Z,70-plus
1,1,C2,M,76,True,2016-07-24 05:21:59,2024-05-02T05:39:10.966Z,70-plus
2,2,C3,M,40,True,2017-11-21 01:14:51,2024-05-02T05:39:10.967Z,30-39
3,3,C4,M,68,False,2016-11-16 04:44:55,2024-05-02T05:39:10.967Z,60-69
4,4,C5,M,62,False,2017-03-27 21:09:31,2024-05-02T05:39:10.968Z,60-69


In [14]:
df_age_group = pd.get_dummies(customers_df['age_range'],prefix="age", dtype='int')

In [15]:
df_age_group.head()

Unnamed: 0,age_18-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-plus
0,0,0,0,0,0,1
1,0,0,0,0,0,1
2,0,1,0,0,0,0
3,0,0,0,0,1,0
4,0,0,0,0,1,0


In [16]:
customers_df = pd.concat([customers_df,df_age_group],axis=1)
customers_df.head()

Unnamed: 0.1,Unnamed: 0,customer_id,sex,age,is_married,active_since,event_time,age_range,age_18-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-plus
0,0,C1,F,76,True,2019-03-09 14:43:26,2024-05-02T05:39:10.965Z,70-plus,0,0,0,0,0,1
1,1,C2,M,76,True,2016-07-24 05:21:59,2024-05-02T05:39:10.966Z,70-plus,0,0,0,0,0,1
2,2,C3,M,40,True,2017-11-21 01:14:51,2024-05-02T05:39:10.967Z,30-39,0,1,0,0,0,0
3,3,C4,M,68,False,2016-11-16 04:44:55,2024-05-02T05:39:10.967Z,60-69,0,0,0,0,1,0
4,4,C5,M,62,False,2017-03-27 21:09:31,2024-05-02T05:39:10.968Z,60-69,0,0,0,0,1,0


In [17]:
customers_df.drop(columns=['age','age_range'], inplace=True)

In [18]:
customers_df.head()

Unnamed: 0.1,Unnamed: 0,customer_id,sex,is_married,active_since,event_time,age_18-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-plus
0,0,C1,F,True,2019-03-09 14:43:26,2024-05-02T05:39:10.965Z,0,0,0,0,0,1
1,1,C2,M,True,2016-07-24 05:21:59,2024-05-02T05:39:10.966Z,0,0,0,0,0,1
2,2,C3,M,True,2017-11-21 01:14:51,2024-05-02T05:39:10.967Z,0,1,0,0,0,0
3,3,C4,M,False,2016-11-16 04:44:55,2024-05-02T05:39:10.967Z,0,0,0,0,1,0
4,4,C5,M,False,2017-03-27 21:09:31,2024-05-02T05:39:10.968Z,0,0,0,0,1,0


In [19]:
customers_df['sex'] = customers_df['sex'].map({"F":0,"M":1})

In [20]:
customers_df.head()

Unnamed: 0.1,Unnamed: 0,customer_id,sex,is_married,active_since,event_time,age_18-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-plus
0,0,C1,0,True,2019-03-09 14:43:26,2024-05-02T05:39:10.965Z,0,0,0,0,0,1
1,1,C2,1,True,2016-07-24 05:21:59,2024-05-02T05:39:10.966Z,0,0,0,0,0,1
2,2,C3,1,True,2017-11-21 01:14:51,2024-05-02T05:39:10.967Z,0,1,0,0,0,0
3,3,C4,1,False,2016-11-16 04:44:55,2024-05-02T05:39:10.967Z,0,0,0,0,1,0
4,4,C5,1,False,2017-03-27 21:09:31,2024-05-02T05:39:10.968Z,0,0,0,0,1,0


In [21]:
customers_df['is_married'] = customers_df['is_married'].astype('int')

In [22]:
customers_df.head()

Unnamed: 0.1,Unnamed: 0,customer_id,sex,is_married,active_since,event_time,age_18-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-plus
0,0,C1,0,1,2019-03-09 14:43:26,2024-05-02T05:39:10.965Z,0,0,0,0,0,1
1,1,C2,1,1,2016-07-24 05:21:59,2024-05-02T05:39:10.966Z,0,0,0,0,0,1
2,2,C3,1,1,2017-11-21 01:14:51,2024-05-02T05:39:10.967Z,0,1,0,0,0,0
3,3,C4,1,0,2016-11-16 04:44:55,2024-05-02T05:39:10.967Z,0,0,0,0,1,0
4,4,C5,1,0,2017-03-27 21:09:31,2024-05-02T05:39:10.968Z,0,0,0,0,1,0


In [23]:
customers_df['active_since'] = pd.to_datetime(customers_df['active_since'], format="%Y-%m-%d %H:%M:%S")

In [24]:
customers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Unnamed: 0    10000 non-null  int64         
 1   customer_id   10000 non-null  object        
 2   sex           10000 non-null  int64         
 3   is_married    10000 non-null  int64         
 4   active_since  10000 non-null  datetime64[ns]
 5   event_time    10000 non-null  object        
 6   age_18-29     10000 non-null  int64         
 7   age_30-39     10000 non-null  int64         
 8   age_40-49     10000 non-null  int64         
 9   age_50-59     10000 non-null  int64         
 10  age_60-69     10000 non-null  int64         
 11  age_70-plus   10000 non-null  int64         
dtypes: datetime64[ns](1), int64(9), object(2)
memory usage: 937.6+ KB


In [25]:
def get_delta_days(datetime) -> int :
    today = date.today()
    delta = today - datetime.date()
    return delta.days

In [26]:
customers_df['n_days_active'] = customers_df['active_since'].apply(lambda x: get_delta_days(x))

In [27]:
customers_df.head()

Unnamed: 0.1,Unnamed: 0,customer_id,sex,is_married,active_since,event_time,age_18-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-plus,n_days_active
0,0,C1,0,1,2019-03-09 14:43:26,2024-05-02T05:39:10.965Z,0,0,0,0,0,1,2345
1,1,C2,1,1,2016-07-24 05:21:59,2024-05-02T05:39:10.966Z,0,0,0,0,0,1,3303
2,2,C3,1,1,2017-11-21 01:14:51,2024-05-02T05:39:10.967Z,0,1,0,0,0,0,2818
3,3,C4,1,0,2016-11-16 04:44:55,2024-05-02T05:39:10.967Z,0,0,0,0,1,0,3188
4,4,C5,1,0,2017-03-27 21:09:31,2024-05-02T05:39:10.968Z,0,0,0,0,1,0,3057


In [28]:
customers_df['n_days_active'] = min_max_scaler.fit_transform(customers_df[['n_days_active']])
customers_df.drop("active_since",axis=1, inplace=True)

In [88]:
customers_df.head()

Unnamed: 0.1,Unnamed: 0,customer_id,sex,is_married,event_time,age_18-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-plus,n_days_active
0,0,C1,0,1,2024-05-02T05:39:10.965Z,0,0,0,0,0,1,0.203425
1,1,C2,1,1,2024-05-02T05:39:10.966Z,0,0,0,0,0,1,0.859589
2,2,C3,1,1,2024-05-02T05:39:10.967Z,0,1,0,0,0,0,0.527397
3,3,C4,1,0,2024-05-02T05:39:10.967Z,0,0,0,0,1,0,0.780822
4,4,C5,1,0,2024-05-02T05:39:10.968Z,0,0,0,0,1,0,0.691096


In [91]:
customers_df.head()

Unnamed: 0,customer_id,sex,is_married,event_time,age_18-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-plus,n_days_active
0,C1,0,1,2024-05-02T05:39:10.965Z,0,0,0,0,0,1,0.203425
1,C2,1,1,2024-05-02T05:39:10.966Z,0,0,0,0,0,1,0.859589
2,C3,1,1,2024-05-02T05:39:10.967Z,0,1,0,0,0,0,0.527397
3,C4,1,0,2024-05-02T05:39:10.967Z,0,0,0,0,1,0,0.780822
4,C5,1,0,2024-05-02T05:39:10.968Z,0,0,0,0,1,0,0.691096


In [93]:
customers_df = customers_df.drop(columns=['Unnamed: 0'], errors='ignore')

In [92]:
customers_df.to_csv("transformed/customers.csv", index=False)

## Step 2: Transform Orders Data

In [112]:
orders_df = pd.read_csv("raw/orders.csv")

In [113]:
orders_df.head()

Unnamed: 0.1,Unnamed: 0,order_id,customer_id,product_id,purchase_amount,is_reordered,purchased_on,event_time
0,0,O1,C9765,P11660,58.84,0,2021-01-10 10:31:27,2024-05-02T05:39:17.172Z
1,1,O2,C3674,P6868,71.08,0,2020-03-20 16:47:36,2024-05-02T05:39:17.172Z
2,2,O3,C2139,P4749,57.17,1,2020-11-01 22:09:22,2024-05-02T05:39:17.172Z
3,3,O4,C7794,P542,5.35,1,2020-03-22 10:10:38,2024-05-02T05:39:17.172Z
4,4,O5,C2229,P7605,47.85,1,2021-01-14 01:08:56,2024-05-02T05:39:17.172Z


In [114]:
orders_df['purchased_on'] =  pd.to_datetime(orders_df['purchased_on'], format='%Y-%m-%d %H:%M:%S')
orders_df['n_days_since_last_purchase'] = orders_df['purchased_on'].apply(lambda x: get_delta_days(x))

In [115]:
orders_df.head()

Unnamed: 0.1,Unnamed: 0,order_id,customer_id,product_id,purchase_amount,is_reordered,purchased_on,event_time,n_days_since_last_purchase
0,0,O1,C9765,P11660,58.84,0,2021-01-10 10:31:27,2024-05-02T05:39:17.172Z,1672
1,1,O2,C3674,P6868,71.08,0,2020-03-20 16:47:36,2024-05-02T05:39:17.172Z,1968
2,2,O3,C2139,P4749,57.17,1,2020-11-01 22:09:22,2024-05-02T05:39:17.172Z,1742
3,3,O4,C7794,P542,5.35,1,2020-03-22 10:10:38,2024-05-02T05:39:17.172Z,1966
4,4,O5,C2229,P7605,47.85,1,2021-01-14 01:08:56,2024-05-02T05:39:17.172Z,1668


In [116]:
orders_df['n_days_since_last_purchase'] = min_max_scaler.fit_transform(orders_df[['n_days_since_last_purchase']])
orders_df.head()

Unnamed: 0.1,Unnamed: 0,order_id,customer_id,product_id,purchase_amount,is_reordered,purchased_on,event_time,n_days_since_last_purchase
0,0,O1,C9765,P11660,58.84,0,2021-01-10 10:31:27,2024-05-02T05:39:17.172Z,0.273256
1,1,O2,C3674,P6868,71.08,0,2020-03-20 16:47:36,2024-05-02T05:39:17.172Z,0.846899
2,2,O3,C2139,P4749,57.17,1,2020-11-01 22:09:22,2024-05-02T05:39:17.172Z,0.408915
3,3,O4,C7794,P542,5.35,1,2020-03-22 10:10:38,2024-05-02T05:39:17.172Z,0.843023
4,4,O5,C2229,P7605,47.85,1,2021-01-14 01:08:56,2024-05-02T05:39:17.172Z,0.265504


In [117]:
orders_df['purchase_amount'] = min_max_scaler.fit_transform(orders_df[['purchase_amount']])

In [118]:
orders_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   Unnamed: 0                  100000 non-null  int64         
 1   order_id                    100000 non-null  object        
 2   customer_id                 100000 non-null  object        
 3   product_id                  100000 non-null  object        
 4   purchase_amount             100000 non-null  float64       
 5   is_reordered                100000 non-null  int64         
 6   purchased_on                100000 non-null  datetime64[ns]
 7   event_time                  100000 non-null  object        
 8   n_days_since_last_purchase  100000 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(2), object(4)
memory usage: 6.9+ MB


In [119]:
orders_df['is_reordered'].value_counts()

is_reordered
1    66509
0    33491
Name: count, dtype: int64

In [120]:
orders_df.head()

Unnamed: 0.1,Unnamed: 0,order_id,customer_id,product_id,purchase_amount,is_reordered,purchased_on,event_time,n_days_since_last_purchase
0,0,O1,C9765,P11660,0.572673,0,2021-01-10 10:31:27,2024-05-02T05:39:17.172Z,0.273256
1,1,O2,C3674,P6868,0.693861,0,2020-03-20 16:47:36,2024-05-02T05:39:17.172Z,0.846899
2,2,O3,C2139,P4749,0.556139,1,2020-11-01 22:09:22,2024-05-02T05:39:17.172Z,0.408915
3,3,O4,C7794,P542,0.043069,1,2020-03-22 10:10:38,2024-05-02T05:39:17.172Z,0.843023
4,4,O5,C2229,P7605,0.463861,1,2021-01-14 01:08:56,2024-05-02T05:39:17.172Z,0.265504


In [121]:
orders_df.drop(columns="purchased_on", inplace=True)
orders_df = orders_df.drop(columns=['Unnamed: 0'], errors='ignore')

In [122]:
orders_df.head()

Unnamed: 0,order_id,customer_id,product_id,purchase_amount,is_reordered,event_time,n_days_since_last_purchase
0,O1,C9765,P11660,0.572673,0,2024-05-02T05:39:17.172Z,0.273256
1,O2,C3674,P6868,0.693861,0,2024-05-02T05:39:17.172Z,0.846899
2,O3,C2139,P4749,0.556139,1,2024-05-02T05:39:17.172Z,0.408915
3,O4,C7794,P542,0.043069,1,2024-05-02T05:39:17.172Z,0.843023
4,O5,C2229,P7605,0.463861,1,2024-05-02T05:39:17.172Z,0.265504


In [123]:
orders_df.to_csv("transformed/orders.csv", index=False)

## Step 3: Prepare Environment

In [124]:
import sagemaker
import boto3
import sys
import pandas as pd
import numpy as np 
import io 
from sagemaker.session import Session
from sagemaker import get_execution_role

In [125]:
prefix = "customer-feature-store"
role = get_execution_role()
sagemaker_session = Session() 
region = sagemaker_session.boto_region_name
s3_bucket_name = sagemaker_session.default_bucket()

## Step 4: Load Transformed Data

In [126]:
# Load the Data 
customer_data = pd.read_csv("transformed/customers.csv") 
orders_data = pd.read_csv("transformed/orders.csv")

In [127]:
customer_data.head()

Unnamed: 0,customer_id,sex,is_married,event_time,age_18-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-plus,n_days_active
0,C1,0,1,2024-05-02T05:39:10.965Z,0,0,0,0,0,1,0.203425
1,C2,1,1,2024-05-02T05:39:10.966Z,0,0,0,0,0,1,0.859589
2,C3,1,1,2024-05-02T05:39:10.967Z,0,1,0,0,0,0,0.527397
3,C4,1,0,2024-05-02T05:39:10.967Z,0,0,0,0,1,0,0.780822
4,C5,1,0,2024-05-02T05:39:10.968Z,0,0,0,0,1,0,0.691096


In [128]:
orders_data.head()

Unnamed: 0,order_id,customer_id,product_id,purchase_amount,is_reordered,event_time,n_days_since_last_purchase
0,O1,C9765,P11660,0.572673,0,2024-05-02T05:39:17.172Z,0.273256
1,O2,C3674,P6868,0.693861,0,2024-05-02T05:39:17.172Z,0.846899
2,O3,C2139,P4749,0.556139,1,2024-05-02T05:39:17.172Z,0.408915
3,O4,C7794,P542,0.043069,1,2024-05-02T05:39:17.172Z,0.843023
4,O5,C2229,P7605,0.463861,1,2024-05-02T05:39:17.172Z,0.265504


## Step 5: Create feature group

In [129]:
# Create feature group
from time import gmtime, strftime, sleep

customers_feature_group_name = "customers-fg-"+strftime("%d-%H-%M-%S", gmtime()) 
orders_feature_group_name = "orders-fg-"+strftime("%d-%H-%M-%S", gmtime()) 
print(f"Customer Feature Group Name: {customers_feature_group_name}")
print(f"Orders Feature Group Name: {orders_feature_group_name}")

Customer Feature Group Name: customers-fg-09-19-26-55
Orders Feature Group Name: orders-fg-09-19-26-55


In [130]:
from sagemaker.feature_store.feature_group import FeatureGroup

customers_feature_group = FeatureGroup(name=customers_feature_group_name , 
                                       sagemaker_session = sagemaker_session)

orders_feature_group = FeatureGroup(name=orders_feature_group_name , 
                                       sagemaker_session = sagemaker_session)

In [131]:
record_identifier_feature_name = "customer_id"

In [132]:
customer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   customer_id    10000 non-null  object 
 1   sex            10000 non-null  int64  
 2   is_married     10000 non-null  int64  
 3   event_time     10000 non-null  object 
 4   age_18-29      10000 non-null  int64  
 5   age_30-39      10000 non-null  int64  
 6   age_40-49      10000 non-null  int64  
 7   age_50-59      10000 non-null  int64  
 8   age_60-69      10000 non-null  int64  
 9   age_70-plus    10000 non-null  int64  
 10  n_days_active  10000 non-null  float64
dtypes: float64(1), int64(8), object(2)
memory usage: 859.5+ KB


In [133]:
orders_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   order_id                    100000 non-null  object 
 1   customer_id                 100000 non-null  object 
 2   product_id                  100000 non-null  object 
 3   purchase_amount             100000 non-null  float64
 4   is_reordered                100000 non-null  int64  
 5   event_time                  100000 non-null  object 
 6   n_days_since_last_purchase  100000 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 5.3+ MB


In [134]:
# Load Customer Feature definitions
customers_feature_group.load_feature_definitions(data_frame = customer_data)

[FeatureDefinition(feature_name='customer_id', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='sex', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='is_married', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='event_time', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='age_18-29', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='age_30-39', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='age_40-49', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='age_50-59', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='age_60-69

In [135]:
# Load Order Feature definitions
orders_feature_group.load_feature_definitions(data_frame = orders_data)

[FeatureDefinition(feature_name='order_id', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='customer_id', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='product_id', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='purchase_amount', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='is_reordered', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='event_time', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='n_days_since_last_purchase', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None)]

In [136]:
customers_feature_group.create(s3_uri = f"s3://{s3_bucket_name}/{prefix}",
                               record_identifier_name=record_identifier_feature_name, event_time_feature_name="event_time",
                               role_arn=role,
                               enable_online_store=True)

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:590183953604:feature-group/customers-fg-09-19-26-55',
 'ResponseMetadata': {'RequestId': 'cba1131c-295a-4edd-85ca-d8af773e6dd9',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'cba1131c-295a-4edd-85ca-d8af773e6dd9',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '101',
   'date': 'Sat, 09 Aug 2025 19:26:55 GMT'},
  'RetryAttempts': 0}}