#### Prerequisites

In [2]:
%%capture

!pip install joblib

### Imports 

In [3]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
from itertools import chain
import pandas as pd
import numpy as np
import sklearn
import logging
import joblib
import os

#### Setup logging

In [4]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

In [5]:
logger.info(f'Using Sklearn: {sklearn.__version__}')
logger.info(f'Using Joblib: {joblib.__version__}')
logger.info(f'Using Pandas: {pd.__version__}')
logger.info(f'Using Numpy: {np.__version__}')

Using Sklearn: 0.22.1
Using Joblib: 0.14.1
Using Pandas: 1.3.5
Using Numpy: 1.21.6


### Feature transforms 

#### Read data

In [6]:
df = pd.read_csv('./data/profile-info.csv')
df

Unnamed: 0,active_since,total_purchases,total_reviews,purchases_last_60_days,reviews_last_60_days,country,age_group
0,11-02-2015,35,3,3,1,india,18-24
1,04-23-2011,47,23,2,0,portugal,65+
2,01-13-2014,34,2,1,0,usa,35-44
3,12-16-2022,2,72,1,67,china,55-64
4,04-04-2022,10,34,10,32,usa,25-34
5,10-19-2021,345,24,82,14,usa,35-44
6,11-04-2019,32,2,2,1,canada,45-54
7,03-08-2020,34,23,1,1,canada,18-24
8,04-03-2019,234,24,2,5,spain,25-34
9,09-04-2022,24,45,2,10,belgium,18-24


#### Create custom FunctionTransformer to encode `active_since` feature

In [7]:
def days_since(date_str):
    date = datetime.strptime(date_str, '%m-%d-%Y')
    now = datetime.now()
    return (now - date).days

In [8]:
date_transformer = FunctionTransformer(days_since, validate=False)

##### Test FunctionTransformer

In [9]:
test_date = '03-03-2022'
encoded_data = date_transformer.transform(test_date)
encoded_data

287

#### Create OrdinalEncoder to encode `age_groups` feature

In [10]:
age_groups = [['18-24'], ['25-34'], ['35-44'], ['45-54'], ['55-64'], ['65+']]
age_group_encoder = OrdinalEncoder().fit(df[['age_group']])

##### Test OrdinalEncoder

In [11]:
test_age_group = ['35-44']
encoded_data = age_group_encoder.transform([test_age_group])
encoded_data

array([[2.]])

#### Encode numeric features using MinMaxScaler

In [12]:
minmax_scaler_total_purchases = MinMaxScaler()
minmax_scaler_total_purchases.fit(df[['total_purchases']])

MinMaxScaler(copy=True, feature_range=(0, 1))

##### Test MinMaxScaler

In [13]:
test_total_purchases = [34]
encoded_data = minmax_scaler_total_purchases.transform([test_total_purchases])
encoded_data

array([[0.09329446]])

##### Apply MinMaxScaler on remaining numeric cols

In [14]:
minmax_scaler_total_reviews = MinMaxScaler()
minmax_scaler_total_reviews.fit(df[['total_reviews']])

MinMaxScaler(copy=True, feature_range=(0, 1))

In [15]:
minmax_scaler_purchases_last_60_days = MinMaxScaler()
minmax_scaler_purchases_last_60_days.fit(df[['purchases_last_60_days']])

MinMaxScaler(copy=True, feature_range=(0, 1))

In [16]:
minmax_scaler_reviews_last_60_days = MinMaxScaler()
minmax_scaler_reviews_last_60_days.fit(df[['reviews_last_60_days']])

MinMaxScaler(copy=True, feature_range=(0, 1))

#### Encode feature `country` using OneHotEncoder

In [17]:
onehot_encoder = OneHotEncoder(sparse=True)
onehot_encoder.fit(df[['country']])

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

##### Test OneHotEncoder

In [18]:
test_country = ['india']
encoded_data = onehot_encoder.transform([test_country])
encoded_data = list(encoded_data.toarray()[0])
encoded_data

[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]

#### Serialize individual encoders as `.joblib` files

In [19]:
joblib.dump(date_transformer, './data/date_transformer.joblib')
joblib.dump(age_group_encoder, './data/age_group_encoder.joblib')
joblib.dump(minmax_scaler_total_purchases, './data/minmax_scaler_total_purchases.joblib')
joblib.dump(minmax_scaler_total_reviews, './data/minmax_scaler_total_reviews.joblib')
joblib.dump(minmax_scaler_purchases_last_60_days, './data/minmax_scaler_purchases_last_60_days.joblib')
joblib.dump(minmax_scaler_reviews_last_60_days, './data/minmax_scaler_reviews_last_60_days.joblib')
joblib.dump(onehot_encoder, './data/onehot_encoder.joblib')

['./data/onehot_encoder.joblib']

### Feature Encoder

* The feature encoder below encapsulates all the individual encoders created above into a callable interface.
* It transforms incoming raw payload into a feature vector that is either consumable by a machine learning model or a service.

In [20]:
DATA_PATH = './data'

In [21]:
class FeatureEncoder:
    
    date_transformer = None
    minmax_scaler_total_purchases = None
    minmax_scaler_total_reviews = None
    minmax_scaler_purchases_last_60_days = None
    minmax_scaler_reviews_last_60_days = None
    onehot_encoder = None
    age_group_encoder = None
    
    MAX_ACTIVE_DAYS = 10024
    MAX_AGE_GROUP = 5
    
    @classmethod
    def load_date_transformer(cls):
        if cls.date_transformer is None:
            with open(os.path.join(DATA_PATH, 'date_transformer.joblib'), 'rb') as file_:
                cls.date_transformer = joblib.load(file_)
        return cls.date_transformer
    
    @classmethod
    def load_minmax_scaler_total_purchases(cls):
        if cls.minmax_scaler_total_purchases is None:
            with open(os.path.join(DATA_PATH, 'minmax_scaler_total_purchases.joblib'), 'rb') as file_:
                cls.minmax_scaler_total_purchases = joblib.load(file_)
        return cls.minmax_scaler_total_purchases
    
    @classmethod
    def load_minmax_scaler_total_reviews(cls):
        if cls.minmax_scaler_total_reviews is None:
            with open(os.path.join(DATA_PATH, 'minmax_scaler_total_reviews.joblib'), 'rb') as file_:
                cls.minmax_scaler_total_reviews = joblib.load(file_)
        return cls.minmax_scaler_total_reviews
    
    @classmethod
    def load_minmax_scaler_purchases_last_60_days(cls):
        if cls.minmax_scaler_purchases_last_60_days is None:
            with open(os.path.join(DATA_PATH, 'minmax_scaler_purchases_last_60_days.joblib'), 'rb') as file_:
                cls.minmax_scaler_purchases_last_60_days = joblib.load(file_)
        return cls.minmax_scaler_purchases_last_60_days
    
    @classmethod
    def load_minmax_scaler_reviews_last_60_days(cls):
        if cls.minmax_scaler_reviews_last_60_days is None:
            with open(os.path.join(DATA_PATH, 'minmax_scaler_reviews_last_60_days.joblib'), 'rb') as file_:
                cls.minmax_scaler_reviews_last_60_days = joblib.load(file_)
        return cls.minmax_scaler_reviews_last_60_days
    
    @classmethod
    def load_age_group_encoder(cls):
        if cls.age_group_encoder is None:
            with open(os.path.join(DATA_PATH, 'age_group_encoder.joblib'), 'rb') as file_:
                cls.age_group_encoder = joblib.load(file_)
        return cls.age_group_encoder
    
    @classmethod
    def load_onehot_encoder(cls):
        if cls.onehot_encoder is None:
            with open(os.path.join(DATA_PATH, 'onehot_encoder.joblib'), 'rb') as file_:
                cls.onehot_encoder = joblib.load(file_)
        return cls.onehot_encoder
    
    @classmethod
    def encode_date(cls, feature):
        encoder = cls.load_date_transformer()
        return encoder.transform(feature)/cls.MAX_ACTIVE_DAYS
    
    @classmethod
    def encode_total_purchases(cls, feature):
        encoder = cls.load_minmax_scaler_total_purchases()
        return encoder.transform([[feature]])[0][0]
    
    @classmethod
    def encode_total_reviews(cls, feature):
        encoder = cls.load_minmax_scaler_total_reviews()
        return encoder.transform([[feature]])[0][0]
    
    @classmethod
    def encode_purchases_last_60_days(cls, feature):
        encoder = cls.load_minmax_scaler_purchases_last_60_days()
        return encoder.transform([[feature]])[0][0]
    
    @classmethod
    def encode_reviews_last_60_days(cls, feature):
        encoder = cls.load_minmax_scaler_reviews_last_60_days()
        return encoder.transform([[feature]])[0][0]
    
    @classmethod
    def encode_country(cls, feature):
        encoder = cls.load_onehot_encoder()
        return list(encoder.transform([[feature]]).toarray()[0])
    
    @classmethod
    def encode_age_group(cls, feature):
        encoder = cls.load_age_group_encoder()
        return encoder.transform([[feature]])[0][0]/cls.MAX_AGE_GROUP
    
    @classmethod
    def encode(cls, features):
        feature_vector = []
        active_since, total_purchases, total_reviews, purchases_last_60_days, reviews_last_60_days, country, age_group = features.split(',')
        feature_vector.append(cls.encode_date(active_since))
        feature_vector.append(cls.encode_total_purchases(total_purchases))
        feature_vector.append(cls.encode_total_reviews(total_reviews))
        feature_vector.append(cls.encode_purchases_last_60_days(purchases_last_60_days))
        feature_vector.append(cls.encode_reviews_last_60_days(reviews_last_60_days))
        feature_vector.append(cls.encode_country(country))
        feature_vector.append(cls.encode_age_group(age_group))
        return feature_vector

In [22]:
def get_encoded_data(features):
    encoded_data = FeatureEncoder.encode(features)
    encoded_data = np.hstack(encoded_data).tolist()
    return encoded_data

#### Test Feature Encoder

In [23]:
payload = '10-19-2021,145,24,22,14,usa,65+'

In [24]:
response = get_encoded_data(payload)
response

[0.04209896249002394,
 0.41690962099125367,
 0.3142857142857143,
 0.25925925925925924,
 0.208955223880597,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0]