#### Prerequisites

In [None]:
%%capture

!pip install joblib

### Imports 

In [2]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
import pandas as pd
import joblib

### Feature transforms 

##### Read data

In [3]:
df = pd.read_csv('./data/profile-info.csv')
df

Unnamed: 0,active_since,total_purchases,total_reviews,purchases_last_60_days,reviews_last_60_days,country,age_group
0,11-02-2015,35,3,3,1,india,18-24
1,04-23-2011,47,23,2,0,portugal,65+
2,01-13-2014,34,2,1,0,usa,35-44
3,12-16-2022,2,72,1,67,china,55-64
4,04-04-2022,10,34,10,32,usa,25-34
5,10-19-2021,345,24,82,14,usa,35-44
6,11-04-2019,32,2,2,1,canada,45-54
7,03-08-2020,34,23,1,1,canada,18-24
8,04-03-2019,234,24,2,5,spain,25-34
9,09-04-2022,24,45,2,10,belgium,18-24


#### Custom date transformer

In [4]:
def days_since(date_str):
    date = datetime.strptime(date_str, '%m-%d-%Y')
    now = datetime.now()
    return (now - date).days

In [5]:
date_transformer = FunctionTransformer(days_since, validate=False)

In [6]:
test_date = '03-03-2022'
encoded_data = date_transformer.transform(test_date)
encoded_data

286

#### Encode `age_groups` using Ordinal Encoder

In [7]:
age_groups = [['18-24'], ['25-34'], ['35-44'], ['45-54'], ['55-64'], ['65+']]
age_group_encoder = OrdinalEncoder().fit(df[['age_group']])

In [8]:
test_age_group = ['35-44']
encoded_data = age_group_encoder.transform([test_age_group])
encoded_data

array([[2.]])

#### Transform numeric columns using MinMax Scaler

In [9]:
minmax_scaler = MinMaxScaler()
minmax_scaler.fit(df[['total_purchases']])

MinMaxScaler(copy=True, feature_range=(0, 1))

In [10]:
test_total_purchases = [34]
encoded_data = minmax_scaler.transform([test_total_purchases])
encoded_data

array([[0.09329446]])

#### Transform column `country` using one-hot encoding

In [11]:
onehot_encoder = OneHotEncoder(sparse=True)
onehot_encoder.fit(df[['country']])

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [12]:
# Use the fitted encoder to transform new data
test_country = ['india']
encoded_data = onehot_encoder.transform([test_country])
encoded_data = list(encoded_data.toarray()[0])
encoded_data

[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]

#### Serialize transformers/encoders as .jbl files

In [13]:
joblib.dump(date_transformer, './data/date_transformer.joblib')

['./data/date_transformer.joblib']

In [14]:
joblib.dump(age_group_encoder, './data/age_group_encoder.joblib')

['./data/age_group_encoder.joblib']

In [15]:
joblib.dump(minmax_scaler, './data/minmax_scaler.joblib')

['./data/minmax_scaler.joblib']

In [16]:
joblib.dump(onehot_encoder, './data/onehot_encoder.joblib')

['./data/onehot_encoder.joblib']

### Feature Encoder

* Bundles all feature transforms into a callable interface for transforming incoming payload 

In [17]:
import joblib
import os

In [18]:
DATA_PATH = './data'

In [35]:
class FeatureEncoder:
    
    date_transformer = None
    age_group_encoder = None
    minmax_scaler = None
    onehot_encoder = None
    
    @classmethod
    def load_date_transformer(cls):
        if cls.date_transformer is None:
            with open(os.path.join(DATA_PATH, 'date_transformer.joblib'), 'rb') as file_:
                cls.date_transformer = joblib.load(file_)
        return cls.date_transformer
    
    @classmethod
    def load_age_group_encoder(cls):
        if cls.age_group_encoder is None:
            with open(os.path.join(DATA_PATH, 'age_group_encoder.joblib'), 'rb') as file_:
                cls.age_group_encoder = joblib.load(file_)
        return cls.age_group_encoder
    
    @classmethod
    def load_minmax_scaler(cls):
        if cls.minmax_scaler is None:
            with open(os.path.join(DATA_PATH, 'minmax_scaler.joblib'), 'rb') as file_:
                cls.minmax_scaler = joblib.load(file_)
        return cls.minmax_scaler
    
    @classmethod
    def load_onehot_encoder(cls):
        if cls.onehot_encoder is None:
            with open(os.path.join(DATA_PATH, 'onehot_encoder.joblib'), 'rb') as file_:
                cls.onehot_encoder = joblib.load(file_)
        return cls.onehot_encoder
    
    
    @classmethod
    def encode_date(cls, feature):
        encoder = cls.load_date_transformer()
        return encoder.transform(feature)
    
    @classmethod
    def encode_age_group(cls, feature):
        encoder = cls.load_age_group_encoder()
        return encoder.transform(feature)
    
    @classmethod
    def encode_numeric_cols(cls, feature):
        encoder = cls.load_minmax_scaler()
        return encoder.transform(feature)
    
    @classmethod
    def encode_country(cls, feature):
        encoder = cls.load_onehot_encoder()
        return encoder.transform(feature)
    
    @classmethod
    def encode(cls, features):
        feature_vector = []
        active_since, total_purchases, total_reviews, purchases_last_60_days, reviews_last_60_days, country, age_group = features.split(',')
        feature_vector.append(cls.encode_date(active_since))
        feature_vector.append(cls.encode_numeric_cols())
        feature_vector.append()
        feature_vector.append()
        feature_vector.append()
        
        return feature_vector

In [36]:
def get_encoded_data(features):
    encoded_data = FeatureEncoder().encode(features)
    return encoded_data

#### Test Feature Encoder

In [37]:
payload = '10-19-2021,345,24,82,14,usa,35-44'

In [38]:
response = get_encoded_data(payload)
response

10-19-2021,345,24,82,14,usa,35-44


[421]