In [None]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"


import keras
import numpy as np
import pandas as pd
import tensorflow as tf
from keras import layers
from keras.layers import StringLookup
import math

In [None]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'iframe' # or 'notebook' or 'colab' or 'jupyterlab'

In [None]:
from mlp import Params

# Data
- data set will be used as sample data which is from kaggle dataset.

In [None]:
class Data:
    data_path = "/Volumes/PS2000W/instacart-market-basket-analysis/"
    order_products__prior = pd.read_csv(
        data_path + "order_products__prior.csv")
    order_products__train = pd.read_csv(
        data_path + "order_products__train.csv")
    orders = pd.read_csv(data_path + "orders.csv")
    products = pd.read_csv(data_path + "products.csv")
    

In [None]:
# this update is needed when tensorflow detects numbers as intetegers even it is conterted to strings
def convert_to_str(x):
    return 'b_' + str(int(x))

In [None]:
Data.products['product_id'] = Data.products['product_id'].apply(convert_to_str)
Data.products['aisle_id'] = Data.products['aisle_id'].apply(convert_to_str)
Data.products['department_id'] = Data.products['department_id'].apply(convert_to_str)
Data.orders['user_id'] = Data.orders['user_id'].apply(convert_to_str)
Data.order_products__train['product_id'] = Data.order_products__train['product_id'].apply(convert_to_str)
Data.order_products__prior['product_id'] = Data.order_products__prior['product_id'].apply(convert_to_str)

In [None]:
Data.train = (
    Data.order_products__train
    .merge(
        Data.products, 
        on='product_id', 
        how='left'
    ).merge(
        Data.orders
        .query("eval_set == 'train'")
        [[
            'order_id', 
            'user_id', 
            'order_dow', 
            'order_hour_of_day', 
            'order_number'
        ]],
        on='order_id',
        how='left'
    )
)

In [None]:
Data.prior = (
    Data.order_products__prior
    .merge(
        Data.products, 
        on='product_id', 
        how='left'
    ).merge(
        Data.orders
        .query("eval_set == 'prior'")
        [[
            'order_id', 
            'user_id', 
            'order_dow', 
            'order_hour_of_day', 
            'order_number'
        ]],
        on='order_id',
        how='left'
    )
)

In [None]:
Data.train['ts'] =  (
    Data.train
    .sort_values(["user_id", "order_number", "add_to_cart_order"])
    .groupby("user_id")
    .cumcount() + 1
) 

In [None]:
Data.train.head(1).T

In [None]:
Data.users = pd.DataFrame(
    Data.train.user_id.unique().tolist(),
    columns=['user_id']
)

# EDA

### product order cnt & product user cnt

In [None]:
product_order_cnt = (
    Data.order_products__train
    .groupby("product_id")
    .order_id
    .count()
    .reset_index()
    .rename(columns={"order_id": "p_order_cnt"})
    .sort_values('p_order_cnt', ascending=False)
)

_min = product_order_cnt.p_order_cnt.min() 
_max = product_order_cnt.p_order_cnt.max()

product_order_cnt['p_order_cnt_norm'] = (
    (product_order_cnt['p_order_cnt'] - _min)
    / (_max - _min)
)
product_order_cnt = product_order_cnt.fillna(0)

In [None]:
Data.order_products__train.merge(
    Data.orders[['order_id', 'user_id']],
    on='order_id',
    how='left'
)

In [None]:
product_user_cnt = (
    Data.order_products__train.merge(
        Data.orders[['order_id', 'user_id']],
        on='order_id',
        how='left'
    )
    .groupby("product_id")
    .user_id
    .agg(pd.Series.nunique)
    .reset_index()
    .rename(columns={"user_id": "p_user_cnt"})
    .sort_values('p_user_cnt', ascending=False)
)

_min = product_user_cnt.p_user_cnt.min() 
_max = product_user_cnt.p_user_cnt.max()

product_user_cnt['p_user_cnt_norm'] = (
    (product_user_cnt['p_user_cnt'] - _min)
    / (_max - _min)
)
product_user_cnt = product_user_cnt.fillna(0)

In [None]:
Data.products = (
    Data.products
    .merge(
        product_user_cnt[['product_id', 'p_user_cnt']],
        on='product_id',
        how='left'
    ).merge(
        product_order_cnt[['product_id', 'p_order_cnt']],
        on='product_id',
        how='left'
    )
)
Data.products.head(1).T

### user order cnt

In [None]:
user_order_cnt = (
    Data.orders
    .groupby("user_id")
    .order_id
    .agg(pd.Series.nunique)
    .reset_index()
    .rename(columns={"order_id": "u_order_cnt"})
)
fig = px.histogram(user_order_cnt, x="u_order_cnt", nbins=20)
fig.show()

In [None]:
Data.users  = Data.users.merge(
    user_order_cnt, 
    on='user_id', 
    how='left'
)
Data.users.head(1).T

In [None]:
### user product cnt

In [None]:
user_product_cnt = (
    Data.train
    .groupby("user_id")
    .product_id
    .agg(pd.Series.nunique)
    .reset_index()
    .rename(columns={"product_id": "u_product_cnt"})
)
fig = px.histogram(user_product_cnt, x="u_product_cnt", nbins=20)
fig.show()

In [None]:
Data.users  = Data.users.merge(
    user_product_cnt, 
    on='user_id', 
    how='left'
)
Data.users.head(1).T

### rankings

In [None]:
rankings = (
    Data.order_products__train
    [['order_id', 'product_id', 'add_to_cart_order', 'reordered']]
    .merge(
        product_order_cnt[['product_id', 'p_order_cnt_norm']],
        on='product_id',
        how='left'
    )
    .merge(
        product_user_cnt[['product_id', 'p_user_cnt_norm']],
        on='product_id',
        how='left'
    )
)
rankings.head()

In [None]:
rankings['relevance_scores'] = (
    (.1 * rankings['add_to_cart_order'])
    + (.3 * rankings['reordered'])
    + (.3 * rankings['p_order_cnt_norm'])
    + (.3 * rankings['p_user_cnt_norm'])
)

In [None]:
fig = px.histogram(rankings, x="relevance_scores", nbins=20)
fig.show()

In [None]:
def get_ranking(r):
    if r <= .5:
        return 1
    if .5 < r <= 1.5:
        return 2
    if 1.5 < r <= 2:
        return 3
    if 2 < r <= 3.5:
        return 4
    if 3.5 < r:
        return 5

rankings['rating'] = rankings.relevance_scores.apply(
    get_ranking
)
rankings

In [None]:
rankings.groupby("rating").product_id.agg(pd.Series.nunique)

In [None]:
Data.train = (
    Data.train
    .merge(
        rankings[['order_id', 'product_id', 'rating']], 
        on=['order_id', 'product_id'], how='left'
    )
)

In [None]:
Data.train.head(1).T

# pre-process

In [None]:
lookups = [
    "user_id",
    "product_id",
    "order_dow",
    "order_number",
    "order_hour_of_day",
    "aisle_id",
    "department_id"
]


sequential_features = [
    "product_id",
    "rating", 
]


user_features = [
    "user_order_cnt",
    "order_hour_of_day",
    
]


item_features = [
    "p_order_cnt",
    "p_user_cnt",
    "department_id",
    "aisle_id",
    
    
]

sequence_length = 4 
step_size = 2


params = {
    "user_id": "user_id",
    "item_id": "product_id",
    "sequence_length": 4,
    "num_heads": 3,
    "hidden_layers": 2,
    "hidden_units": 256,
    "dropout_rate": 0.1
}


lookup_features = list(
    set(categorical_features) - set(['sequence_ratings'])
)

In [None]:
for cats in lookups:
    Data.train[cats] = Data.train[cats].astype(str)

In [None]:
train_data = (
    Data.train
    .sort_values(["user_id", "ts"])
    .groupby("user_id") 
)

In [None]:
train_data = pd.DataFrame(
    {
        "user_id": list(train_data.groups.keys()),
        "product_ids": list(train_data['product_id'].apply(list)),
        "ratings": list(train_data['rating'].apply(list))
    }
)

In [None]:
def convert_to_str(seq):
    return ",".join([str(s) for s in seq])

    
def create_sequences(values, window_size, step_size):    
    sequences = []
    start_index = 0
    while True:
        end_index = start_index + window_size
        seq = values[start_index:end_index]
        if len(seq) < window_size:
            seq = values[-window_size:]
            if len(seq) < window_size:
                seq = seq + ([seq[-1]] * (window_size - len(seq)))
                sequences.append(seq)
            break
        sequences.append(seq)
        start_index += step_size
    return sequences

In [None]:
for seq in ['product_ids', 'ratings']:
    train_data["sequence_"+seq] = train_data[seq].apply(
        lambda row: 
        create_sequences(
            row, 
            sequence_length,
            step_size
        )
    )

In [None]:
train_data.head(1).T

In [None]:
train_data = (
    train_data
    [['user_id', 'sequence_product_ids', 'sequence_ratings']]
    .explode(
        ['sequence_product_ids', 'sequence_ratings'], 
    )
)

train_data.head(2)

In [None]:
train_data[[
    'sequence_product_ids',
    'sequence_ratings',
    'target_product_id', 
    'target'
]] = train_data.apply(
    lambda row:
    pd.Series([
        row['sequence_product_ids'][:-1],
        row['sequence_ratings'][:-1],
        row['sequence_product_ids'][-1], 
        row['sequence_ratings'][-1]
    ]),
    axis=1
)

In [None]:
train_data.head(1).T

In [None]:
train_data = (
    train_data.merge(
        (
            Data.products
            .rename(columns={"product_id": "target_product_id"})
        ), 
        on='target_product_id',
        how='left'
    ).merge(
        Data.users,
        on='user_id',
        how='left'
    )
)
train_data.head(1).T

# feature_selection

In [None]:
categorical_features = [
    'user_id', 
    'sequence_product_ids', 
    'sequence_ratings',
    'target_product_id',
    'aisle_id',
    'department_id'
]
numeric_features = [
    'p_user_cnt', 
    'p_order_cnt', 
    'u_order_cnt',
    'u_product_cnt'
]
target = "target"

In [None]:
train_data = train_data[
    categorical_features 
    + numeric_features
    + [target]
]

In [None]:
for num in numeric_features+['target']:
    train_data[num] = train_data[num].astype(float)

In [None]:
del product_order_cnt, product_user_cnt

In [None]:
del user_order_cnt, user_product_cnt

# train - validation - test split

In [None]:
split_ratio = 0.85

In [None]:
random_selection = np.random.rand(len(train_data.index)) <= split_ratio
val_dataset = train_data[~random_selection]
train_dataset = train_data[random_selection]

In [None]:
val_dataset.shape, train_dataset.shape

# Lookups & Encoders

In [None]:
class Encoder:
    def __init__(self, params, lookup_features):
        self.params = params
        self.item_id = params.get('item_id')
        self.target_item_id = f"target_{params.get('item_id')}"
        self.sequence_item_ids = 'sequence_' + params.get('item_id') + 's'
        self.sequence_length = params.get('sequence_length')
        self.positions = tf.range(start=0, limit=self.sequence_length - 1, delta=1)
        self.lookups = {}
        self.lookup_features = lookup_features
        self.item_lookup_features = [self.target_item_id, self.sequence_item_ids]
        self.embedding_encoders = {}
        self.embedding_dims = {}
        self.item_embedding_processor = None
        self.position_embedding_encoder = None

    @classmethod
    def generate(cls, train_data, params, products, lookup_features):
        _cls = Encoder(
            params=params,
            lookup_features=lookup_features
        )
        _cls.get_lookups(
            train_data,
            products
        )
        return _cls

    def update_lookups_and_embeddings(self, vocabulary, lookup):
        self.lookups[lookup] = StringLookup(
            vocabulary=vocabulary, mask_token=None, oov_token=0,  num_oov_indices=1)
        self.embedding_dims[lookup] = int(math.sqrt(len(vocabulary)))
        self.embedding_encoders[lookup] = layers.Embedding(
                input_dim=len(vocabulary)+1,
                output_dim=self.embedding_dims[lookup],
                name=f"{lookup}_embedding",
            )

    def get_lookups(self, train_data, products):
        for lookup in self.lookup_features:
            if lookup not in self.item_lookup_features: 
                # Convert the string input values into integer indices.
                vocabulary = train_data[lookup].astype(str).unique().tolist()
                self.update_lookups_and_embeddings(vocabulary, lookup)
                                              
        # item Id embedding and lookups
        vocabulary = products[self.item_id].astype(str).unique().tolist()
        self.update_lookups_and_embeddings(vocabulary, self.item_id)
        self.item_embedding_processor = layers.Dense(
            units=self.embedding_dims[self.item_id],
            activation="relu",
            name=f"process_{self.item_id}_embedding",
        )
        self.position_embedding_encoder = layers.Embedding(
            input_dim=self.sequence_length - 1,
            output_dim=self.embedding_dims[self.item_id],
            name="position_embedding",
        )

    def query(self, inp, lookup):
        return self.embedding_encoders[lookup](inp)

    def item_embeddings(self, inputs):
        emb_target = self.query(inputs[self.target_item_id], self.item_id)
        emb_target = self.item_embedding_processor(emb_target)
        emb_seq = self.query(inputs[self.sequence_item_ids], self.item_id)
        emb_seq = self.item_embedding_processor(emb_seq)
        return emb_target, emb_seq

    def get_embeddings(self, inputs):
        encoded = []
        encoded_transformer = []
        for lookup in self.lookup_features:
            if lookup not in self.item_lookup_features: 
                print(inputs[lookup])
                encoded.append(self.query(inputs[lookup], lookup))
        
        ## Create a single embedding vector for the user features
        if len(encoded) > 1:
            encoded = layers.concatenate(encoded)
        elif len(encoded) == 1:
            encoded = encoded[0]
        else:
            encoded = None

        (
            encoded_target_item, 
            encoded_sequence_items
        ) = self.item_embeddings(
            inputs
        )        
        encodded_positions = self.position_embedding_encoder(self.positions)
        sequence_ratings = keras.ops.expand_dims(inputs["sequence_ratings"], -1)

        encoded_sequence_items_with_poistion_and_rating = layers.Multiply()(
            [(encoded_sequence_items + encodded_positions), sequence_ratings]
        )

        # Construct the transformer inputs.
        for i in range(self.sequence_length - 1):
            feature = encoded_sequence_items_with_poistion_and_rating[:, i, ...]
            feature = keras.ops.expand_dims(feature, 1)
            encoded_transformer.append(feature)

        encoded_transformer = layers.concatenate(
            encoded_transformer, axis=1
        )
            
        return encoded_transformer, encoded            
        

In [None]:
encoders = Encoder.generate(
    train_data=train_data,
    params=params,
    products=Data.products,
    lookup_features=lookup_features
)

In [None]:
encoders.lookups

# create `tf.data.Dataset`

In [None]:
train_feature_dataset = {
    encoders.target_item_id: tf.reshape(
        encoders.lookups[encoders.item_id](train_dataset[encoders.target_item_id]), (len(train_dataset),1)
    ),
    encoders.sequence_item_ids: (
        encoders.lookups[encoders.item_id](train_dataset[encoders.sequence_item_ids].tolist())),
    "sequence_ratings": tf.cast(train_dataset['sequence_ratings'].tolist(), tf.float32),
    "target": tf.reshape(tf.cast(train_dataset['target'], tf.float32), (len(train_dataset),1))
}

In [None]:
for f in lookup_features:
    if f not in train_feature_dataset:
        train_feature_dataset[f] = tf.reshape(
            encoders.lookups[f](train_dataset[f]), (len(train_dataset),1))

In [None]:
for f in numeric_features:
    train_feature_dataset[f] = tf.reshape(tf.cast(train_dataset[f], tf.float32), (len(train_dataset), 1))

In [None]:
def preprocess(feature):
    target = feature['target']
    del feature['target']
    return feature, target

In [None]:
train_feature_dataset = tf.data.Dataset.from_tensor_slices(train_feature_dataset)
train_feature_dataset = train_feature_dataset.cache()
train_feature_dataset = train_feature_dataset.shuffle(5000)
train_feature_dataset = train_feature_dataset.prefetch(tf.data.AUTOTUNE)
train_feature_dataset = train_feature_dataset.batch(32)

In [None]:
train_feature_dataset = train_feature_dataset.map(preprocess)

In [None]:
for i in train_feature_dataset.take(1):
    print()
i

In [None]:
i[1]

In [None]:
class Inputs:
    def __init__(
        self, 
        params,
        categorical_features, 
        numeric_features
    ):
        self.params = params
        self.item_id = params.get('item_id')
        self.target_item_id = f"target_{params.get('item_id')}"
        self.sequence_item_ids = 'sequence_' + params.get('item_id') + 's'
        self.sequence_length = params.get('sequence_length')
        self.categorical_features = categorical_features
        self.numeric_features = numeric_features
        self.inputs = {}
        self.collect_inputs()

    def collect_inputs(self):
        for cat in self.categorical_features:
            if cat == self.sequence_item_ids:
                self.inputs[cat] = keras.Input(
                    name=cat, shape=(self.sequence_length - 1,)
                )
            elif cat == "sequence_ratings":
                self.inputs[cat] = keras.Input(
                    name="sequence_ratings", shape=(self.sequence_length - 1,)
                )
            else:
                self.inputs[cat] = keras.Input(name=cat, shape=(1,))

        for num in self.numeric_features:
            self.inputs[num] = keras.Input(name=num, shape=(1,)) 

In [None]:
inputs = Inputs(
        params,
        categorical_features, 
        numeric_features
)

In [None]:
inputs.inputs.keys()

In [None]:
inputs.inputs

In [None]:
class Transformer:
    def __init__(self, params, inputs: Inputs, encoders: Encoder):
        self.params = params
        self.num_heads = params.get('num_heads')
        self.dropout_rate = params.get('dropout_rate')
        self.inputs = inputs
        self.encoders = encoders
        self.hidden_units = self.cal_hidden_layer_of_units(
            params.get('hidden_layers'),
            params.get('hidden_units')
        )

    @staticmethod
    def cal_hidden_layer_of_units(
        hidden_layers, _encoding_dim, autoencoder_layers=False
    ):
        """creating hidden layers for each tower
        hidden_layers:
            number of hidden layer that will be created
        _encoding_dim:
            number of hidden unit that will be used in first hidden layer
        autoencoder_layers:
            if it is for autoencoder, process will not be same. hidden unit will be decreasing for each hidden layer,
            however, for autoencoder, after seeing bottle_neck unit, unit size will be re-increasing till _encoding_dim
        how it works;
            1st example_configurations;
                hidden_layers      : 3
                _encoding_dim      : 16
                autoencoder_layers : False
                layers             : 16 - 8 (16/2) - 4 (8/2) - 2 (4/2)
            2nd example_configurations;
                hidden_layers      : 3
                _encoding_dim      : 16
                autoencoder_layers : True
                layers             : 16 - 8 (16/2) - 4 (8/2) - 2 (4/2) (bottle_neck) - 4 (2*2) - 8 (4*2) - 16 (8*2)
        """
        count = 1
        _unit = _encoding_dim
        h_l_units = []
        while count != hidden_layers + 1:
            h_l_units.append(int(_unit))
            _unit /= 2
            if int(_unit) == 1:
                count = hidden_layers + 1
            else:
                count += 1
        if autoencoder_layers:
            count = 1
            while count != hidden_layers + 2:
                h_l_units.append(int(_unit))
                _unit *= 2
                count += 1
        return h_l_units

    def create_model(self):
        transformer_features, other_features = self.encoders.get_embeddings(
            self.inputs.inputs
        )
        attention_output = layers.MultiHeadAttention(
            num_heads=self.num_heads, 
            key_dim=transformer_features.shape[2], 
            dropout=self.dropout_rate
        )(transformer_features, transformer_features)
    
        # Transformer block.
        attention_output = layers.Dropout(self.dropout_rate)(attention_output)
        x1 = layers.Add()([transformer_features, attention_output])
        x1 = layers.LayerNormalization()(x1)
        x2 = layers.LeakyReLU()(x1)
        x2 = layers.Dense(units=x2.shape[-1])(x2)
        x2 = layers.Dropout(self.dropout_rate)(x2)
        transformer_features = layers.Add()([x1, x2])
        transformer_features = layers.LayerNormalization()(transformer_features)
        features = layers.Flatten()(transformer_features)
    
        # Included the other features.
        if other_features is not None:
            features = layers.concatenate(
                [features, layers.Reshape([other_features.shape[-1]])(other_features)]
            )
    
        # Fully-connected layers.
        for num_units in self.hidden_units:
            features = layers.Dense(num_units)(features)
            features = layers.BatchNormalization()(features)
            features = layers.LeakyReLU()(features)
            features = layers.Dropout(self.dropout_rate)(features)
    
        outputs = layers.Dense(units=1)(features)
        model = keras.Model(inputs=self.inputs.inputs, outputs=outputs)
        return model
        

In [None]:
transformer = Transformer(
    params,
    inputs,
    encoders
)

In [None]:
transformer.encoders.get_embeddings(
    i[0]
)

In [None]:
model = transformer.create_model()

In [None]:
model.summary()

In [None]:
model.compile(
    optimizer=keras.optimizers.Adagrad(learning_rate=0.01),
    loss=keras.losses.MeanSquaredError(),
    metrics=[keras.metrics.MeanAbsoluteError()],
)

In [None]:
model(i[0])

In [None]:
model.fit(
    train_feature_dataset,
    batch_size=32
)