<h1>CS677 - Deep Learning Final Project</h1> 
<hr>

Project team members - 

<i>Sajin Shajee</i><br>
<i>Sanjeet Navinbhai Gajjar</i><br>
<i>Ahamed Arafaath Muthalif Mubarak Ali</i><br>
<i>Rohit Subramanian</i>   

In [38]:
!pip install -q tensorflow-recommenders
!pip install -q scann

<h2><u>Load essential packages</u></h2>

In [24]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0 is required
import tensorflow as tf
from typing import Dict, Text
from tensorflow import keras
from tqdm.notebook import tqdm
tqdm.pandas()
import tensorflow_recommenders as tfrs
try:
    if not tf.config.list_physical_devices('GPU'):
        assert tf.__version__ >= "2.0"
        print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")
        if IS_COLAB:
            print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")
        if IS_KAGGLE:
            print("Go to Settings > Accelerator and select GPU.")
except:
    if not tf.test.is_gpu_available():
        assert tf.__version__ >= "2.0"
        print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")
        if IS_COLAB:
            print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")
        if IS_KAGGLE:
            print("Go to Settings > Accelerator and select GPU.")

# Common imports
import numpy as np
import pandas as pd
import sklearn.preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import os
import datetime as dt
from pathlib import Path

# to make this notebook's output stable across runs
np.random.seed(42)
try:
    if not tf.config.list_physical_devices('GPU'):
        tf.random.set_seed(42)
    else:
        tf.random.set_seed(42)
except:
    if not tf.test.is_gpu_available():
        tf.random.set_seed(42)
    else:
        tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

<h4><u>Function zero_f</u></h4>
Input: Rows of the dataframe.</br>
Output: Converted features with standardized length of 10 characters.<hr>
Description: Converts customer id to string then adds to the right until the length of id is 10 characters long.

In [25]:
# Function to convert customer id to string then adds to the right until the length of id is 10 characters long.
def zero_f(item):
    item=str(item)
    tem=len(item)
    if(len(item)<10):
        item=item.zfill(10)
    return item

<h2><u>Data Loading and Preprocessing with tensorflow</u></h2>

In [26]:
# Load transaction data 
trans_train = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv',
                          dtype={'customer_id': str,'article_id':str})
trans_train['quantity']=1
trans_train = trans_train[trans_train['t_dat'] >'2020-08-01']

# Load article and customers data
articles= pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv',
                      dtype={'article_id': str,'product_code':str})
customers = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv',
                        dtype={'customer_id':str})

#Feature transformation
master_df = trans_train[['customer_id','article_id','t_dat']].astype(str)
master_df['article_id']=master_df['article_id'].apply(zero_f)
master_df['quantity'] = trans_train['quantity'].astype(float)
masterdf = master_df

len(np.unique(trans_train['customer_id']))

In [27]:
masterdf.head(10)

In [28]:
articles.head(10)

In [30]:
customers.head(10)

In [31]:
# Feature Transformation
interactions = masterdf
interactions['t_dat']=pd.to_datetime(interactions['t_dat'])

# Select time frame for model training
train = interactions[interactions['t_dat']<='2020-09-15']
valid = interactions[(interactions['t_dat'] <='2019-09-17')&(interactions['t_dat'] >'2019-09-15')]
test = interactions[interactions['t_dat'] >'2019-09-17']
print(len(np.unique(train['customer_id'])))

#Batcch processing and loading with tensorflow
train_ds = tf.data.Dataset.from_tensor_slices(dict(train[['customer_id',
                                                          'article_id']])).shuffle(100_000).batch(256).cache()
valid_ds = tf.data.Dataset.from_tensor_slices(dict(valid[['customer_id',
                                                          'article_id']])).batch(256).cache()
test_ds = tf.data.Dataset.from_tensor_slices(dict(test[['customer_id',
                                                        'article_id']])).batch(256).cache()

In [32]:
#Feature Engineering
items_dict = articles[['article_id']].drop_duplicates()
customer_dict=customers[['customer_id']].drop_duplicates()

items_dict = {name: np.array(value) for name, value in items_dict.items()}
customer_dict={name:np.array(value) for name,value in customer_dict.items()}

customers=tf.data.Dataset.from_tensor_slices(customer_dict)
items = tf.data.Dataset.from_tensor_slices(items_dict)

items = items.map(lambda x: x['article_id'])
customers=customers.map(lambda x: x['customer_id'])

In [33]:
### get unique item and user id's as a lookup table
unique_items = np.unique(np.concatenate(list(items.batch(1_000))))
unique_user_ids = np.unique(np.concatenate(list(customers.batch(1_000))))

# Randomly shuffle data and split between train and test.
tf.random.set_seed(42)

<h3><u>Function my_l1_regularizer and my_positive_weights</u></h3>
Input: Trainable weights <br>
Output: Updated weights <hr>
Description: Converts input weights to positive value and regularized value

In [34]:
# Hyperparamters for model training
def my_l1_regularizer(weights):
    return tf.reduce_sum(tf.abs(0.01 * weights))
def my_positive_weights(weights): # return value is just tf.nn.relu(weights)
    return tf.where(weights < 0., tf.zeros_like(weights), weights)

<h2><u>Model Training</u></h2>

Original Model

In [35]:
class CandidateModel(tfrs.Model):

    def __init__(self, user_model, item_model):
        super().__init__()
        
        self.item_model: tf.keras.Model = item_model
        self.user_model: tf.keras.Model = user_model
        
        ### for retrieval model. we take top-k accuracy as metrics
        metrics = tfrs.metrics.FactorizedTopK(candidates=items.batch(256).map(item_model))
        
        # define the task, which is retrieval                                    )    
        task = tfrs.tasks.Retrieval(metrics=metrics)
       
        self.task: tf.keras.layers.Layer = task

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        # We pick out the user features and pass them into the user model.
        user_embeddings = self.user_model(features["customer_id"])
        # And pick out the movie features and pass them into the movie model,
        # getting embeddings back.
        article_embeddings = self.item_model(features["article_id"])

        # The task computes the loss and the metrics.
        return self.task(user_embeddings, article_embeddings)

In [None]:
### Fitting and evaluating

### we choose the dimensionality of the query and candicate representation.
embedding_dimension = 64

## we pass the model, which is the same model we created in the query and candidate tower, into the model
item_model = tf.keras.Sequential([
                                tf.keras.layers.StringLookup(
                                vocabulary=unique_items, mask_token=None),
                                tf.keras.layers.Embedding(len(unique_items) + 1, embedding_dimension)
                                ])

user_model = tf.keras.Sequential([
                                tf.keras.layers.StringLookup(
                                vocabulary=unique_user_ids, mask_token=None),
                                # We add an additional embedding to account for unknown tokens.
                                tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
                                ])

model_1 = CandidateModel(user_model, item_model)
model_1.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
model_1.fit(train_ds,epochs=5,batch_size=128)
model_1.evaluate(test_ds, return_dict=True)

In [None]:
# generate submission file for kaggle submission
submission_file = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv',
                              dtype={'customer_id': str})
sub_cust = submission_file["customer_id"]
sub_df = pd.DataFrame(columns=['Customer_Id', 'Article_Id'])
submission_file["prediction"] = submission_file['customer_id'].progress_apply(run_f)
submission_file

In [None]:
submission_file.to_csv('submission_1.csv',index=False)

<h3><u>Model Improvement</u></h3>

In [None]:
### Fitting and evaluating

### we choose the dimensionality of the query and candicate representation.
embedding_dimension = 64

## we pass the model, which is the same model we created in the query and candidate tower, into the model
item_model = tf.keras.Sequential([
                                tf.keras.layers.StringLookup(
                                vocabulary=unique_items, mask_token=None),
                                tf.keras.layers.Embedding(len(unique_items) + 1, embedding_dimension),
                                ])

user_model = tf.keras.Sequential([
                                tf.keras.layers.StringLookup(
                                vocabulary=unique_user_ids, mask_token=None),
                                # We add an additional embedding to account for unknown tokens.
                                tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension),
                                tf.keras.layers.Flatten(),
                                tf.keras.layers.Dense((embedding_dimension),activation="relu",
                                                      kernel_initializer='he_normal',use_bias=False),
                                tf.keras.layers.BatchNormalization(),
                                tf.keras.layers.Dropout(0.2),
                                tf.keras.layers.Dense((embedding_dimension),activation="relu",
                                                      kernel_initializer='he_normal',use_bias=False),
                                tf.keras.layers.BatchNormalization(),
                                tf.keras.layers.Dropout(0.2),
                                tf.keras.layers.Dense((embedding_dimension),
                                                      kernel_regularizer=my_l1_regularizer,
                                                      kernel_constraint=my_positive_weights)
                                ])

model_2 = CandidateModel(user_model, item_model)
model_2.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
model_2.fit(train_ds,validation_data=valid_ds,epochs=5,batch_size=128)
model_2.evaluate(test_ds, return_dict=True)

In [None]:
# Vocabolary bucket of unique items
unique_items

In [None]:
#Bucket of unique customer ids
unique_user_ids 

<h2><u>Model evaluation</u></h2>

In [None]:
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index_from_dataset(items.batch(100).map(lambda items: (items,model.item_model(items))))

In [None]:
# Predicted recommendations
j = '000231cc9af9e58ab4edc66fbd61da921b144ba85bc1c00d0ae2309531e4c210'
_, titles = index(tf.constant([j]),k=12)
print(f"Recommendations for user %s: {titles[0]}" %(j))

<h3><u>Function decoder and run_f</u></h3>
Input: feature from submission file <br>
Output: decoded and vectorized variable <hr>

In [None]:
def decoder(e):
    return e.decode('UTF-8')
def run_f(item):
    _, titles = index(tf.constant([item]),k=12)
    t = np.array(titles[0])
    vfunc = np.vectorize(decoder)
    l = vfunc(t)
    l = " ".join(l)
    return l

This indicates the process that was used to convert values for each customer into a string of values. These values are added into the csv file. TQDM provides a progress bar that indicates the progress of the transformations indicated with the apply function.

In [None]:
sub_cust = submission_file["customer_id"]
sub_df = pd.DataFrame(columns=['Customer_Id', 'Article_Id'])
submission_file["prediction"] = submission_file['customer_id'].progress_apply(run_f)
submission_file.head(10)

In [None]:
submission_file.to_csv('submission_2.csv',index=False)