In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
import tensorflow_recommenders as tfrs
try:
    if not tf.config.list_physical_devices('GPU'):
        assert tf.__version__ >= "2.0"
        print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")
        if IS_COLAB:
            print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")
        if IS_KAGGLE:
            print("Go to Settings > Accelerator and select GPU.")
except:
    if not tf.test.is_gpu_available():
        assert tf.__version__ >= "2.0"
        print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")
        if IS_COLAB:
            print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")
        if IS_KAGGLE:
            print("Go to Settings > Accelerator and select GPU.")

# Common imports
import numpy as np
import pandas as pd
import sklearn.preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import os
import datetime as dt
from pathlib import Path

# to make this notebook's output stable across runs
np.random.seed(42)
try:
    if not tf.config.list_physical_devices('GPU'):
        tf.random.set_seed(42)
    else:
        tf.random.set_random_seed(42)
except:
    if not tf.test.is_gpu_available():
        tf.random.set_seed(42)
    else:
        tf.random.set_random_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

No GPU was detected. LSTMs and CNNs can be very slow without a GPU.


In [2]:
def read_files(file_path, **kwargs):
    
    art_df_args = dict(filepath_or_buffer=file_path + 'articles.csv',low_memory = False)
    if 'art_cols' in kwargs:
        art_df_args['usecols']=kwargs['art_cols']
    
    cust_df_args = dict(filepath_or_buffer=file_path + 'customers.csv', low_memory = False)
    if  'cust_cols' in  kwargs:
        cust_df_args['usecols']=kwargs['cust_cols']
    
    trans_df_args= dict(filepath_or_buffer=file_path + 'transactions_train.csv', low_memory = False)
    if  'trans_cols' in kwargs:
        trans_df_args['usecols']=kwargs['trans_cols']
    
    art_df = pd.read_csv(**art_df_args)
    cust_df = pd.read_csv(**cust_df_args)
    trans_df= pd.read_csv(**trans_df_args)
    
    customer_lookup = cust_df.reset_index().set_index('customer_id')['index'].astype(str).to_dict()
    article_lookup =art_df.reset_index().set_index('article_id')['index'].astype(str).to_dict()
    
    trans_df['user_id']= trans_df['customer_id'].map(customer_lookup)
    trans_df['item_id']= trans_df['article_id'].map(article_lookup)
    
    unique_users = trans_df['user_id'].unique()
    unique_items = trans_df['item_id'].unique()
    
    trans_df = trans_df.drop(columns =['customer_id','article_id'])
    
    return customer_lookup, article_lookup, trans_df, unique_users, unique_items


In [3]:
file_path = 'h-and-m-personalized-fashion-recommendations/'
customer_lookup, article_lookup, trans_data, user_vocab, item_vocab = read_files(file_path, cust_cols=['customer_id'], trans_cols= ['customer_id','article_id','t_dat'])
trans_data=trans_data.set_index('t_dat')
train = trans_data['2018-01-01':'2019-12-32'].reset_index()
train.columns = train.columns.str.replace('index', 't_dat')
test=trans_data['2020-01-01':].reset_index()
test.columns = test.columns.str.replace('index', 't_dat')

In [4]:
train.tail()

Unnamed: 0,t_dat,user_id,item_id
20808187,2019-12-31,1371597,87180
20808188,2019-12-31,1371597,86968
20808189,2019-12-31,1371691,3914
20808190,2019-12-31,1371691,70758
20808191,2019-12-31,1371691,82035


In [5]:
test.head()

Unnamed: 0,t_dat,user_id,item_id
0,2020-01-01,1113,88913
1,2020-01-01,1402,80680
2,2020-01-01,1402,70323
3,2020-01-01,1618,81102
4,2020-01-01,1618,80281


In [6]:
train=tf.data.Dataset.from_tensor_slices(dict(train))
test=tf.data.Dataset.from_tensor_slices(dict(test))

<TensorSliceDataset shapes: {t_dat: (), user_id: (), item_id: ()}, types: {t_dat: tf.string, user_id: tf.string, item_id: tf.string}>