# Data Preparation

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.metrics import mean_squared_error, pairwise

In [2]:
pd.set_option('display.max_columns', 50)

In [3]:
articles = pd.read_csv('Data/h-and-m-personalized-fashion-recommendations/articles.csv')
articles.head(3)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,4,Dark,5,Black,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,3,Light,9,White,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,1,Dusty Light,9,White,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.


This table contains all H&M articles with details such as a type of product, a color, a product group and other features.

Article data description:

- article_id : A unique identifier of every article.
- product_code, prod_name : A unique identifier of every product and its name (not the same).
- product_type, product_type_name : The group of product_code and its name
- graphical_appearance_no, graphical_appearance_name : The group of graphics and its name
- colour_group_code, colour_group_name : The group of color and its name
- perceived_colour_value_id, perceived_colour_value_name, perceived_colour_master_id, perceived_colour_master_name : The added color info
- department_no, department_name: : A unique identifier of every dep and its name
- index_code, index_name: : A unique identifier of every index and its name
- index_group_no, index_group_name: : A group of indeces and its name
- section_no, section_name: : A unique identifier of every section and its name
- garment_group_no, garment_group_name: : A unique identifier of every garment and its name
- detail_desc: : Details

In [4]:
articles.shape

(105542, 25)

In [5]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   article_id                    105542 non-null  int64 
 1   product_code                  105542 non-null  int64 
 2   prod_name                     105542 non-null  object
 3   product_type_no               105542 non-null  int64 
 4   product_type_name             105542 non-null  object
 5   product_group_name            105542 non-null  object
 6   graphical_appearance_no       105542 non-null  int64 
 7   graphical_appearance_name     105542 non-null  object
 8   colour_group_code             105542 non-null  int64 
 9   colour_group_name             105542 non-null  object
 10  perceived_colour_value_id     105542 non-null  int64 
 11  perceived_colour_value_name   105542 non-null  object
 12  perceived_colour_master_id    105542 non-null  int64 
 13 

In [6]:
customers = pd.read_csv('Data/h-and-m-personalized-fashion-recommendations/customers.csv')
customers.head(3)

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...


Customers data description:

- customer_id : A unique identifier of every customer
- FN : 1 or missed
- Active : 1 or missed
- club_member_status : Status in club
- fashion_news_frequency : How often H&M may send news to customer
- age : The current age
- postal_code : Postal code of customer

In [7]:
customers.shape

(1371980, 7)

In [8]:
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371980 entries, 0 to 1371979
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   customer_id             1371980 non-null  object 
 1   FN                      476930 non-null   float64
 2   Active                  464404 non-null   float64
 3   club_member_status      1365918 non-null  object 
 4   fashion_news_frequency  1355971 non-null  object 
 5   age                     1356119 non-null  float64
 6   postal_code             1371980 non-null  object 
dtypes: float64(3), object(4)
memory usage: 73.3+ MB


In [9]:
transactions = pd.read_csv('Data/h-and-m-personalized-fashion-recommendations/transactions_train.csv')
transactions.head(3)

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2


In [10]:
transactions['Bought'] = int(True)

In [11]:
transactions

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,Bought
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,1
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,1
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2,1
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2,1
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2,1
...,...,...,...,...,...,...
31788319,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,929511001,0.059305,2,1
31788320,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,891322004,0.042356,2,1
31788321,2020-09-22,fff380805474b287b05cb2a7507b9a013482f7dd0bce0e...,918325001,0.043203,1,1
31788322,2020-09-22,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,833459002,0.006763,1,1


In [12]:
transactions.shape

(31788324, 6)

Transactions data description:

- t_dat : A unique identifier of every customer
- customer_id : A unique identifier of every customer (in customers table)
- article_id : A unique identifier of every article (in articles table)
- price : Price of purchase
- sales_channel_id : 1 or 2

In [13]:
transactions.isnull().sum()

t_dat               0
customer_id         0
article_id          0
price               0
sales_channel_id    0
Bought              0
dtype: int64

In [14]:
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,Bought
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,1
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,1
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2,1
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2,1
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2,1


Let's try surprise library for the collaborative recommender system. For this we want three columns -
- customer id
- article id 
- 1/0 for either the customer bought the article or not

So for every customer in the transactions df, we want to see which article did they buy or not. These articles are only the ones from the transactions df. 

In [15]:
import surprise
from surprise.prediction_algorithms import *
from surprise import Reader, Dataset

Data Cleaning and EDA again

In [16]:
transactions['InvoiceDate'] = pd.to_datetime(transactions['t_dat'],format='%Y-%m-%d')
transactions=transactions[["InvoiceDate", "customer_id", "article_id", "price","sales_channel_id"]].drop_duplicates()

In [17]:
transactions.shape

(28813419, 5)

Shape of the transactions df reduced from 31,788,324 transactions to 28,813,419.

In [18]:
transactions.head(3)

Unnamed: 0,InvoiceDate,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2


In [19]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28813419 entries, 0 to 31788323
Data columns (total 5 columns):
 #   Column            Dtype         
---  ------            -----         
 0   InvoiceDate       datetime64[ns]
 1   customer_id       object        
 2   article_id        int64         
 3   price             float64       
 4   sales_channel_id  int64         
dtypes: datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 1.3+ GB


In [20]:
# unique customer ids in the transactions df 

len(transactions['customer_id'].unique())
unique_customers = transactions['customer_id'].unique().tolist()
unique_customers

['000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318',
 '00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2',
 '00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4c73235dccbbc132280',
 '0008968c0d451dbc5a9968da03196fe20051965edde7413775c4eb3be9abe9c2',
 '000aa7f0dc06cd7174389e76c9e132a67860c5f65f970699daccc14425ac31a8',
 '001127bffdda108579e6cb16080440e89bf1250a776c6e55f56e35e9ee029a8d',
 '001ea4e9c54f7e9c88811260d954edc059d596147e1cf8adc73323aebf571fd8',
 '001fd23db1109a94bba1319bb73df0b479059027c182da490e1161b34cd3af61',
 '0021da829b898f82269fc51feded4eac2129058ee95bd75bb1591e2eb14ecc79',
 '00228762ecff5b8d1ea6a2e52b96dafa198febddbc3bf350eb611f28e61ea6ce',
 '002b3c0a44a22c45a8d62ea9d2b88d1a89e335f8b8400335a85c3b5845794cb9',
 '003ac6eff3ca20cbe4e14a106a5e28b7551616005de8f9dd0f9455e5543b31ac',
 '00401a367c5ac085cb9d4b77c56f3edcabf25153615db982fcc7991d182e10a9',
 '00402f4463c8dc1b3ee54abfdea280e96cd87320449eca8953eb06769a5c20d4',
 '0045c79125b4dc958579f902b49eacd8

In [21]:
# unique article ids in the transactions df 

len(transactions['article_id'].unique())
unique_articles = transactions['article_id'].unique().tolist()
unique_articles

[663713001,
 541518023,
 505221004,
 685687003,
 685687004,
 685687001,
 505221001,
 688873012,
 501323011,
 598859003,
 688873020,
 688873011,
 531310002,
 529841001,
 501820043,
 674681001,
 671505001,
 631848002,
 680187001,
 676827002,
 685687002,
 680912006,
 692454002,
 640639001,
 664421002,
 680912009,
 553139001,
 377277001,
 700819006,
 397068015,
 652075001,
 670295001,
 631744002,
 562252035,
 649356002,
 579941002,
 629760002,
 625229004,
 688545001,
 673531001,
 464277014,
 617322003,
 620405001,
 678239001,
 613456009,
 633675001,
 648719001,
 427114015,
 567475001,
 567594001,
 681358001,
 613456001,
 573937001,
 622745001,
 617322004,
 507909001,
 665481004,
 211143037,
 503729006,
 662857005,
 640174002,
 625939005,
 508184020,
 560783010,
 660150001,
 611415005,
 578374001,
 673677002,
 676352001,
 611415001,
 255396006,
 594834010,
 516712001,
 524825010,
 634015003,
 636351002,
 633781016,
 534795002,
 659550001,
 547752003,
 610671001,
 652731008,
 618476004,
 622

Now we know there are 1,362,281 (1,371,980 from customers df) customers who have transacted with 104,547 articles (105,542 from articles df). Now, we want to add a column in df where we know if a customer bought an article or not. 

In [22]:
purchase_df = transactions[['customer_id', 'article_id']]
purchase_df['purchase'] = 1
purchase_df.reset_index()
purchase_df.head(3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  purchase_df['purchase'] = 1


Unnamed: 0,customer_id,article_id,purchase
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,1
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,1
2,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,1


In [23]:
purchase_df.isna().sum()

customer_id    0
article_id     0
purchase       0
dtype: int64

In [24]:
unique_customers

['000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318',
 '00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2',
 '00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4c73235dccbbc132280',
 '0008968c0d451dbc5a9968da03196fe20051965edde7413775c4eb3be9abe9c2',
 '000aa7f0dc06cd7174389e76c9e132a67860c5f65f970699daccc14425ac31a8',
 '001127bffdda108579e6cb16080440e89bf1250a776c6e55f56e35e9ee029a8d',
 '001ea4e9c54f7e9c88811260d954edc059d596147e1cf8adc73323aebf571fd8',
 '001fd23db1109a94bba1319bb73df0b479059027c182da490e1161b34cd3af61',
 '0021da829b898f82269fc51feded4eac2129058ee95bd75bb1591e2eb14ecc79',
 '00228762ecff5b8d1ea6a2e52b96dafa198febddbc3bf350eb611f28e61ea6ce',
 '002b3c0a44a22c45a8d62ea9d2b88d1a89e335f8b8400335a85c3b5845794cb9',
 '003ac6eff3ca20cbe4e14a106a5e28b7551616005de8f9dd0f9455e5543b31ac',
 '00401a367c5ac085cb9d4b77c56f3edcabf25153615db982fcc7991d182e10a9',
 '00402f4463c8dc1b3ee54abfdea280e96cd87320449eca8953eb06769a5c20d4',
 '0045c79125b4dc958579f902b49eacd8

In [25]:
purchase_df.head(1)

Unnamed: 0,customer_id,article_id,purchase
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,1


In [26]:

# for customer in unique_customers:
# #     print(customer)
#     reqd_index = purchase_df.index[purchase_df['customer_id'] == customer].tolist()
# #     print(customer)
# #     print(reqd_index)
#     articles_bought = [items for items in purchase_df['article_id'][[index for index in reqd_index]]]
# #     print(len(articles_bought))
# #now add rows for customer id and articles not in the articles bought list and 0 in purchase column
#     articles_not_bought = [item for item in unique_articles if item not in articles_bought]
# #     print(len(articles_not_bought))
#     for article in articles_not_bought:
#         row = {'customer_id': customer, 'article_id': article, 'purchase': 0}
#         purchase_df = purchase_df.append(row, ignore_index = True)
# #         break
# #     break

KeyboardInterrupt: 

In [27]:
purchase_df.tail()

Unnamed: 0,customer_id,article_id,purchase
28819041,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,612891002,0
28819042,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,564792001,0
28819043,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,655869001,0
28819044,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,617245003,0
28819045,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,617279001,0


In [None]:
len(unique_articles)

In [None]:
purchase_df.loc[[23934158]]

In [None]:
purchase_df.loc[[0]]

In [None]:
reader = Reader(rating_scale = (0,1))

In [None]:
ds = Dataset.load_from_df(transactions[['customer_id', 'article_id', 'Bought']], reader)

In [None]:
ds

# RFM Analysis - 

RFM analysis is a marketing technique used to quantitatively rank and group customers based on the recency, frequency and monetary total of their recent transactions to identify the best customers and perform targeted marketing campaigns.

In [None]:
# import required libraries for clustering
# import sklearn
# from sklearn.preprocessing import StandardScaler
# from sklearn.cluster import KMeans
# from sklearn.metrics import silhouette_score
# from scipy.cluster.hierarchy import linkage
# from scipy.cluster.hierarchy import dendrogram
# from scipy.cluster.hierarchy import cut_tree
from sklearn.metrics.pairwise import cosine_similarity

import datetime



In [None]:
transactions['InvoiceDate'] = pd.to_datetime(transactions['t_dat'],format='%Y-%m-%d')
transactions=transactions[["InvoiceDate", "customer_id", "article_id", "price","sales_channel_id"]].drop_duplicates()

In [None]:
transactions.shape

In [None]:
transactions.head(3)

In [None]:
start_date = datetime.datetime(2020,3,1)

# Filter transactions by date
transactions["t_dat"] = pd.to_datetime(transactions["InvoiceDate"])
transactions = transactions.loc[transactions["t_dat"] >= start_date]

In [None]:
#analysis_date = max(transactions['InvoiceDate']) + dt.timedelta(days= 1)
analysis_date=datetime.datetime(2020,9,23)
print((analysis_date).date())

In [None]:
transactions['date']=transactions['InvoiceDate']
(analysis_date - transactions['InvoiceDate'].max()).days

In [None]:
rfm = transactions.groupby('customer_id').agg({
    'InvoiceDate': lambda x: (analysis_date - x.max()).days,
    'date': 'count',
    'price': 'sum'})

In [None]:
rfm.head()

In [None]:
rfm.columns=["Recency","Frequency","Monetary"]
rfm = rfm[rfm["Monetary"] > 0]

In [None]:
#Date from customer's last purchase.The nearest date gets 5 and the furthest date gets 1.
rfm["recency_score"] = pd.qcut(rfm['Recency'], 5, labels=[5, 4, 3, 2, 1])
# Total number of purchases.The least frequency gets 1 and the maximum frequency gets 5.
rfm["frequency_score"] = pd.qcut(rfm["Frequency"].rank(method="first"), 5, labels=[1, 2, 3, 4, 5])
#Total spend by the customer.The least money gets 1, the most money gets 5.
rfm["monetary_score"]= pd.qcut(rfm["Monetary"],5,labels=[1,2,3,4,5])
rfm.head()

In [None]:
#RFM - The value of 2 different variables that were formed was recorded as a RFM_SCORE
rfm["RFM_SCORE"] = (rfm["recency_score"].astype(str) + rfm["frequency_score"].astype(str))

In [None]:
seg_map = {
    r'[1-2][1-2]': 'hibernating',
    r'[1-2][3-4]': 'at_Risk',
    r'[1-2]5': 'cant_loose',
    r'3[1-2]': 'about_to_sleep',
    r'33': 'need_attention',
    r'[3-4][4-5]': 'loyal_customers',
    r'41': 'promising',
    r'51': 'new_customers',
    r'[4-5][2-3]': 'potential_loyalists',
    r'5[4-5]': 'champions'
}
rfm['segment'] = rfm['RFM_SCORE'].replace(seg_map, regex=True)
rfm.head()

In [None]:
rfm[["segment", "Recency","Frequency","Monetary"]].groupby("segment").agg(["mean","count","max"]).round()

In [None]:
import plotly.express as px

In [None]:
x = rfm.segment.value_counts()
fig = px.treemap(x, path=[x.index], values=x)
fig.update_layout(title_text='Distribution of the RFM Segments', title_x=0.5,
                  title_font=dict(size=20))
fig.update_traces(textinfo="label+value+percent root")
fig.show()

### Item-Based Collaborative Filtering -using Probabilistic Matrix Factorization

Preparing the data We need to restrict the data respect to a minimum transaction date. In that way, we reduce the dimensionality of the problem and we get rid of transactions that are not important in terms of the time decaying popularity.

Also, we are getting rid of articles that have not been bought enough. (Minimum 10 purchases are required)

In [None]:
from tqdm import tqdm

In [None]:
rfm=rfm.reset_index()

In [None]:
 transactions.head(1)

In [None]:
transactions=pd.merge(transactions,rfm[["customer_id","segment"]],how='inner',on='customer_id')
training_segment = ['champions', 'potential_loyalists', 'new_customers','promising','loyal_customers']
transactions = transactions[transactions['segment'].isin(training_segment)]
transactions=transactions.drop('segment', axis=1)

In [None]:
start_date = datetime.datetime(2020,9,1)
# Filter transactions by date
transactions["t_dat"] = transactions["InvoiceDate"]
transactions = transactions.loc[transactions["InvoiceDate"] >= start_date]

In [None]:
# Filter transactions by number of an article has been bought
article_bought_count = transactions[['article_id', 'InvoiceDate']].groupby('article_id').count().reset_index().rename(columns={'InvoiceDate': 'count'})
most_bought_articles = article_bought_count[article_bought_count['count']>10]['article_id'].values
transactions = transactions[transactions['article_id'].isin(most_bought_articles)]
transactions["bought"]=1 

In [None]:
transactions

Due to the big amount of items, we can not consider the whole matrix in order to train. Therefore, we need to generate some negative samples: transactions that have never occured.

In [None]:
# Generate negative samples
np.random.seed(0)

negative_samples = pd.DataFrame({
    'article_id': np.random.choice(transactions.article_id.unique(), transactions.shape[0]),
    'customer_id': np.random.choice(transactions.customer_id.unique(), transactions.shape[0]),
    'bought': np.zeros(transactions.shape[0])
})

Model will be based on recommendations computed through the time decaying popularity and the most similar items to those items bought the most times by each user. Similarity among items is computed through cosine distance.

In [None]:
negative_samples.head()

In [None]:
transactions.head()

In [None]:
total_transactions = pd.concat([transactions, negative_samples])
total_transactions

In [None]:
# assign index to unique customers 
customer_id2index = {c: i for i, c in enumerate(np.unique(total_transactions.customer_id.values))}
len(customer_id2index)

In [None]:
#sniff test

customer_id2index['000b31552d3785c79833262bbeefa484cbc43d7b612b3c8fc696260b5afaadc4']

In [None]:
# assign index to unique articles in the transactions list
articles_id2index = {c: i for i, c in enumerate(np.unique(total_transactions.article_id.values))}
len(articles_id2index)

# only 9,225 articles compared to 105,542 in total!

In [None]:
class ItemBased_RecSys:
    ''' Collaborative filtering using a custom sim(u,u'). '''

    def __init__(self, positive_transactions, negative_transactions, num_components=10):
        ''' Constructor '''
        self.positive_transactions = positive_transactions
        self.transactions = pd.concat([positive_transactions, negative_transactions])
        self.customers = self.transactions.customer_id.values
        self.articles = self.transactions.article_id.values
        self.bought = self.transactions.bought.values
        self.num_components = num_components

        self.customer_id2index = {c: i for i, c in enumerate(np.unique(self.customers))}
        self.article_id2index = {a: i for i, a in enumerate(np.unique(self.articles))}
        
    def __sdg__(self):
        for idx in tqdm(self.training_indices):
            # Get the current sample
            customer_id = self.customers[idx]
            article_id = self.articles[idx]
            bought = self.bought[idx]

            # Get the index of the user and the article
            customer_index = self.customer_id2index[customer_id]
            article_index = self.article_id2index[article_id]

            # Compute the prediction and the error
            prediction = self.predict_single(customer_index, article_index)
            error = (bought - prediction) # error
            
            # Update latent factors in terms of the learning rate and the observed error
            self.customers_latent_matrix[customer_index] += self.learning_rate * \
                                    (error * self.articles_latent_matrix[article_index] - \
                                     self.lmbda * self.customers_latent_matrix[customer_index])
            self.articles_latent_matrix[article_index] += self.learning_rate * \
                                    (error * self.customers_latent_matrix[customer_index] - \
                                     self.lmbda * self.articles_latent_matrix[article_index])
                
                
    def fit(self, n_epochs=10, learning_rate=0.001, lmbda=0.1):
        ''' Compute the matrix factorization R = P x Q '''
        self.learning_rate = learning_rate
        self.lmbda = lmbda
        n_samples = self.transactions.shape[0]
        
        # Initialize latent matrices
        self.customers_latent_matrix = np.random.normal(scale=1., size=(len(np.unique(self.customers)), self.num_components))
        self.articles_latent_matrix = np.random.normal(scale=1., size=(len(np.unique(self.articles)), self.num_components))

        for epoch in range(n_epochs):
            print('Epoch: {}'.format(epoch))
            self.training_indices = np.arange(n_samples)
            
            # Shuffle training samples and follow stochastic gradient descent
            np.random.shuffle(self.training_indices)
            self.__sdg__()

    def predict_single(self, customer_index, article_index):
        ''' Make a prediction for an specific user and article '''
        prediction = np.dot(self.customers_latent_matrix[customer_index], self.articles_latent_matrix[article_index])
        prediction = np.clip(prediction, 0, 1)
        
        return prediction

    def default_recommendation(self):
        ''' Calculate time decaying popularity '''
        # Calculate time decaying popularity. This leads to items bought more recently having more weight in the popularity list.
        # In simple words, item A bought 5 times on the first day of the train period is inferior than item B bought 4 times on the last day of the train period.
        self.positive_transactions['pop_factor'] = self.positive_transactions['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,23) - x).days)
        transactions_by_article = self.positive_transactions[['article_id', 'pop_factor']].groupby('article_id').sum().reset_index()
        return transactions_by_article.sort_values(by='pop_factor', ascending=False)['article_id'].values[:12]


    def predict(self, customers):
        ''' Make recommendations '''
        recommendations = []
        self.articles_latent_matrix[np.isnan(self.articles_latent_matrix)] = 0
        # Compute similarity matrix (cosine)
        similarity_matrix = cosine_similarity(self.articles_latent_matrix, self.articles_latent_matrix, dense_output=False)

        # Convert similarity matrix into a matrix containing the 12 most similar items' index for each item
        similarity_matrix = np.argsort(similarity_matrix, axis=1)
        similarity_matrix = similarity_matrix[:, -12:]

        # Get default recommendation (time decay popularity)
        default_recommendation = self.default_recommendation()

        # Group articles by user and articles to compute the number of times each article has been bought by each user
        transactions_by_customer = self.positive_transactions[['customer_id', 'article_id', 'bought']].groupby(['customer_id', 'article_id']).count().reset_index()
        most_bought_article = transactions_by_customer.loc[transactions_by_customer.groupby('customer_id').bought.idxmax()]['article_id'].values

        # Make predictions
        for customer in tqdm(customers):
            try:
                rec_aux1 = []
                rec_aux2 = []
                aux = []

                # Retrieve the most bought article by customer
                user_most_bought_article_id = most_bought_article[self.customer_id2index[customer]]

                # Using the similarity matrix, get the 6 most similar articles
                rec_aux1 = self.articles[similarity_matrix[self.article_id2index[user_most_bought_article_id]]]
                # Return the half of the default recommendation
                rec_aux2 = default_recommendation

                # Merge half of both recommendation lists
                for rec_idx in range(6):
                    aux.append(rec_aux2[rec_idx])
                    aux.append(rec_aux1[rec_idx])

                recommendations.append(' '.join(aux))
            except:
                # Return the default recommendation
                recommendations.append(' '.join(default_recommendation))
        
        return pd.DataFrame({
            'customer_id': customers,
            'prediction': recommendations,
        })

Define your hyperparameters and fit the model. Take into account that there are more customizable parameters in the data processing section.

In [None]:
rec = ItemBased_RecSys(transactions, negative_samples, num_components=1000)
rec.fit(n_epochs=1)

In [None]:
customers_id_unique = customers['customer_id'].unique()

In [None]:
recommendations = rec.predict(customers_id_unique)

In [None]:
recommendations.head()