In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for plotting the data 
import seaborn as sns # Advanced data plotting on top of matplotlib
import os


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [2]:
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))
#/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv
#/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv
#/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv
#/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv    

**About data**

*images* - a folder of images corresponding to each article_id; images are placed in subfolders starting with the first three digits of the article_id; note, not all article_id values have a corresponding image.

*articles.csv* - detailed metadata for each article_id available for purchase

*customers.csv* - metadata for each customer_id in dataset

*sample_submission.csv* - a sample submission file in the correct format

*transactions_train.csv* - the training data, consisting of the purchases each customer for each date, as well as additional information. Duplicate rows correspond to multiple purchases of the same item. Your task is to predict the article_ids each customer will purchase during the 7-day period immediately after the training data period.

*target*: Make predictions for all customer_id values found in the sample submission. All customers who made purchases during the test period are scored, regardless of whether they had purchase history in the training data.

Submissions are evaluated according to the Mean Average Precision @ 12 (MAP@12).
For each customer_id observed in the training data, you may predict up to 12 labels for the article_id, which is the predicted items a customer will buy in the next 7-day period after the training time period. 

# EDA

In [3]:
#customers_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv")
#sample_submission = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv" )
#customers_df.head()

In [4]:
transactions_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")#, nrow=10000)
print(transactions_df.shape)
transactions_df.head()

(31788324, 5)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [5]:
# let's takr peace of data
transactions_df_sample = transactions_df.iloc[:100000,:].copy()
transactions_df_sample.shape

(100000, 5)

In [6]:
articles_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv")
print(articles_df.shape)
articles_df.head().T  

(105542, 25)


Unnamed: 0,0,1,2,3,4
article_id,108775015,108775044,108775051,110065001,110065002
product_code,108775,108775,108775,110065,110065
prod_name,Strap top,Strap top,Strap top (1),OP T-shirt (Idro),OP T-shirt (Idro)
product_type_no,253,253,253,306,306
product_type_name,Vest top,Vest top,Vest top,Bra,Bra
product_group_name,Garment Upper body,Garment Upper body,Garment Upper body,Underwear,Underwear
graphical_appearance_no,1010016,1010016,1010017,1010016,1010016
graphical_appearance_name,Solid,Solid,Stripe,Solid,Solid
colour_group_code,9,10,11,9,10
colour_group_name,Black,White,Off White,Black,White


In [7]:
# Number of unique meanings of article features
for col in articles_df.columns:
    print(col, len(articles_df[col].unique()))
    

article_id 105542
product_code 47224
prod_name 45875
product_type_no 132
product_type_name 131
product_group_name 19
graphical_appearance_no 30
graphical_appearance_name 30
colour_group_code 50
colour_group_name 50
perceived_colour_value_id 8
perceived_colour_value_name 8
perceived_colour_master_id 20
perceived_colour_master_name 20
department_no 299
department_name 250
index_code 10
index_name 10
index_group_no 5
index_group_name 5
section_no 57
section_name 56
garment_group_no 21
garment_group_name 21
detail_desc 43405


In [8]:
# Number of unique meanings  of customers and articles
print(len(transactions_df_sample.customer_id.unique()))
print(len(transactions_df_sample.article_id.unique()))

28317
15581


In [9]:
print(transactions_df_sample.groupby("customer_id")["article_id"].count().mean())
transactions_df_sample.groupby("customer_id")["article_id"].count().value_counts()

3.531447540346788


1      8676
2      6346
3      4022
4      2663
5      1653
6      1289
7       821
8       689
9       470
10      373
11      282
12      208
13      157
15      115
14      103
16       71
19       52
18       51
17       49
20       38
21       27
24       25
22       19
23       17
25       14
28       12
26       12
31        8
27        8
29        7
37        6
32        6
36        4
39        3
40        3
52        3
30        3
38        2
43        2
33        1
103       1
34        1
35        1
47        1
45        1
42        1
72        1
Name: article_id, dtype: int64

In [10]:
print(transactions_df_sample.groupby("article_id")["customer_id"].count().value_counts().mean())
transactions_df_sample.groupby("article_id")["customer_id"].count().value_counts()

114.56617647058823


1      5207
2      2635
3      1577
4      1100
5       800
       ... 
142       1
163       1
167       1
86        1
797       1
Name: customer_id, Length: 136, dtype: int64

In [11]:
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder()
transactions_df_sample[['article_id_enc', 'customer_id_enc']] = enc.fit_transform(transactions_df_sample[['article_id', 'customer_id']]) 

transactions_df_sample.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,article_id_enc,customer_id_enc
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,13274.0,1.0
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,3024.0,1.0
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2,2027.0,2.0
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2,14860.0,2.0
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2,14861.0,2.0


In [12]:
transactions_df_sample['article_id_enc'] = transactions_df_sample['article_id_enc'].astype(int)
transactions_df_sample['customer_id_enc'] = transactions_df_sample['customer_id_enc'].astype(int)

In [13]:
item_features = articles_df[['product_type_no', 'index_group_no', 'garment_group_no', 'article_id']].copy()
item_features = item_features.merge(transactions_df_sample[['article_id_enc','article_id']].drop_duplicates(), how="left", on="article_id")
item_features.dropna(axis=0, inplace=True)
item_features['article_id_enc'] = item_features['article_id_enc'].astype(int)
item_features.set_index("article_id_enc", inplace=True)
item_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15581 entries, 0 to 15580
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   product_type_no   15581 non-null  int64
 1   index_group_no    15581 non-null  int64
 2   garment_group_no  15581 non-null  int64
 3   article_id        15581 non-null  int64
dtypes: int64(4)
memory usage: 608.6 KB


In [14]:
train_df, test_df = train_test_split(transactions_df_sample, test_size=0.1, random_state=42)

# Collaborative filtering 

In [15]:
import scipy.sparse as sp
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

from lightfm.cross_validation import random_train_test_split
from lightfm.data import Dataset


In [16]:
# Convert data to sparse matrix
# (np.float32 coo_matrix of shape [n_users, n_items]) – the matrix containing user-item interactions. 
# Will be converted to numpy.float32 dtype if it is not of that type.

data_sp = sp.coo_matrix((np.ones(train_df.shape[0]), (train_df.customer_id_enc.values, train_df.article_id_enc.values ) ))
data_sp.shape

(28317, 15581)

In [17]:
# build item features
# item_features (np.float32 csr_matrix of shape [n_items, n_item_features], optional) – Each row contains that item’s weights over features.
item_features_product_type_no = pd.get_dummies(item_features["product_type_no"])
item_features_product_type_no.shape

item_features_csr = sp.csr_matrix(item_features_product_type_no.values)
item_features_csr.shape

(15581, 96)

In [18]:
# Make split
train_sp, valid_sp = random_train_test_split(data_sp, test_percentage=0.2, random_state=42)

In [19]:
# Define the model

model = LightFM(no_components=30, loss='warp', random_state=42)
model.fit(train_sp, epochs=10, num_threads=2)

<lightfm.lightfm.LightFM at 0x7f1e65fc1d10>

In [20]:
# Evaluate the model
print("Train AUC: %.2f" % auc_score(model, train_sp).mean())
print("Valid AUC: %.2f" % auc_score(model, valid_sp).mean())


Train AUC: 0.99
Valid AUC: 0.78


In [21]:
model.predict(train_df.customer_id_enc.values, train_df.article_id_enc.values)

array([-0.00773799,  0.5967291 , -0.8796036 , ..., -0.3862144 ,
        1.6393838 ,  1.1997406 ], dtype=float32)

In [22]:
# Model with item features
model2 = LightFM(no_components=150, loss='warp', random_state=42)
model2.fit(train_sp, item_features=item_features_csr, epochs=20, num_threads=2)

<lightfm.lightfm.LightFM at 0x7f1e65fbd6d0>

In [23]:
# Evaluate the model
print("Train AUC: %.2f" % auc_score(model2, train_sp, item_features=item_features_csr).mean())
print("Valid AUC: %.2f" % auc_score(model2, valid_sp, item_features=item_features_csr).mean())

Train AUC: 0.92
Valid AUC: 0.69


In [24]:
# Dictinary for index converting
article_dict = dict(zip(train_df["article_id_enc"].values, train_df["article_id"].values))


In [25]:
def sample_recommendation(model, transactions_df, articles_df, article_dict, num, user_ids):

    items = transactions_df.article_id_enc.unique()
    #print(items[:10])

    for user_id in user_ids:
        print()
        print("*" * 60)
        print("  Recomendations for user_id =", user_id)
        print("*" * 60)
        
        print("\n     Already bought:")
        rtg = transactions_df[transactions_df["customer_id_enc"] == user_id]["article_id"].values
        
        #print("{:10}  {:25} {:20} {:20} {:40}".format("id", "prod name", "product type name","index group name", "section name"))
        #print()

        for item in rtg:           
            print("{:10}  {:25} {:20} {:20} {:40}".format(item, articles_df.loc[item, "prod_name"], articles_df.loc[item, "product_type_name"],
                                                          articles_df.loc[item, "index_group_name"],
                                                          articles_df.loc[item, "section_name"]
                                                          ))

        # make prediction 
        scores = model.predict(np.ones(len(items)) * user_id, items)
        
        # n-top score 
        num_of_cand = 50
        argsort_scores_idx = np.argsort(scores)[-num_of_cand:]
        argsort_scores_idx = np.flip(argsort_scores_idx)

        
        
        print("\n     Recommended:")
        k = 1
        for ind in argsort_scores_idx:
            #print(i, items[i],article_dict[items[i]])
            ii = article_dict[items[ind]]
            
            # check if already bought
            if ii in rtg:
                continue
            else: k+=1
            
            print("{:10}  {:25} {:20} {:20} {:40} score={:.3}".format(ii, articles_df.loc[ii, "prod_name"], 
                                                          articles_df.loc[ii, "product_type_name"],
                                                          articles_df.loc[ii, "index_group_name"],
                                                          articles_df.loc[ii, "section_name"],
                                                          scores[ind]     
                                                          ))
            if k > num: break
        #print()
        
sample_recommendation(model, transactions_df_sample, articles_df.set_index("article_id"), article_dict, 5, [4, 25, 45, 444, 777, 2784, 4525, 9876, 5443])


************************************************************
  Recomendations for user_id = 4
************************************************************

     Already bought:
 625480003  Flora                     Dress                Ladieswear           Womens Everyday Collection              
 658183002  BOOGIE hooded tee         T-shirt              Sport                Ladies H&M Sport                        

     Recommended:
 658183001  BOOGIE hooded tee         T-shirt              Sport                Ladies H&M Sport                         score=1.79
 671505002  EDC ROMAN BLOUSE          Blouse               Ladieswear           Womens Everyday Collection               score=1.36
 685687004  W YODA KNIT OL OFFER      Sweater              Ladieswear           Womens Everyday Collection               score=1.35
 688873012  Gyda!                     Blouse               Ladieswear           Womens Tailoring                         score=1.33
 615021021  Luisa tee            