# Related Products Recommender Engine

### This is a try in order to build a related products Recommender for the cart page of an online grocery store.

#### importing the events data

In [106]:
import json
import pandas as pd
# loading the json file
with open('data/events.json') as file:
    events_data = json.load(file)
# convering the input as a dataframe
events_df = pd.DataFrame.from_dict(events_data['events'])
events_df.head()

Unnamed: 0,event,sessionid,eventtime,price,productid
0,cart,a0655eee-1267-4820-af21-ad8ac068ff7a,2020-06-01T08:59:16.406Z,14.48,HBV00000NVZE8
1,cart,d2ea7bd3-9235-4a9f-a9ea-d7f296e71318,2020-06-01T08:59:46.580Z,49.9,HBV00000U2B18
2,cart,5e594788-78a0-44dd-8e66-37022d48f691,2020-06-01T08:59:33.308Z,1.99,OFIS3101-080
3,cart,fdfeb652-22fa-4153-b9b5-4dfa0dcaffdf,2020-06-01T08:59:31.911Z,2.25,HBV00000NVZBW
4,cart,9e9d4f7e-898c-40fb-aae9-256c40779933,2020-06-01T08:59:33.888Z,9.95,HBV00000NE0T4


Since there are only "cart" type events in the events column, this column was removed from this dataframe.

In [107]:
events_df.event.unique()

array(['cart'], dtype=object)

In [108]:
events_df.drop('event', axis=1, inplace = True)

In [109]:
events_df.shape

(387656, 4)

In [110]:
with open('data/meta.json') as file:
    meta_data = json.load(file)
meta_df = pd.DataFrame.from_dict(meta_data['meta'])
meta_df.head()

Unnamed: 0,productid,brand,category,subcategory,name
0,HBV00000AX6LR,Palette,Kişisel Bakım,Saç Bakımı,Palette Kalıcı Doğal Renkler 10-4 PAPATYA
1,HBV00000BSAQG,Best,Pet Shop,Kedi,Best Pet Jöle İçinde Parça Etli Somonlu Konser...
2,HBV00000JUHBA,Tarım Kredi,Temel Gıda,"Bakliyat, Pirinç, Makarna",Türkiye Tarım Kredi Koop.Yeşil Mercimek 1 kg
3,HBV00000NE0QI,Namet,"Et, Balık, Şarküteri",Şarküteri,Namet Fıstıklı Macar Salam 100 gr
4,HBV00000NE0UQ,Muratbey,Kahvaltılık ve Süt,Peynir,Muratbey Burgu Peyniri 250 gr


In [111]:
meta_df.shape

(10236, 5)

#### Merging the datasets into a single dataset based on the common column (productid)

In [112]:
data = pd.merge(left=meta_df, right=events_df, how='right', on = 'productid')
data.head()

Unnamed: 0,productid,brand,category,subcategory,name,sessionid,eventtime,price
0,HBV00000NVZE8,,"Et, Balık, Şarküteri",Kırmızı Et,Dana Kıyma (%5-%7 Yağ) 250 gr,a0655eee-1267-4820-af21-ad8ac068ff7a,2020-06-01T08:59:16.406Z,14.48
1,HBV00000U2B18,Oral-B,Kişisel Bakım,Ağız Bakım,Diş Fırçası Yedek Başlığı Stages Çocuk 2 Adet,d2ea7bd3-9235-4a9f-a9ea-d7f296e71318,2020-06-01T08:59:46.580Z,49.9
2,OFIS3101-080,Noki,Oyuncak ve Kırtasiye,Dosyalama ve Arşivleme,Noki Dosya Çıtçıtlı Evrak Zarfı Kırmızı 3101 T...,5e594788-78a0-44dd-8e66-37022d48f691,2020-06-01T08:59:33.308Z,1.99
3,HBV00000NVZBW,,Meyve ve Sebze,Sebze,Domates 500 gr,fdfeb652-22fa-4153-b9b5-4dfa0dcaffdf,2020-06-01T08:59:31.911Z,2.25
4,HBV00000NE0T4,Carrefour,Temel Gıda,Sıvı Yağ,Carrefour Ayçiçek Yağı 1 lt,9e9d4f7e-898c-40fb-aae9-256c40779933,2020-06-01T08:59:33.888Z,9.95


In [113]:
data.shape

(387656, 8)

#### Data Cleaning 

In [114]:
data.dtypes

productid      object
brand          object
category       object
subcategory    object
name           object
sessionid      object
eventtime      object
price          object
dtype: object

In [115]:
data.isna().sum()

productid           6
brand          131851
category            6
subcategory         6
name                6
sessionid           0
eventtime           0
price               6
dtype: int64

The rows with missing values in all of the columns were removed.

In [116]:
# not considering the 6 rows with all nan values
data = data[data.productid.notna()]
data.isna().sum()

productid           0
brand          131845
category            0
subcategory         0
name                0
sessionid           0
eventtime           0
price               0
dtype: int64

Hereby, since the number of Nan values in the brand column was significant, removing the rows with Nan numbers would not be rational. Thus, they were treated as another brand; namely "Belirsiz".

In [117]:
data.brand = data.brand.fillna('Belirsiz')
data.brand

0           Belirsiz
1             Oral-B
2               Noki
3           Belirsiz
4          Carrefour
             ...    
387651     Carrefour
387652     Carrefour
387653        Vernel
387654    Carte D'or
387655     Carrefour
Name: brand, Length: 387650, dtype: object

There are no missing values at this point;

In [118]:
data.isna().values.any()

False

Duplicated rows were handled.

In [119]:
data.duplicated().sum()

350

In [120]:
data = data.drop_duplicates()
data.shape

(387300, 8)

### Data Preprocessing 
In this step, the meta_df data frame which contains the metadata of products and their information, was considered. 
Then for each column, the string values in 'brand','category','subcategory', and 'name'columns were combined together as in a new column named 'description'.

In [121]:
meta_df.brand = meta_df.brand.fillna('Belirsiz')
# Combining the four columns into a column and assigning it as a dataframe named description
description = meta_df[['brand','category','subcategory','name']].astype(str).apply(lambda x: ' '.join(x), axis=1).to_frame('description')
# Concatinating the two dataframes
meta_data = pd.concat([meta_df,description], axis=1)

### Building the model
At this point the aim is to build a content based recommender system. The CountVectorizer was used in order to be applied on the description column of the dataframe.

In [122]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(meta_data.description)

Cosine similarity was considered as the metric for finding the similarities between pairs of products.
The cosine_similarity matrix were calculated. 

In [123]:
from sklearn.metrics.pairwise import cosine_similarity
cs_matrix = cosine_similarity(X)
cs_matrix

array([[1.        , 0.        , 0.        , ..., 0.06362848, 0.06726728,
        0.05337605],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.06362848, 0.        , 0.        , ..., 1.        , 0.66769786,
        0.35320863],
       [0.06726728, 0.        , 0.        , ..., 0.66769786, 1.        ,
        0.32673202],
       [0.05337605, 0.        , 0.        , ..., 0.35320863, 0.32673202,
        1.        ]])

Note that the length of the matrix in each axis is equal to the length of unique products list.

In [124]:
meta_data.productid.shape

(10236,)

In [125]:
cs_matrix.shape

(10236, 10236)

#### Finding related products for a given product
The input of the function below will be a productid. And the output of it will be ten most related products to it. Here, the names of those products will be shown. However, we could also show the productid of them.

In [144]:
def related_products(input_product):
    # finding the index of the input_procduct in the meta_data dataframe
    input_productid=meta_data[meta_data.productid == input_product].index.values[0]
    # geting a list of the values in the i'th row ; i==input_productid
    scores = list(enumerate(cs_matrix[input_productid]))
    # descending sorting of the list 
    scores_sorted=sorted(scores, key=lambda x:x[1], reverse=True)[1:]

    print(f'     Related Products  \t\t\t\t\t Similarity')
    
    # pringting names of the related products
    j=0
    for score in scores_sorted:
        recomm=meta_data[meta_data.index==score[0]]['name'].values

        print(f'{j+1:3}. {recomm[0]:55}   {scores_sorted[j][1]:.2f}')
        j+=1
        if j>9:
            break
            
print ('     Input product:      ',meta_data.name[0],'\n')
related_products(meta_data.productid[0])

     Input product:       Palette Kalıcı Doğal Renkler 10-4 PAPATYA 

     Related Products  					 Similarity
  1. Palette Kalıcı Doğal Renkler 10-0 AÇIK SARI               0.89
  2. Palette Kalıcı Doğal Renkler 4-0 KAHVE                    0.88
  3. Palette Kalıcı Doğal Renkler 1-0 SİYAH                    0.85
  4. Palette Kalıcı Doğal Renkler 9-4 SAHRA SARISI             0.85
  5. Palette Kalıcı Doğal Renkler 6-0 KOYU KUMRAL              0.85
  6. Palette Kalıcı Doğal Renkler 1-1 GECE MAVİSİ              0.82
  7. Palette Kalıcı Doğal Renkler 5-89 GECE KIZILI             0.82
  8. Palette Kalıcı Doğal Renkler 6-70 BRONZ KAHVE             0.82
  9. Palette Saç Boyası Kalıcı Doğal Renkler 4-0 Amber Kahve   0.81
 10. Palette Saç Boyası Kalıcı Doğal Renkler 3-0 Koyu Kakao    0.81


### Offering Another Method 
This is a method that iteratively looks for sessions in the events dataset. Thus it would be reasonable to use it on a larger dataset. In this method, we will use the data of events in order to get all the items that a particular session has included in the basket. This way, we will be able to find related products based on the frequency of them being included together in the baskets of distinct sessions.

In [145]:
# building a dictionarry of unique productid
unique_productids = data.productid.unique()
productid_dict = {}
c=0
for p in unique_productids:
    productid_dict[p] = c
    c+=1

In [146]:
# grouping the products for each session, and then getting the results as in lists
carts = data.groupby(['sessionid'])['productid'].apply(list)
carts

sessionid
000280f4-62fc-4dcd-b51d-c66ac14d7d8c                                      [HBV00000NE1WT]
0002e53b-1f60-4309-8380-31ca03de51f8                       [HBV00000NVZGQ, HBV00000NE1LU]
0002ef34-6bee-4953-874b-8298ec26b625                       [HBV00000OELAO, HBV00000OEL9W]
000618de-d415-408c-863e-6124db43f529    [HBV00000NVZGU, HBV00000NVZGU, HBV00000NVZGU, ...
000770d6-c2d4-4ad2-bb2c-b35274bc5e7e                                      [HBV00000LML23]
                                                              ...                        
fffb5e6a-2676-4cd9-b4e4-ab8b6621e0fe    [HBV00000OE7V9, HBV00000OE7H3, HBV00000OE7V9, ...
fffbae5f-8102-4a14-84d7-6c11c724cf8d                         [SGPAN971946, HBV00000NFI3G]
fffbba74-6999-460f-bd5f-70eaebe689cf    [HBV00000NE11I, HBV00000NE11I, HBV00000NE11I, ...
fffd3c61-2f71-4437-986c-e1c30ef5b5fe    [HBV000006IYTS, HBV00000O2SC1, HBV00000OE7X7, ...
ffffcd3c-da03-4667-9c75-9fcafb609c9e    [HBV00000TIRU5, HBV00000TIRUB, HBV00000TIRU5, ...


At this point a matrix will be built in order to store the frequency of items being in the cart pages together.

In [147]:
import numpy as np
# building a zeros matrix
matrix = np.zeros((len(unique_productids),len(unique_productids)))
# filling the matrix
for cart in carts:
    for item1 in cart:
        for item2 in cart:
            i=productid_dict[item1]
            j=productid_dict[item2]
            if i<j:
                matrix[i,j]+=1.0
                   

In [148]:
matrix.shape

(10235, 10235)

#### Finding related products for a given product

In [149]:
def similar_products(input_product):
    # finding the index of the input_procduct in the meta_data dataframe
    input_productid=data[data.productid == input_product].index.values[0]
    # geting a list of the values in the i'th row ; i==input_productid
    scores = list(enumerate(matrix[input_productid]))
    # descending sorting of the list 
    scores_sorted=sorted(scores, key=lambda x:x[1], reverse=True)[1:]
    print ('Similar products')
    # pringting names of the similar products
    j=0
    for score in scores_sorted:
        recomm=data[data.index==score[0]]['name'].values

        print(f'{j+1:3}. {recomm[0]:55} ')
        j+=1
        if j>9:
            break
print ('Input product: ',data.name[0],'\n')
similar_products(data.productid[0])

Input product:  Dana Kıyma (%5-%7 Yağ) 250 gr 

Similar products
  1. Patates 1 kg                                            
  2. Nestle Su 5 Lt                                          
  3. Carrefour Su 1,5 lt                                     
  4. Piliç But Pirzola 500 gr                                
  5. Carrefour Su 5 lt                                       
  6. Eti Süt Çilekli Burger 35 gr                            
  7. Milupa Organik Sütlaç Kavanoz 125 Gr 6+ Ay              
  8. İçim Rahat Laktozsuz Süt 1 L                            
  9. Carrefour Islak Havlu Papatyalı 3x60 Adet               
 10. Tadım Fındık İçi 200gr                                  


# Conclusion

The combination of methods presented above, would be a good choice to consider. 

Following is another work which is unfinished.

### Another Method: Word Embedding

In [150]:
data = df[['productid','brand','category','subcategory','name','sessionid']]
# Combining the four columns into a column and assigning it as a dataframe named description
description = data[['brand','category','subcategory','name']].astype(str).apply(lambda x: ' '.join(x), axis=1).to_frame('description')
data = pd.concat([data,description], axis=1)
# dropping the columns that we just combined into one column
data.drop(['brand','category','subcategory','name'], axis=1, inplace=True)
data.head()

Unnamed: 0,productid,sessionid,description
0,HBV00000NVZE8,a0655eee-1267-4820-af21-ad8ac068ff7a,"Belirsiz Et, Balık, Şarküteri Kırmızı Et Dana ..."
1,HBV00000U2B18,d2ea7bd3-9235-4a9f-a9ea-d7f296e71318,Oral-B Kişisel Bakım Ağız Bakım Diş Fırçası Ye...
2,OFIS3101-080,5e594788-78a0-44dd-8e66-37022d48f691,Noki Oyuncak ve Kırtasiye Dosyalama ve Arşivle...
3,HBV00000NVZBW,fdfeb652-22fa-4153-b9b5-4dfa0dcaffdf,Belirsiz Meyve ve Sebze Sebze Domates 500 gr
4,HBV00000NE0T4,9e9d4f7e-898c-40fb-aae9-256c40779933,Carrefour Temel Gıda Sıvı Yağ Carrefour Ayçiçe...


In [151]:
data.description= data.description.astype(str)

In [152]:
sessions = data.sessionid.unique().tolist()
len(sessions)

54442

In [153]:
import random
random.shuffle(sessions)
# getting the 90% of session randomly
sessions_train = [sessions[i] for i in range(round(0.9*len(sessions)))]

# splittin data into train and validation set
train_data = data[data.sessionid.isin(sessions_train)]
validation_data = data[~data.sessionid.isin(sessions_train)]

In [154]:
train_data.head()

Unnamed: 0,productid,sessionid,description
1,HBV00000U2B18,d2ea7bd3-9235-4a9f-a9ea-d7f296e71318,Oral-B Kişisel Bakım Ağız Bakım Diş Fırçası Ye...
2,OFIS3101-080,5e594788-78a0-44dd-8e66-37022d48f691,Noki Oyuncak ve Kırtasiye Dosyalama ve Arşivle...
3,HBV00000NVZBW,fdfeb652-22fa-4153-b9b5-4dfa0dcaffdf,Belirsiz Meyve ve Sebze Sebze Domates 500 gr
4,HBV00000NE0T4,9e9d4f7e-898c-40fb-aae9-256c40779933,Carrefour Temel Gıda Sıvı Yağ Carrefour Ayçiçe...
5,HBV00000NE0T6,9e9d4f7e-898c-40fb-aae9-256c40779933,Carrefour Temel Gıda Sıvı Yağ Carrefour Ayçiçe...


In [155]:
import warnings;
warnings.filterwarnings('ignore')

from gensim.models import Word2Vec
# building the model
model = Word2Vec(window = 10, min_count=2,  sg = 1, hs = 0,
                 negative = 10,
                 alpha=0.03, min_alpha=0.0007,
                 seed = 42)
# building the vocabulary
model.build_vocab(data.description, progress_per=200)

In [None]:
#### Basicaly we want to use word2vec in order to get vectors for the description column of our dataframe

In [156]:
# training the model
model.train(data.description, total_examples = model.corpus_count, 
            epochs=10, report_delay=1)

(65642448, 246501150)

In [157]:
model.init_sims(replace=True)

In [158]:
print(model)

Word2Vec(vocab=101, vector_size=100, alpha=0.03)


In [159]:
products = data[["productid", "description"]]
# removing duplicates
products.drop_duplicates(inplace=True, subset='productid', keep="last")

# creating productid and description dictionary
products_dict = products.groupby('productid')['description'].apply(list).to_dict()

The dictionarry values for a given product is as follows.

In [160]:
products_dict['AILEDALIN275105']

['Dalin Bebek Bebek Bakım ve Sağlığı Dalin Şampuan Klasik 500ML']