In [1]:
import json
import pandas as pd

with open('data/events.json') as file:
    events_data = json.load(file)
events_df = pd.DataFrame.from_dict(events_data['events'])
events_df.head()

Unnamed: 0,event,sessionid,eventtime,price,productid
0,cart,a0655eee-1267-4820-af21-ad8ac068ff7a,2020-06-01T08:59:16.406Z,14.48,HBV00000NVZE8
1,cart,d2ea7bd3-9235-4a9f-a9ea-d7f296e71318,2020-06-01T08:59:46.580Z,49.9,HBV00000U2B18
2,cart,5e594788-78a0-44dd-8e66-37022d48f691,2020-06-01T08:59:33.308Z,1.99,OFIS3101-080
3,cart,fdfeb652-22fa-4153-b9b5-4dfa0dcaffdf,2020-06-01T08:59:31.911Z,2.25,HBV00000NVZBW
4,cart,9e9d4f7e-898c-40fb-aae9-256c40779933,2020-06-01T08:59:33.888Z,9.95,HBV00000NE0T4


In [2]:
events_df.event.unique()

array(['cart'], dtype=object)

In [3]:
events_df.drop('event', axis=1, inplace = True)

In [4]:
events_df.shape

(387656, 4)

In [5]:
with open('data/meta.json') as file:
    meta = json.load(file)
meta_df = pd.DataFrame.from_dict(meta['meta'])
meta_df.head()

Unnamed: 0,productid,brand,category,subcategory,name
0,HBV00000AX6LR,Palette,Kişisel Bakım,Saç Bakımı,Palette Kalıcı Doğal Renkler 10-4 PAPATYA
1,HBV00000BSAQG,Best,Pet Shop,Kedi,Best Pet Jöle İçinde Parça Etli Somonlu Konser...
2,HBV00000JUHBA,Tarım Kredi,Temel Gıda,"Bakliyat, Pirinç, Makarna",Türkiye Tarım Kredi Koop.Yeşil Mercimek 1 kg
3,HBV00000NE0QI,Namet,"Et, Balık, Şarküteri",Şarküteri,Namet Fıstıklı Macar Salam 100 gr
4,HBV00000NE0UQ,Muratbey,Kahvaltılık ve Süt,Peynir,Muratbey Burgu Peyniri 250 gr


In [6]:
meta_df.shape

(10236, 5)

In [7]:
df = pd.merge(left=meta_df, right=events_df, how='right', on = 'productid')
df.head()

Unnamed: 0,productid,brand,category,subcategory,name,sessionid,eventtime,price
0,HBV00000NVZE8,,"Et, Balık, Şarküteri",Kırmızı Et,Dana Kıyma (%5-%7 Yağ) 250 gr,a0655eee-1267-4820-af21-ad8ac068ff7a,2020-06-01T08:59:16.406Z,14.48
1,HBV00000U2B18,Oral-B,Kişisel Bakım,Ağız Bakım,Diş Fırçası Yedek Başlığı Stages Çocuk 2 Adet,d2ea7bd3-9235-4a9f-a9ea-d7f296e71318,2020-06-01T08:59:46.580Z,49.9
2,OFIS3101-080,Noki,Oyuncak ve Kırtasiye,Dosyalama ve Arşivleme,Noki Dosya Çıtçıtlı Evrak Zarfı Kırmızı 3101 T...,5e594788-78a0-44dd-8e66-37022d48f691,2020-06-01T08:59:33.308Z,1.99
3,HBV00000NVZBW,,Meyve ve Sebze,Sebze,Domates 500 gr,fdfeb652-22fa-4153-b9b5-4dfa0dcaffdf,2020-06-01T08:59:31.911Z,2.25
4,HBV00000NE0T4,Carrefour,Temel Gıda,Sıvı Yağ,Carrefour Ayçiçek Yağı 1 lt,9e9d4f7e-898c-40fb-aae9-256c40779933,2020-06-01T08:59:33.888Z,9.95


In [8]:
df.shape

(387656, 8)

In [9]:
df.dtypes

productid      object
brand          object
category       object
subcategory    object
name           object
sessionid      object
eventtime      object
price          object
dtype: object

In [10]:
df.isna().sum()

productid           6
brand          131851
category            6
subcategory         6
name                6
sessionid           0
eventtime           0
price               6
dtype: int64

In [11]:
df = df[df.productid.notna()]
df.isna().sum()

productid           0
brand          131845
category            0
subcategory         0
name                0
sessionid           0
eventtime           0
price               0
dtype: int64

In [12]:
df.brand = df.brand.fillna('Belirsiz')
df.brand

0           Belirsiz
1             Oral-B
2               Noki
3           Belirsiz
4          Carrefour
             ...    
387651     Carrefour
387652     Carrefour
387653        Vernel
387654    Carte D'or
387655     Carrefour
Name: brand, Length: 387650, dtype: object

In [13]:
df.isna().values.any()

False

In [14]:
df.duplicated().sum()

350

In [15]:
df = df.drop_duplicates()
df.shape

(387300, 8)

In [16]:
data = df[['productid','brand','category','subcategory','name','sessionid']]
description = data[['brand','category','subcategory','name']].astype(str).apply(lambda x: ' '.join(x), axis=1).to_frame('description')
data = pd.concat([data,description], axis=1)
data.drop(['brand','category','subcategory','name'], axis=1, inplace=True)
data.head()

Unnamed: 0,productid,sessionid,description
0,HBV00000NVZE8,a0655eee-1267-4820-af21-ad8ac068ff7a,"Belirsiz Et, Balık, Şarküteri Kırmızı Et Dana ..."
1,HBV00000U2B18,d2ea7bd3-9235-4a9f-a9ea-d7f296e71318,Oral-B Kişisel Bakım Ağız Bakım Diş Fırçası Ye...
2,OFIS3101-080,5e594788-78a0-44dd-8e66-37022d48f691,Noki Oyuncak ve Kırtasiye Dosyalama ve Arşivle...
3,HBV00000NVZBW,fdfeb652-22fa-4153-b9b5-4dfa0dcaffdf,Belirsiz Meyve ve Sebze Sebze Domates 500 gr
4,HBV00000NE0T4,9e9d4f7e-898c-40fb-aae9-256c40779933,Carrefour Temel Gıda Sıvı Yağ Carrefour Ayçiçe...


In [17]:
meta_df.brand = meta_df.brand.fillna('Belirsiz')

description = meta_df[['brand','category','subcategory','name']].astype(str).apply(lambda x: ' '.join(x), axis=1).to_frame('description')
meta_data = pd.concat([meta_df,description], axis=1)


In [18]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(meta_data.description)


In [154]:
from sklearn.metrics.pairwise import cosine_similarity
cs = cosine_similarity(X)
cs

array([[1.        , 0.        , 0.        , ..., 0.06362848, 0.06726728,
        0.05337605],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.06362848, 0.        , 0.        , ..., 1.        , 0.66769786,
        0.35320863],
       [0.06726728, 0.        , 0.        , ..., 0.66769786, 1.        ,
        0.32673202],
       [0.05337605, 0.        , 0.        , ..., 0.35320863, 0.32673202,
        1.        ]])

In [155]:
input_product = meta_data.productid[0]
input_productid=meta_data[meta_data.productid == input_product].index.values[0]
scores = list(enumerate(cs[input_productid]))
scores_sorted=sorted(scores, key=lambda x:x[1], reverse=True)[1:]


In [156]:
print(f'     Similar Products  \t\t\t\t\t  Similarity')
j=0
for score in scores_sorted:
    recomm=meta_data[meta_data.index==score[0]]['name'].values
    
    print(f'{j+1:3}. {recomm[0]:55}   {scores_sorted[j][1]:.2f}')
    j+=1
    if j>9:
        break

     Similar Products  					  Similarity
  1. Palette Kalıcı Doğal Renkler 10-0 AÇIK SARI               0.89
  2. Palette Kalıcı Doğal Renkler 4-0 KAHVE                    0.88
  3. Palette Kalıcı Doğal Renkler 1-0 SİYAH                    0.85
  4. Palette Kalıcı Doğal Renkler 9-4 SAHRA SARISI             0.85
  5. Palette Kalıcı Doğal Renkler 6-0 KOYU KUMRAL              0.85
  6. Palette Kalıcı Doğal Renkler 1-1 GECE MAVİSİ              0.82
  7. Palette Kalıcı Doğal Renkler 5-89 GECE KIZILI             0.82
  8. Palette Kalıcı Doğal Renkler 6-70 BRONZ KAHVE             0.82
  9. Palette Saç Boyası Kalıcı Doğal Renkler 4-0 Amber Kahve   0.81
 10. Palette Saç Boyası Kalıcı Doğal Renkler 3-0 Koyu Kakao    0.81


In [222]:
import re
def data_cleaner(text):
    text = re.sub(r',', '', text)
    text = text.lower()
    return text

data.description = data.description.apply(data_cleaner)

In [233]:
data.description= data.description.astype(str)

In [234]:
sessions = data.sessionid.unique().tolist()
len(sessions)

54442

In [235]:
import random

random.shuffle(sessions)

# extract 90% of customer ID's
sessions_train = [sessions[i] for i in range(round(0.9*len(sessions)))]

# split data into train and validation set
train_data = data[data.sessionid.isin(sessions_train)]
validation_data = data[~data.sessionid.isin(sessions_train)]

In [236]:
train_data.head()

Unnamed: 0,productid,sessionid,description
1,HBV00000U2B18,d2ea7bd3-9235-4a9f-a9ea-d7f296e71318,Oral-B Kişisel Bakım Ağız Bakım Diş Fırçası Ye...
2,OFIS3101-080,5e594788-78a0-44dd-8e66-37022d48f691,Noki Oyuncak ve Kırtasiye Dosyalama ve Arşivle...
3,HBV00000NVZBW,fdfeb652-22fa-4153-b9b5-4dfa0dcaffdf,Belirsiz Meyve ve Sebze Sebze Domates 500 gr
6,HBV00000O2SDO,86427bdb-4542-406d-b71b-e3bf4facd36c,Belirsiz Meyve ve Sebze Sebze Taze Fasulye 500 gr
7,HBV00000U2B4I,bf3a141e-ed91-4dfa-b4e1-de5aadf61d97,Şölen Atıştırmalık Bisküvi ve Kekler Şölen Ozm...


In [237]:
import warnings;
warnings.filterwarnings('ignore')

from gensim.models import Word2Vec

model = Word2Vec(window = 10, min_count=2,  sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
                 seed = 42)
model.build_vocab(data.description, progress_per=200)


In [238]:
model.train(data.description, total_examples = model.corpus_count, 
            epochs=10, report_delay=1)

(65650076, 246501150)

In [239]:
model.init_sims(replace=True)

In [240]:
print(model)

Word2Vec(vocab=101, vector_size=100, alpha=0.03)


In [241]:
products = data[["productid", "description"]]

# remove duplicates
products.drop_duplicates(inplace=True, subset='productid', keep="last")

# create product-ID and product-description dictionary
products_dict = products.groupby('productid')['description'].apply(list).to_dict()

In [242]:
products_dict.keys()

dict_keys(['AILEBIZIZSMTLDGY54', 'AILEBIZIZSMTLDHB18', 'AILEBS179526', 'AILEBSHSB22037', 'AILEDALIN275101', 'AILEDALIN275103', 'AILEDALIN275105', 'AILEDALIN275106', 'AILEDALIN275107', 'AILEDALIN275114', 'AILEDALIN275122', 'AILEDALIN275169', 'AILEEBBKB-56556', 'AILEEBBKR-W4472', 'AILEELITDIS5285B', 'AILEELITDIS5288B', 'AILEETI1811300', 'AILEETI3511300', 'AILEETI5311300', 'AILEFIMAN5299094', 'AILEHDMAR8967772', 'AILEHDMAR8967774', 'AILEHDMAR8967775', 'AILEHDMAR8967777', 'AILEHDMAR8967779', 'AILEHERO0595800', 'AILEHERO0595801', 'AILEHERO0598801', 'AILEHIPPTR2087', 'AILEHIPPTR2342', 'AILEHIPPTR2468', 'AILEHIPPTR2473', 'AILEHIPPTR2477', 'AILEHIPPTR2769', 'AILEHIPPTR2963', 'AILEHIPPTR3141', 'AILEHIPPTR3331', 'AILEHIPPTR3551', 'AILEHIPPTR4202', 'AILEHIPPTR4212', 'AILEHIPPTR4233', 'AILEHIPPTR4242', 'AILEHIPPTR4253', 'AILEHIPPTR4263', 'AILEHIPPTR4403', 'AILEHIPPTR5110', 'AILEHPULKERTMA201', 'AILEHPULKERTMA202', 'AILEHPULKERTMA204', 'AILEHPULKERTMA206', 'AILEHSBR36033', 'AILEHSBR53378', 'AILEHSB

In [243]:
products_dict['AILEDALIN275105']


['Dalin Bebek Bebek Bakım ve Sağlığı Dalin Şampuan Klasik 500ML']

In [244]:
model.wv.most_similar(['a'], topn= 10)

[('i', 0.7083442807197571),
 (' ', 0.7056373357772827),
 ('ı', 0.6761066913604736),
 ('k', 0.6667896509170532),
 ('r', 0.6593756675720215),
 ('l', 0.6579871773719788),
 ('e', 0.6498638391494751),
 ('t', 0.6322890520095825),
 ('n', 0.6120158433914185),
 ('m', 0.6032317280769348)]

In [None]:
df1=df.copy()
df1.price = df1.price.astype(float)
df2 = df1.groupby(['sessionid','eventtime','productid']).sum().reset_index()
df2