## Реализация похожести товаров по пользовательским сессиям

In [43]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import datetime as dt
from sklearn.preprocessing import normalize

In [13]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [14]:
cd /content/gdrive/My Drive/OZON

/content/gdrive/My Drive/OZON


## Считывание и предобработка данных

### функция для переиндексации товаров

In [44]:
def newIndex(session_item):
    sessionid = session_item['sessionid'].unique() # все уникальные айди клиентов из user_actions
    session_cat = np.arange(0, len(sessionid), dtype='uint32') # массив с элементами от 0 до n - 1

    # таблица, где каждая строка переводит clientid в номер строки
    user_mapping = pd.DataFrame({'old': sessionid, 'new': session_cat})

    items_id = session_item['itemid'].unique() # все уникальные айди товаров из user_actions
    items_cat = np.arange(0, len(items_id), dtype='uint32') # массив с элементами от 0 до m - 1

    # таблица, где каждая строка переводит itemid в номер столбца
    item_mapping = pd.DataFrame({'old': items_id, 'new': items_cat})

    session_item['sessionid'] = session_item['sessionid'].map(user_mapping.set_index('old').new)
    session_item['itemid'] = session_item['itemid'].map(item_mapping.set_index('old').new)
    return (session_item, user_mapping, item_mapping)

### Данные сессий

In [45]:
item_sessions = pd.read_parquet('data/sessions.parquet')
item_sessions.head()

Unnamed: 0,sessionid,itemid,action_type,event_date
0,0,146494488,0,2019-08-19
1,0,147656978,0,2019-08-19
2,0,136271000,0,2019-08-19
3,0,147019789,0,2019-08-19
4,0,32117708,0,2019-08-19


In [46]:
item_sessions['sessionid'] = item_sessions['sessionid'].astype('uint32')
item_sessions['itemid'] = item_sessions['itemid'].astype('uint32')
item_sessions['action_type'] = item_sessions['action_type'].astype('bool')
item_sessions['event_date'] = item_sessions['event_date'].apply(dt.datetime.toordinal).astype('uint32')
item_sessions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38956702 entries, 0 to 38956701
Data columns (total 4 columns):
sessionid      uint32
itemid         uint32
action_type    bool
event_date     uint32
dtypes: bool(1), uint32(3)
memory usage: 483.0 MB


### Делим анные из сессий для просмотров

In [47]:
data_view = (
    item_sessions[~item_sessions['action_type']]
    .reset_index(drop=True)
    .drop(columns=['event_date', 'action_type'])
)
session_view, session_mapping_view, item_mapping_view = newIndex(data_view)

### Делим данные из сессий для картэдов

In [48]:
data_tocart = (
    item_sessions[item_sessions['action_type']]
    .reset_index(drop=True)
    .drop(columns=['event_date', 'action_type'])
)
session_tocart, session_mapping_tocart, item_mapping_tocart = newIndex(data_tocart)

### train test

In [49]:
train_target = pd.read_csv('data/train_with_scores.csv')
test_target = pd.read_csv('data/test_with_scores.csv')

In [50]:
train_target['clientid'] = train_target['clientid'].astype('uint32')
train_target['jointitemid'] = train_target['jointitemid'].astype('uint32')
train_target['novelty_cnt']=train_target['novelty_cnt'].astype('uint32')
train_target['itemid']=train_target['itemid'].astype('uint32')
train_target['label']=train_target['label'].astype('uint8')

test_target['clientid'] = test_target['clientid'].astype('uint32')
test_target['jointitemid'] = test_target['jointitemid'].astype('uint32')
test_target['novelty_cnt']=test_target['novelty_cnt'].astype('uint32')
test_target['itemid']=test_target['itemid'].astype('uint32')
test_target['label'] = test_target['label'].astype('uint8')
test_target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 597158 entries, 0 to 597157
Data columns (total 15 columns):
clientid                          597158 non-null uint32
itemid                            597158 non-null uint32
jointitemid                       597158 non-null uint32
label                             597158 non-null uint8
timestamp                         597158 non-null object
view_cnt                          597158 non-null float64
to_cart_cnt                       597158 non-null float64
ctr                               597158 non-null float64
novelty_cnt                       597158 non-null uint32
to_cart_day_avg_cnt               597158 non-null float64
view_day_avg_cnt                  597158 non-null float64
to_cart_Lust_Day_cnt              597158 non-null float64
last_day_views                    597158 non-null float64
lastDayToCart/dayAvgToCart_cnt    597158 non-null float64
lastDayView/dayAvgView_cnt        597158 non-null float64
dtypes: float64(9), object

## Составим матрицу товар/сессия



In [51]:
#Для просмотров
shape = (item_mapping_view['new'].max()+1,  session_mapping_view['new'].max()+1)
item_session_view = sp.csr_matrix(arg1 = (np.ones_like(session_view['sessionid'].values), 
                                       (session_view['itemid'].values, session_view['sessionid'].values)), shape=shape)

In [52]:
#Для картэдов
shape_1 = (item_mapping_tocart['new'].max()+1,  session_mapping_tocart['new'].max()+1)
item_session_tocart = sp.csr_matrix(arg1 = (np.ones_like(session_tocart['sessionid'].values), 
                                       (session_tocart['itemid'].values, session_tocart['sessionid'].values)), shape=shape)

## Функция для:
### – Расчета похожести товаров от частоты попадания в одну сессию
### – Расчета похожести товаров от количесва сессий, в которых встречались товары


In [53]:
def count_same_score(data, matrix, action):
    pairs = data.dropna()[['item_cat', 'jointitem_cat']]
    pairs['item_cat'] = pairs['item_cat'].astype('uint32')
    pairs['jointitem_cat'] = pairs['jointitem_cat'].astype('uint32')

    pairs['same_items_on_session_' +action] = (
      normalize(matrix[pairs['item_cat'].values], axis=1)
      .multiply(normalize(matrix[ pairs['jointitem_cat'].values], axis=1))
      .sum(axis=1)
    )
    pairs['count_on_session_' +action] = (
      matrix[pairs['item_cat'].values]
      .multiply(matrix[ pairs['jointitem_cat'].values])
      .sum(axis=1)
    )

    data = pd.merge(data, pairs, on=['item_cat', 'jointitem_cat'], how = 'left').drop_duplicates()
    return data

## обработка  train и test

### train
train. Для проссмотров

In [54]:
train_target_view = train_target.drop(columns=['timestamp', 'label'])
train_target_view['item_cat'] = train_target_view['itemid'].map(item_mapping_view.set_index('old').new)
train_target_view['jointitem_cat'] = train_target_view['jointitemid'].map(item_mapping_view.set_index('old').new)

train. Для картедов

In [55]:
train_target_tocart = train_target.drop(columns=['timestamp', 'label'])
train_target_tocart['item_cat'] = train_target_tocart['itemid'].map(item_mapping_tocart.set_index('old').new)
train_target_tocart['jointitem_cat'] = train_target_tocart['jointitemid'].map(item_mapping_tocart.set_index('old').new)


In [56]:
train_view = count_same_score(train_target_view, item_session_view, 'view')

In [57]:
train_tocart = count_same_score(train_target_tocart, item_session_tocart, 'to_cart')

In [58]:
train_view.head()

Unnamed: 0,clientid,itemid,jointitemid,view_cnt,to_cart_cnt,ctr,novelty_cnt,to_cart_day_avg_cnt,view_day_avg_cnt,to_cart_Lust_Day_cnt,last_day_views,lastDayToCart/dayAvgToCart_cnt,lastDayView/dayAvgView_cnt,item_cat,jointitem_cat,same_items_on_session_view,count_on_session_view
0,7833842,31499843,138176581,31.0,9.0,3.444444,737272,1.5,1.722222,0.0,0.0,0.0,0.0,53719.0,557229.0,0.0,0.0
2,19548158,147389610,148381589,24.0,9.0,2.666667,737272,1.125,1.6,0.0,0.0,0.0,0.0,201366.0,1104614.0,0.160623,5.0
4,32943407,6261257,4490956,0.0,1.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,549251.0,524064.0,0.0,0.0
5,10185243,148455169,148455173,10.0,1.0,10.0,737275,1.0,1.111111,0.0,0.0,0.0,0.0,947588.0,1517350.0,0.0,0.0
6,30552232,152440009,152440052,9.0,1.0,9.0,737274,1.0,1.285714,0.0,0.0,0.0,0.0,1449.0,281873.0,0.005222,1.0


In [59]:
train_tocart.head()

Unnamed: 0,clientid,itemid,jointitemid,view_cnt,to_cart_cnt,ctr,novelty_cnt,to_cart_day_avg_cnt,view_day_avg_cnt,to_cart_Lust_Day_cnt,last_day_views,lastDayToCart/dayAvgToCart_cnt,lastDayView/dayAvgView_cnt,item_cat,jointitem_cat,same_items_on_session_to_cart,count_on_session_to_cart
0,7833842,31499843,138176581,31.0,9.0,3.444444,737272,1.5,1.722222,0.0,0.0,0.0,0.0,442586.0,395289.0,0.069338,1.0
2,19548158,147389610,148381589,24.0,9.0,2.666667,737272,1.125,1.6,0.0,0.0,0.0,0.0,340934.0,340935.0,0.308607,2.0
4,32943407,6261257,4490956,0.0,1.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,684930.0,,,
5,10185243,148455169,148455173,10.0,1.0,10.0,737275,1.0,1.111111,0.0,0.0,0.0,0.0,,443197.0,,
6,30552232,152440009,152440052,9.0,1.0,9.0,737274,1.0,1.285714,0.0,0.0,0.0,0.0,36358.0,24179.0,0.0,0.0


In [60]:
train = (
    pd.merge(train_target, train_view[['itemid','jointitemid','same_items_on_session_view',
                                       'count_on_session_view']], how='left').drop_duplicates()
    )
train = (
    pd.merge(train, train_tocart[['itemid','jointitemid','same_items_on_session_to_cart',
                                  'count_on_session_to_cart']], how='left').drop_duplicates()
    )
train.head()

Unnamed: 0,clientid,itemid,jointitemid,label,timestamp,view_cnt,to_cart_cnt,ctr,novelty_cnt,to_cart_day_avg_cnt,view_day_avg_cnt,to_cart_Lust_Day_cnt,last_day_views,lastDayToCart/dayAvgToCart_cnt,lastDayView/dayAvgView_cnt,same_items_on_session_view,count_on_session_view,same_items_on_session_to_cart,count_on_session_to_cart
0,7833842,31499843,138176581,1,2019-09-07 20:11:01,31.0,9.0,3.444444,737272,1.5,1.722222,0.0,0.0,0.0,0.0,0.0,0.0,0.069338,1.0
2,19548158,147389610,148381589,0,2019-08-31 22:32:31,24.0,9.0,2.666667,737272,1.125,1.6,0.0,0.0,0.0,0.0,0.160623,5.0,0.308607,2.0
4,32943407,6261257,4490956,0,2019-09-06 15:19:30,0.0,1.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
5,10185243,148455169,148455173,0,2019-09-03 17:43:50,10.0,1.0,10.0,737275,1.0,1.111111,0.0,0.0,0.0,0.0,0.0,0.0,,
6,30552232,152440009,152440052,0,2019-09-01 07:07:59,9.0,1.0,9.0,737274,1.0,1.285714,0.0,0.0,0.0,0.0,0.005222,1.0,0.0,0.0


In [61]:
train = train.fillna(0)
train['count_on_session_view']=train['count_on_session_view'].astype('uint32')
train['count_on_session_to_cart'] = train['count_on_session_to_cart'].astype('uint32')
train.head()

Unnamed: 0,clientid,itemid,jointitemid,label,timestamp,view_cnt,to_cart_cnt,ctr,novelty_cnt,to_cart_day_avg_cnt,view_day_avg_cnt,to_cart_Lust_Day_cnt,last_day_views,lastDayToCart/dayAvgToCart_cnt,lastDayView/dayAvgView_cnt,same_items_on_session_view,count_on_session_view,same_items_on_session_to_cart,count_on_session_to_cart
0,7833842,31499843,138176581,1,2019-09-07 20:11:01,31.0,9.0,3.444444,737272,1.5,1.722222,0.0,0.0,0.0,0.0,0.0,0,0.069338,1
2,19548158,147389610,148381589,0,2019-08-31 22:32:31,24.0,9.0,2.666667,737272,1.125,1.6,0.0,0.0,0.0,0.0,0.160623,5,0.308607,2
4,32943407,6261257,4490956,0,2019-09-06 15:19:30,0.0,1.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0
5,10185243,148455169,148455173,0,2019-09-03 17:43:50,10.0,1.0,10.0,737275,1.0,1.111111,0.0,0.0,0.0,0.0,0.0,0,0.0,0
6,30552232,152440009,152440052,0,2019-09-01 07:07:59,9.0,1.0,9.0,737274,1.0,1.285714,0.0,0.0,0.0,0.0,0.005222,1,0.0,0


## test
test. Для проссмотров

In [62]:
test_target_view = test_target.drop(columns=['timestamp', 'label'])
test_target_view['item_cat'] = test_target_view['itemid'].map(item_mapping_view.set_index('old').new)
test_target_view['jointitem_cat'] = test_target_view['jointitemid'].map(item_mapping_view.set_index('old').new)

test. Для картедов

In [63]:
test_target_tocart = test_target.drop(columns=['timestamp', 'label'])
test_target_tocart['item_cat'] = test_target_tocart['itemid'].map(item_mapping_tocart.set_index('old').new)
test_target_tocart['jointitem_cat'] = test_target_tocart['jointitemid'].map(item_mapping_tocart.set_index('old').new)


In [64]:
test_view = count_same_score(test_target_view, item_session_view, 'view')

In [65]:
test_tocart = count_same_score(test_target_tocart, item_session_tocart, 'to_cart')

In [66]:
test_view.head()

Unnamed: 0,clientid,itemid,jointitemid,view_cnt,to_cart_cnt,ctr,novelty_cnt,to_cart_day_avg_cnt,view_day_avg_cnt,to_cart_Lust_Day_cnt,last_day_views,lastDayToCart/dayAvgToCart_cnt,lastDayView/dayAvgView_cnt,item_cat,jointitem_cat,same_items_on_session_view,count_on_session_view
0,8081929,152898248,152875664,10.0,2.0,5.0,737276,1.0,2.0,0.0,0.0,0.0,0.0,281044.0,441643.0,0.0,0.0
1,33378638,144847078,140715321,211.0,19.0,11.105263,737272,1.727273,7.814815,1.0,1.0,0.578947,0.127962,3446.0,250078.0,0.023322,7.0
2,36237195,149717596,149390783,282.0,20.0,14.1,737272,1.666667,9.096774,1.0,1.0,0.6,0.109929,127375.0,62292.0,0.247489,61.0
4,4190203,141835810,147130456,892.0,210.0,4.247619,737272,6.774194,28.774194,2.0,2.0,0.295238,0.069507,111117.0,54691.0,0.0,0.0
5,35362099,24560937,155457711,0.0,2.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,796611.0,,,


In [67]:
test = (
    pd.merge(test_target, test_view[['itemid','jointitemid','same_items_on_session_view','count_on_session_view']],
             how='left').drop_duplicates()
    )
test = (
    pd.merge(test, test_tocart[['itemid','jointitemid','same_items_on_session_to_cart','count_on_session_to_cart']],
             how='left').drop_duplicates()
)

In [68]:
test = test.fillna(0)
test['count_on_session_view']=test['count_on_session_view'].astype('uint32')
test['count_on_session_to_cart'] = test['count_on_session_to_cart'].astype('uint32')

In [69]:
test.head()

Unnamed: 0,clientid,itemid,jointitemid,label,timestamp,view_cnt,to_cart_cnt,ctr,novelty_cnt,to_cart_day_avg_cnt,view_day_avg_cnt,to_cart_Lust_Day_cnt,last_day_views,lastDayToCart/dayAvgToCart_cnt,lastDayView/dayAvgView_cnt,same_items_on_session_view,count_on_session_view,same_items_on_session_to_cart,count_on_session_to_cart
0,8081929,152898248,152875664,0,2019-09-06 18:57:23,10.0,2.0,5.0,737276,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0
1,33378638,144847078,140715321,0,2019-09-02 07:09:31,211.0,19.0,11.105263,737272,1.727273,7.814815,1.0,1.0,0.578947,0.127962,0.023322,7,0.0,0
2,36237195,149717596,149390783,0,2019-09-01 07:14:01,282.0,20.0,14.1,737272,1.666667,9.096774,1.0,1.0,0.6,0.109929,0.247489,61,0.139573,3
4,4190203,141835810,147130456,0,2019-09-06 07:18:33,892.0,210.0,4.247619,737272,6.774194,28.774194,2.0,2.0,0.295238,0.069507,0.0,0,0.011117,1
5,35362099,24560937,155457711,0,2019-09-05 17:16:28,0.0,2.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0


## Запись в файл

In [70]:
test.to_csv("data/test_with_scores.csv", index=False)
train.to_csv("data/train_with_scores.csv", index=False)