In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tnrange, tqdm_notebook
import gc
import operator

In [2]:
import warnings
warnings.filterwarnings('ignore', message='Changing the shape of non-C contiguous array')

In [3]:
pd.set_option('display.max_columns', 500)

In [4]:
sns.set_context('talk')

In [5]:
import average_precision

In [6]:
import datetime

# Thoughts

Нам нужно предсказать для каждого пользователя пять категорий в порядке убывания, на товарные предложения внутри которых он кликал чаще всего за восьмую неделю. 

Можно выделить седьмую неделю и валидироваться по аггрегированным кликам для неё.

Для начала будем работать только с train_clicks и train_category_views. Только с полями user_id, category_id, day

In [7]:
views = pd.read_csv('data/train_category_views.csv', parse_dates=['day'])
clicks = pd.read_csv('data/train_clicks.csv', parse_dates=['day'])[views.columns]

In [8]:
test_users = pd.read_csv('data/test_users.csv')

In [9]:
clicks.head()

Unnamed: 0,user_id,category_id,day
0,46,672,2016-08-04
1,48,170,2016-08-04
2,48,170,2016-08-04
3,53,1190,2016-08-04
4,93,56,2016-08-04


In [10]:
clicks.day.max() - clicks.day.min()

Timedelta('52 days 00:00:00')

In [11]:
clicks.category_id.max()

2653

In [12]:
views.category_id.max()

2682

Переведём данные в "клики (просмотры) в день пользователя по категории"

In [10]:
agg_views = views.copy()
agg_views['count'] = 1
agg_views = agg_views.groupby(['user_id', 'day', 'category_id']).count().sort_index()

In [11]:
agg_clicks = clicks.copy()
agg_clicks['count'] = 1
agg_clicks = agg_clicks.groupby(['user_id', 'day', 'category_id']).count().sort_index()

In [12]:
(clicks.groupby('user_id').day.max() - clicks.groupby('user_id').day.min()).median()

Timedelta('0 days 00:00:00')

In [13]:
(views.groupby('user_id').day.max() - views.groupby('user_id').day.min()).median()

Timedelta('0 days 00:00:00')

In [14]:
clicks.groupby('user_id')['category_id'].count().median()

2.0

In [15]:
clicks.groupby('user_id')['category_id'].count().mean()

3.9594260406245434

In [16]:
(clicks.groupby('user_id')['category_id'].count() > 4).sum()

38643

In [20]:
clicks['user_id'].nunique()

177922

А давайте мягко предсказывать категорию, на которую больше всего накликает пользователь. Как ответ будет давать упорядоченную пятёрку топ-5 предсказаний. Как y_train будет soft_max от накликанного

Можно пытаться предсказать поведение среднего пользователя.

In [17]:
len(set(test_users.user_id) - set(clicks.user_id))

0

In [18]:
test_users.shape

(31712, 1)

In [20]:
clicks[clicks.user_id.isin(set(test_users.user_id))].groupby('user_id')['category_id'].count().median()

3.0

In [21]:
clicks[clicks.user_id.isin(set(test_users.user_id))].groupby('user_id')['category_id'].count().mean()

6.8187752270433908

In [36]:
clicks[clicks.user_id.isin(set(test_users.user_id))].groupby('user_id')['day'].max().min()

Timestamp('2016-09-20 00:00:00')

In [49]:
clicks[clicks.user_id.isin(set(test_users.user_id))].groupby('user_id')['day'].count().sort_values(ascending=False)

user_id
78036     487
31407     480
185942    431
179927    367
77906     334
91671     320
33654     267
52846     266
13303     264
46427     260
35698     253
44708     245
73402     233
133443    230
140650    222
30457     205
26267     195
111101    193
37938     192
113619    186
118140    179
11399     173
60817     172
109296    171
42618     170
192180    169
151581    168
4619      168
155819    167
129663    166
         ... 
84402       1
84706       1
84751       1
85119       1
84915       1
85116       1
85090       1
85034       1
85021       1
84989       1
84957       1
84953       1
84940       1
177745      1
84931       1
84922       1
177773      1
84772       1
84884       1
84871       1
84862       1
177775      1
84830       1
177790      1
84822       1
177796      1
177805      1
84793       1
84792       1
199977      1
Name: day, dtype: int64

In [33]:
clicks[clicks.user_id.isin(set(test_users.user_id))].groupby(['user_id', 'day', 'category_id']).count().sort_index()

user_id,day,category_id
8,2016-08-06,134
8,2016-08-17,134
8,2016-08-17,795
8,2016-08-18,1244
8,2016-08-23,1409
8,2016-08-26,108
8,2016-08-26,937
8,2016-08-26,970
8,2016-09-11,2330
8,2016-09-12,103


In [41]:
clicks.day.max()

Timestamp('2016-09-25 00:00:00')

In [42]:
views.day.max()

Timestamp('2016-09-25 00:00:00')

In [44]:
views.day.min()

Timestamp('2016-08-04 00:00:00')

Все пользователи из теста есть в трейне

In [22]:
test_users.head()

Unnamed: 0,user_id
0,8
1,12
2,27
3,39
4,40


In [34]:
agg_clicks

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
user_id,day,category_id,Unnamed: 3_level_1
0,2016-08-25,672,1
1,2016-08-19,428,1
1,2016-09-01,44,2
1,2016-09-01,1967,2
2,2016-08-11,892,1
3,2016-08-06,1257,1
3,2016-08-10,2318,1
3,2016-08-17,1351,1
4,2016-08-10,108,1
5,2016-08-12,2149,2


In [49]:
print(datetime.date(2016, 8, 3))

2016-08-03


In [51]:
clicks[(datetime.date(2016, 8, 3) < clicks.day) & (clicks.day < datetime.date(2016, 8, 11))].category_id.value_counts()[:5]

672    10456
134     1513
66      1484
424     1329
163     1279
Name: category_id, dtype: int64

In [53]:
clicks[(datetime.date(2016, 9, 18) < clicks.day) & (clicks.day < datetime.date(2016, 9, 26))].category_id.value_counts()[:5]

672    12417
66      2566
163     1729
424     1615
134     1188
Name: category_id, dtype: int64

In [62]:
clicks[(datetime.date(2016, 9, 19) < clicks.day) & (clicks.day < datetime.date(2016, 9, 26))].category_id.value_counts()[:5]

672     10697
66       2205
163      1494
424      1411
1409      986
Name: category_id, dtype: int64

In [61]:
clicks[(datetime.date(2016, 9, 24) < clicks.day) & (clicks.day < datetime.date(2016, 9, 26))].category_id.value_counts()[:5]

672     1516
66       329
163      247
424      198
1409     142
Name: category_id, dtype: int64

In [63]:
last_days_answer = "672 66 163 424 1409"

In [67]:
views[(datetime.date(2016, 9, 18) < views.day) & (views.day < datetime.date(2016, 9, 26))].category_id.value_counts()[:5]

672    18303
663     3003
163     2958
424     2845
66      2696
Name: category_id, dtype: int64

In [73]:
views[(datetime.date(2016, 9, 24) < views.day) & (views.day < datetime.date(2016, 9, 26))].category_id.value_counts()[:5]

672    2402
663     416
424     353
163     348
66      286
Name: category_id, dtype: int64

In [75]:
clicks[(datetime.date(2016, 9, 18) < clicks.day) & (clicks.day < datetime.date(2016, 9, 26))].category_id.value_counts()[:10]

672     12417
66       2566
163      1729
424      1615
134      1188
1409     1130
120      1123
440       913
674       777
237       705
Name: category_id, dtype: int64

In [76]:
clicks[(datetime.date(2016, 9, 11) < clicks.day) & (clicks.day < datetime.date(2016, 9, 19))].category_id.value_counts()[:10]

672     12395
66       2307
424      1882
163      1676
134      1445
1409     1068
440       939
674       820
120       798
110       696
Name: category_id, dtype: int64

Предсказание последних дней даёт 0.037 скора. Наша модель с затуханием схватывает эффект последних дней, но экстраполирует константой, не пытаясь эстраполировать более сложными функциями

Выделим седьмую неделю

In [None]:
def split_by_date(df, date):
    

In [None]:
train_agg_views, validation_agg_views = split_by_date(agg_views, views.day.max() - datetime.timedelta(6))

In [14]:
user_profile = pd.read_csv('data/train_user_profile.csv', parse_dates=['day'])

In [15]:
user_profile.user_id.nunique()

53428

In [12]:
test_users.shape

(31712, 1)

In [16]:
len(set(test_users.user_id) - set(user_profile.user_id))

22120

Чёт всё гавно какое-то. Возьмём бейзлайн, и потюним его

In [25]:
clicks.day.dt.dayofyear

0         217
1         217
2         217
3         217
4         217
5         217
6         217
7         217
8         217
9         217
10        217
11        217
12        217
13        217
14        217
15        217
16        217
17        217
18        217
19        217
20        217
21        217
22        217
23        217
24        217
25        217
26        217
27        217
28        217
29        217
         ... 
704439    269
704440    269
704441    269
704442    269
704443    269
704444    269
704445    269
704446    269
704447    269
704448    269
704449    269
704450    269
704451    269
704452    269
704453    269
704454    269
704455    269
704456    269
704457    269
704458    269
704459    269
704460    269
704461    269
704462    269
704463    269
704464    269
704465    269
704466    269
704467    269
704468    269
Name: day, dtype: int64

In [52]:
number_of_categories = max(max(clicks.category_id), max(views.category_id))
number_of_users = max(max(clicks.user_id), max(views.user_id))

In [11]:
last_clicks = clicks[(datetime.date(2016, 9, 19) < clicks.day) & (clicks.day < datetime.date(2016, 9, 26))]

In [53]:
user_clicks = np.zeros((number_of_users + 1, number_of_categories + 1))
for row in clicks.iterrows():
    _, row = row
    user_clicks[row['user_id'], row['category_id']] += np.exp((row.day.dayofyear - 269) / 7)

In [54]:
user_clicks.sum()

102138.34312102833

In [55]:
user_clicks[test_users.user_id.reshape(-1), :].sum()

71384.012439027283

In [56]:
user_clicks.shape

(200000, 2683)

divider = user_clicks.sum(axis=1).reshape((-1, 1))
divider = np.where(divider > 0, divider, 1)

user_clicks /= divider

In [57]:
average_clicks = user_clicks.sum(axis=0)

In [58]:
average_clicks /= user_clicks.shape[0]

In [59]:
user_clicks += average_clicks / 10

In [60]:
number_of_categories = max(max(clicks.category_id), max(views.category_id))
number_of_users = max(max(clicks.user_id), max(views.user_id))

In [20]:
last_views = views[(datetime.date(2016, 9, 19) < views.day) & (views.day < datetime.date(2016, 9, 26))]

In [61]:
for row in views.iterrows():
    _, row = row
    user_clicks[row['user_id'], row['category_id']] += np.exp((row.day.dayofyear - 269) / 7) / 10

In [62]:
user_clicks.sum()

124101.3417096655

In [63]:
test_categories = np.argsort(-user_clicks[test_users.user_id.values.reshape(-1), :], axis=1)[:, :5]

In [64]:
test_categories[:5]

array([[1079,  429,  755, 2330,  426],
       [1898,  789,  672,   66,  163],
       [ 200,  672,   66,  163,  424],
       [2138,  160,  672,   66,  163],
       [2273,  672,   66,  163,  424]])

In [65]:
def join_categories(row):
    base_str = ' '.join(map(str, row))
    return base_str + ' '*(25 - len(base_str))

In [66]:
test_users['categories'] = np.apply_along_axis(join_categories, 1, test_categories)

In [67]:
test_users.head()

Unnamed: 0,user_id,categories
0,8,1079 429 755 2330 426
1,12,1898 789 672 66 163
2,27,200 672 66 163 424
3,39,2138 160 672 66 163
4,40,2273 672 66 163 424


In [68]:
test_users.to_csv('csv/baseline_7_decay.csv', index=None)

Пока есть два параметра -- затухание и сглаживание средним