In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tnrange, tqdm_notebook
import gc
import operator

In [2]:
import warnings
warnings.filterwarnings('ignore', message='Changing the shape of non-C contiguous array')

In [3]:
pd.set_option('display.max_columns', 500)

In [4]:
sns.set_context('talk')

In [5]:
import average_precision

In [6]:
import datetime

# Thoughts

Нам нужно предсказать для каждого пользователя пять категорий в порядке убывания, на товарные предложения внутри которых он кликал чаще всего за восьмую неделю. 

Можно выделить седьмую неделю и валидироваться по аггрегированным кликам для неё.

Для начала будем работать только с train_clicks и train_category_views. Только с полями user_id, category_id, day

In [7]:
views = pd.read_csv('data/train_category_views.csv', parse_dates=['day'])
clicks = pd.read_csv('data/train_clicks.csv', parse_dates=['day'])[views.columns]

In [8]:
clicks.head()

Unnamed: 0,user_id,category_id,day
0,46,672,2016-08-04
1,48,170,2016-08-04
2,48,170,2016-08-04
3,53,1190,2016-08-04
4,93,56,2016-08-04


In [9]:
clicks.day.max() - clicks.day.min()

Timedelta('52 days 00:00:00')

In [10]:
clicks.category_id.max()

2653

Переведём данные в "клики (просмотры) в день пользователя по категории"

In [11]:
agg_views = views.copy()
agg_views['count'] = 1
agg_views = agg_views.groupby(['user_id', 'day', 'category_id']).count().sort_index()

In [12]:
agg_clicks = clicks.copy()
agg_clicks['count'] = 1
agg_clicks = agg_clicks.groupby(['user_id', 'day', 'category_id']).count().sort_index()

In [13]:
(clicks.groupby('user_id').day.max() - clicks.groupby('user_id').day.min()).median()

Timedelta('0 days 00:00:00')

In [14]:
(views.groupby('user_id').day.max() - views.groupby('user_id').day.min()).median()

Timedelta('0 days 00:00:00')

Выделим седьмую неделю

In [None]:
def split_by_date(df, date):
    

In [None]:
train_agg_views, validation_agg_views = split_by_date(agg_views, views.day.max() - datetime.timedelta(6))

In [14]:
user_profile = pd.read_csv('data/train_user_profile.csv', parse_dates=['day'])

In [15]:
user_profile.user_id.nunique()

53428

In [11]:
test_users = pd.read_csv('data/test_users.csv')

In [12]:
test_users.shape

(31712, 1)

In [16]:
len(set(test_users.user_id) - set(user_profile.user_id))

22120

Чёт всё гавно какое-то. Возьмём бейзлайн, и потюним его

In [25]:
clicks.day.dt.dayofyear

0         217
1         217
2         217
3         217
4         217
5         217
6         217
7         217
8         217
9         217
10        217
11        217
12        217
13        217
14        217
15        217
16        217
17        217
18        217
19        217
20        217
21        217
22        217
23        217
24        217
25        217
26        217
27        217
28        217
29        217
         ... 
704439    269
704440    269
704441    269
704442    269
704443    269
704444    269
704445    269
704446    269
704447    269
704448    269
704449    269
704450    269
704451    269
704452    269
704453    269
704454    269
704455    269
704456    269
704457    269
704458    269
704459    269
704460    269
704461    269
704462    269
704463    269
704464    269
704465    269
704466    269
704467    269
704468    269
Name: day, dtype: int64

In [28]:
number_of_categories = max(clicks.category_id)
number_of_users = max(clicks.user_id)

In [29]:
user_clicks = np.zeros((number_of_users + 1, number_of_categories + 1))
for row in clicks.iterrows():
    _, row = row
    user_clicks[row['user_id'], row['category_id']] += np.exp((row.day.dayofyear - 269) / 14)

In [30]:
user_clicks.sum()

192507.89925929767

In [31]:
user_clicks[test_users.values.reshape(-1), :].sum()

101437.18588566112

In [32]:
user_clicks.shape

(200000, 2654)

divider = user_clicks.sum(axis=1).reshape((-1, 1))
divider = np.where(divider > 0, divider, 1)

user_clicks /= divider

In [33]:
average_clicks = user_clicks.sum(axis=0)

In [34]:
average_clicks /= user_clicks.shape[0]

In [35]:
user_clicks += average_clicks / 10

In [36]:
test_categories = np.argsort(-user_clicks[test_users.user_id.values.reshape(-1), :], axis=1)[:, :5]

In [37]:
test_categories[:5]

array([[ 429, 1079,  755, 2330, 1409],
       [1898,  672,   66,  424,  163],
       [ 200,  672,   66,  424,  163],
       [2138,  160,  672,   66,  424],
       [2273,  672,   66,  424,  163]])

In [38]:
def join_categories(row):
    base_str = ' '.join(map(str, row))
    return base_str + ' '*(25 - len(base_str))

In [39]:
test_users['categories'] = np.apply_along_axis(join_categories, 1, test_categories)

In [40]:
test_users.head()

Unnamed: 0,user_id,categories
0,8,429 1079 755 2330 1409
1,12,1898 672 66 424 163
2,27,200 672 66 424 163
3,39,2138 160 672 66 424
4,40,2273 672 66 424 163


In [41]:
test_users.to_csv('csv/baseline_agg_exp.csv', index=None)