In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня

from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k, money_recall_at_k, money_precision_at_k
from src.metrics import postfilter_items
from src.utils_i import prefilter_items, new_user_feat, new_item_feat, get_important_features, add_feat_user,\
add_feat_itm  
from src.recommenders_i import MainRecommender

In [2]:
data = pd.read_csv('./retail_train.csv')
item_features = pd.read_csv('./product.csv')
user_features = pd.read_csv('./hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, user_features=user_features, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [4]:
recommender = MainRecommender(data_train_lvl_1)



HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




In [5]:
data_val_lvl_1 = data_val_lvl_1[data_val_lvl_1['user_id'].isin(data_train_lvl_1['user_id'].unique())]

In [6]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [7]:
result_lvl_1['als'] = result_lvl_1['user_id'].apply(lambda x:  recommender.get_als_recommendations(x, N=200))
result_lvl_1['get_own'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=200))
result_lvl_1['get_similar_items'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_similar_items_recommendation(x, N=200))

In [8]:
# result_lvl_1['get_similar_users'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_similar_users_recommendation(x, N=200))

In [9]:
recommender.user_item_matrix

<2495x5001 sparse matrix of type '<class 'numpy.float64'>'
	with 314750 stored elements in COOrdinate format>

# Измеряем recall_K 

In [10]:
result_lvl_1.apply(lambda row: recall_at_k(row['actual'], row['als']), axis=1).mean()

0.0027708042770803864

In [11]:
result_lvl_1.apply(lambda row: recall_at_k(row['actual'], row['get_own']), axis=1).mean()

0.004453742445374231

In [12]:
result_lvl_1.apply(lambda row: recall_at_k(row['actual'], row['get_similar_items']), axis=1).mean()

0.00286610878661084

In [13]:
# result_lvl_1.apply(lambda row: recall_at_k(row['actual'], row['get_similar_users']), axis=1).mean()

# ПОДГОТОВКА ДАННЫХ - НАЧАЛО

In [14]:
data['price'] = data['sales_value']/np.maximum(data['quantity'], 1.0) 

### Обучаем модель 2-ого уровня на выбранных кандидатах

- Обучаем на data_train_lvl_2
- Обучаем *только* на выбранных кандидатах
- Я *для примера* сгенерирую топ-200 кадидиатов через get_own_recommendations
- (!) Если юзер купил < 200 товаров, то get_own_recommendations дополнит рекоммендации топ-популярными

In [15]:
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 

In [16]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']
# Пока только warm start
train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

In [17]:
users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=200))
# users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=200))
# users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_similar_items_recommendation(x, N=500))

In [18]:
users_lvl_2.head(3)

Unnamed: 0,user_id,candidates
0,2070,"[1105426, 1097350, 879194, 948640, 928263, 944..."
1,2021,"[950935, 1119454, 835578, 863762, 1019142, 102..."
2,1753,"[967041, 963686, 948640, 1057168, 942475, 9421..."


In [19]:
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['drop'] = 1  # фиктивная пересенная

users_lvl_2.head(4)

Unnamed: 0,user_id,item_id,drop
0,2070,1105426,1
0,2070,1097350,1
0,2070,879194,1
0,2070,948640,1


In [20]:
users_lvl_2.shape[0]

430200

In [21]:
users_lvl_2['user_id'].nunique()

2151

In [22]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('drop', axis=1, inplace=True)

In [23]:
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target
0,2070,1105426,0.0
1,2070,1097350,0.0


In [24]:
data_val_lvl_2

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2277416,338,41260573635,636,840173,1,1.99,369,0.00,112,92,0.0,0.0
2277417,338,41260573635,636,1037348,1,0.89,369,-0.30,112,92,0.0,0.0
2277418,338,41260573635,636,5592737,2,1.58,369,-0.20,112,92,0.0,0.0
2277419,338,41260573635,636,7441679,1,3.69,369,0.00,112,92,0.0,0.0
2277420,338,41260573635,636,7442317,1,2.69,369,0.00,112,92,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2396799,1613,41655820646,663,16102849,1,2.00,3262,-1.15,1231,95,0.0,0.0
2396800,1001,41655829421,663,13217063,1,1.69,3131,0.00,2231,95,0.0,0.0
2396801,1001,41655829421,663,13217800,1,1.69,3131,0.00,2231,95,0.0,0.0
2396802,1167,41656790510,663,6410462,22451,43.98,3385,-0.65,1059,95,0.0,0.0


In [25]:
targets_lvl_2['target']==1

0         False
1         False
2         False
3         False
4         False
          ...  
437597    False
437598    False
437599    False
437600    False
437601    False
Name: target, Length: 437602, dtype: bool

(!) На каждого юзера 200 item_id-кандидатов

In [26]:
targets_lvl_2['target'].mean()

0.06198326333060635

# Обработка user feat и ITEM FEAT

In [27]:
user_features = new_user_feat(data, user_features)
item_features = new_item_feat(data, item_features)
user_features = add_feat_user(data, user_features)
item_features= add_feat_itm(data, item_features)

  out=out, **kwargs)


In [28]:
item_features.rename(columns={'day_x': 'day'}, inplace=True)

In [29]:
item_factors = recommender.item_factors
user_factors = recommender.user_factors
items_emb_df = recommender.items_emb_df
users_emb_df = recommender.users_emb_df

user_features = user_features.merge(users_emb_df, on='user_id', how='left')
item_features = item_features.merge(items_emb_df, on='item_id', how='left')

user_features=user_features.fillna(0)
item_features= item_features.fillna(0)

In [30]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

targets_lvl_2.head(3)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,day,...,10_y,11_y,12_y,13_y,14_y,15_y,16_y,17_y,18_y,19_y
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,1.084211,...,-6.822336,4.273782,7.011938,3.751617,-0.735883,11.030344,-1.53002,0.99007,-6.74267,-7.342715
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,0.536842,...,-6.822336,4.273782,7.011938,3.751617,-0.735883,11.030344,-1.53002,0.99007,-6.74267,-7.342715
2,2070,879194,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,14 CT,0.484211,...,-6.822336,4.273782,7.011938,3.751617,-0.735883,11.030344,-1.53002,0.99007,-6.74267,-7.342715


In [31]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2[['target']]

In [32]:
cat_feats = ['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age',
 'marital_status_code',
 'income',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kids'
            ]

X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age',
 'marital_status_code',
 'income',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kids']

In [33]:
X_train.head(2)
X_train.columns

Index(['user_id', 'item_id', 'manufacturer', 'department', 'brand',
       'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product', 'day',
       'coupon_disc', 'quantity_of_sales', 'quantity_of_sales_per_week',
       'qnt_of_sales_per_dep', 'qnt_of_sales_per_item_per_dep_per_week',
       'qnt_of_sales_per_sub_commodity_desc',
       'qnt_of_sales_per_item_per_sub_commodity_desc_per_week',
       'item_sales_for_dep', 'average_freq_itm', 'nd_from_la_pur_itm', '0_x',
       '1_x', '2_x', '3_x', '4_x', '5_x', '6_x', '7_x', '8_x', '9_x', '10_x',
       '11_x', '12_x', '13_x', '14_x', '15_x', '16_x', '17_x', '18_x', '19_x',
       'marital_status_code', 'homeowner_desc', 'hh_comp_desc',
       'household_size_desc', 'mean_time', 'age', 'income', 'kids',
       'average_basket', 'sum_per_week', 'avg_check', 'quantity', 'day_qaunt',
       'sales_value', 'check_day', 'nd_from_la_pur', 'average_freq_user',
       'user_avg_coupon_disc', 'user_avg_chek', 'user_purch_week',
       'use

In [34]:
X_train.head(2)
X_train['avg_check'] = X_train['avg_check'].astype('float')
X_train['day'] = X_train['day'].astype('float')


X_train['quantity'] = X_train['quantity'].astype('float')
X_train['day_qaunt'] = X_train['day_qaunt'].astype('float')
X_train['sales_value'] = X_train['sales_value'].astype('float')
X_train['check_day'] = X_train['check_day'].astype('float')

X_train['age'] = X_train['age'].astype('float')
X_train['income'] = X_train['income'].astype('float')
X_train['kids'] = X_train['kids'].astype('float')


In [35]:
%%time

lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_feats)
lgb.fit(X_train, y_train)

train_preds = lgb.predict(X_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Wall time: 3.44 s


In [36]:
basic_feats = get_important_features(lgb, X_train, y_train)
lgb.fit(X_train[basic_feats], y_train)



LGBMClassifier(boosting_type='gbdt',
               categorical_column=['manufacturer', 'department', 'brand',
                                   'commodity_desc', 'sub_commodity_desc',
                                   'curr_size_of_product', 'age',
                                   'marital_status_code', 'income',
                                   'homeowner_desc', 'hh_comp_desc',
                                   'household_size_desc', 'kids'],
               class_weight=None, colsample_bytree=1.0, importance_type='split',
               learning_rate=0.1, max_depth=7, min_child_samples=20,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
               n_jobs=-1, num_leaves=31, objective='binary', random_state=None,
               reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [37]:
preds = lgb.predict(X_train[basic_feats])

In [38]:
preds

array([0., 0., 0., ..., 0., 0., 0.])

In [39]:
df = pd.concat([X_train[['user_id', 'item_id']], y_train, pd.Series(preds)], axis=1)
df['prob'] = df[0]
df_p = df.sort_values(['user_id', 'prob'],ascending=False).groupby('user_id').head(5)

In [40]:
df_p['target'].sum()/len(df_p)

0.2775453277545328

In [41]:
df_p

Unnamed: 0,user_id,item_id,target,0,prob
281162,2500,1063739,0.0,0.0,0.0
281163,2500,12262832,0.0,0.0,0.0
281164,2500,1086061,0.0,0.0,0.0
281165,2500,879393,0.0,0.0,0.0
281166,2500,1065538,1.0,0.0,0.0
...,...,...,...,...,...
179740,1,9655212,1.0,1.0,1.0
179741,1,9655212,1.0,1.0,1.0
179746,1,8293439,1.0,1.0,1.0
179747,1,8293439,1.0,1.0,1.0


In [42]:
user_recommendation = { u:[] for u in df_p['user_id'].unique()}
df_p.apply(lambda x : user_recommendation[x['user_id']].append(int(x['item_id'])), axis=1)

281162    None
281163    None
281164    None
281165    None
281166    None
          ... 
179740    None
179741    None
179746    None
179747    None
179748    None
Length: 10755, dtype: object

In [43]:
item_prices = data.groupby('item_id')['price'].mean().reset_index()
items_info = item_features.merge(item_prices, on = 'item_id', how='left')
items_info.fillna(0.0, inplace=True)

In [44]:
most_popular_items = data.groupby('item_id').agg({'user_id':len}).sort_values('user_id', ascending=False).head(2000).index.tolist()

In [45]:
train_bought_by_users = {}
for u, i in data_train_lvl_2.groupby('user_id')['item_id']:
    train_bought_by_users[u] = i.values

val_bought_by_users = {}
for u, i in data_train_lvl_2.groupby('user_id')['item_id']:
    val_bought_by_users[u] = i.values

In [46]:
item_features.columns
data.columns

Index(['user_id', 'basket_id', 'day', 'item_id', 'quantity', 'sales_value',
       'store_id', 'retail_disc', 'trans_time', 'week_no', 'coupon_disc',
       'coupon_match_disc', 'price'],
      dtype='object')

# ФИНАЛЬНАЯ ОБРАБОТКА

In [47]:
item_department = item_features[['item_id', 'department']]
user_department_price = data.merge(item_department, on='item_id', how='left').groupby(['user_id', 'department'])['price'].mean().reset_index()
user_department_quantity = data.merge(item_department, on='item_id', how='left').groupby(['user_id', 'department'])['day'].mean().reset_index()

In [48]:
def postfilter_items(items, items_info, bought, most_popular_items , department_price, department_quantity, N=5):
    def get_department(i):
        department = items_info[items_info['item_id'] == i]['department'].values[0]
        return department
    
    ## NOT RARE PURCH ITEM:
    

#     not_cheap_items = items_info[items_info['price'] > 1.0]
#     not_cheap_items = items_info[(items_info['price'] > 1.0) & (items_info['price']  < 1.7) & (items_info['day']  > 22.0)]
    not_cheap_items = items_info[(items_info['price'] > 1.0) & (items_info['price']  < 1.7)]
    not_rare_items = items_info[(items_info['average_freq_itm']  < 5.0) & (items_info['nd_from_la_pur_itm']  < 4.0) & (items_info['average_freq_itm']  >= 1.0)] 
    not_cheap_items =  not_cheap_items.merge(not_rare_items, on='item_id', how='right')
    item_ids = pd.DataFrame(items, columns=['item_id'])
    never_bought_items = item_ids[~item_ids['item_id'].isin(bought)]['item_id'].values.tolist()
#     never_bought_items= never_bought_items(never_bought_items['price'] > 1.0) & (never_bought_items['price']  < 1.7)])
    expensive_items =  item_ids.merge(items_info[(items_info['price']>7) & (items_info['price']<15)], on='item_id', how='right').head(5)['item_id'].values.tolist()
   
    categories_used = []
    final_recommendations = []
   
    expensive_item_id = expensive_items[0]
    final_recommendations.append(expensive_item_id)
    categories_used.append(get_department(expensive_item_id))

   
    for i in items:
        if any(not_cheap_items[not_cheap_items['item_id'] == i]['price_y']):
            if  get_department(i) not in categories_used:
                vals = department_price[department_price['department'] == get_department(i)].values
                quantity = department_quantity[department_quantity['department'] == get_department(i)].values
                if vals.shape[0] == 1 and quantity.shape[0] ==1:
                    average_price = vals[0,1]
                    average_quantity = quantity[0,1]
#                     print('average_quantity:i ', average_quantity, i)
                    if average_price <= float(not_cheap_items[not_cheap_items['item_id'] == i]['price_y']) and average_quantity >= 250:
                        final_recommendations.append(i)
                        categories_used.append( get_department(i))
        if len(final_recommendations) == 3:
            break
            ### and average_quantity >=float(not_cheap_items[not_cheap_items['item_id'] == i]['day'])
# average_quantity <=float(not_cheap_items.groupby([not_cheap_items['item_id']==i, 'department'])['day'].mean().reset_index()) \
#                                              and 
    for i in never_bought_items:
        if any(not_cheap_items[not_cheap_items['item_id'] == i]['price_y']):
            if  get_department(i) not in categories_used:
                final_recommendations.append(i)
                categories_used.append( get_department(i))
        if len(final_recommendations) == 5:
            break
    

    
    if len(final_recommendations) < 5:      
        for i in most_popular_items:
            if True:
                if  get_department(i) not in categories_used:
                    final_recommendations.append(i)
                    categories_used.append( get_department(i))
            if len(final_recommendations) == 5:
                break            
       
    assert len(final_recommendations) == N, 'Количество рекомендаций = {}, должно быть {}'.format(len(final_recommendations), N)
    assert len(categories_used) == len(set(categories_used)), '{} уникальных категорий'.format(len(set(categories_used)))
    return final_recommendations

In [49]:
precisions_recommend = {}
precisions = {}
money_precisions = {}
for user, items in user_recommendation.items():
    bought = train_bought_by_users.get(user, [])
    department_price = user_department_price[user_department_price['user_id'] == user][['department','price']]
    department_quantity = user_department_quantity[user_department_quantity['user_id'] == user][['department','day']]
    final_recommendations = postfilter_items(items, items_info, bought, most_popular_items, department_price, department_quantity, N=5)
    prices_recommended = pd.DataFrame(final_recommendations, columns=['item_id']).merge(items_info, on = 'item_id', how='left')['price'].values
    m_p_at_k = money_precision_at_k(final_recommendations, val_bought_by_users.get(user, []), prices_recommended, k=5)
    p_at_k = precision_at_k(final_recommendations, val_bought_by_users.get(user, []), k=5)
    precisions[user] = p_at_k
    money_precisions[user] = m_p_at_k
    precisions_recommend[user] = final_recommendations
    print('.', end = '')

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

# ИТОГОВЫЕ ЦИФРЫ ПО МОДЕЛИ:
MONEY_PR_K (11-13%)
PR_K (40%) 


In [50]:
print('Mean money_precion @ 5 = ', np.mean(list(money_precisions.values())))
print('Mean precion @ 5 = ', np.mean(list(precisions.values())))

Mean money_precion @ 5 =  0.11840339011857348
Mean precion @ 5 =  0.4355183635518363


In [51]:
with open ('precisions.pkl', 'wb') as file:
    import pickle
    pickle.dump(precisions, file)

In [52]:
with open ('precisions_recommend.pkl', 'wb') as file:
    import pickle
    pickle.dump(precisions_recommend, file)

In [53]:
import csv
pd.DataFrame([precisions.keys(), precisions_recommend.values()]).T.to_csv('result.csv', header=0, index=False, quoting=csv.QUOTE_MINIMAL)