# Two-level recommendation system

## Import

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from lightgbm import LGBMClassifier

from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender
# Функция генерации фичей
from src.features import generate_fs
from src.recommendation import tlvl_recommender

In [3]:
data = pd.read_csv('../raw_data/retail_train.csv')
item_features = pd.read_csv('../raw_data/product.csv')
user_features = pd.read_csv('../raw_data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


## Feature filltering

In [4]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['cost'] = data['sales_value'] / data['quantity']


Decreased # items from 83685 to 5001


In [5]:
recommender = MainRecommender(data_train_lvl_1, weighting='bm25', fake_id=999999)



  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

## 1 Level filter items

In [6]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(
    lambda x: recommender.get_als_recommendations(x, N=100))
users_lvl_2.head(2)

Unnamed: 0,user_id,candidates
0,2070,"[1081177, 1004906, 9527290, 871756, 951190, 88..."
1,2021,"[844179, 1004906, 951590, 1013928, 1000753, 65..."


## 2 Level range items

### Spliting interactions user-item

In [7]:
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['flag'] = 1

users_lvl_2.head(4)

Unnamed: 0,user_id,item_id,flag
0,2070,1081177,1
0,2070,1004906,1
0,2070,9527290,1
0,2070,871756,1


In [8]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('flag', axis=1, inplace=True)

targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target
0,2070,1081177,0.0
1,2070,1004906,0.0


### Adding new features

In [10]:
user_emb = pd.DataFrame(recommender.model.user_factors[:-10], columns=[f'users_{i}' for i in range(350)])

pca = PCA(n_components=4, random_state=42)
us_emb = pd.DataFrame(pca.fit_transform(user_emb))

kmeans = KMeans(n_clusters=7, random_state=4, n_init=15)
user_emb['user_cluster'] = kmeans.fit_predict(user_emb)

user_emb['user_id'] = data_train_lvl_1.sort_values('user_id').user_id.unique()
us_emb['user_id'] = data_train_lvl_1.sort_values('user_id').user_id.unique()

user_emb = user_emb[['user_id', 'user_cluster']]

user_emb = user_emb.merge(us_emb, on='user_id', how='inner')
user_emb.head(3)

Unnamed: 0,user_id,user_cluster,0,1,2,3
0,1,3,-4.90688,2.40839,2.20597,-2.36317
1,2,6,-1.523228,1.14009,2.956241,-0.184863
2,3,6,6.809963,5.794197,1.858696,1.845454


In [11]:
item_emb = pd.DataFrame(recommender.model.item_factors, columns=[f'item_{i}' for i in range(350)])

pca = PCA(n_components=4, random_state=42)
it_emb = pd.DataFrame(pca.fit_transform(item_emb))

kmeans = KMeans(n_clusters=7, random_state=4, n_init=15)
item_emb['item_cluster'] = kmeans.fit_predict(item_emb)

item_emb['item_id'] = data_train_lvl_1.sort_values('item_id').item_id.unique()
it_emb['item_id'] = data_train_lvl_1.sort_values('item_id').item_id.unique()

item_emb = item_emb[['item_id', 'item_cluster']]

item_emb = item_emb.merge(it_emb, on='item_id', how='inner')
item_emb.head(3)

Unnamed: 0,item_id,item_cluster,0,1,2,3
0,818980,5,0.002968,-0.012494,-0.00368,0.004811
1,818981,2,-0.016989,-0.010519,0.00211,-0.011907
2,819063,1,-0.040984,0.010901,-0.007933,0.004065


In [12]:
targets_lvl_2 = targets_lvl_2.merge(item_emb, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_emb, on='user_id', how='left')

targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target,item_cluster,0_x,1_x,2_x,3_x,user_cluster,0_y,1_y,2_y,3_y
0,2070,1081177,0.0,6,0.042191,0.089091,0.030114,0.006331,4,2.836835,-0.303434,0.833537,3.104106
1,2070,1004906,0.0,6,0.05234,0.101875,-0.007157,0.01696,4,2.836835,-0.303434,0.833537,3.104106


In [13]:
data_train_lvl_2 = generate_fs(data_train_lvl_2, item_features, user_features)
data_train_lvl_2

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,age_cat_q,income_cat_q,kid_cat_q,age_val_pcat,income_val_pcat,kid_val_pcat
0,1753,40618809138,594,1022537,1,1.39,345,0.00,8,86,...,Homeowner,Unknown,1,None/Unknown,0.001143,0.001163,0.001113,3.808629,3.778206,3.836401
1,1753,40618809138,594,1128422,1,1.45,345,-0.14,8,86,...,Homeowner,Unknown,1,None/Unknown,0.001143,0.001163,0.001113,3.808629,3.778206,3.836401
2,1753,40618809138,594,6773055,1,29.99,345,0.00,8,86,...,Homeowner,Unknown,1,None/Unknown,0.001143,0.001163,0.001113,3.808629,3.778206,3.836401
3,1753,40618809138,594,10285106,1,1.00,345,-0.49,8,86,...,Homeowner,Unknown,1,None/Unknown,0.001143,0.001163,0.001113,3.808629,3.778206,3.836401
4,1753,40618809138,594,17105257,1,1.00,345,-0.49,8,86,...,Homeowner,Unknown,1,None/Unknown,0.001143,0.001163,0.001113,3.808629,3.778206,3.836401
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169706,1257,41260156777,635,13511916,1,0.89,309,0.00,1655,91,...,Homeowner,2 Adults Kids,3,1,0.006802,0.006555,0.009400,2.503938,2.510717,2.494477
169707,1784,41260412740,635,960503,1,5.59,404,-2.40,1810,91,...,,,,,,,,,,
169708,1784,41260412740,635,962568,1,1.89,404,0.00,1810,91,...,,,,,,,,,,
169709,1784,41260412740,635,979707,1,1.65,404,0.00,1810,91,...,,,,,,,,,,


In [14]:
targets_lvl_2 = targets_lvl_2.merge(data_train_lvl_2, on=['user_id', 'item_id'], how='left')

In [15]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2[['target']]

In [16]:
cat_feats = ['manufacturer', 'department', 'brand', 'commodity_desc', 'sub_commodity_desc', 
             'curr_size_of_product', 'age_desc', 'marital_status_code', 'income_desc', 
             'homeowner_desc', 'hh_comp_desc', 'household_size_desc', 'kid_category_desc',
            'daytime', 'pop_cat', 'item_cluster', 'user_cluster']
X_train[cat_feats] = X_train[cat_feats].astype('category')

### Training model

In [17]:
lgb = LGBMClassifier(objective='binary', 
                     max_depth=5, 
                     n_estimators=75,
                     categorical_column=cat_feats)
lgb.fit(X_train, y_train)

pred_proba = lgb.predict_proba(X_train)
pred_proba

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([[9.99858442e-01, 1.41557789e-04],
       [9.99858442e-01, 1.41557789e-04],
       [9.99858442e-01, 1.41557789e-04],
       ...,
       [9.99858442e-01, 1.41557789e-04],
       [9.99858442e-01, 1.41557789e-04],
       [9.99858442e-01, 1.41557789e-04]])

### Features sorting

In [18]:
X_train['proba'] = pred_proba[:, 1]

res = pd.DataFrame(targets_lvl_2[targets_lvl_2['target'] == 1]\
                   .groupby('user_id')['item_id'].unique())\
                    .rename(columns={'item_id': 'actual'})

res['items'] = X_train.groupby('user_id')['item_id'].agg(lambda x: [x.values])
res['proba'] = X_train.groupby('user_id')['proba'].agg(lambda x: [x.values.argsort()[::-1]])
res['pred'] = res.apply(lambda row: 
                              pd.Series(row['items'][0][row['proba'][0]].flatten()).unique(), axis=1)
res = res.reset_index()
res

Unnamed: 0,user_id,actual,items,proba,pred
0,1,"[940947, 856942, 10149640, 865456, 5577022, 10...","[[1033142, 865178, 9527290, 940947, 940947, 94...","[[128, 47, 32, 95, 33, 36, 37, 38, 39, 42, 45,...","[871570, 9655212, 5577022, 1082212, 1004906, 1..."
1,2,"[1133018, 1106523, 916122, 1053690, 901062, 10...","[[5569230, 1133018, 1106523, 8090521, 916122, ...","[[8, 1, 2, 4, 93, 98, 58, 63, 43, 23, 67, 26, ...","[1053690, 1133018, 1106523, 916122, 1035843, 9..."
2,4,"[883932, 891423, 962229, 908283, 6773204]","[[840361, 883932, 883932, 883932, 883932, 5569...","[[13, 7, 41, 44, 32, 43, 42, 4, 3, 2, 1, 74, 6...","[962229, 891423, 6773204, 908283, 883932, 9085..."
3,6,"[878996, 962568, 1024306, 1098844, 845208, 895...","[[904360, 961554, 878996, 1041796, 934639, 866...","[[80, 77, 102, 103, 104, 105, 106, 107, 108, 1...","[900802, 1119051, 994928, 1037863, 1024306, 96..."
4,7,"[1106523, 1122358, 1133018, 1022003, 5591154, ...","[[987724, 1106523, 826249, 1122358, 849843, 11...","[[43, 5, 50, 51, 32, 14, 91, 13, 63, 94, 3, 1,...","[836281, 1133018, 993638, 1110572, 1126899, 55..."
...,...,...,...,...,...
1929,2496,"[979707, 12810393, 883404, 916122, 1106523, 99...","[[1040371, 859075, 1044078, 840361, 1133018, 8...","[[117, 70, 20, 21, 22, 23, 88, 24, 45, 25, 44,...","[957741, 1041796, 1106523, 820122, 995785, 105..."
1930,2497,"[1055646, 862349, 834484, 845208, 860776, 1038...","[[826249, 957951, 972665, 1042907, 951590, 999...","[[116, 28, 90, 30, 87, 31, 33, 82, 34, 35, 36,...","[970202, 860776, 900802, 870515, 965719, 10382..."
1931,2498,"[1053690, 1070820, 1106523, 1100379, 961554, 9...","[[1053690, 1070820, 1070820, 1070820, 1070820,...","[[0, 9, 34, 38, 66, 12, 10, 11, 4, 3, 2, 1, 80...","[1053690, 1106523, 1100379, 961554, 951197, 10..."
1932,2499,"[883404, 826249, 5568378, 899624, 904129, 1070...","[[1004906, 952163, 965766, 1110572, 883404, 82...","[[21, 18, 89, 90, 91, 20, 17, 16, 15, 83, 82, ...","[904129, 5568378, 1060872, 899624, 5569327, 84..."


In [19]:
res.apply(lambda row: precision_at_k(row['pred'], row['actual'], k=5), axis=1).mean()

0.8653567735263707

In [20]:
res.apply(lambda row: recall_at_k(row['pred'], row['actual'], k=5), axis=1).mean()

0.5941285831037535

In [21]:
res_test = res[['user_id', 'pred']]

result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns=['user_id', 'actual']

res_test = res_test.merge(result_lvl_2, on='user_id', how='inner')
res_test.head(3)

Unnamed: 0,user_id,pred,actual
0,1,"[871570, 9655212, 5577022, 1082212, 1004906, 1...","[821867, 834484, 856942, 865456, 889248, 90795..."
1,6,"[900802, 1119051, 994928, 1037863, 1024306, 96...","[920308, 926804, 946489, 1006718, 1017061, 107..."
2,7,"[836281, 1133018, 993638, 1110572, 1126899, 55...","[840386, 889774, 898068, 909714, 929067, 95347..."


In [22]:
res_test.apply(lambda row: precision_at_k(row['pred'], row['actual'], k=5), axis=1).mean()

0.3002280501710355

In [23]:
res_test.apply(lambda row: recall_at_k(row['pred'], row['actual'], k=5), axis=1).mean()

0.03607363342085396