### Импорты библиотек

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from scipy.sparse import csr_matrix
from implicit import als

from lightgbm import LGBMClassifier, Dataset
import lightgbm

from sklearn.model_selection import GridSearchCV

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender
from src.preprocessing import new_item_features, new_user_features, train_test_preprocessing
from src.preprocessing import popularity_recommendation, get_important_features, get_final_recomendation

import warnings
warnings.simplefilter('ignore')

### Загрузка данных

In [2]:
DATA_PATH = '../data/retail_train.csv'
ITEM_FEATURES_PATH = '../data/product.csv'
USER_FEATURES_PATH = '../data/hh_demographic.csv'
TEST_1_PATH = '../data/retail_test1.csv'
TEST_2_PATH = '../data/retail_test2.csv'

data = pd.read_csv(DATA_PATH)
item_features = pd.read_csv(ITEM_FEATURES_PATH)
user_features = pd.read_csv(USER_FEATURES_PATH)
test_1 = pd.read_csv(TEST_1_PATH)

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

### Разбиение датасета на обучающую и две валидационные выборки

In [3]:
VAL_SIZE = 5

train_1 = data[data['week_no'] < data['week_no'].max() - (VAL_SIZE)]
val_1 = data[data['week_no'] >= data['week_no'].max() - (VAL_SIZE)]

train_2 = val_1.copy()

In [4]:
print(f'Общее количество юзеров {data.user_id.nunique()}')
print(f'Число юзеров в train_1  {train_1.user_id.nunique()}')
print(f'Число юзеров в val_1    {val_1.user_id.nunique()}')
print(f'Число юзеров в test_1    {test_1.user_id.nunique()}')

Общее количество юзеров 2499
Число юзеров в train_1  2499
Число юзеров в val_1    2154
Число юзеров в test_1    1885


## Первый уровень рекомендаций

### Применяем фильтрацию к датасету, оставляем только нужных кандидатов

In [5]:
n_items_before = train_1['item_id'].nunique()
train_1 = prefilter_items(train_1, item_features=item_features, take_n_popular=190)
n_items_after = train_1['item_id'].nunique()

print(f'Decreased # items from {n_items_before} to {n_items_after}')

Decreased # items from 85828 to 191


In [6]:
recommender = MainRecommender(train_1)



HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=191.0), HTML(value='')))




#### Эмбеддинги

In [7]:
items_emb_df = recommender.items_emb_df
users_emb_df = recommender.users_emb_df

### Добавим фичи

In [8]:
item_features = new_item_features(train_2, item_features, items_emb_df)
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,0,1,2,...,12,13,14,coupon_disc,quantity_of_sales,quantity_of_sales_per_week,qnt_of_sales_per_dep,qnt_of_sales_per_item_per_dep_per_week,qnt_of_sales_per_sub_commodity_desc,qnt_of_sales_per_item_per_sub_commodity_desc_per_week
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB,,,,...,,,,,0,0.0,112255,0.137313,101,0.134667
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,,,,,...,,,,,0,0.0,226,0.05858,225,0.064433


In [9]:
user_features = new_user_features(train_2, user_features, users_emb_df)
user_features.head(2)

Unnamed: 0,marital_status_code,homeowner_desc,hh_comp_desc,household_size_desc,user_id,0,1,2,3,4,...,11,12,13,14,mean_time,age,income,kids,average_basket,sum_per_week
0,A,Homeowner,2 Adults No Kids,2,1,-3.054558,14.229646,9.226919,21.877134,-8.62224,...,9.871156,6.480928,-2.626936,-8.42864,1324.803711,70,45,0,2.48729,44.356667
1,A,Homeowner,2 Adults No Kids,2,7,-1.402323,0.131635,14.23359,10.636305,11.728805,...,17.191568,4.310338,7.719049,24.581388,1622.862915,50,70,0,2.547257,74.295


### Предобработка обучающего и валидационного датасетов

In [10]:
X_train, y_train = train_test_preprocessing(train_2, train_1, recommender, item_features, user_features)

In [11]:
cat_feats = X_train.select_dtypes(include=['object']).columns.tolist()
cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'marital_status_code',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc']

In [12]:
X_train[cat_feats + ['user_id', 'item_id']] = X_train[cat_feats + ['user_id', 'item_id']].astype('category')

# X_train[cat_feats] = X_train[cat_feats].astype('category')

# X_test, y_test = train_test_preprocessing(val_2, train_1, recommender, item_features, user_features)

# X_test[cat_feats + ['user_id', 'item_id']] = X_test[cat_feats + ['user_id', 'item_id']].astype('category')

X_test_1, y_test_1 = train_test_preprocessing(test_1, train_1, recommender, item_features, user_features)
X_test_1[cat_feats + ['user_id', 'item_id']] = X_test_1[cat_feats + ['user_id', 'item_id']].astype('category')
# X_test_1[cat_feats] = X_test_1[cat_feats].astype('category')

## Второй уровень рекомендаций с использованием LGBMClassifier

In [13]:
lgb = LGBMClassifier(objective='binary', max_depth=3, categorical_feature=cat_feats, random_state=1)

### Отбор признаков

In [14]:
basic_feats = get_important_features(lgb, X_train, y_train)
basic_feats

['user_id',
 'item_id',
 'sub_commodity_desc',
 '0_x',
 '1_x',
 '2_x',
 '3_x',
 '4_x',
 '5_x',
 '6_x',
 '7_x',
 '8_x',
 '9_x',
 '10_x',
 '11_x',
 '12_x',
 '13_x',
 '14_x',
 'coupon_disc',
 'quantity_of_sales',
 'qnt_of_sales_per_dep',
 'qnt_of_sales_per_item_per_dep_per_week',
 'qnt_of_sales_per_sub_commodity_desc',
 'qnt_of_sales_per_item_per_sub_commodity_desc_per_week',
 'marital_status_code',
 '0_y',
 '1_y',
 '2_y',
 '3_y',
 '4_y',
 '5_y',
 '6_y',
 '8_y',
 '11_y',
 '13_y',
 'average_basket',
 'sum_per_week']

### Подбор параметров и обучение модели

In [15]:
param_grid = {
    'learning_rate': np.linspace(0.005, 0.03, num=5), 
    'max_depth': [3, 4], 
    'reg_alpha': np.linspace(0.000001, 0.0001, num=5), 
    'reg_lambda': np.linspace(0.001, 0.05, num=5)
}

In [16]:
%%time
gbm = GridSearchCV(lgb, param_grid, cv=5, n_jobs=-1)
gbm.fit(X_train[basic_feats], y_train)

print('Best parameters found by grid search are:', gbm.best_params_)

Best parameters found by grid search are: {'learning_rate': 0.017499999999999998, 'max_depth': 3, 'reg_alpha': 5.05e-05, 'reg_lambda': 0.001}
Wall time: 4min 31s


In [17]:
params = gbm.best_params_
lgb = LGBMClassifier(
    objective='binary', 
    categorical_feature=cat_feats, 
    random_state=1, 
    **params, 
    n_jobs=-1, 
    n_estimators=500
)
lgb.fit(X_train[basic_feats], y_train)

LGBMClassifier(boosting_type='gbdt',
               categorical_feature=['manufacturer', 'department', 'brand',
                                    'commodity_desc', 'sub_commodity_desc',
                                    'curr_size_of_product',
                                    'marital_status_code', 'homeowner_desc',
                                    'hh_comp_desc', 'household_size_desc'],
               class_weight=None, colsample_bytree=1.0, importance_type='split',
               learning_rate=0.017499999999999998, max_depth=3,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=500, n_jobs=-1, num_leaves=31, objective='binary',
               random_state=1, reg_alpha=5.05e-05, reg_lambda=0.001,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

### Предсказание на тестовой выборке

In [18]:
# test_preds_proba = lgb.predict_proba(X_test[basic_feats])[:, 1]
test_preds_proba = lgb.predict_proba(X_test_1[basic_feats])[:, 1]

# result = get_final_recomendation(X_test, test_preds_proba, val_2, train_1)
result = get_final_recomendation(X_test_1, test_preds_proba, test_1, train_1)

# result.user_id.nunique() == val_2.user_id.nunique()
result.user_id.nunique() == test_1.user_id.nunique()

True

In [19]:
result.apply(lambda row: precision_at_k(row['recomendations'], row['actual']), axis=1).mean()

0.3110875331564981

### Предсказание для трейна

In [20]:
X_test_2, y_test_2 = train_test_preprocessing(data, train_1, recommender, item_features, user_features)
X_test_2[cat_feats + ['user_id', 'item_id']] = X_test_2[cat_feats + ['user_id', 'item_id']].astype('category')

In [21]:
test_preds_proba = lgb.predict_proba(X_test_2[basic_feats])[:, 1]

result = get_final_recomendation(X_test_2, test_preds_proba, data, train_1)

result.user_id.nunique() == data.user_id.nunique()

True

In [22]:
result.apply(lambda row: precision_at_k(row['recomendations'], row['actual']), axis=1).mean()

0.947579031612645

In [23]:
result.drop('actual', axis=1, inplace=True)

In [24]:
# result.to_csv('recommendations.csv', index=False)