# Финальный проект

In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Бустинг
from lightgbm import LGBMRanker
from lightgbm import LGBMClassifier

# Самостоятельно написанные классы и  функции 
from src.metrics import precision_at_k
from src.recommenders import MainRecommender
from src.utils import prefilter_items

In [69]:
data_train = pd.read_csv('./retail_train.csv')
data_test = pd.read_csv('./retail_test1.csv')

item_features = pd.read_csv('./product.csv')
user_features = pd.read_csv('./hh_demographic.csv')

In [70]:
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [71]:
VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

# берем данные для тренировки matching модели
data_train_matcher = data_train[data_train['week_no'] < data_train['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data_train[(data_train['week_no'] >= data_train['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data_train['week_no'] < data_train['week_no'].max() - (VAL_RANKER_WEEKS))]


# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data_train[data_train['week_no'] >= data_train['week_no'].max() - VAL_RANKER_WEEKS]

In [72]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))
Decreased # items from 83685 to 5001


In [73]:
# ищем общих пользователей
common_users = list(set(data_train_matcher.user_id.values)&(set(data_val_matcher.user_id.values))&set(data_val_ranker.user_id.values))

data_train_matcher = data_train_matcher[data_train_matcher.user_id.isin(common_users)]
data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

In [74]:
ACTUAL_COL = 'actual'
ITEM_COL = 'item_id'
USER_COL = 'user_id'

In [75]:
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]

In [98]:
# weighting = [None, 'bm25', 'tfidf']
weighting = ['bm25', 'tfidf']
n_candidates = 50

### Проверим модели матчинга

In [99]:
%%time
for w in weighting:
    print(f'weighting: {w}...')
    recommender = MainRecommender(data_train_matcher, w)

    result_eval_matcher[f'own_rec {w}'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=n_candidates))
    result_eval_matcher[f'sim_item_rec {w}'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_similar_items_recommendation(x, N=n_candidates))
    result_eval_matcher[f'als_rec {w}'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_als_recommendations(x, N=n_candidates))
    result_eval_matcher[f'sim_user_rec {w}'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_similar_users_recommendation(x, N=n_candidates))

weighting: bm25...


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/4999 [00:00<?, ?it/s]

weighting: tfidf...


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/4999 [00:00<?, ?it/s]

CPU times: user 13min 5s, sys: 5min 57s, total: 19min 3s
Wall time: 6min 34s


In [78]:
TOP_K_PRECISION = 5

In [79]:
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [100]:
sorted(calc_precision(result_eval_matcher, TOP_K_PRECISION), key=lambda x: x[1],reverse=True)

[('own_rec bm25', 0.18872062663185377),
 ('als_rec tfidf', 0.14788511749347258),
 ('own_rec tfidf', 0.13357702349869452),
 ('als_rec bm25', 0.1296083550913838),
 ('sim_item_rec bm25', 0.06558746736292428),
 ('sim_item_rec tfidf', 0.06015665796344648),
 ('sim_user_rec tfidf', 0.04835509138381201),
 ('sim_user_rec bm25', 0.012845953002610967)]

Будем использовать own recommendation. На самом деле, я также проверял вариант без взвешивания weighting = None и он показал не самые плохие результаты (но не лучшие), но он ОЧЕНЬ  долго считается - около 7-8 часов, не знаю, с чем это связано. Когда оформлял ноутбук начисто - исключил этот вариант, чтобы не тратить время

In [81]:
recommender = MainRecommender(data_train_matcher, 'bm25')
items_embendings, user_embedings = recommender.get_embeddings()
top_popular = recommender.get_top_popular()
candidates = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=n_candidates))

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/4999 [00:00<?, ?it/s]

### Модель ранжирования

In [82]:
def get_extended_item_features(data_train_ranker, item_features, items_embendings):
    new_features = item_features.merge(data_train_ranker, on='item_id', how='left')

    item_features = item_features.merge(items_embendings, how='left')

    rare_manufacturer = item_features['manufacturer'] \
        .value_counts()[item_features['manufacturer'].value_counts() < 50].index
    item_features.loc[item_features['manufacturer'].isin(rare_manufacturer), 'manufacturer'] = 999999999
    item_features['manufacturer'] = item_features.manufacturer.astype('object')


    mean_disc = new_features \
        .groupby('item_id')['coupon_disc'] \
        .mean() \
        .reset_index() \
        .sort_values('coupon_disc')
    item_features = item_features.merge(mean_disc, on='item_id', how='left')    


    items_in_department = new_features \
        .groupby('department')['item_id'] \
        .count() \
        .reset_index() \
        .sort_values('item_id', ascending=False)
    items_in_department.rename(columns={'item_id': 'items_in_department'}, inplace=True)

    sales_count_per_dep = new_features \
        .groupby(['department'])['quantity'] \
        .count() \
        .reset_index() \
        .sort_values('quantity', ascending=False)
    sales_count_per_dep.rename(columns={'quantity': 'sales_count_per_dep'}, inplace=True)

    items_in_department = items_in_department.merge(sales_count_per_dep, on='department')
    items_in_department['qnt_of_sales_per_item_per_dep_per_week'] = (
            items_in_department['sales_count_per_dep'] /
            items_in_department['items_in_department'] /
            new_features['week_no'].nunique()
    )
    items_in_department = items_in_department.drop(['items_in_department'], axis=1)
    item_features = item_features.merge(items_in_department, on=['department'], how='left')

    item_qnt = new_features \
        .groupby(['item_id'])['quantity'] \
        .count() \
        .reset_index()
    item_qnt.rename(columns={'quantity': 'quantity_of_sales'}, inplace=True)

    item_qnt['sales_count_per_week'] = item_qnt['quantity_of_sales'] / new_features['week_no'].nunique()
    item_features = item_features.merge(item_qnt, on='item_id', how='left')

    items_in_department = new_features \
        .groupby('sub_commodity_desc')['item_id'] \
        .count() \
        .reset_index() \
        .sort_values('item_id', ascending=False)
    items_in_department.rename(columns={'item_id': 'items_in_sub_commodity_desc'}, inplace=True)

    sales_count_per_dep = new_features \
        .groupby(['sub_commodity_desc'])['quantity'] \
        .count() \
        .reset_index() \
        .sort_values('quantity', ascending=False)
    sales_count_per_dep.rename(columns={'quantity': 'qnt_of_sales_per_sub_commodity_desc'}, inplace=True)

    items_in_department = items_in_department.merge(sales_count_per_dep, on='sub_commodity_desc')
    items_in_department['qnt_of_sales_per_item_per_sub_commodity_desc_per_week'] = (
            items_in_department['qnt_of_sales_per_sub_commodity_desc'] /
            items_in_department['items_in_sub_commodity_desc'] /
            new_features['week_no'].nunique()
    )
    items_in_department = items_in_department.drop(['items_in_sub_commodity_desc'], axis=1)
    item_features = item_features.merge(items_in_department, on=['sub_commodity_desc'], how='left')

    return item_features

In [83]:
def get_extended_user_features(data_train_ranker, user_features, user_embedings):
    data_train_ranker['price'] = data_train_ranker['sales_value'] / data_train_ranker['quantity']
    new_user_features = user_features.merge(data_train_ranker, on='user_id', how='left')

    user_features = user_features.merge(user_embedings, how='left')

    time = new_user_features \
        .groupby('user_id')['trans_time'] \
        .mean() \
        .reset_index()
    time.rename(columns={'trans_time': 'mean_time'}, inplace=True)
    time = time.astype(np.float32)
    user_features = user_features.merge(time, how='left')


    # Age
    user_features['age'] = user_features['age_desc'].replace({
        '65+'  : 70, 
        '45-54': 50, 
        '25-34': 30, 
        '35-44': 40, 
        '19-24': 20, 
        '55-64': 60}
    )
    user_features = user_features.drop('age_desc', axis=1)


    # Income
    user_features['income'] = user_features['income_desc'].replace({
        '35-49K'   : 45,
        '50-74K'   : 70,
        '25-34K'   : 30,
        '75-99K'   : 95,
        'Under 15K': 15,
        '100-124K' : 120,
        '15-24K'   : 20,
        '125-149K' : 145,
        '150-174K' : 170,
        '250K+'    : 250,
        '175-199K' : 195,
        '200-249K' : 245}
    )
    user_features = user_features.drop('income_desc', axis=1)


    # Childrens
    user_features['children'] = 0
    user_features.loc[(user_features['kid_category_desc'] == '1'), 'children'] = 1
    user_features.loc[(user_features['kid_category_desc'] == '2'), 'children'] = 2
    user_features.loc[(user_features['kid_category_desc'] == '3'), 'children'] = 3
    user_features = user_features.drop('kid_category_desc', axis=1)


    # Средний чек, средний чек в неделю
    basket = new_user_features \
        .groupby(['user_id'])['price'] \
        .sum() \
        .reset_index()

    baskets = new_user_features.groupby('user_id')['basket_id'] \
        .count()\
        .reset_index()
    baskets.rename(columns={'basket_id': 'baskets'}, inplace=True)

    avg_basket = basket.merge(baskets)

    avg_basket['avg_basket'] = avg_basket['price'] / avg_basket['baskets']
    avg_basket['sum_per_week'] = avg_basket['price'] / new_user_features['week_no'].nunique()

    avg_basket = avg_basket.drop(['price', 'baskets'], axis=1)
    user_features = user_features.merge(avg_basket, how='left')

    return user_features

In [84]:
def get_ranker_targets(data_train_ranker, data_train_matcher, candidates, N):
    users_ranker = pd.DataFrame(data_train_ranker['user_id'].unique())

    users_ranker.columns = ['user_id']

    train_users = data_train_matcher['user_id'].unique()
    users_ranker = users_ranker[users_ranker['user_id'].isin(train_users)]

    # Рекомендации на основе собственных покупок
    users_ranker = users_ranker.copy()
    users_ranker['candidates'] = candidates

    s = users_ranker.apply(
        lambda x: pd.Series(x['candidates']), axis=1
    ).stack().reset_index(level=1, drop=True)

    s.name = 'item_id'

    users_ranker = users_ranker.drop('candidates', axis=1).join(s)

    users_ranker['flag'] = 1


    ranker_targets = data_train_ranker[['user_id', 'item_id']].copy()

    ranker_targets['target'] = 1 

    ranker_targets = users_ranker.merge(ranker_targets, on=['user_id', 'item_id'], how='left')

    ranker_targets['target'].fillna(0, inplace=True)
    ranker_targets.drop('flag', axis=1, inplace=True)

    return ranker_targets

In [85]:
def get_extended_user_item_features(data_train_ranker, data_train_matcher, candiadates, item_features, user_features, items_embendings, user_embedings, N=50):

    target = get_ranker_targets(data_train_ranker, data_train_matcher, candiadates, N)
    user_features = get_extended_user_features(data_train_ranker, user_features, user_embedings)
    item_features = get_extended_item_features(data_train_ranker, item_features, items_embendings)
    item_features = data_train_ranker.merge(item_features, on='item_id', how='left')

    new_data = item_features.merge(user_features, on='user_id', how='left')

    count_perch = new_data\
        .groupby(['user_id', 'commodity_desc', 'week_no']) \
        .agg({'quantity': 'mean'}) \
        .reset_index() \
        .rename(columns={'quantity': 'count_purchases_week_dep'})

    mean_count_perch = new_data.groupby(['commodity_desc', 'week_no']) \
        .agg({'quantity': 'sum'}) \
        .reset_index() \
        .rename(columns=({'quantity': 'mean_count_purchases_week_dep'}))

    coef = count_perch.merge(mean_count_perch, on=['commodity_desc', 'week_no'], how='left')
    coef['count_purchases_week_mean'] = coef['count_purchases_week_dep'] / coef['mean_count_purchases_week_dep']
    coef = coef[['user_id', 'commodity_desc', 'count_purchases_week_mean']]

    temp = coef\
        .groupby(['user_id', 'commodity_desc']) \
        .agg({'count_purchases_week_mean': 'mean'}) \
        .reset_index()

    new_data = new_data.merge(temp, on=['user_id', 'commodity_desc'], how='left')

    count_perch = new_data \
        .groupby(['user_id', 'commodity_desc', 'week_no']) \
        .agg({'price': 'sum'}) \
        .reset_index() \
        .rename(columns={'price': 'price_week'})

    mean_count_perch = new_data \
        .groupby(['commodity_desc', 'week_no'])\
        .agg({'price': 'sum'}) \
        .reset_index() \
        .rename(columns=({'price': 'mean_price_week'}))

    coef = count_perch.merge(mean_count_perch, on=['commodity_desc', 'week_no'], how='left')
    coef['sum_purchases_week_mean'] = coef['price_week'] / coef['mean_price_week']
    coef = coef[['user_id', 'commodity_desc', 'sum_purchases_week_mean']]

    temp = coef \
        .groupby(['user_id', 'commodity_desc']) \
        .agg({'sum_purchases_week_mean': 'mean'}) \
        .reset_index()

    new_data = new_data.merge(temp, on=['user_id', 'commodity_desc'], how='left')

    new_data = new_data.merge(target, on=['item_id', 'user_id'], how='left')
    new_data = new_data.fillna(0)

    return new_data

In [86]:
train = get_extended_user_item_features(data_train_ranker, data_train_matcher, candidates, item_features, user_features, items_embendings, user_embedings, n_candidates)
train.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,19_y,mean_time,age,income,children,avg_basket,sum_per_week,count_purchases_week_mean,sum_purchases_week_mean,target
0,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,...,2.893883,1274.421509,50.0,70.0,0.0,2.290045,77.86153,0.000685,0.002868,0.0
1,2021,40618753059,594,840361,1,0.99,443,0.0,101,86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002571,0.00263,0.0
2,2021,40618753059,594,856060,1,1.77,443,-0.09,101,86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002721,0.002794,0.0
3,2021,40618753059,594,869344,1,1.67,443,-0.22,101,86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003986,0.005455,0.0
4,2021,40618753059,594,896862,2,5.0,443,-2.98,101,86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01137,0.007395,0.0


In [87]:
X_train = train.drop(['target'], axis=1)
y_train = train[['target']]

In [88]:
cat_features = []
for col in X_train.columns:
    if(X_train[col].dtype == np.object):
          cat_features.append(col)
            
X_train[cat_features + ['user_id', 'item_id']] = X_train[cat_features + ['user_id', 'item_id']].astype('category')

In [89]:
test = get_extended_user_item_features(data_test, data_train_matcher, candidates, item_features, user_features, items_embendings, user_embedings, n_candidates)
X_test = test.drop(['target'], axis=1)
y_test = test[['target']]
X_test[cat_features + ['user_id', 'item_id']] = X_test[cat_features + ['user_id', 'item_id']].astype('category')

In [90]:
def get_important_features(model, X_train, y_train):
    model.fit(X_train, y_train)
    feature = list(zip(X_train.columns.tolist(), model.feature_importances_))
    feature = pd.DataFrame(feature, columns=['feature', 'value'])
    features = feature.loc[feature.value > 0, 'feature'].tolist()
    return features

In [91]:
lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_features)
important_features = get_important_features(lgb, X_train, y_train)

  return f(*args, **kwargs)


In [92]:
lgb = LGBMClassifier(
    objective='binary',
    max_depth=7,
    categorical_feature=cat_features
)
lgb.fit(X_train[important_features], y_train)

  return f(*args, **kwargs)
Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


LGBMClassifier(categorical_feature=['department', 'brand', 'commodity_desc',
                                    'sub_commodity_desc',
                                    'curr_size_of_product',
                                    'marital_status_code', 'homeowner_desc',
                                    'hh_comp_desc', 'household_size_desc'],
               max_depth=7, objective='binary')

In [93]:
preds = lgb.predict(X_test[important_features])
test_preds_proba = lgb.predict_proba(X_test[important_features])[:, 1]

In [94]:
def get_final_recomendations(X_test, test_preds_proba, data_train, top_popular, item_features):
    X_test['predict_proba'] = test_preds_proba
    X_test.sort_values(['user_id', 'predict_proba'], ascending=False, inplace=True)

    recomendations = []
    for user, preds in X_test.groupby('user_id')['item_id']:
        recomendations.append({'user_id': user, 'recomendations': preds.tolist()})
    recomendations = pd.DataFrame(recomendations)

    result = data_train.groupby('user_id')['item_id'].unique().reset_index()
    result.columns = ['user_id', 'actual']

    result = result.merge(recomendations, how='left')
    
    result.loc[result['recomendations'].isnull(), 'recomendations'] = pd.Series( [top_popular] * len(result))

    return result

In [95]:
result = get_final_recomendations(X_test, test_preds_proba, data_train, top_popular, item_features)

In [97]:
test_score = result.apply(lambda row: precision_at_k(row['recomendations'], row['actual']), axis=1).mean()
test_score

0.4720488195278111

По-моему, получился слишком хороший результат. Возможно я что-то упустил и не так посчитал