In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from surprise.model_selection import KFold
from tqdm.notebook import tqdm
from sklearn.metrics import ndcg_score
from surprise import dump
import seaborn as sns
import matplotlib.pyplot as plt
from typing import List
from surprise import SVD
import random
import pickle
from copy import deepcopy

import typing as tp
from sklearn.preprocessing import normalize
from lightfm import LightFM
from scipy.sparse import coo_matrix
from lightfm.data import Dataset as LFMDataset
from collections import Counter
from matplotlib.lines import Line2D
import time
import optuna
import os

np.random.seed(42)



In [2]:
from Helper import Helper
from CatBoostWrapper import CatBoostWrapper
from LightFMWrapper import LightFMWrapper
from NNWrapper import NNWrapper
from SurpriseWrapper import SurpriseWrapper

In [3]:
df = pd.read_csv('data/train_joke_df.csv')

df['UID'] = df['UID'] - 1
df['JID'] = df['JID'] - 1
# сделаем сортировку и перепишем index
df = df.sort_values(by=['UID', 'JID'])
df = df.reset_index(drop=True)

df

df_train, df_test = train_test_split(df, test_size=0.5, random_state=42)

In [4]:
df

Unnamed: 0,UID,JID,Rating
0,0,0,-7.82
1,0,1,8.79
2,0,2,-9.66
3,0,3,-8.16
4,0,4,-7.52
...,...,...,...
1448359,24982,66,6.21
1448360,24982,67,7.48
1448361,24982,68,5.15
1448362,24982,70,6.26


In [5]:
wrappers ={
    'svd':SurpriseWrapper(model_name='05_baseline_svd_train', folder_name='models', df_train=df_train, n_recommendations=10), 
    'lfm_cos':LightFMWrapper(model_name='', folder_name='models', df_train=df_train, n_recommendations=10),           
    'catboost':CatBoostWrapper(model_name='catboost_05', folder_name='models', df_train=df_train, n_recommendations=10), 
    'knn':SurpriseWrapper(model_name='05_baseline_knn_train', folder_name='models', df_train=df_train, n_recommendations=10), 
    'nn_bias':NNWrapper(model_name='epoch=23-step=8496.ckpt', folder_name='models', df_train=df_train, n_recommendations=10)
}

In [10]:
%%time
candidates = {model_name: wrapper.predict() for model_name, wrapper in wrappers.items()}

CPU times: total: 5min 14s
Wall time: 5min 13s


In [11]:
n_users = 24983
n_items = 100

In [38]:
def rank_candidates_for_user(user: int, params: dict, param_new_user:dict, candidates: dict, joke_quality:dict,
                         joke_volume:dict, svd: SurpriseWrapper):
    
    user_id = user - 1
    rec_list = None
    if user_id < n_users:
        
        res = {}
        for i, (model_name, value) in enumerate(candidates.items()):
            rank = {x: params[model_name] * (10-j) for j, x in enumerate(value[user_id])}

            for k, v in rank.items():
                if k in res:
                    res[k] += v
                else:
                    res[k] = v
                    
        for k in res:
            res[k] += joke_quality[k] * params['quality']
            res[k] += joke_volume[k] * params['volume']

        rec_list = [x[0] for x in sorted(res.items(), key=lambda item: item[1], reverse=True)][:10]
        first_rating = svd.model.test([[user_id, rec_list[0], 0]])[0].est
        
    else:
        res = {}
                    
        for k in range(n_items):
            if joke_quality[k] < 0.00:# or joke_volume[k] < 0.1:
                continue
                
            res[k] = joke_quality[k] * param_new_user['quality']
            res[k] += joke_volume[k] * param_new_user['volume']

        xx = np.array(
            [[x[0], x[1]] for x in sorted(res.items(), key=lambda item: item[1], reverse=True)])
        
        pp = xx[:, 1]
        pp += np.abs(np.min(pp))
        rec_list = list(np.random.choice(xx[:, 0], size=10, replace=False, p=pp/np.sum(pp)))        
        first_rating = 0
            
    rec_list = [int(joke + 1) for joke in rec_list]
    return [[first_rating], rec_list]
        

In [8]:

with open('models/joke_quality.pkl', 'rb') as f:
    joke_quality = pickle.load(f)
    
with open('models/joke_volume.pkl', 'rb') as f:
    joke_volume = pickle.load(f)
    
    
params = {'svd': 0.7,
          'lfm': -0, 
          'lfm_cos': 0.8, 
          'nn': 0.0, 
          'nn_bias': -0.1, 
          'catboost': 0.1, 
          'knn': 0.1, 'quality': 4, 'volume': 4
         }

param_new_user = {'quality': -0.6, 'volume': 4.2}

In [16]:
svd = wrappers['svd']
svd

<SurpriseWrapper.SurpriseWrapper at 0x27b604c67d0>

In [21]:
svd.model.test([[0,0,0]])

[Prediction(uid=0, iid=0, r_ui=0, est=-7.689099516191514, details={'was_impossible': False})]

In [24]:
svd.model.test([[0,0,0]])[0].est

-7.689099516191514

In [None]:
svd.model.test([[0,0,0]])

In [46]:
df_users = pd.DataFrame(np.unique(df['UID']), columns =['UID'])
df_users

Unnamed: 0,UID
0,0
1,1
2,2
3,3
4,4
...,...
24978,24978
24979,24979
24980,24980
24981,24981


In [30]:
xx = get_predict_for_user(user_id=0, params=params, 
                     param_new_user=param_new_user, 
                     candidates=candidates, 
                     joke_quality=joke_quality,
                         joke_volume=joke_volume, svd=wrappers['svd'])

xx

[[7.638983634094388], [27, 2, 14, 60, 62, 41, 42, 44, 31, 3]]

In [36]:
xx = get_predict_for_user(user_id=200000, params=params, 
                     param_new_user=param_new_user, 
                     candidates=candidates, 
                     joke_quality=joke_quality,
                     joke_volume=joke_volume, svd=wrappers['svd'] )

xx

[[0], [6, 56, 34, 27, 23, 10, 12, 3, 96, 63]]

In [43]:
users = list(range(100))
users.extend(range(50000, 50100))
users

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 50000,
 50001,
 50002,
 50003,
 50004,
 50005,
 50006,
 50007,
 50008,
 50009,
 50010,
 50011,
 50012,
 50013,
 50014,
 50015,
 50016,
 50017,
 50018,
 50019,
 50020,
 50021,
 50022,
 50023,
 50024,
 50025,
 50026,
 50027,
 50028,
 50029,
 50030,
 50031,
 50032,
 50033,
 50034,
 50035,
 50036,
 50037,
 50038,
 50039,
 50040,
 50041,
 50042,
 50043,
 50044,
 50045,
 50046,
 50047,
 50048,
 50049,
 50050,
 50051,
 50052,
 50053,
 50054,
 50055,
 50056,
 50057,
 50058,
 50059,
 50060,
 50061,
 50062,
 50063

In [44]:
dff = pd.DataFrame(users, columns =['UID'])
dff

Unnamed: 0,UID
0,0
1,1
2,2
3,3
4,4
...,...
195,50095
196,50096
197,50097
198,50098


In [47]:
df_users['recommendations'] = df_users['UID'].apply(get_predict_for_user, args=(
            params,
            param_new_user,
            candidates,
            joke_quality,
            joke_volume,
            wrappers['svd']
        ))

df_users

Unnamed: 0,UID,recommendations
0,0,"[[2.5362366732061368], [36, 54, 47, 21, 6, 51,..."
1,1,"[[7.638983634094388], [27, 2, 14, 60, 62, 41, ..."
2,2,"[[8.761300821406051], [44, 100, 41, 85, 40, 92..."
3,3,"[[8.690320630376444], [35, 69, 49, 45, 53, 28,..."
4,4,"[[7.099995237568542], [68, 39, 14, 2, 65, 32, ..."
...,...,...
24978,24978,"[[5.547459599094481], [31, 54, 61, 29, 72, 5, ..."
24979,24979,"[[8.998359001792785], [93, 27, 82, 24, 86, 72,..."
24980,24980,"[[8.359799515337246], [3, 1, 52, 100, 25, 10, ..."
24981,24981,"[[6.10638263371094], [69, 42, 56, 29, 48, 23, ..."


In [48]:
output = df.merge(df_users)
output

Unnamed: 0,UID,JID,Rating,recommendations
0,0,0,-7.82,"[[2.5362366732061368], [36, 54, 47, 21, 6, 51,..."
1,0,1,8.79,"[[2.5362366732061368], [36, 54, 47, 21, 6, 51,..."
2,0,2,-9.66,"[[2.5362366732061368], [36, 54, 47, 21, 6, 51,..."
3,0,3,-8.16,"[[2.5362366732061368], [36, 54, 47, 21, 6, 51,..."
4,0,4,-7.52,"[[2.5362366732061368], [36, 54, 47, 21, 6, 51,..."
...,...,...,...,...
1448359,24982,66,6.21,"[[8.103408850889217], [29, 53, 32, 27, 50, 5, ..."
1448360,24982,67,7.48,"[[8.103408850889217], [29, 53, 32, 27, 50, 5, ..."
1448361,24982,68,5.15,"[[8.103408850889217], [29, 53, 32, 27, 50, 5, ..."
1448362,24982,70,6.26,"[[8.103408850889217], [29, 53, 32, 27, 50, 5, ..."


In [None]:
assert False

In [None]:
#wrappers['svd'].predict(df_test)

In [None]:
#wrappers['lfm_cos'].predict(df_test)

In [None]:
#wrappers['catboost'].predict(df_test)

In [None]:
#wrappers['knn'].predict(df_test)

In [None]:
#wrappers['nn_bias'].predict(df_test)

In [None]:
#predictions.keys()

In [None]:
import sklearn
sklearn.__version__

In [None]:
xx = torch.load(r'E:\competitions\kaggle\magnit_recsys-in-practice\part2\deploy\magnit_recsys-in-practice\models\epoch=23-step=8496.ckpt')
xx

In [None]:
model = wrappers['nn_bias'].model
model.load_state_dict(xx)
model.eval()

In [None]:
model = wrappers['nn_bias'].model

In [None]:
assert False

In [None]:
n_users=24983
n_items=100

In [None]:

test_set_all = np.zeros((n_users * n_items, 3))
for u in tqdm(range(n_users)):
    for j in range(n_items):
        test_set_all[(n_items * u) + j, 0] = u
        test_set_all[(n_items * u) + j, 1] = j
        
        
        
df_test = pd.DataFrame(test_set_all, columns=['UID', 'JID', 'Rating'])
df_test

df_test['UID'] = df_test['UID'].astype(int)
df_test['JID'] = df_test['JID'].astype(int)

In [None]:
train_df, valid_df = train_test_split(df, test_size=0.5, random_state=42)

In [None]:
#sw = SurpriseWrapper(model_name='05_baseline_svd_train', folder_name='', df_train=train_df, n_recommendations=10)
#pred = sw.predict(valid_df)
#display(pred)
#pred[pred['UID'] == 0]

In [None]:
#with open('predictions_df_surprise.pkl', 'rb') as f:
#    predictions_df_surprise = pickle.load(f)
    
#print(predictions_df_surprise.keys())

#predictions_df_surprise['svd'][predictions_df_surprise['svd']['UID'] == 0]

In [None]:
#nnw = NNWrapper(model_name='epoch=23-step=8496.ckpt', 
#    folder_name='E:\\competitions\\kaggle\\magnit_recsys-in-practice\\part2\\lightning_logs\\version_1\\checkpoints\\', 
#                df_train=train_df, n_recommendations=10)

#pred = nnw.predict(valid_df)
#display(pred)
#pred[pred['UID'] == 0]

In [None]:
#with open('predictions_df_nn_bias.pkl', 'rb') as f:
#    predictions_df_nn_bias = pickle.load(f)
    
#print(predictions_df_nn_bias.keys())

#predictions_df_nn_bias['nn_bias'][predictions_df_nn_bias['nn_bias']['UID'] == 0]

In [None]:
#cbw = CatBoostWrapper(model_name='catboost_05', 
#    folder_name='', 
#                df_train=train_df, n_recommendations=10)

#pred = cbw.predict(valid_df)
#display(pred)
#pred[pred['UID'] == 0]

In [None]:


lfmw = LightFMWrapper(model_name='', 
    folder_name='', 
                df_train=train_df, n_recommendations=10)

pred = lfmw.predict(valid_df)
display(pred)
pred[pred['UID'] == 0]

In [None]:
with open('predictions_df_lfm.pkl', 'rb') as f:
    predictions_df_lfm = pickle.load(f)
    
predictions_df_lfm.keys()

xx = list(predictions_df_lfm['lfm_cos'].groupby('UID').agg({'JID':list})['JID'].values)
xx

In [None]:
#with open('predictions_df_catboost.pkl', 'rb') as f:
#    predictions_df_catboost = pickle.load(f)
    
#print(predictions_df_surprise.keys())

#xx = list(predictions_df_catboost['catboost'].groupby('UID').agg({'JID':list})['JID'].values)
#xx

In [None]:
assert False

In [None]:

test_set = Helper().create_test_set(n_users, n_items)                
df_test = pd.DataFrame(test_set, columns=['UID', 'JID', 'Rating'])

df_test['UID'] = df_test['UID'].astype(int)
df_test['JID'] = df_test['JID'].astype(int)

data_module_test = ContextualRankerData(df_test, df_test, df_test, features = ["UID", "JID"])
data_module_test.prepare_data()
data_module_test.setup()

predictions = []
for x, y in data_module_test.test_dataloader():
    predict = nnw.model(x)
    predictions.extend(predict.cpu().detach().numpy())


df_test['Rating_pred'] = predictions


In [None]:

df_rec = Helper().filter_viewed_items(train_df, df_test, n_users, 10)


In [None]:
train_df

In [None]:

df_rec = Helper().filter_viewed_items(train_df, df_test, n_users, 10)
df_rec

In [None]:
def load_models(path_list):
    for model_name, file_name in path_list.items():
        if model_name == 'knn':
            _, algo_knn = dump.load(file_name)
        if model_name == 'svd':
            _, algo_svd = dump.load(file_name)

In [None]:
model_path = {
    #final
'svd':'svd', 'lfm_cos':'lfm_cos', 'nn_bias':'nn_bias', 'catboost':'catboost', 'knn': 'knn'    
}

In [None]:
from uuu import mapk_plot, mapk, calculate_serendipity_per_user, prediction_coverage, \
catalog_coverage, long_tail_plot, weight_by_volume, get_frame_from_prediction_list, novelty, get_value_popularity_ranks

In [None]:
fig = plt.figure(figsize=(15, 7))
volume_df = long_tail_plot(df=df, 
             item_id_column="JID", 
             interaction_type="joke ratings", 
             percentage=0.4,
             x_labels=False)

volume_df['weight'] = volume_df['percent_of_total_volume'].apply(weight_by_volume)
joke_volume = {}
for row in volume_df.itertuples():
    joke_volume[int(row.JID)] = row.weight

In [None]:
df_top_joke = df[df['Rating'] > 5].groupby('JID').agg('count').rename(columns={'Rating':'count_plus'}) \
.sort_values('count_plus', ascending=False).drop(columns=['UID'])


df_bottom_joke = df[df['Rating'] < -5].groupby('JID').agg('count').rename(columns={'Rating':'count_minus'}) \
.sort_values('count_minus', ascending=False).drop(columns=['UID'])


df_counts = df_bottom_joke.merge(df_top_joke, left_index=True, right_index=True)


df_counts['diff'] = df_counts['count_plus'] - df_counts['count_minus']
df_counts = df_counts.sort_values('diff', ascending=False)


df_counts['weight'] = df_counts['diff'] / 10000

joke_quality = {}
for row in df_counts.itertuples():
    joke_quality[row.Index] = row.weight

In [None]:

with open('joke_quality.pkl', 'wb') as f:
    pickle.dump(joke_quality, f)
    
with open('joke_volume.pkl', 'wb') as f:
    pickle.dump(joke_volume, f)

In [None]:
with open('predictions_df_lfm.pkl', 'rb') as f:
    predictions_df_lfm = pickle.load(f)
    
predictions_df_lfm.keys()

In [None]:
with open('predictions_df_nn.pkl', 'rb') as f:
    predictions_df_nn = pickle.load(f)
    
predictions_df_nn.keys()

In [None]:
with open('predictions_df_nn_bias.pkl', 'rb') as f:
    predictions_df_nn_bias = pickle.load(f)
    
predictions_df_nn_bias.keys()

In [None]:
with open('predictions_df_catboost.pkl', 'rb') as f:
    predictions_df_catboost = pickle.load(f)
    
predictions_df_catboost.keys()

In [None]:
#with open('predictions_df_catboost_doc2vec.pkl', 'rb') as f:
#    predictions_df_catboost_doc2vec = pickle.load(f)
    
#predictions_df_catboost_doc2vec.keys()

In [None]:
svd_df = predictions_df_surprise['svd']
knn_df = predictions_df_surprise['knn']
lfm_df = predictions_df_lfm['lfm']
lfm_cos_df = predictions_df_lfm['lfm_cos']
nn_df = predictions_df_nn['nn']
nn_bias_df = predictions_df_nn_bias['nn_bias']
catboost_df = predictions_df_catboost['catboost']
#catboost_doc2vec_df = predictions_df_catboost_doc2vec['catboost_doc2vec']
actual_df = predictions_df_surprise['actual']

In [None]:
frames = {'svd':svd_df, 'lfm':lfm_df, 'lfm_cos':lfm_cos_df, 'nn':nn_df, 
          'catboost':catboost_df, 'knn':knn_df,  'nn_bias':nn_bias_df}#'catboost_doc2vec':catboost_doc2vec_df,

predictions = {k:list(v.groupby('UID').agg({'JID':list})['JID'].values) for k, v in frames.items()}
actual = list(predictions_df_surprise['actual'].groupby('UID').agg({'JID':list})['JID'].values)

In [None]:
users = df["UID"].unique()
n_users = users.size
n_users

In [None]:
# рекомендации по популярным 
#popularity_recs = df['JID'].value_counts().head(10).index.tolist()

#pop_recs = []
#for user in tqdm(range(n_users)):
#    pop_predictions = popularity_recs
#    pop_recs.append(pop_predictions)
    
    
#predictions['popular'] = pop_recs

In [None]:
pop = dict(df.JID.value_counts())

In [None]:
items = np.unique(df['JID'])
items

In [None]:
def union_predicts(params):
    result = []

    for u in range(24983):
        res = {}
        for i, (key, value) in enumerate(predictions.items()):
            rank = {x: params[key] * (10-j) for j, x in enumerate(value[u])}

            for k, v in rank.items():
                if k in res:
                    res[k] += v
                else:
                    res[k] = v
                    
        for k in res:
            res[k] += joke_quality[k] * params['quality']
            res[k] += joke_volume[k] * params['volume']

        result.append([x[0] for x in sorted(res.items(), key=lambda item: item[1], reverse=True)][:10])
        
    return result

In [None]:
def main_metric(params):
    t1 = time.time()
    result = union_predicts(params)
    res_df = get_frame_from_prediction_list(users, result)


    cat_coverage = catalog_coverage(result, items, 100)
    pred_coverage = prediction_coverage(result, items)
    nov, mselfinfo_list = novelty(result, pop, len(users), 10)

    res_df = get_frame_from_prediction_list(users, result)
    serendip = calculate_serendipity_per_user(res_df, df_train, df_test)
    map1 = mapk(actual, result, k=1)
    map10 = mapk(actual, result, k=10)
    
    price = np.mean([sum([joke_volume[x] for x in xx]) for xx in result])
    
    sm = sum([cat_coverage / 100, pred_coverage / 100, nov, serendip, map1, map10, price / 3])
    print('time',round(time.time() - t1, 3), 
        'cat_cov',  round(cat_coverage, 3), 
         'cov', round(pred_coverage, 3),
         'nov', round(nov, 3),
         'ser', round(serendip, 3),
        'm1',  round(map1 , 3),
         'm10', round(map10, 3),
         'price', round(price, 3),
         'sum', round(sm, 3))
    return sm

In [None]:
main_metric({'svd': 0.1, 
              'lfm': 0.1, 
              'lfm_cos': 0.1, 
              'nn': 0.1, 
              'nn_bias': 0.1, 
              'catboost': 0.1, 
              'knn': 0.1, 
              'catboost_doc2vec': 0.1,
              'quality': 0.1,
              'volume': 0.1
             })

In [None]:
result = union_predicts({'svd': 0.1, 
              'lfm': 0.1, 
              'lfm_cos': 0.1, 
              'nn': 0.1, 
              'nn_bias': 0.1, 
              'catboost': 0.1, 
              'knn': 0.1, 
              'catboost_doc2vec': 0.1,
              'quality': 0.1,
              'volume': 0.1
             })

In [None]:
def objective(trial):    
    
    params = {'svd': trial.suggest_float(name='svd', low=-5, high=5, step=0.1), 
              'lfm': trial.suggest_float(name='lfm', low=-5, high=5, step=0.1), 
              'lfm_cos': trial.suggest_float(name='lfm_cos', low=-5, high=2, step=0.1), 
              'nn': trial.suggest_float(name='nn', low=-5, high=5, step=0.1), 
              'nn_bias': trial.suggest_float(name='nn_bias', low=-5, high=5, step=0.1), 
              'catboost': trial.suggest_float(name='catboost', low=-5, high=2, step=0.1), 
              'knn': trial.suggest_float(name='knn', low=-5, high=5, step=0.1), 
              #'catboost_doc2vec': trial.suggest_float(name='catboost_doc2vec', low=-5, high=5, step=0.1),
              'quality': trial.suggest_float(name='quality', low=-5, high=5, step=0.1),
              'volume': trial.suggest_float(name='volume', low=-5, high=5, step=0.1)
             }
    
    return main_metric(params)
    


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20000)


In [None]:
print("Value:", study.best_value)
print("Parameters:", study.best_params)

In [None]:
main_metric(
{
    
'svd': 0.7, 'lfm': -0.2, 'lfm_cos': 0.8, 'nn': 0.0, 'nn_bias': -0.1, 'catboost': 0.1, 'knn': 0.1, 'quality': 4.6, 'volume': 3.90
})

In [None]:
main_metric(
{
    
'svd': 0.7, 'lfm': -0, 'lfm_cos': 0.8, 'nn': 0.0, 'nn_bias': -0.1, 'catboost': 0.1, 'knn': 0.1, 'quality': 4.6, 'volume': 3.90
})

In [None]:
main_metric(
{
    
'svd': 0.7, 'lfm': -0, 'lfm_cos': 0.8, 'nn': 0.0, 'nn_bias': -0.1, 'catboost': 0.1, 'knn': 0.1, 'quality': 2, 'volume': 2
})

In [None]:
main_metric(
{
    #final
'svd': 0.7, 'lfm': -0, 'lfm_cos': 0.8, 'nn': 0.0, 'nn_bias': -0.1, 'catboost': 0.1, 'knn': 0.1, 'quality': 4, 'volume': 4
})

In [None]:
main_metric(
{
    
'svd': 0.1, 'lfm': -0.1, 'lfm_cos': 0.1, 'nn': -3.8, 'nn_bias': -3.7, 'catboost': -3.9, 'knn': 0.0, 'quality': 4.8, 'volume': 3.3
})