In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
from lib import *
from classes import *
from lightfm import LightFM
from tqdm import tqdm
tqdm.pandas()

In [None]:
users, orgs, reviews = get_data('data/')

In [None]:
train_r, test_r = train_test_split(reviews, 1116)
X_test, y_test = process_reviews(test_r)
g = 1110
train_r1, train_r2 = train_r[train_r['ts'] <= g], train_r[train_r['ts'] > g]
CNT_TOP = 30

In [None]:
import os.path
from pandas.util import hash_pandas_object
from sklearn.model_selection import train_test_split
from catboost import CatBoostRanker, Pool, MetricVisualizer
import pickle

def predict_to_pandas(predictions):
    train_b = []
    for i, r in tqdm(predictions.iterrows()):
        for ic, org_id in enumerate(r['target']):
            train_b.append([r['user_id'], org_id, ic + 1])
    
    train_b = pd.DataFrame(train_b, columns=['user_id', 'org_id', 'pos_lfm'])
    return train_b

def get_comb_top_orgs(_users, prediction, k_r):
    users_uq = pd.DataFrame(prediction['user_id'].unique(), columns=['user_id'])
    users = _users.merge(users_uq, on='user_id', how='right')
    k_r_msk = k_r[k_r['user_city'] == 'msk'][['org_id', 'rating']]
    top_orgs_msk = k_r_msk.groupby(by='org_id').count().reset_index().sort_values(by='rating', ascending=False)[['org_id']][:CNT_TOP]
    users_msk = users[users['city'] == 'msk'][['user_id']]
    komb_msk = users_msk.merge(top_orgs_msk, how='cross')

    k_r_spb = k_r[k_r['user_city'] == 'spb'][['org_id', 'rating']]
    top_orgs_spb = k_r_spb.groupby(by='org_id').count().reset_index().sort_values(by='rating', ascending=False)[['org_id']][:CNT_TOP]
    users_spb = users[users['city'] == 'spb'][['user_id']]
    komb_spb = users_spb.merge(top_orgs_spb, how='cross')
    return pd.concat([komb_msk, komb_spb])

def pred_to_proc(_users, _orgs, _predictions, r1, r2, topklfm):
    users = _users.copy()
    orgs = _orgs.copy()
    predictions = _predictions.copy()

    print("Proc creating... ", end="")

    r = pd.concat([r1, r2])
    t_r = r[r['user_city'] != r['org_city']]
    k_r = t_r[t_r['rating'] >= 4]

    komb = get_comb_top_orgs(users, predictions, k_r[['org_id', 'user_city', 'rating']])
    predictions = predictions.merge(komb, on=['user_id', 'org_id'], how='outer')
    predictions['pos_lfm'] = predictions['pos_lfm'].fillna(topklfm + 1)
    proc = predictions.merge(r2[['user_id', 'org_id', 'rating']], on=['user_id', 'org_id'], how='left')
    proc['rating'] = proc['rating'].fillna(0)
    proc = proc.merge(users[['user_id', 'city']], on=['user_id']).rename(columns={'city': 'user_city'})
    orgs = orgs.merge(k_r[['org_id', 'rating']].groupby(by="org_id").count().rename(columns={'rating':'cnt_pos'}), on="org_id", how='left')
    orgs['cnt_pos'] = orgs['cnt_pos'].fillna(0)
    proc = proc.merge(orgs[['org_id', 'cnt_pos']], on=['org_id'])
    proc = proc.sort_values(by=['user_id'])

    print("Finish")
    
    return proc

class CatBoostSolver:
    def __init__(self, users, orgs):
        self.users = users.copy()
        self.orgs = orgs.copy()
        self.org_ctoi, self.org_itoc = create_mappings(self.orgs['org_id'])
        self.user_ctoi, self.user_itoc = create_mappings(self.users['user_id'])
        self.users['user_id'] = self.users['user_id'].map(self.user_ctoi)
        self.orgs['org_id'] = self.orgs['org_id'].map(self.org_ctoi)
    
    def fit(self, _r1, _r2, topklfm=100, test_size=0.5):
        self.topklfm = topklfm
        
        r1 = _r1.copy()
        r2 = _r2.copy()
        r1['user_id'] = r1['user_id'].map(self.user_ctoi)
        r2['user_id'] = r2['user_id'].map(self.user_ctoi)
        r1['org_id'] = r1['org_id'].map(self.org_ctoi)
        r2['org_id'] = r2['org_id'].map(self.org_ctoi)
        self.r1 = r1
        self.r2 = r2

        arg_hash = str(abs(hash_pandas_object(r1[['user_id', 'org_id']]).sum() ^ hash_pandas_object(r2[['user_id', 'org_id']]).sum() ^ topklfm))

        path_lfm = 'tmp/lmf_' + arg_hash + '.pkl'
        if os.path.exists(path_lfm):
            with open(path_lfm, 'rb') as f:
                self.lfm, lfm_predictions = pickle.load(f)
        else:
            self.lfm = SplitLightFMSolver(self.users, self.orgs)
            self.lfm.fit(r1, min_pos_rating=5)
            self.lfm.fit_partial(5)
            lfm_predictions = self.lfm.predict(pd.DataFrame(r2['user_id'].unique(), columns=['user_id']), topk=topklfm)
            with open(path_lfm, 'wb') as f:
                pickle.dump((self.lfm, lfm_predictions), f)

        path_train = 'tmp/train_' + arg_hash + '.csv'
        if (os.path.exists(path_train)):
            lfm_predictions_pd = pd.read_csv(path_train)
        else:
            lfm_predictions_pd = predict_to_pandas(lfm_predictions)
            lfm_predictions_pd.to_csv(path_train, index=False)
        
        path_proc = 'tmp/proc_' + arg_hash + '.csv'
        if (os.path.exists(path_proc)):
            proc = pd.read_csv(path_proc)
        else:
            proc = pred_to_proc(self.users, self.orgs, lfm_predictions_pd, r1, r2, topklfm)
            proc.to_csv(path_proc, index=False)
        

        print("Pools creating... ", end="")
        self.features_to_catboost = ['user_city', 'pos_lfm', 'cnt_pos']
        self.cat_features = [0]
        
        user_id = proc['user_id'].unique()
        user_id_train, user_id_test = train_test_split(user_id, test_size=test_size, random_state=42, shuffle=False)
        proc_train, proc_test = proc[proc['user_id'].isin(user_id_train)], proc[proc['user_id'].isin(user_id_test)]
        X_train, y_train, group_id_train = proc_train[self.features_to_catboost].values, proc_train['rating'].values, proc_train['user_id'].values
        X_test, y_test, group_id_test = proc_test[self.features_to_catboost].values, proc_test['rating'].values, proc_test['user_id'].values
        y_train = 1*(y_train >= 4)
        y_test = 1*(y_test >= 4)
        
        self.train = Pool(
            data=X_train,
            label=y_train,
            group_id=group_id_train,
            cat_features=self.cat_features
        )
        
        if test_size > 0:
            self.test = Pool(
                data=X_test,
                label=y_test,
                group_id=group_id_test,
                cat_features=self.cat_features
            )
        print("Finish")
        
        self.model = CatBoostRanker(
            loss_function='YetiRankPairwise',
            learning_rate=0.05,
            depth=6,
            min_data_in_leaf=80,
            eval_metric='MAP:top=20',
            train_dir='tmp/f_'+arg_hash,
            verbose=False,
            random_seed=42,
            iterations=1000,
            # thread_count=-1
            task_type='GPU'
        )
        
        if test_size > 0:
            self.model.fit(self.train, eval_set=self.test, verbose=True)
        else:
            self.model.fit(self.train, verbose=True)

    def predict(self, _X_test, path=None, topk=N): 
        X_test = _X_test.copy()
        X_test['user_id'] = X_test['user_id'].map(self.user_ctoi)

        lfm_predictions = self.lfm.predict(X_test, topk=self.topklfm)
        lfm_predictions_pd = predict_to_pandas(lfm_predictions)
        proc = pred_to_proc(self.users, self.orgs, lfm_predictions_pd, self.r1, self.r2, self.topklfm)
        proc['rating'] = self.model.predict(proc[self.features_to_catboost].values, verbose=True)
        
        def f(x):
            a = x['org_id']
            b = x['rating']
            return list(np.vectorize(lambda x : self.org_itoc[x])(np.array(a)[np.argsort(b)[::-1][:topk]]))

        fp = proc[['user_id', 'org_id', 'rating']].groupby(by='user_id').agg(lambda x: list(x)).reset_index()
        fp['target'] = fp.progress_apply(f, axis=1)
        predictions = fp[['user_id', 'target']].copy()
        predictions['user_id'] = predictions['user_id'].map(self.user_itoc)
        if path != None:
            save_predictions_to_file(predictions, path)

        return predictions

In [None]:
ct = CatBoostSolver(users, orgs)
ct.fit(train_r1, train_r2, 30, test_size=0.1)

In [None]:
x = ct.predict(X_test)

In [None]:
print_score(MNAP_N(y_test, x))

In [None]:
ct.model.get_feature_importance(data=ct.test)

In [None]:
test_users = pd.read_csv('data/test_users.csv')
ct.predict(test_users, "answers.csv")