In [1]:
import pandas as pd  
from tqdm import tqdm  
from collections import defaultdict  
import math  

from joblib import Parallel, delayed

path = '/root/kdd_cup_2020'
train_path = path + '/underexpose_train/local_2'  
test_path =  path + '/underexpose_test/local_2'  
now_phase = 9
recall_num = 50



def rec_wrap(idx, recall_num):
    recom_item = []
    rank_item = recommend(cut_dict, item_sim_list, user_item_list, idx, 500, recall_num)
    for j in rank_item:
        recom_item.append([idx, j[0], j[1]])
    return recom_item


def get_predict(df, pred_col, top_fill, rank_num):  
    top_fill = [int(t) for t in top_fill.split(',')]  
    scores = [-1 * i for i in range(1, len(top_fill) + 1)]  
    ids = list(df['user_id'].unique())  
    fill_df = pd.DataFrame(ids * len(top_fill), columns=['user_id'])  
    fill_df.sort_values('user_id', inplace=True)  
    fill_df['item_id'] = top_fill * len(ids)  
    fill_df[pred_col] = scores * len(ids)  
    df = df.append(fill_df)  
    df.sort_values(pred_col, ascending=False, inplace=True)  
    df = df.drop_duplicates(subset=['user_id', 'item_id'], keep='first')  
    df['rank'] = df.groupby('user_id')[pred_col].rank(method='first', ascending=False)  
    df = df[df['rank'] <= rank_num]  
    df = df.groupby('user_id')['item_id'].apply(lambda x: ','.join([str(i) for i in x])).str.split(',', expand=True).reset_index()  
    return df

import pandas as pd
import numpy as np

def ndcg_metric(tmp):
    ndcg = 0
    for rank in range(50):
        ndcg += np.sum((tmp[rank].astype(int) == tmp['item_id'])/np.log2(rank + 2))
    ndcg /= tmp.shape[0]
    return ndcg


def get_sim_item(df_, user_col, item_col, use_iif=False): 
    df = df_.copy()
    user_item_ = df.groupby(user_col)[item_col].agg(list).reset_index()
    user_item_dict = dict(zip(user_item_[user_col], user_item_[item_col]))

    user_time_ = df.groupby(user_col)['time'].agg(list).reset_index() # 引入时间因素
    user_time_dict = dict(zip(user_time_[user_col], user_time_['time']))

    sim_item = {}  
    item_cnt = defaultdict(int)  # 商品被点击次数
    for user, items in tqdm(user_item_dict.items()):  
        for loc1, item in enumerate(items):  
            item_cnt[item] += 1  
            sim_item.setdefault(item, {})  
            for loc2, relate_item in enumerate(items):  
                if item == relate_item:  
                    continue  
                t1 = user_time_dict[user][loc1] # 点击时间提取
                t2 = user_time_dict[user][loc2]
                sim_item[item].setdefault(relate_item, 0)  
                if not use_iif:
                    corr = 1
                        
                    constant = math.log(1 + len(items))
                    if loc1-loc2>0:
                        sim_item[item][relate_item] += \
                        1 * 0.7 * (0.8**(loc1-loc2-1)) * (1 - (t1 - t2) * 5000) / constant
                    else:
                        sim_item[item][relate_item] += \
                        1 * 0.7 * (0.8**(loc2-loc1-1)) * (1 - (t2 - t1) * 5000) / constant
                else:
                    sim_item[item][relate_item] += 1 / math.log(1 + len(items))

    sim_item_corr = sim_item.copy() # 引入AB的各种被点击次数  
    for i, related_items in tqdm(sim_item.items()):  
        for j, cij in related_items.items():  
            sim_item_corr[i][j] = cij / ((item_cnt[i] * item_cnt[j]) ** 0.2)  
    return sim_item_corr, user_item_dict


from copy import deepcopy
def recommend(cut_dict, sim_item_corr, user_item_dict, user_id, top_k, item_num):  
    '''
    input:item_sim_list, user_item, uid, 500, 50
    # 用户历史序列中的所有商品均有关联商品,整合这些关联商品,进行相似性排序
    '''
    rank = {}
    item_all = cut_dict[user_id]
#     item_num = min(item_num, 100)
    interacted_items = user_item_dict[user_id]
    item_drop = deepcopy(interacted_items)
    interacted_items = interacted_items[:item_all][::-1]
    
    for loc, i in enumerate(interacted_items):  
#         
        for j, wij in sorted(sim_item_corr[i].items(),key = lambda d: d[1], reverse=True)[0:top_k]:  
            if j not in item_drop:  
                rank.setdefault(j, 0)  
                rank[j] += wij * (0.5**loc)
    return sorted(rank.items(), key=lambda d: d[1], reverse=True)[:item_num]





def df_constr(whole_click, qtime):
    top50_click = whole_click['item_id'].value_counts().index[:recall_num].values  
    top50_click = ','.join([str(i) for i in top50_click])

    click_cut = whole_click[['user_id', 'time']].copy()
    click_cut = click_cut.loc[click_cut['user_id'].isin(qtime['user_id'])].reset_index(drop = True)
    qtime_dic = dict(qtime.values)
    click_cut['qtime'] = click_cut['user_id'].apply(lambda x: qtime_dic[x] )
    click_cut = click_cut.loc[(click_cut['time'] <= click_cut['qtime'])].reset_index(drop = True)

    cut_item = click_cut.groupby(['user_id'])[['time']].count().reset_index()
    cut_dict = dict(cut_item.values)
    return top50_click, cut_dict

def recommend_pars(top50_click, qtime, recall_num):
    res = Parallel(n_jobs = 12, backend = 'multiprocessing')\
              (delayed(rec_wrap)(idx, recall_num)
              for idx in tqdm(qtime['user_id'].unique())
              )
    recom_item = []
    for i in range(len(res)):
        recom_item += res[i]
    recom_df = pd.DataFrame(recom_item, columns=['user_id', 'item_id', 'sim'])
    result = get_predict(recom_df, 'sim', top50_click, recall_num)
    return result, recom_df

def local_test_constr(phase, now_phase, recall_num, path = path):
    whole_click = pd.DataFrame()
    test_answer = pd.DataFrame([])
    for c in range(0, now_phase + 1): 
        recom_item = []
        click_train = pd.read_csv(train_path + '/underexpose_train_click-{}.csv'.format(c), header=None,  names=['user_id', 'item_id', 'time'])  
        click_test = pd.read_csv(test_path + '/underexpose_test_click-{}.csv'.format(c), header=None,  names=['user_id', 'item_id', 'time'])
        all_click = click_train.append(click_test)
        whole_click = whole_click.append(all_click)
    
    whole_click = whole_click.drop_duplicates(['user_id', 'item_id', 'time']).reset_index(drop = True)
    whole_click = whole_click.sort_values(['user_id', 'time']).reset_index(drop = True)
    
    click_train = pd.read_csv(train_path + '/underexpose_train_click-{}.csv'.format(phase), header=None,  names=['user_id', 'item_id', 'time'])  
    click_test = pd.read_csv(test_path + '/underexpose_test_click-{}.csv'.format(phase), header=None,  names=['user_id', 'item_id', 'time'])
    qtime = pd.read_csv(test_path + '/underexpose_test_qtime-{}.csv'.format(phase), header=None,  names=['user_id', 'time'])
    all_click = click_train.append(click_test).reset_index(drop = True)
    return whole_click, all_click, qtime

def local_train_constr(phase, now_phase, recall_num, path = path):
    whole_click = pd.DataFrame()
    for c in range(0, now_phase + 1):
        recom_item = []
        click_train = pd.read_csv(train_path + '/underexpose_train_click-{}.csv'.format(c), header=None,  names=['user_id', 'item_id', 'time'])  
        click_test = pd.read_csv(test_path + '/underexpose_test_click-{}.csv'.format(c), header=None,  names=['user_id', 'item_id', 'time'])
        all_click = click_train.append(click_test)
        all_click = all_click.sort_values(['user_id', 'time']).reset_index(drop = True)
        whole_click = whole_click.append(all_click)
    
    click_train = pd.read_csv(train_path + '/underexpose_train_click-{}.csv'.format(phase), header=None,  names=['user_id', 'item_id', 'time'])  
    click_test = pd.read_csv(test_path + '/underexpose_test_click-{}.csv'.format(phase), header=None,  names=['user_id', 'item_id', 'time'])
    all_click = click_train.append(click_test).reset_index(drop = True)
    print("phase series:", all_click.shape)
    all_click = all_click.sort_values(['user_id', 'time']).reset_index(drop = True)
    train_answer = all_click.groupby(['user_id']).last().reset_index()
    all_click.set_index(['user_id', 'item_id'], inplace = True)
    all_click = all_click[~all_click.index.isin(train_answer.set_index(['user_id', 'item_id']).index)].reset_index()
    print("phase series:", all_click.shape)
    
    whole_click = whole_click.drop_duplicates(['user_id', 'item_id', 'time']).reset_index(drop = True)
    print("all click:",whole_click.shape)
    whole_click.set_index(['user_id', 'item_id'], inplace = True)
    whole_click = whole_click[~whole_click.index.isin(train_answer.set_index(['user_id', 'item_id']).index)].reset_index()
    whole_click = whole_click.sort_values(['user_id', 'time']).reset_index(drop = True)
    print("all click:",whole_click.shape)
    return whole_click, all_click, train_answer




import lightgbm as lgb
import gc
from sklearn.metrics import roc_auc_score

params = {
            'num_leaves':  63,
#           'min_child_weight': 0.034,
          'feature_fraction': 0.5,
          'bagging_fraction':  0.5,
#           'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.01,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'random_state': 47
         } 

In [2]:
path_input_root = '/root/kdd_cup_2020'
path_input_train = path_input_root + '/underexpose_train/local_2'  
path_input_test = path_input_root + '/underexpose_test/local_2'  
path_input_train_feat = '../input/underexpose_train/' # 这个LB和LOCAL用一样的就行

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
import pandas as pd  
from tqdm import tqdm
from collections import defaultdict  
import math  

# ================================================================
W1 = -0.3
W2 = 30
W3 = 2.7182818 
W1_list = [-0.15, -0.2, -0.25, -0.3, -0.35, -0.4]
W2_list = [1, 2, 5, 10, 20, 25, 30]
W3_list = [2,  2.7182818, 5]

W4 = 10
W5 = 14
W4_list = [1, 2, 5, 10, 20]
W5_list = [2, 5, 8, 10, 14, 18, 22]

W4 = 25
W5 = 10
W6 = 0.4

def get_sim_item_runxing(df_, user_col, item_col, use_iif = False):
    df = df_.copy()
    user_item_ = df.groupby(user_col)[item_col].agg(list).reset_index()
    user_item_dict = dict(zip(user_item_[user_col], user_item_[item_col]))

    sim_item = {}
    item_cnt = defaultdict(int)
    for user, items in tqdm(user_item_dict.items()):
    # for user, items in user_item_dict.items():
        for idx_cur, i in enumerate(items):
            item_cnt[i] += 1
            sim_item.setdefault(i, {})
            for idx_relate, relate_item in enumerate(items):
                if i == relate_item:
                    continue
                sim_item[i].setdefault(relate_item, 0)
                w_time_inner = abs(idx_cur - idx_relate) ** (W1)
                if not use_iif:
                    sim_item[i][relate_item] += 1
                else:
                    sim_item[i][relate_item] += 1 / math.log(W2 + len(items), W3) * w_time_inner

    sim_item_corr = sim_item.copy()
    for i, related_items in tqdm(sim_item.items()):
    # for i, related_items in (sim_item.items()):
        for j, cij in related_items.items():
            D = (math.sqrt(item_cnt[i] * item_cnt[j]) - (np.log(W4 + W6 * abs(item_cnt[i] - item_cnt[j])) - W5))
            sim_item_corr[i][j] = cij / D

    return sim_item_corr, user_item_dict


# ================================================================
R1 = -0.5
R2 = -0.1
R1_list = [-0.1, -0.2, -0.3, -0.4, -0.5, -0.6, -0.7, -0.8, -0.9]
R2_list = [-0.005, -0.01, -0.03, -0.05, -0.08, -0.1]

R3 = 30
R4 = 5
R3_list = [1, 5,10, 20, 30, 40, 50, 100, 200]
R4_list = [0.1, 0.5, 1, 2, 5, 10]

R5 = 0.01
R6 = 0
R7 = 1
R5_list = [1, 5, 10, 20, 30, 50]
R6_list = [1, 2, 5, 10, 15, 20]
R7_list = [0.1, 0.5, 1, 2, 5, 10]

def recommend_runxing(T, NOT_item_dict, sim_item_corr, user_item_dict, user_id, top_k, item_num):
    rank = {}
    interacted_items = user_item_dict[user_id]
    # interacted_items  = [item for item in interacted_items if item not in NOT_item_dict[user_id]]
    
    interacted_items_set = set(interacted_items) # @zhongrunxing
    length = len(interacted_items)
    for idx, i in enumerate(interacted_items):
        w_time = ((length - idx) ** (R1))
        ni = 0
        for j, wij in sim_item_corr[i][0:top_k]:
            if j in NOT_item_dict[user_id]:
                continue;
                
            if j not in interacted_items_set:
                ni += 1
                w_item_rank = ni ** (R2)
                rank.setdefault(j, 0)
                
                # 需要另一个逻辑, 表麻麻烦暂时, @zhongrunxing
#                 try:
#                     sim_feat = sim_dict_feat[i][j]
#                 except:
#                     sim_feat = 0
#                 wij = wij * (math.log(R3 * (sim_feat - sim_min_list[T]) + 1, 2)) ** R4
                
                # 针对half, @zhongrunxing
                # wij = wij * (1 / np.log1p(R5 * item_vc_list[T][j] + R6)) ** R7
                
                rank[j] += wij * w_time * w_item_rank # * (item_vc_list[T][j] ** 0.05)
                
    rank_item = sorted(rank.items(), key=lambda d: d[1], reverse=True)[:item_num]        
    return rank_item

def get_predict_runxing(df, pred_col, item_top_list, N_recall = 50):  
    # item_top
    scores = [-1 * i for i in range(1, len(item_top_list) + 1)]  
    ids = list(df['user_id'].unique())  
    fill_df = pd.DataFrame(ids * len(item_top_list), columns=['user_id'])  
    fill_df.sort_values('user_id', inplace=True)  
    fill_df['item_id'] = item_top_list * len(ids)  
    fill_df[pred_col] = scores * len(ids)  
    df = df.append(fill_df)  
    
    df.sort_values(pred_col, ascending=False, inplace=True)  
    df = df.drop_duplicates(subset=['user_id', 'item_id'], keep='first')  
    df['rank'] = df.groupby('user_id')[pred_col].rank(method='first', ascending=False).astype(int)
    
    # df_score = df.copy()
    df = df[df['rank'] <= N_recall]  
    df_score = df.copy()
    
    df_score = df_score.sort_values(by = ['user_id', 'rank']).reset_index(drop = True)
    df = df.groupby('user_id')['item_id'].apply(lambda x: ','.join([str(i) for i in x])).\
        str.split(',', expand=True).reset_index()  
    
    for col in df:
        df[col] = df[col].astype(float).astype(int)
    return df, df_score

def CF_train(T, train_test_click):
    item_sim_list, user_item = get_sim_item_runxing(train_test_click, 'user_id', 'item_id', use_iif = True) 
    sim_item_corr = {}
    for key in list(item_sim_list.keys()):    
        item_neighbour_list = sorted(item_sim_list[key].items(), key = lambda x: x[1], reverse = True)
        sim_item_corr[key] = item_neighbour_list
    return sim_item_corr, user_item

from joblib import Parallel, delayed

def rec_wrap_runxing(user_id, T, N_recall):
    user_item_sim = []
    rank_item = recommend_runxing(T, NOT_item_dict, sim_item_corr, user_item, user_id, 500, N_recall)  
    for j in rank_item:
        user_item_sim.append([user_id, j[0], j[1]])
    return user_item_sim

def CF_predict(train_click_i, test_click_i, T, NOT_item_dict, test_qtime_i, sim_item_corr, user_item, N_recall = 50):
    res = Parallel(n_jobs = 12, backend = 'multiprocessing')\
              (delayed(rec_wrap_runxing)(test_qtime_i.iloc[i]['user_id'], T, 50)
              for i in tqdm(range(test_qtime_i.shape[0]))
              )
    recom_item_test = []
    for i in range(len(res)):
        recom_item_test += res[i]
        
    recom_df_test = pd.DataFrame(recom_item_test, columns=['user_id', 'item_id', 'sim'])  
    item_all = pd.concat([train_click_i['item_id'], test_click_i['item_id']])
    item_top_list = list(item_all.value_counts().index[:N_recall].values)
    df_pred_test, df_score_test = get_predict_runxing(recom_df_test, 'sim', item_top_list, N_recall)  
    return df_pred_test, df_score_test


def local_test_constr_RUNXING(Ti, now_phase, N_recall = 50):
    train_click_list = []
    test_click_list = []
    test_qtime_list = []

    for i in range(now_phase + 1):
        train_click_i = pd.read_csv(path_input_train + '/' + 'underexpose_train_click-{}.csv'.format(i), 
                                   header = None, names=['user_id', 'item_id', 'time'])
        test_click_i = pd.read_csv(path_input_test + '/' + 'underexpose_test_click-{}.csv'.format(i), 
                                   header = None, names=['user_id', 'item_id', 'time'])
        test_qtime_i = pd.read_csv(path_input_test + '/' + 'underexpose_test_qtime-{}.csv'.format(i), 
                                   header = None, names=['user_id', 'time'])
    
        train_click_i.sort_values(['user_id', 'time'], inplace = True)
        test_click_i.sort_values(['user_id', 'time'], inplace = True)
        test_qtime_i.sort_values(['user_id', 'time'], inplace = True)
    
        train_click_list.append(train_click_i)
        test_click_list.append(test_click_i)
        test_qtime_list.append(test_qtime_i)
        
    train_click_ALL = pd.concat(train_click_list, axis = 0)
    test_click_ALL = pd.concat(test_click_list, axis = 0)
    train_test_click_ALL = train_click_ALL.append(test_click_ALL)
    train_test_click_ALL.sort_values(['user_id', 'time'], inplace = True)
    train_test_click_ALL.drop_duplicates(['user_id', 'item_id'], keep = 'last', inplace = True)
    train_test_click_ALL.reset_index(drop = True,  inplace = True)
    
    
    train_click_i = train_click_list[Ti]     
    test_click_i = test_click_list[Ti]
    test_qtime_i = test_qtime_list[Ti]
    
    NOT_item_dict = {}
    for i in range(len(test_qtime_i)):
        user_id, time = test_qtime_i.iloc[i, :].values
        idx = (train_test_click_ALL['user_id'] == int(user_id)) & (train_test_click_ALL['time'] >= time)
        NOT_item_dict[user_id] = set(train_test_click_ALL[idx]['item_id'].values)
        
    TA1, TA2 = 0, Ti
    TB1, TB2 = 0, Ti
    train_click = pd.concat(train_click_list[TA1: TA2 + 1], axis = 0)
    test_click = pd.concat(test_click_list[TB1: TB2 + 1], axis = 0)
    train_test_click = train_click.append(test_click)
    train_test_click.sort_values(['user_id', 'time'], inplace = True)
    train_test_click.drop_duplicates(['user_id', 'item_id'], keep = 'last', inplace = True)
    train_test_click.reset_index(drop = True,  inplace = True)
    sim_item_corr, user_item = CF_train(Ti, train_test_click)
    return sim_item_corr, user_item, train_click_i, test_click_i, test_qtime_i, NOT_item_dict, train_test_click

def local_train_constr_RUNXING(Ti, now_phase, N_recall = 50):
    train_click_list = []
    test_click_list = []
    
    train_click_i = pd.read_csv(path_input_train + '/' + 'underexpose_train_click-{}.csv'.format(Ti), 
                                   header = None, names=['user_id', 'item_id', 'time'])
    test_click_i = pd.read_csv(path_input_test + '/' + 'underexpose_test_click-{}.csv'.format(Ti), 
                                   header = None, names=['user_id', 'item_id', 'time'])
    all_click = pd.concat([train_click_i, test_click_i], axis = 0).reset_index(drop = True)
    all_click = all_click.sort_values(['user_id', 'time']).reset_index(drop = True)
    train_answer = all_click.groupby(['user_id']).last().reset_index()
    
    
    for i in range(now_phase + 1):
        train_click_i = pd.read_csv(path_input_train + '/' + 'underexpose_train_click-{}.csv'.format(i), 
                                   header = None, names=['user_id', 'item_id', 'time'])
        test_click_i = pd.read_csv(path_input_test + '/' + 'underexpose_test_click-{}.csv'.format(i), 
                                   header = None, names=['user_id', 'item_id', 'time'])
        train_click_i.sort_values(['user_id', 'time'], inplace = True)
        test_click_i.sort_values(['user_id', 'time'], inplace = True)
        
        train_click_i.set_index(['user_id', 'item_id'], inplace = True)
        train_click_i = train_click_i[~train_click_i.index.isin(
            train_answer.set_index(['user_id', 'item_id']).index)].reset_index()
    
        test_click_i.set_index(['user_id', 'item_id'], inplace = True)
        test_click_i = test_click_i[~test_click_i.index.isin(
            train_answer.set_index(['user_id', 'item_id']).index)].reset_index()
        
        train_click_list.append(train_click_i)
        test_click_list.append(test_click_i)
        
        
    train_click_ALL = pd.concat(train_click_list, axis = 0)
    test_click_ALL = pd.concat(test_click_list, axis = 0)
    train_test_click_ALL = train_click_ALL.append(test_click_ALL)
    train_test_click_ALL.sort_values(['user_id', 'time'], inplace = True)
    
    print("before:", train_test_click_ALL.shape)
    train_test_click_ALL.drop_duplicates(['user_id', 'item_id'], keep = 'last', inplace = True)
    print("before:", train_test_click_ALL.shape)

    train_click_i = train_click_list[Ti]     
    test_click_i = test_click_list[Ti]
    test_qtime_i = train_answer[['user_id', 'time']]

    
    NOT_item_dict = {}
    for i in range(len(test_qtime_i)):
        user_id, time = test_qtime_i.iloc[i, :].values
        idx = (train_test_click_ALL['user_id'] == int(user_id)) & (train_test_click_ALL['time'] >= time)
        NOT_item_dict[user_id] = set(train_test_click_ALL[idx]['item_id'].values)

    TA1, TA2 = 0, Ti
    TB1, TB2 = 0, Ti
    train_click = pd.concat(train_click_list[TA1: TA2 + 1], axis = 0)
    test_click = pd.concat(test_click_list[TB1: TB2 + 1], axis = 0)
    train_test_click = train_click.append(test_click)
    train_test_click.sort_values(['user_id', 'time'], inplace = True)
    train_test_click.drop_duplicates(['user_id', 'item_id'], keep = 'last', inplace = True)
    train_test_click.reset_index(drop = True,  inplace = True)
    
    sim_item_corr, user_item = CF_train(Ti, train_test_click)
    return (sim_item_corr, user_item, train_click_i, test_click_i, 
            test_qtime_i, NOT_item_dict, train_test_click, train_answer)





In [3]:
print('[+] re-transform time')
import datetime
def stamp_to_datetime(x):
    return datetime.datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S')

N1 = 53251
N2 = 1585067656

def feature_engineering(df, clicks, recom, qtime):
    feat = df[['user_id', 'item_id']].copy()
    tmp = clicks.groupby(['user_id']).agg({"item_id":['count', 'nunique'], "time":['mean', 'std', 'min', 'max']})
    tmp.columns = ["user_" + "_".join(col) for col in tmp.columns.ravel()]
    feat = pd.merge(feat, tmp, on = ['user_id'], how = 'left')
    
    
    ##item based
    tmp = clicks.groupby(['item_id']).agg({"user_id":['count', 'nunique'], "time":['mean', 'std', 'min', 'max']})
    tmp.columns = ["item_" + "_".join(col) for col in tmp.columns.ravel()]
    feat = pd.merge(feat, tmp, on = ['item_id'], how = 'left')
    
    tmp = clicks.sort_values(["item_id", 'time']).groupby(['item_id']).last()
    tmp.columns = [f'last_{col}' for col in tmp.columns]
    feat = pd.merge(feat, tmp, on = ['item_id'], how = 'left')

    tmp = clicks.sort_values(["user_id", 'time']).groupby(['user_id']).last()
    tmp.columns = [f'last_us_{col}' for col in tmp.columns]
    feat = pd.merge(feat, tmp, on = ['user_id'], how = 'left')

    
    tmp = clicks.sort_values(["user_id", 'time']).groupby(['user_id']).tail(5)
    tmp = tmp.groupby(['user_id']).agg({"time":['mean', 'std', 'min', 'max']})
    tmp.columns = ["user_tail5" + "_".join(col) for col in tmp.columns.ravel()]
    feat = pd.merge(feat, tmp, on = ['user_id'], how = 'left')

    qtmp = qtime.copy()
    qtmp['time'] = (qtmp['time'] * N2 + N1).astype(int)
    qtmp['date'] = pd.to_datetime(qtmp['time'].apply(stamp_to_datetime))    
    qtmp['day'] = qtmp['date'].dt.day
    qtmp['hour'] = qtmp['date'].dt.hour    
    feat = pd.merge(feat, qtmp[['user_id', 'day', 'hour']], on = ['user_id'], how = 'left')
      
    feat = pd.merge(feat, qtime, on = ['user_id'], how = 'left')
    feat = pd.merge(feat, recom, on = ['user_id', 'item_id'], how = 'left')
#     feat['rank_sim'] = recom.groupby(['user_id'])['sim'].rank()
    feat['t_d1'] = feat['time'] - feat['last_time']
    feat['t_d2'] = feat['time'] - feat['last_us_time']
    feat.drop(['user_id', 'item_id'], axis = 1, inplace = True)
    
    return feat


def kfold_lgb(train, test):
    from sklearn.model_selection import StratifiedKFold
    import lightgbm as lgb
    import gc
    from sklearn.metrics import roc_auc_score

    params = {
            'num_leaves':  31,
#           'min_child_weight': 0.034,
          'feature_fraction': 0.5,
          'bagging_fraction':  0.5,
#           'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.01,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'random_state': 47
         } 

    import datetime
    n_kfolds = 5
    feats = [col for col in train.columns if col not in ['label', 'user_id']]
    n_train, n_test = train.shape[0], test.shape[0]
    oof_train, oof_test= np.zeros((n_train,)), np.zeros((n_test,))
    score_list, model_list = [], []
    skf = StratifiedKFold(n_splits = n_kfolds, 
                          shuffle = True, random_state = 777).split(train[feats], train['label'])
    for i, (train_idx, valid_idx) in enumerate(skf):
        print('############################################################ fold = {} / {}'.format(i + 1, n_kfolds))
        print('####### cur time = ' + str(datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S")))
        X_train, y_train = train.loc[train_idx, feats], train.loc[train_idx, 'label'] 
        dtrain = lgb.Dataset(X_train, y_train, free_raw_data = True)
        del X_train, y_train; gc.collect();
              
        X_valid, y_valid = train.loc[valid_idx, feats], train.loc[valid_idx, 'label']
        dvalid = lgb.Dataset(X_valid, y_valid, reference = dtrain, free_raw_data = True)
        model = lgb.train(
            params = params,
            train_set = dtrain,
            valid_sets = [dvalid],
            num_boost_round = 10000,
            early_stopping_rounds = 100,
            verbose_eval = 100,
        )
        
        oof_train[valid_idx] = model.predict(X_valid)
        oof_test += model.predict(test[feats])/n_kfolds
        score_list.append(roc_auc_score(y_valid, oof_train[valid_idx]))
        print("period:", valid_idx,", the score is", roc_auc_score(y_valid, oof_train[valid_idx]))
        del X_valid, y_valid, dtrain, dvalid, model; gc.collect();
    return oof_test

[+] re-transform time


In [4]:
now_phase = 9
result = pd.DataFrame()
file_save_path = path + '/local_result/'
local_pred = pd.DataFrame()
for phase in range(7, now_phase + 1):
    print(f"phase {phase}:")
    recall_num = 50
    recall_click, test_click_ZW, test_qtime = local_test_constr(phase, now_phase, recall_num)
    item_sim_list, user_item_list = get_sim_item(recall_click, 'user_id', 'item_id', use_iif=False)
    top50_click, cut_dict = df_constr(recall_click, test_qtime)
    test_res_ZW, test_recom_ZW = recommend_pars(top50_click, test_qtime, recall_num)
    
    recall_num = 50
    recall_click, train_click_ZW, train_answer_ZW = local_train_constr(phase, now_phase, recall_num)
    train_qtime = train_answer_ZW[['user_id','time']]
    item_sim_list, user_item_list = get_sim_item(recall_click, 'user_id', 'item_id', use_iif=False)

    top50_click, cut_dict = df_constr(recall_click, train_qtime)
    train_res_ZW, train_recom_ZW = recommend_pars(top50_click, train_qtime, recall_num)
    print(train_res_ZW.shape)
    
    
    sim_item_corr, user_item, train_click_i, test_click_i, \
    test_qtime_i, NOT_item_dict, test_click_RX = local_test_constr_RUNXING(phase, now_phase)
    test_res_RX, test_recom_RX = CF_predict(train_click_i, test_click_i, phase, NOT_item_dict, 
                                             test_qtime_i, sim_item_corr, user_item, N_recall = 50)


    sim_item_corr, user_item, train_click_i, test_click_i, \
    test_qtime_i, NOT_item_dict, train_click_RX, train_answer_RX = local_train_constr_RUNXING(phase, now_phase)
    train_res_RX, train_recom_RX = CF_predict(train_click_i, test_click_i, phase, NOT_item_dict, 
                                             test_qtime_i, sim_item_corr, user_item, N_recall = 25)


    
    train_rec = pd.read_csv(file_save_path + f'hybrid_emb_update_v3_train_phase{phase}.csv')
    test_rec = pd.read_csv(file_save_path + f'hybrid_emb_update_v3_test_phase{phase}.csv')
    
    test_rec['score'] = \
            (test_rec['sim_all'] + test_rec['sim_txt']) * 0.3 + \
            (test_rec['sim_w2v'] + test_rec['sim_w2v_10w']) * 0.8 + test_rec['sim_w2v_gr'] * 0.1
    train_rec['score'] = \
            (train_rec['sim_all'] + train_rec['sim_txt']) * 0.3 + \
            (train_rec['sim_w2v'] + train_rec['sim_w2v_10w']) * 0.8 + train_rec['sim_w2v_gr'] * 0.1
    
    train_res_new = get_predict(train_rec, 'score', top50_click, 100)
    test_res_new = get_predict(test_rec, 'score', top50_click, 500)
    train_rec = train_rec.sort_values(['user_id', 'score'], ascending  = False).groupby(['user_id']).head(100)
    test_rec = test_rec.sort_values(['user_id', 'score'], ascending  = False).groupby(['user_id']).head(500)
#     train_rec.fillna(0, inplace = True)
#     test_rec.fillna(0, inplace = True)
    

    train_click = train_click_ZW
    train_answer = train_answer_ZW
    test_click = test_click_ZW

    train_res = pd.merge(train_res_ZW, train_res_RX, on = ['user_id'], how = 'inner')
    train_res = pd.merge(train_res, train_res_new, on = ['user_id'], how = 'inner')
    train_recom = pd.merge(train_recom_ZW, train_recom_RX, on = ['user_id', 'item_id'], 
                            how = 'outer').reset_index(drop = True)
    train_recom = pd.merge(train_recom, train_rec, on = ['user_id', 'item_id'], 
                            how = 'outer').reset_index(drop = True)

    test_res = pd.merge(test_res_ZW, test_res_RX, on = ['user_id'], how = 'inner')
    test_res = pd.merge(test_res, test_res_new, on = ['user_id'], how = 'inner')

    test_recom = pd.merge(test_recom_ZW, test_recom_RX, on = ['user_id', 'item_id'], 
                                how = 'outer').reset_index(drop = True)
    test_recom = pd.merge(test_recom, test_rec, on = ['user_id', 'item_id'], 
                                how = 'outer').reset_index(drop = True)

    test_res.columns = ['user_id'] + [i for i in range(1, test_res.shape[1])]
 

    train = train_res.set_index(['user_id']).unstack().reset_index()
    train = train.iloc[:, 1:].astype(int)
    train.columns = ['user_id', 'item_id']
    train = pd.merge(train, train_answer.reset_index()[['user_id', 'item_id', 'time']],
                       on = ['user_id', 'item_id'], how = 'left')
    train['time'].fillna(-1, inplace = True)
    train['time'] = np.where(train['time'] != -1, 1, 0)
    train.columns = ['user_id', 'item_id', 'label']

    test = test_res.set_index("user_id").unstack().reset_index()
    test = test.iloc[:, 1:].astype(int)
    test.columns = ['user_id', 'item_id']

    print("train:",train.shape)
    print("test:",test.shape)
    
    print(train.shape)
    train = train.drop_duplicates(['user_id', 'item_id']).reset_index(drop = True)
    print(train.shape)
    test = test.drop_duplicates(['user_id', 'item_id']).reset_index(drop = True)
    
    
    train = train[['user_id', 'item_id', 'label']]
    print(train.shape)
    qtime = train_answer[['user_id', 'time']]
    tr_feat = feature_engineering(train, train_click, train_recom, qtime)
    train[tr_feat.columns] = tr_feat
    print(train.shape)
    print("over")

    test = test[['user_id', 'item_id']]
    print(test.shape)
    qtime = pd.read_csv(test_path + '/underexpose_test_qtime-{}.csv'.format(phase), 
                      header = None, names = ['user_id', 'time'])

    ts_feat = feature_engineering(test, test_click, test_recom, qtime)
    test[ts_feat.columns] = ts_feat
    print(test.shape)
    print("over")
    
    train.drop(columns=["rank"], inplace = True)
    test.drop(columns=["rank"], inplace = True)
    
    
    preds = kfold_lgb(train, test)
    test['pred'] = preds
    local_pred = local_pred.append(test[['user_id', 'item_id', 'pred']])
    sort_res = test.sort_values(['user_id', 'pred'], ascending = False).groupby('user_id')['item_id'].apply(
    lambda x: ','.join([str(i) for i in x])).str.split(',', expand=True).reset_index()
    sort_res = sort_res.iloc[:, :51]
    test.drop(["pred"], axis = 1, inplace = True)
    
    result = result.append(sort_res)
    print("sorting is over.")
#     break

phase 7:


100%|██████████| 35346/35346 [01:43<00:00, 340.49it/s]
100%|██████████| 117720/117720 [00:34<00:00, 3399.46it/s]
100%|██████████| 1701/1701 [00:01<00:00, 1254.91it/s]


phase series: (294763, 3)
phase series: (275093, 3)
all click: (1243109, 3)
all click: (1218677, 3)


100%|██████████| 35346/35346 [01:41<00:00, 349.62it/s]
100%|██████████| 117716/117716 [00:35<00:00, 3304.91it/s]
100%|██████████| 19670/19670 [03:02<00:00, 107.68it/s]


(19670, 51)


100%|██████████| 32747/32747 [00:59<00:00, 547.36it/s]
100%|██████████| 105221/105221 [02:08<00:00, 819.65it/s] 
100%|██████████| 1701/1701 [00:11<00:00, 153.04it/s]


before: (2798139, 3)
before: (1133251, 3)


100%|██████████| 32747/32747 [00:58<00:00, 562.39it/s]
100%|██████████| 105210/105210 [02:04<00:00, 846.56it/s] 
100%|██████████| 19670/19670 [01:28<00:00, 221.19it/s]
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


train: (3442250, 3)
test: (1020600, 2)
(3442250, 3)
(2690147, 3)
(2690147, 3)
(2690147, 38)
over
(900275, 2)
(900275, 37)
over
############################################################ fold = 1 / 5
####### cur time = 2020/06/11 17:19:43
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.905662
[200]	valid_0's auc: 0.908337
[300]	valid_0's auc: 0.909631
[400]	valid_0's auc: 0.909954
[500]	valid_0's auc: 0.910203
[600]	valid_0's auc: 0.910583
[700]	valid_0's auc: 0.910249
Early stopping, best iteration is:
[602]	valid_0's auc: 0.910588
period: [      8      15      17 ... 2690098 2690115 2690143] , the score is 0.910587916911499
############################################################ fold = 2 / 5
####### cur time = 2020/06/11 17:20:44
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.899638
[200]	valid_0's auc: 0.903131
[300]	valid_0's auc: 0.905304
[400]	valid_0's auc: 0.906879
[500]	valid_0's auc: 0.906988
E

100%|██████████| 35346/35346 [01:43<00:00, 342.38it/s]
100%|██████████| 117720/117720 [00:35<00:00, 3290.27it/s]
100%|██████████| 1716/1716 [00:22<00:00, 75.89it/s] 


phase series: (288652, 3)
phase series: (268916, 3)
all click: (1243109, 3)
all click: (1218904, 3)


100%|██████████| 35346/35346 [01:40<00:00, 350.77it/s]
100%|██████████| 117715/117715 [00:33<00:00, 3557.09it/s]
100%|██████████| 19736/19736 [03:28<00:00, 94.56it/s] 


(19736, 51)


100%|██████████| 33955/33955 [01:08<00:00, 496.76it/s]
100%|██████████| 111208/111208 [02:26<00:00, 760.51it/s] 
100%|██████████| 1716/1716 [00:12<00:00, 137.86it/s]


before: (2808566, 3)
before: (1133185, 3)


100%|██████████| 33955/33955 [01:07<00:00, 505.07it/s]
100%|██████████| 111200/111200 [02:22<00:00, 780.23it/s] 
100%|██████████| 19736/19736 [01:42<00:00, 193.44it/s]
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


train: (3453800, 3)
test: (1029600, 2)
(3453800, 3)
(2692396, 3)
(2692396, 3)
(2692396, 38)
over
(907705, 2)
(907705, 37)
over
############################################################ fold = 1 / 5
####### cur time = 2020/06/11 17:51:00
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.898851
[200]	valid_0's auc: 0.901261
[300]	valid_0's auc: 0.902934
[400]	valid_0's auc: 0.904063
[500]	valid_0's auc: 0.904806
[600]	valid_0's auc: 0.904916
[700]	valid_0's auc: 0.90516
[800]	valid_0's auc: 0.905116
[900]	valid_0's auc: 0.905339
[1000]	valid_0's auc: 0.905411
[1100]	valid_0's auc: 0.905423
Early stopping, best iteration is:
[1065]	valid_0's auc: 0.905501
period: [      8      15      17 ... 2692374 2692379 2692386] , the score is 0.9055014372541699
############################################################ fold = 2 / 5
####### cur time = 2020/06/11 17:52:41
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.89807

100%|██████████| 35346/35346 [01:43<00:00, 340.92it/s]
100%|██████████| 117720/117720 [00:36<00:00, 3253.85it/s]
100%|██████████| 1651/1651 [00:23<00:00, 70.18it/s] 


phase series: (278066, 3)
phase series: (258161, 3)
all click: (1243109, 3)
all click: (1219742, 3)


100%|██████████| 35346/35346 [01:41<00:00, 349.89it/s]
100%|██████████| 117714/117714 [00:34<00:00, 3372.50it/s]
100%|██████████| 19905/19905 [03:43<00:00, 88.93it/s] 


(19905, 51)


100%|██████████| 35346/35346 [01:19<00:00, 442.54it/s]
100%|██████████| 117720/117720 [02:50<00:00, 689.20it/s] 
100%|██████████| 1651/1651 [00:14<00:00, 115.97it/s]


before: (2823149, 3)
before: (1133016, 3)


100%|██████████| 35346/35346 [01:18<00:00, 447.58it/s]
100%|██████████| 117714/117714 [02:49<00:00, 695.81it/s] 
100%|██████████| 19905/19905 [01:51<00:00, 178.88it/s]
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


train: (3483375, 3)
test: (990600, 2)
(3483375, 3)
(2688677, 3)
(2688677, 3)
(2688677, 38)
over
(869749, 2)
(869749, 37)
over
############################################################ fold = 1 / 5
####### cur time = 2020/06/11 18:24:36
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.896027
[200]	valid_0's auc: 0.899755
[300]	valid_0's auc: 0.902617
[400]	valid_0's auc: 0.904
[500]	valid_0's auc: 0.904396
[600]	valid_0's auc: 0.90463
[700]	valid_0's auc: 0.904627
Early stopping, best iteration is:
[648]	valid_0's auc: 0.904791
period: [      9      16      18 ... 2688648 2688656 2688665] , the score is 0.9047909946864708
############################################################ fold = 2 / 5
####### cur time = 2020/06/11 18:25:41
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.891584
[200]	valid_0's auc: 0.895337
[300]	valid_0's auc: 0.897617
[400]	valid_0's auc: 0.899429
[500]	valid_0's auc: 0.900366
[600]

In [7]:
local_pred = local_pred.sort_values(['user_id', 'pred'], ascending = False).reset_index(drop = True)
local_pred.to_csv(file_save_path + "/local_200611_final_result.csv", index = None)

In [6]:
print("*" * 50)
score = 0
for phase in range(7, now_phase + 1):
    print("phase", phase)
    ans_tmp = pd.read_csv(test_path + '/underexpose_test_label-{}.csv'.format(phase), 
                      header = None, names = ['user_id', 'item_id', 'time'])
    tmp = pd.merge(ans_tmp, result, on = ['user_id'], how = 'inner')
    print("update score:", ndcg_metric(tmp))
    recall_num = 0
    for rank in range(50):
        recall_num += np.sum(tmp[rank].astype(int) == tmp['item_id'])
    print("update recall:",recall_num/(tmp.shape[0]))
    score += ndcg_metric(tmp)
    
print("final update score:", score)



**************************************************
phase 7
update score: 0.11508925051116507
update recall: 0.2551440329218107
phase 8
update score: 0.10408997307205409
update recall: 0.23076923076923078
phase 9
update score: 0.09244287475767646
update recall: 0.2113870381586917
final update score: 0.3116220983408956


In [9]:
print("*" * 50)
score = 0
for phase in range(7, now_phase + 1):
    print("phase", phase)
    ans_tmp = pd.read_csv(test_path + '/underexpose_test_label-{}.csv'.format(phase), 
                      header = None, names = ['user_id', 'item_id', 'time'])
    tmp = pd.merge(ans_tmp[['user_id']], result, on = ['user_id'], how = 'inner')
    click_train = pd.read_csv(train_path + '/underexpose_train_click-{}.csv'.format(phase), header=None,  names=['user_id', 'item_id', 'time'])  
    click_test = pd.read_csv(test_path + '/underexpose_test_click-{}.csv'.format(phase), header=None,  names=['user_id', 'item_id', 'time'])
    all_click = click_train.append(click_test).reset_index(drop  = True)
    
    LB_CF = get_score_LB(ans_tmp, tmp.astype(int), click_train.head(0), all_click )
    score += LB_CF[1]
    
print("half score:", score)

**************************************************
phase 7
[+] NDCG@FULL = 0.115089, NDCG@HALF = 0.103834
[+]  HIT@FULL = 0.255144,  HIT@HALF = 0.235103
phase 8
[+] NDCG@FULL = 0.104090, NDCG@HALF = 0.100652
[+]  HIT@FULL = 0.230769,  HIT@HALF = 0.218653
phase 9
[+] NDCG@FULL = 0.092443, NDCG@HALF = 0.073283
[+]  HIT@FULL = 0.211387,  HIT@HALF = 0.177313
half score: 0.2777687981724739
