In [None]:
import os
import pandas as pd

from datetime import datetime
from tqdm import tqdm

from collections import defaultdict
import math
import numpy as np
import pickle

In [None]:
from google.colab import drive 
drive.mount('/content/drive', force_remount=True) 

Mounted at /content/drive


In [None]:
data_dir = '/content/drive/My Drive/recommendation/data'
path = '/content/drive/My Drive/recommendation/'

In [None]:
train_sessions = pd.read_csv(os.path.join(data_dir, 'train_sessions.csv'))
train_purchases = pd.read_csv(os.path.join(data_dir, 'train_purchases.csv'))
test_sessions = pd.read_csv(os.path.join(data_dir, 'test_leaderboard_sessions.csv'))
test_leaderboard = pd.read_csv(os.path.join(data_dir,'test_final_sessions.csv'))

In [None]:
train_sessions = train_sessions.append(train_purchases).reset_index(drop=True)
train_sessions = train_sessions.append(test_sessions).reset_index(drop=True)
train_sessions = train_sessions.append(test_leaderboard).reset_index(drop=True)

In [None]:
train_sessions

Unnamed: 0,session_id,item_id,date
0,3,9655,2020-12-18 21:25:00.373
1,3,9655,2020-12-18 21:19:48.093
2,13,15654,2020-03-13 19:35:27.136
3,18,18316,2020-08-26 19:18:30.833
4,18,2507,2020-08-26 19:16:31.211
...,...,...,...
6199307,4439648,7154,2021-06-14 08:03:19.024
6199308,4439675,23067,2021-06-01 12:21:07.959
6199309,4439868,26085,2021-06-16 22:18:27.509
6199310,4439966,19483,2021-06-06 20:05:06.457


In [None]:
#Utils functions
def parse(x):
    if "." in x:
        date_time = datetime.strptime(x,"%Y-%m-%d %H:%M:%S.%f")
    else:
        date_time = datetime.strptime(x,"%Y-%m-%d %H:%M:%S")
    return datetime.timestamp(date_time)

def make_user_time_tuple(group_df, user_col='session_id', item_col='item_id', time_col='ts'):
    user_time_tuples = list(zip(group_df[user_col], group_df[time_col]))
    return user_time_tuples


def make_item_time_tuple(group_df, user_col='session_id', item_col='item_id', time_col='ts'):
    # group_df = group_df.drop_duplicates(subset=[user_col, item_col], keep='last')
    item_time_tuples = list(zip(group_df[item_col], group_df[time_col]))
    return item_time_tuples
    
def get_user_item_time_dict(df, user_col='session_id', item_col='item_id', time_col='ts'):
    user_item_ = df.sort_values(by=[user_col, time_col])

    user_item_ = user_item_.groupby(user_col).apply(lambda group: make_item_time_tuple(group, user_col, item_col, time_col)).reset_index().rename(columns={0: 'item_id_time_list'})
    user_item_time_dict = dict(zip(user_item_[user_col], user_item_['item_id_time_list']))
    return user_item_time_dict


def get_item_user_time_dict(df, user_col='session_id', item_col='item_id', time_col='ts'):
    item_user_df = df.sort_values(by=[item_col, time_col])
    item_user_df = item_user_df.groupby(item_col).apply(lambda group: make_user_time_tuple(group, user_col, item_col, time_col)).reset_index().rename(columns={0: 'user_id_time_list'})
    item_user_time_dict = dict(zip(item_user_df[item_col], item_user_df['user_id_time_list']))
    return item_user_time_dict

In [None]:
ts_list = []
for time in tqdm(train_sessions['date']):
    ts_list.append(parse(time))
train_sessions['ts'] = ts_list
train_sessions=train_sessions.sort_values(by = ['session_id','ts'],ascending = True)

100%|██████████| 6199312/6199312 [01:39<00:00, 62372.07it/s]


In [None]:
#Recall function
def get_sim2(df, user_col, item_col,alpha):
  user_item_ = df.groupby(user_col)[item_col].agg(lambda x: list(x)).reset_index()
  user_item_dict = dict(zip(user_item_[user_col], user_item_[item_col]))
  user_time_ = df.groupby(user_col)['ts'].agg(lambda x: list(x)).reset_index()  # 引入时间因素
  user_time_dict = dict(zip(user_time_[user_col], user_time_['ts']))

  sim_item = {}
  item_cnt = defaultdict(int)
  for user, items in tqdm(user_item_dict.items()):
    for loc1, item in enumerate(items):
      item_cnt[item] += 1
      sim_item.setdefault(item, {})
      for loc2, relate_item in enumerate(items):
        if item == relate_item:
          continue
        t1 = user_time_dict[user][loc1] #time
        t2 = user_time_dict[user][loc2] #time

        loc_alpha = 1.0 if loc2>loc1 else 0.7
        loc_weight = loc_alpha*(0.8**(abs(loc2-loc1)-1))
        time_weight = np.exp(-abs(t2 - t1)/259200)
     

        sim_item[item].setdefault(relate_item, 0)

        sim_item[item][relate_item] += loc_weight*time_weight/math.log(1+len(items))
  
  
  sim_item_corr = sim_item.copy()
  # for i, related_items in tqdm(sim_item.items()):
  #     for j, cij in related_items.items():
  #         sim_item_corr[i][j] = cij / math.sqrt(item_cnt[i] * item_cnt[j] * alpha) 


  return sim_item_corr,item_cnt
sim_item_corr,item_cnt = get_sim2(train_sessions,'session_id','item_id',1)

In [None]:
user_item_time_dict = get_user_item_time_dict(train_sessions)

In [None]:
order = train_sessions['item_id'].value_counts().reset_index()
order = order.sort_values('item_id', ascending=False)
popular_items = list(order['index'])

In [None]:
#Rank function
def recommend(sim_item_corr, popular_items, top_k, session_item_list, item_num=300):  
    rank = {}  
    for i in session_item_list:  
        if i not in sim_item_corr.keys():
            continue
        for j, wij in sorted(sim_item_corr[i].items(), key=lambda d: d[1], reverse=True)[0:item_num]:  
            if j not in session_item_list:  
                rank.setdefault(j, 0)  
                rank[j] += wij
    rank_list = sorted(rank.items(), key=lambda d: d[1], reverse=True)[:top_k]
    rank_array = np.array(rank_list)
    item_list = list(rank_array[:,0].astype('int32'))

    
    if len(item_list)<top_k:
        index = 0
        while(len(item_list)<top_k):
            if popular_items[index] not in item_list:
                item_list.append(popular_items[index])
            else:
                index += 1
            
    return item_list

In [None]:
top_k = 100
session_id_list = []
item_id_list = []
rank_list = []
test_session_dict = test_sessions.groupby('session_id')['item_id'].agg(list).to_dict()

for session_id,session_item_list in tqdm(test_session_dict.items()):
    item_list = recommend(sim_item_corr,popular_items,top_k,session_item_list)
    session_id_list += [session_id for _ in range(len(item_list))]
    item_id_list += list(item_list)

    rank_list += [x for x in range(1,len(item_list)+1)]

res_df = pd.DataFrame()
res_df['session_id'] = session_id_list
res_df['item_id'] = item_id_list
res_df['rank'] = rank_list

res_df.to_csv(os.path.join(data_dir, 'leaderboard_result.csv'),index=False)

100%|██████████| 50000/50000 [10:13<00:00, 81.51it/s]


In [None]:
top_k = 100
session_id_list = []
item_id_list = []
rank_list = []
test_session_dict = test_leaderboard.groupby('session_id')['item_id'].agg(list).to_dict()

for session_id,session_item_list in tqdm(test_session_dict.items()):
    item_list = recommend(sim_item_corr,popular_items,top_k,session_item_list)
    session_id_list += [session_id for _ in range(len(item_list))]
    item_id_list += list(item_list)

    rank_list += [x for x in range(1,len(item_list)+1)]

res_df = pd.DataFrame()
res_df['session_id'] = session_id_list
res_df['item_id'] = item_id_list
res_df['rank'] = rank_list

res_df.to_csv(os.path.join(data_dir, 'final_result.csv'),index=False)