In [22]:
import numpy as np
import pandas as pd
import os
import sys
import zipfile
import subprocess

from gradio.monitoring_dashboard import unique_users
from matplotlib import pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from scipy import stats
from tqdm.notebook import tqdm
from copy import deepcopy

import json

In [23]:
DATASET = 'MIND_small'
RAW_PATH = os.path.join('./', DATASET)

RANDOM_SEED = 42
NEG_ITEMS = 99

# Load data

1. Load interaction data and item metadata
2. Filter out items with less than 5 interactions
3. Calculate basic statistics

In [3]:
# Please download the training and validation set from https://msnews.github.io/
# and copy MINDlarge.zip and MINDlarge_dev.zip to the *MIND_large* dir
print('Unzip files...')
f = zipfile.ZipFile(os.path.join(RAW_PATH,'MINDsmall_dev.zip'),'r')
os.makedirs(os.path.join(RAW_PATH,'train'),exist_ok=True)
for file in f.namelist():
    print("Extract %s"%(file))
    f.extract(file,os.path.join(RAW_PATH,'train'))
f.close()

Unzip files...
Extract MINDsmall_dev/
Extract MINDsmall_dev/behaviors.tsv
Extract MINDsmall_dev/news.tsv
Extract MINDsmall_dev/entity_embedding.vec
Extract MINDsmall_dev/relation_embedding.vec


In [4]:
f = zipfile.ZipFile(os.path.join(RAW_PATH,'MINDsmall_dev.zip'),'r')
os.makedirs(os.path.join(RAW_PATH,'dev'),exist_ok=True)
for file in f.namelist():
    print("Extract %s"%(file))
    f.extract(file,os.path.join(RAW_PATH,'dev'))
f.close()

Extract MINDsmall_dev/
Extract MINDsmall_dev/behaviors.tsv
Extract MINDsmall_dev/news.tsv
Extract MINDsmall_dev/entity_embedding.vec
Extract MINDsmall_dev/relation_embedding.vec


In [24]:
# read interaction data
interactions = []
user_freq, item_freq = dict(), dict()
for d in [os.path.join(RAW_PATH,'train'),os.path.join(RAW_PATH,'dev')]:
    file = os.path.join(d,"behaviors.tsv")
    with open(file) as F:
        for line in tqdm(F):
            line = line.strip().split("\t")
            sid, uid, time = line[0], line[1], line[2]
            impressions = line[4].split(" ")
            for imp in impressions:
                iid, label = imp.split("-")
                interactions.append([sid,uid,time,iid,label])
                if int(label)==1:
                    user_freq[uid] = user_freq.get(uid,0)+1
                    item_freq[iid] = item_freq.get(iid,0)+1

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [25]:
interactions_original = interactions.copy()

In [26]:
# 5-core filtering
select_uid, select_iid = [],[]
for u in user_freq:
    if user_freq[u]>=5:
        select_uid.append(u)
for i in item_freq:
    if item_freq[i]>=5:
        select_iid.append(i)
print("User: %d/%d, Item: %d/%d"%(len(select_uid),len(user_freq),len(select_iid),len(item_freq)))

while len(select_uid)<len(user_freq) or len(select_iid)<len(item_freq):
    select_uid = set(select_uid)
    select_iid = set(select_iid)
    user_freq, item_freq = dict(), dict()
    interactions_5core = []
    for line in tqdm(interactions):
        uid, iid, label = line[1], line[3], line[-1]
        if uid in select_uid and iid in select_iid:
            interactions_5core.append(line)
            if int(label)==1:
                user_freq[uid] = user_freq.get(uid,0)+1
                item_freq[iid] = item_freq.get(iid,0)+1
    interactions = interactions_5core
    select_uid, select_iid = [],[]
    for u in user_freq:
        if user_freq[u]>=5:
            select_uid.append(u)
    for i in item_freq:
        if item_freq[i]>=5:
            select_iid.append(i)
    print("User: %d/%d, Item: %d/%d"%(len(select_uid),len(user_freq),len(select_iid),len(item_freq)))

print("Selected Interactions: %d, Users: %d, Items: %d"%(len(interactions),len(select_uid),len(select_iid)))

User: 13388/50000, Item: 1171/2212


  0%|          | 0/5481996 [00:00<?, ?it/s]

User: 13194/13385, Item: 960/1156


  0%|          | 0/2729292 [00:00<?, ?it/s]

User: 13118/13194, Item: 951/960


  0%|          | 0/2662724 [00:00<?, ?it/s]

User: 13114/13118, Item: 951/951


  0%|          | 0/2649752 [00:00<?, ?it/s]

User: 13114/13114, Item: 951/951
Selected Interactions: 2649242, Users: 13114, Items: 951


In [27]:
# exclude illegal interactions
for i in range(len(interactions)):
    if len(interactions[i])>5:
        interactions[i] = interactions[i][:-1]

In [28]:
# Get timestamp
format_t = '%m/%d/%Y %I:%M:%S %p'
ts, time = [], []
for i in tqdm(range(len(interactions))):
    t = datetime.strptime(interactions[i][2],format_t)
    ts.append(t)
    time.append(t.timestamp())

  0%|          | 0/2649242 [00:00<?, ?it/s]

In [29]:
# Construct 5 core results with situation context
interaction_df = pd.DataFrame(interactions,columns = ["session_id","user_id","time_str","news_id","label"])
interaction_df['time'] = time
interaction_df['timestamp'] = ts
interaction_df['hour'] = interaction_df['timestamp'].apply(lambda x: x.hour)
interaction_df['weekday'] = interaction_df['timestamp'].apply(lambda x: x.weekday())
interaction_df['date'] = interaction_df['timestamp'].apply(lambda x: x.date())

def get_time_range(hour): # according to the Britannica dictionary
    # https://www.britannica.com/dictionary/eb/qa/parts-of-the-day-early-morning-late-morning-etc
    if hour>=5 and hour<=8:
        return 0
    if hour>8 and hour<11:
        return 1
    if hour>=11 and hour<=12:
        return 2
    if hour>12 and hour<=15:
        return 3
    if hour>15 and hour<=17:
        return 4
    if hour>=18 and hour<=19:
        return 5
    if hour>19 and hour<=21:
        return 6
    if hour>21:
        return 7
    return 8 # 0-4 am

interaction_df['period'] = interaction_df.hour.apply(lambda x: get_time_range(x))
min_date = interaction_df.date.min()
interaction_df['day'] = (interaction_df.date - min_date).apply(lambda x: x.days)

In [30]:
# Save 5-core interactions
interaction_df.to_csv("interaction_5core.csv",index=False)

----
# Prepare data for CTR & Reranking task

1. Rename and organize all interaction features
2. Split dataset into training, validation, and test; Save interaction files (same time indicates same impression)
3. Organize item metadata

In [31]:
CTR_PATH='./MINDCTR/'
os.makedirs(CTR_PATH,exist_ok=True)

In [32]:
# copy interaction file, rename and re-id all features
interaction_ctr = interaction_df.copy()
interaction_ctr.rename(columns={'hour':'c_hour_c','weekday':'c_weekday_c','period':'c_period_c','day':'c_day_f',
                              'user_id':'original_user_id'},
                     inplace=True)
user2newid_ctr = dict(zip(sorted(interaction_ctr.original_user_id.unique()), 
                      range(1,interaction_ctr.original_user_id.nunique()+1)))
interaction_ctr['user_id'] = interaction_ctr.original_user_id.apply(lambda x: user2newid_ctr[x])

item2newid_ctr = dict(zip(sorted(interaction_ctr.news_id.unique()), 
                      range(1,interaction_ctr.news_id.nunique()+1)))
interaction_ctr['item_id'] = interaction_ctr['news_id'].apply(lambda x: item2newid_ctr[x])
interaction_ctr.sort_values(by=['user_id','time'],inplace=True)
interaction_ctr = interaction_ctr.reset_index(drop=True)

json.dump(user2newid_ctr,open(os.path.join(CTR_PATH,"user2newid.json"),'w'))
json.dump(item2newid_ctr,open(os.path.join(CTR_PATH,"item2newid.json"),'w'))

In [33]:
# Count statistics
for col in interaction_ctr.columns:
    if col in ['user_id','item_id'] or col.startswith('c_'):
        print(col, interaction_ctr[col].nunique())

c_hour_c 24
c_weekday_c 1
c_period_c 9
c_day_f 1
user_id 13114
item_id 951


In [40]:
# split training, validation, and test sets.
print("=== MIND_small 只有一天数据，使用用户划分方案 ===")

# split_time1 = 5
# train = interaction_ctr.loc[interaction_ctr.c_day_f<=split_time1].copy()
# val_test = interaction_ctr.loc[(interaction_ctr.c_day_f>split_time1)].copy()

# 根据用户划分 train 和 val_test
np.random.seed(RANDOM_SEED)
unique_users = interaction_ctr['user_id'].unique()
np.random.shuffle(unique_users)

train_ratio = 0.7
n_train = int(len(unique_users)*train_ratio)
train_users = unique_users[:n_train]
val_test_users = unique_users[n_train:]

train = interaction_ctr.loc[interaction_ctr.user_id.isin(train_users)].copy()
val_test = interaction_ctr.loc[interaction_ctr.user_id.isin(val_test_users)].copy()

# 根据会话划分验证集和测试集的代码不变
val_test.sort_values(by='time',inplace=True)
sessionbyTime = []
last_s = -1
for s in val_test.session_id:
    if s!=last_s:
        sessionbyTime.append(s)
        last_s = s
val = val_test.loc[val_test.session_id.isin(sessionbyTime[:len(sessionbyTime)//2])].copy()
test = val_test.loc[val_test.session_id.isin(sessionbyTime[len(sessionbyTime)//2:])].copy()

# Delete user&item in validation&test sets that not exist in training set
train_u, train_i = set(train.user_id.unique()), set(train.item_id.unique())

# 此处需要修改，因为用户没有重合，过滤后会变成空
# val_sel = val.loc[(val.user_id.isin(train_u))&(val.item_id.isin(train_i))].copy()
# test_sel = test.loc[(test.user_id.isin(train_u))&(test.item_id.isin(train_i))].copy()

# 改为只确保物品在训练集中出现过
val_sel = val.loc[val.item_id.isin(train_i)].copy()
test_sel = test.loc[test.item_id.isin(train_i)].copy()

print("Train user: %d, item: %d"%(len(train_u),len(train_i)))
print("Validation user: %d, item: %d"%(val_sel.user_id.nunique(),val_sel.item_id.nunique()))
print("Test user: %d, item: %d"%(test_sel.user_id.nunique(),test_sel.item_id.nunique()))

=== MIND_small 只有一天数据，使用用户划分方案 ===
Train user: 9179, item: 951
Validation user: 2771, item: 825
Test user: 2825, item: 876


In [41]:
# Save interaction data
select_columns = ['user_id','item_id','time','label','c_hour_c','c_weekday_c','c_period_c','c_day_f']
train[select_columns].to_csv(os.path.join(CTR_PATH,'train.csv'),sep="\t",index=False)
val_sel[select_columns].to_csv(os.path.join(CTR_PATH,'dev.csv'),sep="\t",index=False)
test_sel[select_columns].to_csv(os.path.join(CTR_PATH,'test.csv'),sep="\t",index=False)

In [42]:
# organize & save item metadata
item_meta_train = pd.read_csv(os.path.join(RAW_PATH,'train',"news.tsv"),sep="\t",header=None)
item_meta_train.columns = ['news_id','category','subcategory','title','abstract','url','title_entitiy','abstract_entity']
item_select = item_meta_train.loc[item_meta_train.news_id.isin(interaction_ctr.news_id.unique())].copy()
item_select['item_id'] = item_select.news_id.apply(lambda x: item2newid_ctr[x])
category2id = dict(zip(sorted(item_select.category.unique()),range(1,item_select.category.nunique()+1)))
subcategory2id = dict(zip(sorted(item_select.subcategory.unique()),range(1,item_select.subcategory.nunique()+1)))
item_select['i_category_c'] = item_select['category'].apply(lambda x: category2id[x])
item_select['i_subcategory_c'] = item_select['subcategory'].apply(lambda x: subcategory2id[x])
item_select[['item_id','i_category_c','i_subcategory_c']].to_csv(
    os.path.join(CTR_PATH,'item_meta.csv'),sep="\t",index=False)

# Prepare data for Top-k Recommendation Task
1. Rename all interaction features
2. Split dataset into training, validation, and test
3. Re-assign IDs to user, item, and context; Save interaction files
4. Organize item metadata

In [18]:
TOPK_PATH='./MINDTOPK/'
os.makedirs(TOPK_PATH,exist_ok=True)

In [19]:
interaction_df = pd.read_csv("interaction_5core.csv")

In [20]:
interaction_df.head(2)

Unnamed: 0,session_id,user_id,time_str,news_id,label,time,timestamp,hour,weekday,date,period,day
0,1,U87243,11/10/2019 11:30:54 AM,N78206,0,1573357000.0,2019-11-10 11:30:54,11,6,2019-11-10,2,1
1,1,U87243,11/10/2019 11:30:54 AM,N26368,0,1573357000.0,2019-11-10 11:30:54,11,6,2019-11-10,2,1


In [21]:
# copy & rename columns
interaction_pos = interaction_df.loc[interaction_df.label==1].copy() # retain positive interactions
interaction_pos.rename(columns={'hour':'c_hour_c','weekday':'c_weekday_c','period':'c_period_c','day':'c_day_f',
                              'user_id':'original_user_id'}, inplace=True)

In [22]:
# split training, validation, and test sets.
split_time1 = 5
train = interaction_pos.loc[interaction_pos.c_day_f<=split_time1].copy()
val_test = interaction_pos.loc[(interaction_pos.c_day_f>split_time1)].copy()
val_test.sort_values(by='time',inplace=True)
sessionbyTime = []
last_s = -1
for s in val_test.session_id:
    if s!=last_s:
        sessionbyTime.append(s)
        last_s = s
val = val_test.loc[val_test.session_id.isin(sessionbyTime[:len(sessionbyTime)//2])].copy()
test = val_test.loc[val_test.session_id.isin(sessionbyTime[len(sessionbyTime)//2:])].copy()

# Delete user&item in validation&test sets that not exist in training set
train_u, train_i = set(train.original_user_id.unique()), set(train.news_id.unique())
val_sel = val.loc[(val.original_user_id.isin(train_u))&(val.news_id.isin(train_i))].copy()
test_sel = test.loc[(test.original_user_id.isin(train_u))&(test.news_id.isin(train_i))].copy()
print("Train user: %d, item: %d"%(len(train_u),len(train_i)))
print("Validation user: %d, item:%d"%(val_sel.original_user_id.nunique(),val_sel.news_id.nunique()))
print("Test user: %d, item:%d"%(test_sel.original_user_id.nunique(),test_sel.news_id.nunique()))

Train user: 267742, item: 8572
Validation user: 84477, item:1303
Test user: 58023, item:930


In [23]:
# Assign ids for users and items (to generate continous ids)
all_df = pd.concat([train,val_sel,test_sel],axis=0)
user2newid_topk = dict(zip(sorted(all_df.original_user_id.unique()), 
                      range(1,all_df.original_user_id.nunique()+1)))
 
for df in [train,val_sel,test_sel]:
    df['user_id'] = df.original_user_id.apply(lambda x: user2newid_topk[x])

item2newid_topk = dict(zip(sorted(all_df.news_id.unique()), 
                      range(1,all_df.news_id.nunique()+1)))
for df in [train,val_sel,test_sel]:
    df['item_id'] = df['news_id'].apply(lambda x: item2newid_topk[x])

all_df['user_id'] = all_df.original_user_id.apply(lambda x: user2newid_topk[x])
all_df['item_id'] = all_df['news_id'].apply(lambda x: item2newid_topk[x])

json.dump(user2newid_topk,open(os.path.join(TOPK_PATH,"user2newid.json"),'w'))
json.dump(item2newid_topk,open(os.path.join(TOPK_PATH,"item2newid.json"),'w'))

In [24]:
# generate negative items
def generate_negative(data_df,all_items,clicked_item_set,random_seed,neg_item_num=99):
    np.random.seed(random_seed)
    neg_items = np.random.choice(all_items, (len(data_df),neg_item_num))
    for i, uid in tqdm(enumerate(data_df['user_id'].values)):
        user_clicked = clicked_item_set[uid]
        for j in range(len(neg_items[i])):
            while neg_items[i][j] in user_clicked|set(neg_items[i][:j]):
                neg_items[i][j] = np.random.choice(all_items, 1)
    return neg_items.tolist()

clicked_item_set = dict()
for user_id, seq_df in all_df.groupby('user_id'):
    clicked_item_set[user_id] = set(seq_df['item_id'].values.tolist())
all_items = all_df.item_id.unique()
val_sel['neg_items'] = generate_negative(val_sel,all_items,clicked_item_set,random_seed=1)
test_sel['neg_items'] = generate_negative(test_sel,all_items,clicked_item_set,random_seed=2)

0it [00:00, ?it/s]

  neg_items[i][j] = np.random.choice(all_items, 1)


0it [00:00, ?it/s]

  neg_items[i][j] = np.random.choice(all_items, 1)


In [25]:
select_columns = ['user_id','item_id','time','c_hour_c','c_weekday_c','c_period_c','c_day_f']
train[select_columns].to_csv(os.path.join(TOPK_PATH,'train.csv'),sep="\t",index=False)
val_sel[select_columns+['neg_items']].to_csv(os.path.join(TOPK_PATH,'dev.csv'),sep="\t",index=False)
test_sel[select_columns+['neg_items']].to_csv(os.path.join(TOPK_PATH,'test.csv'),sep="\t",index=False)

In [26]:
# organize & save item metadata
item_meta_train = pd.read_csv(os.path.join(RAW_PATH,'train',"news.tsv"),sep="\t",header=None)
item_meta_train.columns = ['news_id','category','subcategory','title','abstract','url','title_entitiy','abstract_entity']
item_select = item_meta_train.loc[item_meta_train.news_id.isin(all_df.news_id.unique())].copy()
item_select['item_id'] = item_select.news_id.apply(lambda x: item2newid_topk[x])
category2id = dict(zip(sorted(item_select.category.unique()),range(1,item_select.category.nunique()+1)))
subcategory2id = dict(zip(sorted(item_select.subcategory.unique()),range(1,item_select.subcategory.nunique()+1)))
item_select['i_category_c'] = item_select['category'].apply(lambda x: category2id[x])
item_select['i_subcategory_c'] = item_select['subcategory'].apply(lambda x: subcategory2id[x])
item_select[['item_id','i_category_c','i_subcategory_c']].to_csv(
    os.path.join(TOPK_PATH,'item_meta.csv'),sep="\t",index=False)