In [130]:
import os, csv
import pandas as pd
import json
from collections import Counter
import numpy as np

MAGAZINE_DATASET = 'Magazine_Subscriptions'
BEAUTY_DATASET = 'All_Beauty'
IN_PATH = './input'
OUT_PATH = './processed'
KEY_COLUMNS = ['user_id', 'item_id', 'timestamp']
UID, IID = 'user_id', 'item_id'
UMAP_FILE, IMAP_FILE,  = '{dataset}_u_map.tsv', '{dataset}_i_map.tsv'
U_I_PAIR_FILE = '{dataset}_u_i_pairs.tsv'
POS_NEG_FILE = '{dataset}_user_items_negs.tsv'

def get_input_file(dataset, meta=False):
    fname = f'{dataset}.jsonl'
    if meta:
        fname = 'meta_' + fname
    return os.path.join(IN_PATH, fname)

def read_jsonl_as_pd(fname):
    with open(fname, encoding="utf8") as f:
        lines = [json.loads(x) for x in f.read().splitlines()]
    df = pd.DataFrame(lines)
    return df

def prep_for_kcore(df):
    print(f'Before dropped: {df.shape}')
    df = df.drop(columns=['title', 'text', 'images', 'asin', 'helpful_vote', 'verified_purchase', 'rating'])
    df = df.rename(columns={'parent_asin':'item_id'})
    df.dropna(subset=KEY_COLUMNS, inplace=True)
    df.drop_duplicates(subset=KEY_COLUMNS, inplace=True)
    print(f'After dropped: {df.shape}')
    return df


def find_invalid_freq_ids(df, field, max_num=np.inf, min_num=-1):
    inter_cnt = Counter(df[field].values)
    blocklist = {x for x,cnt in inter_cnt.items() if not (min_num <= cnt <= max_num)}
    return blocklist


def filter_by_k_core(df, min_u_num, min_i_num):
    iteration = 0
    df = df.copy()
    print('Calculating k-core...')
    while True:
        ban_users = find_invalid_freq_ids(df, field=UID, min_num=min_u_num)
        ban_items = find_invalid_freq_ids(df, field=IID, min_num=min_i_num)
        if len(ban_users) == 0 and len(ban_items) == 0:
            print(f"{len(df.index)} rows left in (u={min_u_num},i={min_i_num})-core")
            break

        dropped_inter = pd.Series(False, index=df.index)
        dropped_inter |= df[UID].isin(ban_users)
        dropped_inter |= df[IID].isin(ban_items)
        print(f'\titeration {iteration}: {len(dropped_inter)} dropped interactions',
             f"with {len(ban_users)} users banned and {len(ban_items)} items banned")
        df.drop(df.index[dropped_inter], inplace=True)
        iteration += 1
    return df

def get_input_file(dataset, meta=False):
    fname = f'{dataset}.jsonl'
    if meta:
        fname = 'meta_' + fname
    return os.path.join(IN_PATH, fname)

def reindex(df):
    df.reset_index(drop=True, inplace=True)

    uniq_users = pd.unique(df[UID])
    uniq_items = pd.unique(df[IID])

    # start from 0
    u_map = {k: i for i, k in enumerate(uniq_users)}
    i_map = {k: i for i, k in enumerate(uniq_items)}

    df[UID] = df[UID].map(u_map)
    df[IID] = df[IID].map(i_map)
    df[UID] = df[UID].astype(int)
    df[IID] = df[IID].astype(int)
    df.sort_values(by=[IID, 'timestamp'], inplace=True)
    return df, u_map, i_map

def neg_samples(df, neg=5, neg_multiplier=3):
    all_items = list(df[IID].unique())
    items_per_user = df.groupby(UID)[IID].unique().reset_index().rename(columns={IID: 'items'})
    items_per_user['samples'] = list(np.random.choice(all_items, size=(len(items_per_user.index), neg_multiplier*neg), replace=True))
    user_neg = []
    for user, row in items_per_user.iterrows():
        samples = row['samples']
        items = row['items']
        neg_samples = set(samples) - set(items)
        if len(neg_samples) < neg:
            print(f"FIXME(peijunz): not enough negative samples for user {user}")
        user_neg.append(','.join(str(i) for i in list(neg_samples)[:5]))
    items_per_user['neg'] = user_neg
    items_per_user.drop(columns=['samples', 'items'], inplace=True)
    return items_per_user

def pos_samples(df, pos=6):
    # need to sort positive items by timestamp...
    item_frequency = df.groupby(IID).size().reset_index(name='frequency')
    freq = df.merge(item_frequency, on=IID).sort_values(by=[UID, 'frequency'], ascending=False)
    pos_df = freq[freq.groupby(UID)['frequency'].rank(method="dense", ascending=False) <= pos]
    return pos_df.groupby(UID)[IID].agg(lambda x:','.join(str(i) for i in x)).reset_index().rename(columns={IID: 'pos'})


def user_items_negs(df, pos=6, neg=5):
    pos = pos_samples(df)
    neg = neg_samples(df)
    return pos.merge(neg, on=UID)


def save_to_csv(dataset, df, u_map, i_map):
    # save interactions
    u_i_pair_path = os.path.join(OUT_PATH, U_I_PAIR_FILE.format(dataset=dataset))
    df.to_csv(u_i_pair_path, sep='\t', index=False)
    print(f"saved file {u_i_pair_path}")
   
    # save u_map
    u_path = os.path.join(OUT_PATH, UMAP_FILE.format(dataset=dataset))
    (pd.DataFrame(list(u_map.items()), columns=['original', UID])
            .to_csv(u_path, sep='\t', index=False))
    print(f"saved file {u_path}")

    # save i_map
    i_path = os.path.join(OUT_PATH, IMAP_FILE.format(dataset=dataset))
    (pd.DataFrame(list(i_map.items()), columns=['original', IID])
            .to_csv(i_path, sep='\t', index=False))
    print(f"saved file {i_path}")
    
    # save user item negs
    
    pos_neg_path = os.path.join(OUT_PATH, POS_NEG_FILE.format(dataset=dataset))
    user_items_negs(df, pos=6, neg=5).to_csv(pos_neg_path, sep='\t', index=False)
    print(f"saved file {pos_neg_path}")

def process_dataset(dataset=MAGAZINE_DATASET, core_req=(6,6)):
    df = read_jsonl_as_pd(get_input_file(dataset))
    df = prep_for_kcore(df)
    df = filter_by_k_core(df, *core_req)
    df, u_map, i_map = reindex(df)
    save_to_csv(dataset, df, u_map, i_map)
    return df

In [131]:
process_dataset(MAGAZINE_DATASET, (3,3))

Before dropped: (71497, 10)
After dropped: (70922, 3)
Calculating k-core...
	iteration 0: 70922 dropped interactions with 58199 users banned and 1361 items banned
	iteration 1: 7670 dropped interactions with 50 users banned and 722 items banned
	iteration 2: 6629 dropped interactions with 368 users banned and 10 items banned
	iteration 3: 5954 dropped interactions with 8 users banned and 63 items banned
	iteration 4: 5832 dropped interactions with 52 users banned and 3 items banned
	iteration 5: 5726 dropped interactions with 3 users banned and 8 items banned
	iteration 6: 5706 dropped interactions with 5 users banned and 0 items banned
	iteration 7: 5696 dropped interactions with 0 users banned and 2 items banned
	iteration 8: 5692 dropped interactions with 1 users banned and 0 items banned
5690 rows left in (u=3,i=3)-core
saved file ./processed/Magazine_Subscriptions_u_i_pairs.tsv
saved file ./processed/Magazine_Subscriptions_u_map.tsv
saved file ./processed/Magazine_Subscriptions_i_

Unnamed: 0,item_id,user_id,timestamp
4630,0,1150,1316789603000
466,0,95,1354819669000
3213,0,776,1410651210000
4279,0,1054,1415671770000
0,0,0,1648232610565
...,...,...,...
4888,458,1222,1315182026000
5587,459,1409,1004136316000
5030,459,1252,1039983406000
5095,459,1271,1128047433000


In [128]:
process_dataset(BEAUTY_DATASET, (5,5))

Before dropped: (701528, 10)
After dropped: (694252, 3)
Calculating k-core...
	iteration 0: 694252 dropped interactions with 630509 users banned and 85333 items banned
	iteration 1: 10446 dropped interactions with 576 users banned and 3203 items banned
	iteration 2: 5137 dropped interactions with 298 users banned and 69 items banned
	iteration 3: 4164 dropped interactions with 23 users banned and 133 items banned
	iteration 4: 3640 dropped interactions with 60 users banned and 16 items banned
	iteration 5: 3362 dropped interactions with 8 users banned and 58 items banned
	iteration 6: 3128 dropped interactions with 31 users banned and 9 items banned
	iteration 7: 2981 dropped interactions with 4 users banned and 24 items banned
	iteration 8: 2881 dropped interactions with 12 users banned and 5 items banned
	iteration 9: 2814 dropped interactions with 1 users banned and 8 items banned
	iteration 10: 2778 dropped interactions with 3 users banned and 1 items banned
	iteration 11: 2762 dro

Unnamed: 0,item_id,user_id,timestamp
709,0,56,1621346359769
247,0,16,1621894083603
813,0,65,1623685953622
0,0,0,1627391044559
1215,0,111,1628864954140
...,...,...,...
2055,367,211,1618794367742
2562,367,254,1619398427533
2203,367,223,1621107418771
2326,367,233,1621831743566
