In [1]:
import numpy as np
import pandas as pd
import os
import random
import pickle
from collections import defaultdict

In [2]:
def load_pickle(filename):
    with open(filename, "rb") as f:
        return pickle.load(f)


def save_pickle(data, filename):
    with open(filename, "wb") as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
        
def check_k_core(dataframe, user_core, item_core):
    user_count = defaultdict(int)
    item_count = defaultdict(int)
    for user in dataframe['uid'].tolist():
        user_count[user] += 1
    for item in dataframe['iid'].tolist():
        item_count[item] += 1
    for user in user_count:
        if user_count[user] < user_core:
            return user_count, item_count, False
        
    for item in item_count:
        if item_count[item] < item_core:
            return user_count, item_count, False
        
    return user_count, item_count, True

def delete_users_from_df(dataframe, user_list):
    dataframe = dataframe.drop(dataframe[dataframe['uid'].isin(user_list)].index).reset_index(drop=True)
    return dataframe

def delete_items_from_df(dataframe, item_list):
    dataframe = dataframe.drop(dataframe[dataframe['iid'].isin(item_list)].index).reset_index(drop=True)
    return dataframe

def filter_k_core(dataframe, user_core, item_core):
    user_count, item_count, isKcore = check_k_core(dataframe, user_core, item_core)
    while not isKcore:
        delete_user_list = [user for user in user_count if user_count[user] < user_core]
        delete_item_list = [item for item in item_count if item_count[item] < item_core]
        dataframe = delete_users_from_df(dataframe, delete_user_list)
        dataframe = delete_items_from_df(dataframe, delete_item_list)
        
        user_count, item_count, isKcore = check_k_core(dataframe, user_core, item_core)
    return dataframe

In [3]:
def get_data_name(raw_data_name):
    if raw_data_name == 'Books':
        return 'Books'
    elif raw_data_name == 'Electronics':
        return 'Electronics'
    elif raw_data_name == 'Beauty':
        return 'Beauty'
    elif raw_data_name == 'CDs_and_Vinyl':
        return 'CDs'
    elif raw_data_name == 'Movies_and_TV':
        return 'Movies'
    elif raw_data_name == 'Toys_and_Games':
        return 'Toys'
    elif raw_data_name == 'Clothing_Shoes_and_Jewelry':
        return 'Clothing'
    else:
        raise NotImplementedError
        
def load_data(raw_data_folder, raw_data_name, user_core, item_core):
    raw_data_path = os.path.join(raw_data_folder, 'ratings_{}.csv'.format(raw_data_name))
    if not os.path.exists(raw_data_path):
        raise FileNotFoundError
    name_cols = ['uid', 'iid', 'rating', 'timestamp'] 
    print(f'load data from {raw_data_name} dataset')
    df = pd.read_csv(raw_data_path, names = name_cols)
    print(f'apply user {user_core} core and item {item_core} core filters')
    df = filter_k_core(df, user_core, item_core)
    df = df.sort_values('timestamp')
    users = df['uid'].tolist()
    items = df['iid'].tolist()
    user_num = len(df['uid'].unique())
    item_num = len(df['iid'].unique())
    print(f'The {raw_data_name} dataset has {len(users)} data with {user_num} users and {item_num} items')
    assert len(users) == len(items)
    user_sequence = defaultdict(list)
    for i in range(len(users)):
        user = users[i]
        item = items[i]
        user_sequence[user].append(item)
    return user_sequence

def write_sequence_into_file(data_path, raw_data_name, user_sequence):
    data_name = get_data_name(raw_data_name)
    data_folder = os.path.join(data_path,data_name)
    if not os.path.exists(data_folder):
        os.makedirs(data_folder)
    data_file = os.path.join(data_folder, 'user_sequence.txt')
    with open(data_file, 'w') as out:
        for user, items in user_sequence.items():
            out.write(user + ' ' + ' '.join(items) + '\n')

In [4]:
raw_data_folder = '../raw_data/Amazon/'
raw_data_name = 'Books'
data_path = '../data/'
user_sequence = load_data(raw_data_folder, raw_data_name, 5, 5)
write_sequence_into_file(data_path, raw_data_name, user_sequence)

load data from Books dataset
apply user 5 core and item 5 core filters
The Books dataset has 8898041 data with 603668 users and 367982 items


In [5]:
raw_data_name = 'Electronics'
user_sequence = load_data(raw_data_folder, raw_data_name, 5, 5)
write_sequence_into_file(data_path, raw_data_name, user_sequence)

load data from Electronics dataset
apply user 5 core and item 5 core filters
The Electronics dataset has 1689188 data with 192403 users and 63001 items


In [6]:
raw_data_name = 'CDs_and_Vinyl'
user_sequence = load_data(raw_data_folder, raw_data_name, 5, 5)
write_sequence_into_file(data_path, raw_data_name, user_sequence)

load data from CDs_and_Vinyl dataset
apply user 5 core and item 5 core filters
The CDs_and_Vinyl dataset has 1097592 data with 75258 users and 64443 items


In [7]:
raw_data_name = 'Movies_and_TV'
user_sequence = load_data(raw_data_folder, raw_data_name, 5, 5)
write_sequence_into_file(data_path, raw_data_name, user_sequence)

load data from Movies_and_TV dataset
apply user 5 core and item 5 core filters
The Movies_and_TV dataset has 1697533 data with 123960 users and 50052 items


In [8]:
raw_data_name = 'Toys_and_Games'
user_sequence = load_data(raw_data_folder, raw_data_name, 5, 5)
write_sequence_into_file(data_path, raw_data_name, user_sequence)

load data from Toys_and_Games dataset
apply user 5 core and item 5 core filters
The Toys_and_Games dataset has 167597 data with 19412 users and 11924 items


In [9]:
raw_data_name = 'Clothing_Shoes_and_Jewelry'
user_sequence = load_data(raw_data_folder, raw_data_name, 5, 5)
write_sequence_into_file(data_path, raw_data_name, user_sequence)

load data from Clothing_Shoes_and_Jewelry dataset
apply user 5 core and item 5 core filters
The Clothing_Shoes_and_Jewelry dataset has 278677 data with 39387 users and 23033 items


In [5]:
name_cols = ['iid', 'uid', 'rating', 'timestamp'] 
df = pd.read_csv(raw_data_path, names = name_cols)
df.count

<bound method DataFrame.count of                  iid             uid  rating   timestamp
0         0000031895  A23K73OVXJ04EG     5.0  1391212800
1         0000031895  A2681T699HV6H1     4.0  1384905600
2         0000031895  A374PA18DCGS5Y     1.0  1477008000
3         0000031895  A14PVW2N5YBWSA     5.0  1476748800
4         0000031895  A2KWBC44QI2567     1.0  1476662400
...              ...             ...     ...         ...
12980832  B01HJHHBHG  A33DFHRKGPDEF7     3.0  1502323200
12980833  B01HJHHBHG   AT5N4QPWM1GKL     4.0  1501372800
12980834  B01HJHHBHG  A3ONWSRNZFNC3U     5.0  1497139200
12980835  B01HJHHBHG  A1OJNTT9ZTT82A     5.0  1496966400
12980836  B01HJFUCHO  A2SHB9ITG7EALR     5.0  1526515200

[12980837 rows x 4 columns]>

In [8]:
u_count, i_count, check = check_k_core(df, 5, 5)
check

True

In [7]:
df = filter_k_core(df, 5, 5)
df.count

<bound method DataFrame.count of                 iid             uid  rating   timestamp
0        0000032034  A180LQZBUWVOLF     5.0  1433289600
1        0000032034   ATMFGKU5SVEYY     1.0  1427846400
2        0000032034  A1QE70QBJ8U6ZG     5.0  1421107200
3        0000032034  A22CP6Z73MZTYU     5.0  1419292800
4        0000032034  A22L28G8NRNLLN     4.0  1418601600
...             ...             ...     ...         ...
2835120  B01HJGAJ9O  A3TN0U64HONOPB     5.0  1522886400
2835121  B01HJGAJ9O   AJDQLM8PT3YWT     5.0  1519862400
2835122  B01HJHHBHG  A3QK5ZLRE2KHLL     5.0  1521244800
2835123  B01HJHHBHG  A3VDML80KNR9QQ     5.0  1509148800
2835124  B01HJHHBHG  A3ONWSRNZFNC3U     5.0  1497139200

[2835125 rows x 4 columns]>

In [9]:
df = df.sort_values('timestamp')
df.count

<bound method DataFrame.count of                 iid             uid  rating   timestamp
2489     B00004U9IY  A1M2T0J45TTE64     5.0   968976000
57986    B0007VZVGK  A1DJ7PH72PMJIO     5.0   995155200
2779     B000051WSI   A1RPTVW5VEOSI     5.0  1014249600
1685     B00004S9I0  A1M2T0J45TTE64     5.0  1015459200
89474    B000B545E6  A3FS8HDE2BTD5Z     5.0  1020211200
...             ...             ...     ...         ...
2834348  B01HCKR9ZE   AW7CIDUOZLIVJ     4.0  1538438400
2834907  B01HHB2HK0  A1P3ROFFUTEWKG     2.0  1538524800
2834347  B01HCKR9ZE  A11AT3PC06Y5WT     5.0  1538524800
2834595  B01HEGK5U2   A13H2ZRENPV9E     5.0  1538524800
2835067  B01HIY7NPU  A30DF0WFD9U1PT     1.0  1538611200

[2835125 rows x 4 columns]>

In [10]:
users = df['uid'].tolist()
items = df['iid'].tolist()

In [13]:
len(df['uid'].unique())

331844

In [31]:
import json
def load_json(file_path):
    with open(file_path, "r") as f:
        return json.load(f)
    
def ReadLineFromFile(path):
    lines = []
    with open(path,'r') as fd:
        for line in fd:
            lines.append(line.rstrip('\n'))
    return lines

In [30]:
datamaps = load_json("../../P5/data/beauty_old/datamaps.json")
print(datamaps.keys())

dict_keys(['user2id', 'item2id', 'id2user', 'id2item', 'attribute2id', 'id2attribute', 'attributeid2num'])


In [32]:
sequential_data = ReadLineFromFile('../../P5/data/beauty_old/sequential_data.txt')

In [36]:
original_data = []
id2u = datamaps['id2user']
id2i = datamaps['id2item']
for line in sequential_data:
    line= line.split()
    data = [id2u[line[0]]] + [id2i[line[i]] for i in range(1, len(line))]
    original_data.append(data)

In [37]:
with open('../data/Beauty/user_sequence.txt', 'w') as out:
    for data in original_data:
        out.write(' '.join(data) + '\n')

In [40]:
sequential_data = ReadLineFromFile('../data/Beauty/user_sequence.txt')
user_seq_dict = dict()
for line in sequential_data:
    user_seq = line.split(" ")
    user_seq_dict[user_seq[0]] = user_seq[1:]

In [41]:
sequential_data[0]

'A1YJEY40YUW4SE B004756YJA B004ZT0SSG B0020YLEYK 7806397051 B002WLWX82'

In [42]:
user_seq_dict['A1YJEY40YUW4SE']

['B004756YJA', 'B004ZT0SSG', 'B0020YLEYK', '7806397051', 'B002WLWX82']

In [43]:
sequential_data = ReadLineFromFile('../data/Beauty/user_sequence.txt')
user_seq_dict = dict()
for line in sequential_data:
    user_seq = line.split(" ")
    user_seq_dict[user_seq[0]] = user_seq[1:]

In [46]:
user_map = dict()
for user in user_seq_dict.keys():
    user_map[user] = len(user_map) + 1

In [47]:
user_map['A1YJEY40YUW4SE']

1

In [50]:
def WriteDictToFile(path, write_dict):
    with open(path, 'w') as out:
        for user, items in write_dict.items():
            if type(items) == list:
                out.write(user + ' ' + ' '.join(items) + '\n')
            else:
                out.write(user + ' ' + str(items) + '\n')

In [51]:
WriteDictToFile('../data/Beauty/user_map.txt', user_map)

In [3]:
2 // 3

0

In [None]:
import torch.distributed as dist
def cal(rank):
    