In [3]:
import pandas as pd
import numpy as np
import random
import json

from tqdm import tqdm
from collections import defaultdict

## Making `ml-1m.train.rating`, `ml-1m.test.rating`

In [4]:
train_df = pd.read_csv("./train_ratings.csv")

In [5]:
user_list = sorted(train_df["user"].unique().tolist())
item_list = sorted(train_df["item"].unique().tolist())

In [4]:
print (f"user_list : {user_list[:10]}\nitem_list : {item_list[:10]}")

user_list : [11, 14, 18, 25, 31, 35, 43, 50, 58, 60]
item_list : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [6]:
user2reindex = defaultdict(int)
item2reindex = defaultdict(int)

for user_reindex, user in tqdm(enumerate(user_list)):
    user2reindex[user] = user_reindex
        
for item_reindex, item in tqdm(enumerate(item_list)):
    item2reindex[item] = item_reindex

# example
# user 11 to 1
# item 1 to 1

# 0 ~ 31359 
# 0 ~ 6806

31360it [00:00, 937067.64it/s]
6807it [00:00, 581953.27it/s]


In [6]:
item_reindex = list()
for item in tqdm(item_list):
    item_reindex.append(item2reindex[item])

100%|██████████| 6807/6807 [00:00<00:00, 919386.47it/s]


In [7]:
max(item_reindex)

6806

In [8]:
train_ratings = list()
test_ratings = list()

prev_user = -1
for user, item, time in tqdm(zip(train_df["user"], train_df["item"], train_df["time"]), total=len(train_df["user"])):
    if prev_user != user:
        prev_user = user
        test_ratings.append([user2reindex[user], item2reindex[item], 3.0, time])
    else:
        train_ratings.append([user2reindex[user], item2reindex[item], 3.0, time])


100%|██████████| 5154471/5154471 [00:09<00:00, 552176.13it/s]


In [9]:
print (f"train_ratings : {train_ratings[:5]}\ntest_ratings : {test_ratings[:5]}")

train_ratings : [[0, 109, 3.0, 1230782534], [0, 319, 3.0, 1230782539], [0, 368, 3.0, 1230782542], [0, 1183, 3.0, 1230782563], [0, 1510, 3.0, 1230782583]]
test_ratings : [[0, 2505, 3.0, 1230782529], [1, 4101, 3.0, 1225308746], [2, 1039, 3.0, 1195573195], [3, 162, 3.0, 1277961618], [4, 161, 3.0, 1424733433]]


In [10]:
train_ratings_df = pd.DataFrame(columns=["user", "item", "rating", "timestamp"], data=train_ratings)
test_ratings_df = pd.DataFrame(columns=["user", "item", "rating", "timestamp"], data=test_ratings)

train_ratings_df.to_csv(path_or_buf="./new_train_ratings.csv", index=False)
test_ratings_df.to_csv(path_or_buf="./new_test_ratings.csv", index=False)

## Making `ml-1m.test.negative`

In [7]:
user_seen = train_df.groupby("user")["item"].apply(list)

In [12]:
def get_negative_sample(user: int):
    item = random.choice(item_list)
    while item in user_seen[user]:
        item = random.choice(item_list)
    return item2reindex[item]

In [8]:
all_item = set(train_df["item"])

def get_all_negative_sample(user: int):
    seen_item = set(user_seen[user])
    unseen_item = all_item - seen_item
    return list(unseen_item)
    

In [9]:
negative_list = defaultdict(list)

for user in tqdm(user_list):
    negative_list[user2reindex[user]].extend(get_all_negative_sample(user))

100%|██████████| 31360/31360 [00:16<00:00, 1863.97it/s]


In [10]:
with open("./new_all_test_negative.json", "w") as js:
    json.dump(negative_list, js, indent=4)