In [16]:
import os
import sys
import numpy as np
import pandas as pd
import scipy.sparse as sp
import pickle
import statistics
from matplotlib import pyplot as plt
from collections import Counter
import random
import torch
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [2]:
def fix_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [3]:
fix_seed(123)

In [4]:
### Print the number of users, bundles, items

def get_counters(dataset_name):
    print("\n" + dataset_name)
    num_users, num_bundles, num_items = 0,0,0
    with open(f"..\data\{dataset_name}\{dataset_name}_data_size.txt") as f:        
        num_users, num_bundles, num_items = [int(s) for s in f.readline().split('\t')][:3]
    print(f"num users: {num_users}")
    print(f"num_items: {num_items}")
    print(f"num_bundles: {num_bundles}")
    return num_users, num_items, num_bundles

In [5]:
# Create an item-item file of each item and its cooccurence items
def create_cooccurences_file(dataset_name, num_items):
    df = pd.read_csv(f"..\\data\\{dataset_name}\\bundle_item.txt", delimiter="\t", names=["bundle", "item"])
    with open(f"..\\data\\{dataset_name}\\item_item.txt", 'w') as output_file:
        for item in range(num_items):
            bundles_for_item = set(df[df["item"] == item]["bundle"])
            cooccuring_items = set(df[df["bundle"].apply(lambda b: b in bundles_for_item)]["item"])
            for citem in cooccuring_items:
                if citem != item:
                    output_file.write(f"{item}\t{citem}\n")

In [None]:
# Take subset of the positive items and create a negative matches for them
def Create_negative_item_item_file_subset(dataset_name, total_num_items, num_pos_interactions_to_take):
    netease_item_item = pd.read_csv(os.path.join("..","data", dataset_name, 'item_item.txt'), delimiter="\t", 
                                    names=["item1", "item2"])
    num_positive_samples = len(netease_item_item)
    fix_seed(123)
    
    chosen_positive_interactions = np.random.choice(range(num_positive_samples), num_pos_interactions_to_take, replace=True)
    chosen_positive_interactions.sort()
    small_netease_item_item = netease_item_item.iloc[chosen_positive_interactions]
    
    negative_items = np.random.choice(range(total_num_items), num_pos_interactions_to_take, replace=True)
    item1_list = small_netease_item_item["item1"].values
    item2_list = small_netease_item_item["item2"].values
    
    latest_item1_index = -1
    latest_positives = set()
    for i in tqdm(range(num_pos_interactions_to_take)):
        current_item1_index = item1_list[i]
        if current_item1_index == latest_item1_index:
            item_positive_items = latest_positives
        else:
            item_positive_items = set(netease_item_item[netease_item_item["item1"] == current_item1_index]["item2"])
            latest_item1_index = current_item1_index
            latest_positives = item_positive_items.copy()
        while (negative_items[i] == current_item1_index or negative_items[i] in item_positive_items):
            negative_items[i] = np.random.choice(range(total_num_items))

    with open(f"..\\data\\{dataset_name}\\item_item_negatives_10M.txt", 'w') as output_file:
        for item1, neg in zip(item1_list, negative_items):
            output_file.write(f"{item1.item()}\t{neg.item()}\n")
            
    with open(f"..\\data\\{dataset_name}\\item_item_positives_10M.txt", 'w') as output_file:
        for item1, pos in zip(item1_list, item2_list):
            output_file.write(f"{item1.item()}\t{pos.item()}\n")

In [19]:
# Pandas version - all items
def Create_negative_item_item_file(dataset_name, total_num_items):
    netease_item_item = pd.read_csv(os.path.join("..","data", dataset_name, 'item_item.txt'), delimiter="\t", names=["item1", "item2"])
    num_positive_samples = len(netease_item_item)
    fix_seed(123)
    negative_items = np.random.choice(range(total_num_items), num_positive_samples, replace=True)
    item1_list = netease_item_item["item1"]
    latest_item1_index = -1
    latest_positives = set()
    for i in tqdm(range(num_positive_samples)):
        current_item1_index = item1_list[i]
        if current_item1_index == latest_item1_index:
            item_positive_items = latest_positives
        else:
            item_positive_items = set(netease_item_item[netease_item_item["item1"] == current_item1_index]["item2"])
            latest_item1_index = current_item1_index
            latest_positives = item_positive_items.copy()
        while negative_items[i] in item_positive_items or negative_items[i] == current_item1_index:
            negative_items[i] = np.random.choice(range(total_num_items))

    with open(f"..\\data\\{dataset_name}\\item_item_negatives3.txt", 'w') as output_file:
        for item1, neg in zip(item1_list, negative_items):
            output_file.write(f"{item1.item()}\t{neg.item()}\n")

In [17]:
# CSR version - all items
def Create_negative_item_item_file(dataset_name, total_num_items):
    with open(os.path.join("..","data", dataset_name, 'item_item.txt'), 'r') as f:
        items_paires = list(map(lambda s: tuple(int(i) for i in s[:-1].split('\t')), f.readlines()))
    indice = np.array(items_paires, dtype=np.int32)
    values = np.ones(len(items_paires), dtype=np.float32)
    item_item_metrix = sp.coo_matrix((values, (indice[:, 0], indice[:, 1])), shape=(total_num_items, total_num_items)).tocsr()

    item1_list, positive_items = item_item_metrix.nonzero()
    num_positive_samples = item1_list.shape[0]
    # A negative item2 for each item
    negative_items = np.random.choice(range(total_num_items), num_positive_samples, replace=True)

    for i in tqdm(range(num_positive_samples)):
        left_item_index = item1_list[i]
        item_positive_items = set(item_item_metrix[left_item_index].nonzero()[1])
        while negative_items[i] in item_positive_items or negative_items[i] == left_item_index:
            negative_items[i] = np.random.choice(range(total_num_items))

    with open(f"..\\data\\{dataset_name}\\item_item_negatives2.txt", 'w') as output_file:
        for item1, neg in zip(item1_list, negative_items):
            output_file.write(f"{item1.item()}\t{neg.item()}\n")

In [None]:
def get_coocurrences_per_item(dataset_name)
    item_item = pd.read_csv(os.path.join("..","data", dataset_name, 'item_item.txt'), delimiter="\t", 
                                    names=["item1", "item2"])
    counts = item_item.groupby("item1").count()
    coutne.
    = len(netease_item_item)
    fix_seed(123)
    
    chosen_positive_interactions = np.random.choice(range(num_positive_samples), num_pos_interactions_to_take, replace=True)
    chosen_positive_interactions.sort()
    small_netease_item_item = netease_item_item.iloc[chosen_positive_interactions]
    
    negative_items = np.random.choice(range(total_num_items), num_pos_interactions_to_take, replace=True)
    item1_list = small_netease_item_item["item1"].values
    item2_list = small_netease_item_item["item2"].values
    
    latest_item1_index = -1
    latest_positives = set()
    for i in tqdm(range(num_pos_interactions_to_take)):
        current_item1_index = item1_list[i]
        if current_item1_index == latest_item1_index:
            item_positive_items = latest_positives

Unnamed: 0,item1,counts
0,0,2
1,1,1
2,2,1
3,3,47
4,4,21
...,...,...
2812,2814,1
2813,2815,2
2814,2816,1
2815,2817,23


In [None]:
#for dataset_name in ["Steam", "NetEase", "Youshu"]:
#for dataset_name in ["NetEase"]:
#    print(dataset_name)
#    num_items = counters_dict[dataset_name][1]
#    Create_negative_item_item_file(dataset_name, num_items)

NetEase


 14%|████████▋                                                      | 32640894/237323182 [9:19:19<63:53:55, 889.79it/s]

In [None]:
# Splitting the data to 80,10,10
def split_train_val_test(dataset_name):
    print(dataset_name)
    fix_seed(123)
    pos_df = pd.read_csv(f"../data/{dataset_name}/item_item_positives_10M.txt", delimiter="\t", names=["bundle", "item"])
    pos_df["label"] = 1
    neg_df = pd.read_csv(f"../data/{dataset_name}/item_item_negatives_10M.txt", delimiter="\t", names=["bundle", "item"])
    neg_df["label"] = 0
    
    num_total_pos = len(pos_df)
    test_and_val_size = 60000 if num_total_pos * 0.2 > 60000 else 0.2
    
    # take equal pos and negs (80,10,10)
    pos_train, pos_test = train_test_split(pos_df, test_size=test_and_val_size)
    neg_train, neg_test = train_test_split(neg_df, test_size=test_and_val_size)
    pos_val, pos_test = train_test_split(pos_test, test_size=0.5)
    neg_val, neg_test = train_test_split(neg_test, test_size=0.5)
    
    # unite pos and neg
    train = pd.concat([pos_train, neg_train])
    val = pd.concat([pos_val, neg_val])
    test = pd.concat([pos_test, neg_test])

    # To file
    train.to_csv(f"../data/{dataset_name}/item_item_train.txt", sep="\t", header=False, index=False)
    val.to_csv(f"../data/{dataset_name}/item_item_val.txt", sep="\t", header=False, index=False)
    test.to_csv(f"../data/{dataset_name}/item_item_test.txt", sep="\t", header=False, index=False)