In [1]:
import pandas as pd
import numpy as np
import sklearn.model_selection
from tqdm import tqdm

In [2]:
original_dataset = pd.read_csv('../daytrip_click.csv', names=['user_id', 'item_id'],
                               usecols=[0, 1], dtype={0: np.int32, 1: np.int32}, header=None)

original_dataset['user_id'] = original_dataset['user_id'].apply(lambda x: x - 1)
original_dataset['item_id'] = original_dataset['item_id'].apply(lambda x: x - 1)

In [3]:
row_ind = original_dataset['user_id']
col_ind = original_dataset['item_id']
label = np.ones(len(row_ind))

user_number = row_ind.max() + 1
item_number = col_ind.max() + 1

In [4]:
# Split the data into training and validation sets (80% training, 20% validation)
train_dataset, val_dataset = sklearn.model_selection.train_test_split(original_dataset, test_size=0.2)

# Get the list of all unique items
all_items = original_dataset['item_id'].unique()

# Create a dictionary for each user with the items they have clicked
user_clicked_items = original_dataset.groupby('user_id')['item_id'].apply(set).to_dict()

# Function to get negative samples
def get_negative_samples(user_id, num_samples=4):
    clicked_items = user_clicked_items.get(user_id, set())
    possible_items = [item for item in all_items if item not in clicked_items]
    
    negative_samples = np.random.choice(
        possible_items,
        size=num_samples if num_samples <= len(possible_items) else len(possible_items), 
        replace=False
    )
    
    return '|'.join(map(str, negative_samples))

# Create the validation dataset with negative samples
tqdm.pandas()
val_dataset['negative_item_ids'] = val_dataset['user_id'].progress_apply(get_negative_samples)

# Save the datasets to CSV files (optional)
train_dataset.to_csv('../daytrip.train.csv', index=False, header=False)
train_dataset.to_csv('../daytrip.val.csv', index=False, header=False)

  2%|▏         | 15820/964161 [02:15<2:15:05, 117.01it/s]


KeyboardInterrupt: 