In [17]:
import pandas as pd
import numpy as np
import sklearn.model_selection
import scipy.sparse as sp

In [18]:
dataset = pd.read_csv('ml-100k/u.data', sep="\t", names=['user_id', 'item_id'], usecols=[0, 1], dtype={0: np.int32, 1: np.int32}, header=None)

dataset['user_id'] = dataset['user_id'].apply(lambda x: x-1)
dataset['item_id'] = dataset['item_id'].apply(lambda x: x-1)

In [19]:
row_ind = dataset['user_id']
col_ind = dataset['item_id']
data = np.ones(len(row_ind))

dataset_array = sp.csr_matrix((data, (row_ind, col_ind)), shape=(964, 1682)).toarray()
negative_dataset = np.argwhere(dataset_array == 0)
negative_dataset = pd.DataFrame(negative_dataset, columns=["user_id", "item_id"])
# negative_dataset.to_numpy()

In [20]:
train_dataset, val_dataset = sklearn.model_selection.train_test_split(dataset, test_size=0.2)

In [21]:
def create_val_dataset(positive_df, negative_df, n):
    result = []
    negative_grouped = negative_df.groupby('user_id')['item_id'].apply(list).to_dict()

    for _, row in positive_df.iterrows():
        user_id = row['user_id']
        pos_item_id = row['item_id']
        
        neg_items = negative_grouped.get(user_id, [])
        
        if len(neg_items) < n:
            raise ValueError(f"Not enough negative samples for user {user_id}")

        sampled_neg_items = np.random.choice(neg_items, n, replace=False)
        neg_item_ids = "|".join(map(str, sampled_neg_items))
        
        result.append({'user_id': user_id, 'pos_item_id': pos_item_id, 'neg_item_id': neg_item_ids})
        
    result_df = pd.DataFrame(result)
    return result_df

In [25]:
train_dataset.to_csv('ml-100k.train.csv', index=False, header=False)

In [24]:
create_val_dataset(positive_df=val_dataset, negative_df=negative_dataset, n=99).to_csv('ml-100k.val.csv', index=False, header=False)