In [6]:
# Imports
import os
try:
    os.chdir('code')
except FileNotFoundError:
    pass

from world import Config, FakeArgs
from dataloader import DataLoader
from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd

In [7]:
def analyse_dataset(config, dataset):
    config.dataset = dataset
    # Load the dataset
    dataset = DataLoader(config)
    # Create the user item sparse matrices for the train and test set
    train_user_item_matrix = csr_matrix(
        (np.ones(len(dataset.df_train['user_id'])), (dataset.df_train['user_id'], dataset.df_train['item_id'])),
        shape=(dataset.n_user, dataset.m_item)
    )

    test_user_item_matrix = csr_matrix(
        (np.ones(len(dataset.df_test['user_id'])), (dataset.df_test['user_id'], dataset.df_test['item_id'])),
        shape=(dataset.n_user, dataset.m_item)
    )
    # Get the number of items by users
    train_user = np.array(train_user_item_matrix.sum(axis=1)).squeeze()
    # Get the number of users by items
    train_item = np.array(train_user_item_matrix.sum(axis=0)).squeeze()

    # Get the number of items by users
    test_user = np.array(test_user_item_matrix.sum(axis=1)).squeeze()
    # Get the number of users by items
    test_item = np.array(test_user_item_matrix.sum(axis=0)).squeeze()
    
    print('In the training dataset:')
    print(f'- There are {len(dataset.df_train["user_id"])} edges.')
    print(f'- The users have a minimum, mean, median, maximum, std of {get_array_statistics(train_user)} edges.')
    print(f'- The items have a minimum, mean, median, maximum, std of {get_array_statistics(train_item)} users.\n')

    print('In the testing dataset:')
    print(f'- There are {len(dataset.df_test["user_id"])} edges.')
    print(f'- The users have a minimum, mean, median, maximum, std of {get_array_statistics(test_user)} edges.')
    print(f'- The items have a minimum, mean, median, maximum, std of {get_array_statistics(test_item)} users.\n')
    
    excepted_n_item = {i: 0 for i in dataset.df_train['item_id'].unique()}
    sample_list = []
    for user_id in dataset.df_train['user_id'].unique():
        user_items = dataset.all_pos[user_id]
        user_items_len = len(user_items)
        for i in user_items:
            excepted_n_item[i] += dataset.mean_item_per_user / user_items_len
    
    min_expected_value = min([v for v in excepted_n_item.values()])
    mean_expected_value = np.mean([v for v in excepted_n_item.values()])
    max_expected_value = max([v for v in excepted_n_item.values()])
    std_expected_value = np.std([v for v in excepted_n_item.values()])

    print(f'Minimum item expected value: {min_expected_value}')
    print(f'Mean item expected value: {mean_expected_value}')
    print(f'Maximum item expected value: {max_expected_value}')
    print(f'Std item expected value: {std_expected_value}')
    
    data_dic = {
        'dataset': dataset,
        'min_expected_value': min_expected_value,
        'mean_expected_value': mean_expected_value,
        'max_expected_value': max_expected_value,
        'std_expected_value': std_expected_value,
    }
    return data_dic
    

def get_array_statistics(array):
    return array.min(), array.mean(), np.median(array), array.max(), array.std()

In [5]:
# Extract the default values to instantiate the Config class
args = FakeArgs()

# Instantiate the config classss
config = Config(
    args.dataset, args.model, args.bpr_batch, args.recdim, args.layer, args.dropout, args.keepprob, args.a_fold,
    args.testbatch, args.multicore, args.lr, args.decay, args.pretrain, args.seed, args.epochs, args.load,
    args.checkpoint_path, args.results_path, args.topks, args.tensorboard, args.comment, args.sampling
)
data_list = []
for dataset in ['gowalla', 'yelp2018', 'amazon-book']: # 'lastfm', 
    data_list.append(analyse_dataset(config, dataset))
    
df_analysis = pd.DataFrame(data_list)


[0;30;43mloading [../data/gowalla][0m
0 training samples and 0 test samples were dropped during the data cleaning.
The user ids were not updated.
The item ids were not updated.
810128 interactions for training
217242 interactions for testing
gowalla Sparsity : 0.0008396216228570436
gowalla is ready to go
In the training dataset:
- There are 810128 edges.
- The users have a minimum, mean, median, maximum, std of (8.0, 27.132694755174494, 16.0, 811.0, 36.85818812689325) edges.
- The items have a minimum, mean, median, maximum, std of (1.0, 19.76838046899783, 12.0, 1415.0, 33.11268050158492) users.

In the testing dataset:
- There are 217242 edges.
- The users have a minimum, mean, median, maximum, std of (1.0, 7.275838971130016, 4.0, 203.0, 9.217030630034714) edges.
- The items have a minimum, mean, median, maximum, std of (0.0, 5.301041946267783, 3.0, 895.0, 13.345033564190903) users.

Minimum item expected value: 0.14594594594594595
Mean item expected value: 19.671701520216686
Maximu

NameError: name 'pd' is not defined

In [None]:
# Create the user item sparse matrices for the train and test set
train_user_item_matrix = csr_matrix(
    (np.ones(len(dataset.df_train['user_id'])), (dataset.df_train['user_id'], dataset.df_train['item_id'])),
    shape=(dataset.n_user, dataset.m_item)
)

test_user_item_matrix = csr_matrix(
    (np.ones(len(dataset.df_test['user_id'])), (dataset.df_test['user_id'], dataset.df_test['item_id'])),
    shape=(dataset.n_user, dataset.m_item)
)
# Get the number of items by users
train_user = np.array(train_user_item_matrix.sum(axis=1)).squeeze()
# Get the number of users by items
train_item = np.array(train_user_item_matrix.sum(axis=0)).squeeze()

# Get the number of items by users
test_user = np.array(test_user_item_matrix.sum(axis=1)).squeeze()
# Get the number of users by items
test_item = np.array(test_user_item_matrix.sum(axis=0)).squeeze()

In [None]:
def get_array_statistics(array):
    return array.min(), array.mean(), np.median(array), array.max(), array.std()

In [None]:
excepted_n_item = {i: 0 for i in dataset.df_train['item_id'].unique()}
sample_list = []
for user_id in dataset.df_train['user_id'].unique():
    user_items = dataset.all_pos[user_id]
    user_items_len = len(user_items)
    for i in user_items:
        excepted_n_item[i] += dataset.mean_item_per_user / user_items_len

In [None]:
min_expected_value = min([v for v in excepted_n_item.values()])
mean_expected_value = np.mean([v for v in excepted_n_item.values()])
max_expected_value = max([v for v in excepted_n_item.values()])
std_expected_value = np.std([v for v in excepted_n_item.values()])

print(f'Minimum item expected value: {min_expected_value}')
print(f'Mean item expected value: {mean_expected_value}')
print(f'Maximum item expected value: {max_expected_value}')
print(f'Std item expected value: {std_expected_value}')