# Валидация

## Imports

In [1]:
import numpy as np
import pandas as pd
from rectools import Columns

In [2]:
class UsersKFoldPOut():
    def __init__(self, n_folds, p, user_column=Columns.User, random_seed=23):
        self.n_folds = n_folds
        self.p = p
        self.user_column = user_column
        self.random_seed = random_seed
    
    def split(self, df):
        df['target'] = df['order'] < self.p
        users = df[self.user_column].unique()
        users_count = len(users)

        np.random.seed(self.random_seed)
        np.random.shuffle(users)
        
        fold_sizes = np.full(self.n_folds, users_count // self.n_folds, dtype=int)
        fold_sizes[: users_count % self.n_folds] += 1
        current = 0
        for fold_size in fold_sizes:
            start, stop = current, current + fold_size
            test_fold_users = users[start:stop]

            test_mask = df[self.user_column].isin(test_fold_users) & df['target']
            train_mask = ~df[self.user_column].isin(test_fold_users)
            yield train_mask, test_mask

## Read data

Данные - https://disk.yandex.ru/d/SI1aAooPn9i8TA

In [3]:
!unzip -o likes_data.zip

Archive:  likes_data.zip
  inflating: test                    
  inflating: __MACOSX/._test         
  inflating: track_artists.csv       
  inflating: train                   
  inflating: __MACOSX/._train        


In [4]:
with open('train') as f:
    train = f.read().split('\n')[:-1]

In [5]:
with open('test') as f:
    test = f.read().split('\n')[:-1]

In [6]:
def generate_item_sequence(record_id, record):
    items = list(map(int, record.split(' ')[::-1]))
    return (
        [record_id] * len(items),
        items,
        list(range(len(items))),
    )

In [7]:
common = train + test

with open('interim.csv', 'w') as interim_file:
    for i in range(len(common)):
        users, items, orders = generate_item_sequence(i, common[i])
        for user, item, order in zip(users, items, orders):
            interim_file.write(
                '{0},{1},{2}\n'.format(
                    user, item, order,
                ),
            )

del common, train, test

In [8]:
df_common = pd.read_csv(
    'interim.csv',
    dtype=np.int32,
    header=None,
    names=[Columns.User, Columns.Item, 'order'],
)

<hr />

In [9]:
cv = UsersKFoldPOut(n_folds=3, p=2)

for i, (train_mask, test_mask) in enumerate(cv.split(df_common)):
    train = df_common[train_mask]
    test = df_common[test_mask]
    # Проверим корректность работы
    common_users = set(train[cv.user_column].unique()).intersection(
        set(test[cv.user_column].unique()),
    )
    # Между test и train нет общих пользователей
    assert len(common_users) == 0
    # Количество записей для пользователя в test не больше p
    np.all(test.groupby(cv.user_column).size() <= cv.p)
    
    print(f'Fold#{i} | Train: {train.shape[0]}, Test: {test.shape[0]}')

Fold#0 | Train: 78277333, Test: 966666
Fold#1 | Train: 78277333, Test: 966666
Fold#2 | Train: 78277584, Test: 966664
