# Валидация

## Imports

In [1]:
import numpy as np
import pandas as pd
from rectools import Columns

In [2]:
class UsersKFoldPOut():
    def __init__(self, n_folds, p, user_column=Columns.User, random_seed=23):
        self.n_folds = n_folds
        self.p = p
        self.user_column = user_column
        self.random_seed = random_seed
    
    def split(self, df):
        df['target'] = df['order'] < self.p
        users = df[self.user_column].unique()
        users_count = len(users)

        np.random.seed(self.random_seed)
        np.random.shuffle(users)
        
        fold_sizes = np.full(self.n_folds, users_count // self.n_folds, dtype=int)
        fold_sizes[: users_count % self.n_folds] += 1
        current = 0
        for fold_size in fold_sizes:
            start, stop = current, current + fold_size
            test_fold_users = users[start:stop]

            test_mask = df[self.user_column].isin(test_fold_users) & df['target']
            train_mask = ~df[self.user_column].isin(test_fold_users)
            yield train_mask, test_mask

## Read data

Данные - https://disk.yandex.ru/d/SI1aAooPn9i8TA

In [3]:
!unzip -o likes_data.zip

Archive:  likes_data.zip
  inflating: test                    
  inflating: __MACOSX/._test         
  inflating: track_artists.csv       
  inflating: train                   
  inflating: __MACOSX/._train        


In [4]:
with open('train') as f:
    train = f.read().split('\n')[:-1]

In [5]:
with open('test') as f:
    test = f.read().split('\n')[:-1]

In [6]:
def generate_item_sequence(record_id, record):
    items = record.split(' ')[::-1]
    result = []
    for i in range(0, len(items)):
        result.append([record_id, int(items[i]), i])
    return result

In [7]:
common = train + test
# df_size = len(common)
# Ограничим размер для скорости работы
df_size = 200
df_common = []
for i in range(0, df_size): 
    df_common += generate_item_sequence(i, common[i]) 

df_common = pd.DataFrame(data=df_common, columns=['user_id', 'item', 'order'])
df_common

Unnamed: 0,user_id,item,order
0,0,388242,0
1,0,278503,1
2,0,102795,2
3,0,470957,3
4,0,159637,4
...,...,...,...
15515,199,429513,10
15516,199,96020,11
15517,199,333300,12
15518,199,338744,13


<hr />

In [8]:
cv = UsersKFoldPOut(n_folds=3, p=1)

for i, (train_mask, test_mask) in enumerate(cv.split(df_common)):
    train = df_common[train_mask]
    test = df_common[test_mask]
    print(f'Fold#{i} | Train: {train.shape[0]}, Test: {test.shape[0]}')

Fold#0 | Train: 9605, Test: 67
Fold#1 | Train: 9605, Test: 67
Fold#2 | Train: 9618, Test: 66
