# Generate Folds

Let's make GropKFold(by pois) to avoid leakage of the data between train and test split. According to the fact that we have two poi columns for pairs there is no way to make cross-validation split using GroupKFold from sklearn. So let's write split function by ourselves. At the end we will have train\test splits 

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import os
import time
from joblib import dump, load
from tqdm import tqdm

# Import data

In [2]:
pairsMatched = pd.read_csv('./pairs/custom_pairs.csv')
pairsNonMatched = pd.read_csv('./pairs/nonmatching_pairs.csv')

In [3]:
#convert poi column to poi1 and poi2 columns for pairs Matched to concatenate with other dfs
pairsMatched.insert(5,'poi1',pairsMatched['poi'].values)
pairsMatched = pairsMatched.rename(columns = {"poi": "poi2"})

In [4]:
pairsFull = pd.concat([pairsMatched, pairsNonMatched], ignore_index = True)
pairsFull.shape

(2241955, 13)

# Splitting functions

In [6]:
class Bunch():
    def __init__(self, columns, out_dir, fold, purpose = 'train'):
        self.pois_set = set()
        self.df = pd.DataFrame(columns = columns)
        self.match_count = 1
        self.nonmatch_count = 1
        self.length = 1
        self.outpath = out_dir + 'Fold_' + str(fold) + '_' + purpose + '.csv'
    
    def add(self, row_to_add):
        self.pois_set.add(row_to_add.poi1)
        self.pois_set.add(row_to_add.poi2)
        self.length += 1
        if row_to_add.match:
            self.match_count += 1
        else:
            self.nonmatch_count += 1

        self.df = pd.concat([self.df,row_to_add.to_frame().transpose()], ignore_index = True)
        if len(self.df)>100:
            self.to_csv(self.df)

    def to_csv(self, df):
        df.to_csv(self.outpath, mode='a', header=not os.path.exists(self.outpath), index = False)
        self.df = pd.DataFrame(columns = list(self.df.columns))
            
        
        
def customGroupKFOLD(data, n_splits, test_size = 0.3, out_dir  = './folds/'):
    for fold in range(n_splits):
        start_time = time.time()
        data = data.sample(frac = 1).reset_index(drop = True)
        train = Bunch(list(data.columns), out_dir, fold, 'train')
        test = Bunch(list(data.columns),out_dir, fold, 'test')
        for row_id in tqdm(range(len(data))):
            current_row = data.iloc[row_id]
            if test.length/(train.length + test.length) > test_size:
                #trying to add to the train
                if current_row.poi1 not in test.pois_set and current_row.poi2 not in test.pois_set:
                    train.add(current_row)
                elif current_row.poi1 not in train.pois_set and current_row.poi2 not in train.pois_set:
                    test.add(current_row)
            else:
                #trying to add to the test
                if current_row.poi1 not in train.pois_set and current_row.poi2 not in train.pois_set:
                    test.add(current_row)
                elif current_row.poi1 not in test.pois_set and current_row.poi2 not in test.pois_set:
                    train.add(current_row)
        train.to_csv(train.df)
        test.to_csv(test.df)
        display(f'Fold {fold} finished!')
        display(f'Train length: {train.length - 1} Test length: {test.length - 1}')
        display(f'Train match/nonmatch: {train.match_count/train.nonmatch_count} Test match/nonmatch: {test.match_count/test.nonmatch_count}')

In [7]:
#input df and number of folds we want. also we could specify directory to save folds 
#and test_size, but I'm happy with default ones
customGroupKFOLD(pairsFull, 1)

100%|██████████████████████████████████████████████████████████████████████| 2241955/2241955 [24:19<00:00, 1536.08it/s]


'Fold 0 finished!'

'Train length: 1415363 Test length: 602448'

'Train match/nonmatch: 0.8262915583862585 Test match/nonmatch: 1.0609545833960508'