# Train/Test Set Construction
In this document, we construct training and test sets from already computed feature sets. The sets are created according to the layout:

data<br>
├── train_features<br>
├── test_features<br>

In [1]:
from os import listdir
from os.path import join
import pickle
import numpy as np
import pandas as pd
import multiprocessing
from collections import Counter
from random import shuffle
import json
import re

# our common utility functions that are used in different notebooks
from classes.Utils import *

pd.set_option('compute.use_bottleneck', True)
pd.set_option('compute.use_numexpr', True)

## Game state

In [3]:
if __name__ == '__main__':

    match_id = 60565
    
    try:
        features_df = pd.read_csv('../data/general/feature-set/match_{}.csv'.format(match_id))
    except FileNotFoundError:
        pass

    game_state = np.sort(features_df['game_state'].unique())
    print('Game  states: {}'.format(game_state))

    print('Size: {}'.format(len(game_state)))

Game  states: [0 1]
Size: 2


Null index(0) represents all the event categories, apart from the ones we are interested in, that occur when the game stops.

## Dataset Construction

In this section, we construct a combined shuffled dataset for train and set sets, coming from feature data.


### Parameters

When obtaining feature data for a given index, set we should reduce the number of frames of the null ball (game stops) events in our data set. But cosidering the fact that they are most common, we should keep their numbers close to the number of other most occurred ball location indexes.

* $-$: No-labaled positions intervals would lead to greater label noise.

### Construction
Here we construct the combined dataset from all match data we have in the given data directories.

In [4]:
    # file based constants
    features_dir     = '../data/general/feature-set/'
    features_regex   = re.compile(r'match_\d+.csv')

We speed up the computation by using all the CPU cores via multiprocessing module.

In [5]:
    pool = multiprocessing.Pool()
    df = pd.DataFrame()
    
    # get all file dir to list 
    features_all_csv = [f for f in listdir(features_dir) if features_regex.match(f)]
   
    features_all_csv=np.array(features_all_csv)
    
    # keep 10% of matches for testing
    test_idx = np.random.choice(np.arange(len(features_all_csv)), int(len(features_all_csv)*.1), replace=False) 
    
    # rest is for train
    train_idx = [index for index in range(len(features_all_csv)) if index not in test_idx ]
    
    print("train size: {} \ntest size: {}".format(len(train_idx), len(test_idx)))
    
    # get selected indices
    test_csv = features_all_csv[test_idx]
    train_cvs = features_all_csv[train_idx]
    
    # full dir
    test_csv = [features_dir+scv_dir for scv_dir in test_csv]
    train_cvs = [features_dir+scv_dir for scv_dir in train_cvs]
    
    # construct dataset from all csv files via multiprocessing
    df_test_data = pd.concat(pool.map(construct_data_set, test_csv))
    df_train_data = pd.concat(pool.map(construct_data_set, train_cvs))
    
    

    print('\nTrain set shape: samples={}, features={}'.format(*df_train_data.shape))
    print('Test set shape: samples={}, features={}\n'.format(*df_test_data.shape))

    header='Game state counts on Test set'
    print(header + '\n' + '-'*len(header))
    for key, val in {'0':len(df_test_data[df_test_data['game_state']==0]), '1':len(df_test_data[df_test_data['game_state']==1])}.items():
        print('{}\t: {}'.format(str(key), str(val)))

train size: 271 
test size: 30

Train set shape: samples=1563656, features=306
Test set shape: samples=173600, features=306

Game state counts on Test set
-----------------------------
0	: 75971
1	: 97629


## Exporitng 

In [6]:
    # test
    df_train_data.to_csv('dataset/data_{dataset}.csv'.format(dataset='train'), index=False)
    del df_train_data

In [9]:
    # train
    df_test_data.to_csv('dataset/data_{dataset}.csv'.format(dataset='test'), index=False)
    display(df_test_data.head(10))
    del df_test_data

Unnamed: 0,match_id,half,minute,second,game_state,x,y,home_LB_all_avrg_x,home_LB_all_avrg_y,home_LB_all_avrg_speed,...,bothteams_inner_dis_to_avrg_pos,bothteams_dbscan_avrg_x,bothteams_dbscan_avrg_y,bothteams_dbscan_avrg_speed,bothteams_inner_dis_to_dbscan_pos,referee_x,referee_y,referee_speed,referee_direction_x,referee_direction_y
0,60821,1,0,0,0,-1.0,-1.0,36.17,13.4,0.41,...,361.57,49.06,37.69,0.83,159.21,53.11,43.39,0.77,1,1
1,60821,1,0,1,1,58.28,22.56,36.19,13.34,0.21,...,361.27,50.5,36.83,1.06,141.07,52.64,43.42,0.58,-1,1
2,60821,1,0,2,1,58.24,22.03,36.64,13.82,1.18,...,362.78,52.75,38.36,2.3,124.65,51.77,43.54,1.13,-1,1
3,60821,1,0,3,1,63.82,35.75,37.83,15.07,2.02,...,369.76,55.98,40.32,3.08,108.29,50.64,43.73,1.16,-1,1
4,60821,1,0,4,1,79.51,48.53,39.74,16.23,2.54,...,377.92,56.15,41.36,3.04,135.93,50.03,44.63,1.57,-1,1
5,60821,1,0,5,1,80.64,49.34,42.56,16.79,3.03,...,382.69,58.88,40.59,2.95,122.9,50.86,46.36,2.23,1,1
6,60821,1,0,6,1,101.99,35.28,45.23,16.45,2.41,...,388.09,58.27,35.34,2.51,157.05,52.94,47.04,2.49,1,1
7,60821,1,0,7,1,102.22,34.35,46.44,15.54,2.11,...,392.55,59.21,32.27,2.57,135.56,54.81,46.74,2.94,1,-1
8,60821,1,0,8,1,97.49,29.53,49.07,12.75,2.2,...,408.52,59.58,33.03,2.96,192.96,60.43,42.18,4.26,1,-1
9,60821,1,0,9,1,96.75,28.7,49.19,12.54,0.47,...,407.87,59.66,28.85,0.81,145.14,60.78,41.48,1.37,1,-1
