In [1]:
# Copyright 2018 Anar Amirli
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Train/Test Set Construction
In this document, we construct training and test sets from already computed feature sets. The sets are computed according to the layout:

data<br>
├── train_features<br>
├── test_features<br>

In [2]:
from os import listdir
from os.path import join
import pickle
import numpy as np
import pandas as pd
import multiprocessing
from collections import Counter
from random import shuffle
import json
import re

# our common utility functions that are used in different notebooks
from utils import *

pd.set_option('compute.use_bottleneck', True)
pd.set_option('compute.use_numexpr', True)

In [3]:
def describe_dataset(dataset):
    print('Shape: {}\n'.format(dataset.shape))
    
    _, col = dataset.shape
        
    heading = 'Pitch index counts'
    print(heading + '\n' + '-'*len(heading))
    for key, val in sorted(Counter(dataset.values[:, col-1]).items()):
        print('{}\t: {}'.format(int(key), val))

## Pitch IDS

In [4]:
if __name__ == '__main__':

    match_id = 60565
    try:
        features_df = pd.read_csv('../data/general/feature-set/match{}_features.csv'.format(match_id))
    except FileNotFoundError:
        pass

    ball_idxs = np.sort(features_df['pitch_index'].unique())
    print('Ball  index: {}'.format(ball_idxs))

    print('Size: {}'.format(len(ball_idxs)))

Ball  index: [  0.   1.   2.   3.   4.   5.   6.   7.   8.   9.  10.  11.  12.  13.  14.
  15.  16.  17.  18.  19.  20.  21.  22.  23.  24.  25.  26.]
Size: 27


Null index(0) represents all the event categories, apart from the ones we are interested in, that occur when the game stops.

## Dataset Construction

In this section, we construct a combined shuffled dataset for train and set sets, coming from feature data.


### Parameters

When obtaining feature data for a given index, set we should reduce the number of frames of the null ball events in our data set. But cosidering the fact that they are most common, we should keep their numbers close to the number of other most occurred ball location indexes.

* $-$: No-labaled positions intervals would lead to greater label noise.

### Construction
Here we construct the combined dataset from all match data we have in the given data directories.

In [5]:
    # file based constants
    features_dir     = '../data/general/feature-set/'
    features_regex   = re.compile(r'match\d+_features.csv')

We speed up the computation by using all the CPU cores via multiprocessing module.

In [20]:
    pool = multiprocessing.Pool()
    df = pd.DataFrame()

    
    features_csv_files = [f for f in listdir(features_dir) if features_regex.match(f)]
    shuffle(features_csv_files)
    print('All matches: {}'.format(len(features_csv_files)))
    features_train_csv = features_csv_files[0:259]
    features_test_csv  = features_csv_files[259:len(features_csv_files)]
    print('Train matches: {}'.format(len(features_train_csv)))
    print('Test matches: {}'.format(len(features_test_csv)))
    
    
    df_train = pd.concat(pool.map(construct_train_set, features_train_csv))
    df_test = pd.concat(pool.map(construct_test_set, features_test_csv))


    print('Train-set shape: {}'.format(df_train.shape))
    print('Test-set shape: {}'.format(df_test.shape))

All matches: 298
Train matches: 259
Test matches: 39
Train-set shape: (223465, 228)
Test-set shape: (89919, 228)


In [7]:
    # shuffling data-sets agin
    df_train = df_train.sample(frac=1).reset_index(drop=True)
    df_test = df_test.sample(frac=1).reset_index(drop=True)

### Train Set

In [22]:
    describe_dataset(df_train)
    display(df_train.head())

Shape: (223465, 228)

Pitch index counts
------------------
0	: 10335
1	: 4307
2	: 11274
3	: 4400
4	: 12094
5	: 5516
6	: 7694
7	: 5676
8	: 8744
9	: 10696
10	: 8706
11	: 11007
12	: 12279
13	: 11382
14	: 10812
15	: 9859
16	: 11316
17	: 9124
18	: 7108
19	: 9729
20	: 7468
21	: 5346
22	: 8589
23	: 6858
24	: 3656
25	: 8170
26	: 1320


Unnamed: 0,# refX,refY,refSpeed,refDirectX,refDirectY,teamsDBSCANx,teamsDBSCANy,homeDBSCANx,homeDBSCANy,homeMaxX,...,awayCenterForwardAvgY,awayCenterForwardAvgSpeed,awayCenterForwardSlowAvgX,awayCenterForwardSlowAvgY,awayCenterForwardHirAvgX,awayCenterForwardHirAvgY,awayCenterForwardMaxSprintX,awayCenterForwardMaxSprintY,awayCenterForwardMaxSprintSpeed,pitch_index
0,74.75,20.453125,1.129883,-1.0,-1.0,72.125,27.171875,69.875,12.132812,77.4375,...,20.28125,0.97998,62.9375,20.28125,0.0,0.0,0.0,0.0,0.0,17.0
1,78.0,33.53125,1.110352,1.0,-1.0,80.125,41.625,70.75,40.6875,88.75,...,30.40625,1.259766,52.34375,30.40625,0.0,0.0,0.0,0.0,0.0,19.0
2,49.875,34.8125,1.879883,-1.0,1.0,41.4375,29.4375,34.8125,38.09375,70.25,...,41.78125,3.179688,0.0,0.0,0.0,0.0,32.03125,41.78125,3.179688,11.0
3,53.625,26.75,4.898438,-1.0,1.0,41.1875,24.078125,40.40625,25.890625,70.125,...,16.671875,5.800781,0.0,0.0,0.0,0.0,50.40625,5.0,6.519531,9.0
4,58.5625,25.75,5.078125,-1.0,1.0,46.125,26.671875,44.0625,26.359375,73.1875,...,19.1875,5.398438,0.0,0.0,0.0,0.0,44.71875,19.1875,5.398438,12.0


### Test Set

In [23]:
    describe_dataset(df_test)
    display(df_test.head())

Shape: (89919, 228)

Pitch index counts
------------------
0	: 57448
1	: 570
2	: 1605
3	: 617
4	: 1684
5	: 829
6	: 1122
7	: 852
8	: 1394
9	: 1638
10	: 1354
11	: 1851
12	: 1978
13	: 1648
14	: 1610
15	: 1612
16	: 1636
17	: 1381
18	: 1196
19	: 1418
20	: 1213
21	: 942
22	: 1222
23	: 855
24	: 800
25	: 1148
26	: 296


Unnamed: 0,# refX,refY,refSpeed,refDirectX,refDirectY,teamsDBSCANx,teamsDBSCANy,homeDBSCANx,homeDBSCANy,homeMaxX,...,awayCenterForwardAvgY,awayCenterForwardAvgSpeed,awayCenterForwardSlowAvgX,awayCenterForwardSlowAvgY,awayCenterForwardHirAvgX,awayCenterForwardHirAvgY,awayCenterForwardMaxSprintX,awayCenterForwardMaxSprintY,awayCenterForwardMaxSprintSpeed,pitch_index
0,39.0625,32.84375,1.830078,1.0,-1.0,47.59375,26.328125,26.375,28.0,51.0625,...,39.375,0.890137,26.4375,39.375,0.0,0.0,0.0,0.0,0.0,0.0
1,40.5,31.640625,1.870117,1.0,-1.0,28.015625,26.265625,25.6875,26.765625,51.71875,...,38.78125,1.150391,25.828125,38.78125,0.0,0.0,0.0,0.0,0.0,0.0
2,41.9375,30.46875,1.879883,1.0,-1.0,47.15625,24.359375,29.921875,11.421875,52.65625,...,37.1875,1.839844,0.0,0.0,25.671875,37.1875,0.0,0.0,0.0,0.0
3,43.375,29.34375,1.740234,1.0,-1.0,47.0625,26.875,35.625,27.9375,53.5,...,35.59375,1.400391,25.5,35.59375,0.0,0.0,0.0,0.0,0.0,0.0
4,44.75,28.4375,1.629883,1.0,-1.0,47.375,26.203125,36.5,27.15625,54.1875,...,34.28125,1.290039,25.421875,34.28125,0.0,0.0,0.0,0.0,0.0,0.0


## Exporting

In [24]:
    df_train.to_csv('../data/general/{dataset}/all_{dataset}.csv'.format(dataset='train'), index=False)
    df_test.to_csv('../data/general/{dataset}/all_{dataset}.csv'.format(dataset='test'), index=False)

In [25]:
    df_test[1:10]

Unnamed: 0,# refX,refY,refSpeed,refDirectX,refDirectY,teamsDBSCANx,teamsDBSCANy,homeDBSCANx,homeDBSCANy,homeMaxX,...,awayCenterForwardAvgY,awayCenterForwardAvgSpeed,awayCenterForwardSlowAvgX,awayCenterForwardSlowAvgY,awayCenterForwardHirAvgX,awayCenterForwardHirAvgY,awayCenterForwardMaxSprintX,awayCenterForwardMaxSprintY,awayCenterForwardMaxSprintSpeed,pitch_index
1,40.5,31.640625,1.870117,1.0,-1.0,28.015625,26.265625,25.6875,26.765625,51.71875,...,38.78125,1.150391,25.828125,38.78125,0.0,0.0,0.0,0.0,0.0,0.0
2,41.9375,30.46875,1.879883,1.0,-1.0,47.15625,24.359375,29.921875,11.421875,52.65625,...,37.1875,1.839844,0.0,0.0,25.671875,37.1875,0.0,0.0,0.0,0.0
3,43.375,29.34375,1.740234,1.0,-1.0,47.0625,26.875,35.625,27.9375,53.5,...,35.59375,1.400391,25.5,35.59375,0.0,0.0,0.0,0.0,0.0,0.0
4,44.75,28.4375,1.629883,1.0,-1.0,47.375,26.203125,36.5,27.15625,54.1875,...,34.28125,1.290039,25.421875,34.28125,0.0,0.0,0.0,0.0,0.0,0.0
5,46.15625,27.46875,1.740234,1.0,-1.0,39.65625,29.96875,37.34375,26.53125,54.625,...,32.9375,1.419922,25.453125,32.9375,0.0,0.0,0.0,0.0,0.0,0.0
6,47.46875,26.6875,1.360352,1.0,-1.0,41.5,25.671875,38.125,25.859375,55.0625,...,31.53125,1.400391,25.515625,31.53125,0.0,0.0,0.0,0.0,0.0,0.0
7,48.625,25.953125,1.509766,1.0,-1.0,44.8125,19.96875,38.84375,25.1875,55.6875,...,30.109375,1.44043,25.734375,30.109375,0.0,0.0,0.0,0.0,0.0,0.0
8,49.875,24.96875,1.490234,1.0,-1.0,50.53125,21.890625,39.59375,24.5,56.3125,...,28.765625,1.339844,26.125,28.765625,0.0,0.0,0.0,0.0,0.0,0.0
9,50.90625,24.359375,1.049805,1.0,-1.0,47.59375,21.015625,40.40625,23.90625,56.71875,...,27.5625,1.230469,26.5625,27.5625,0.0,0.0,0.0,0.0,0.0,0.0
