In [1]:
# Copyright 2018 Anar Amirli
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Train/Test Set Construction
In this document, we construct training and test sets from already computed feature sets. The sets are computed according to the layout:

data<br>
├── train_features<br>
├── test_features<br>

In [2]:
from os import listdir
from os.path import join
import pickle
import numpy as np
import pandas as pd
import multiprocessing
from collections import Counter
from random import shuffle
import json
import re

# our common utility functions that are used in different notebooks
from utils import *

pd.set_option('compute.use_bottleneck', True)
pd.set_option('compute.use_numexpr', True)

In [3]:
def describe_dataset(dataset):
    print('Shape: {}\n'.format(dataset.shape))
    
    _, col = dataset.shape
        
    heading = 'Pitch index counts'
    print(heading + '\n' + '-'*len(heading))
    for key, val in sorted(Counter(dataset.values[:, col-1]).items()):
        print('{}\t: {}'.format(int(key), val))

## Pitch IDS

In [4]:
if __name__ == '__main__':

    match_id = 60565
    try:
        features_df = pd.read_csv('../data/general/feature-set/match{}_features.csv'.format(match_id))
    except FileNotFoundError:
        pass

    ball_idxs = np.sort(features_df['pitch_index'].unique())
    print('Ball  index: {}'.format(ball_idxs))

    print('Size: {}'.format(len(ball_idxs)))

Ball  index: [  0.   1.   2.   3.   4.   5.   6.   7.   8.   9.  10.  11.  12.  13.  14.
  15.  16.  17.  18.  19.  20.  21.  22.  23.  24.  25.  26.]
Size: 27


Null index(0) represents all the event categories, apart from the ones we are interested in, that occur when the game stops.

## Dataset Construction

In this section, we construct a combined shuffled dataset for train and set sets, coming from feature data.


### Parameters

When obtaining feature data for a given index, set we should reduce the number of frames of the null ball events in our data set. But cosidering the fact that they are most common, we should keep their numbers close to the number of other most occurred ball location indexes.

* $-$: No-labaled positions intervals would lead to greater label noise.

### Construction
Here we construct the combined dataset from all match data we have in the given data directories.

In [5]:
    # file based constants
    features_dir     = '../data/general/feature-set/'
    features_regex   = re.compile(r'match\d+_features.csv')

We speed up the computation by using all the CPU cores via multiprocessing module.

In [6]:
    pool = multiprocessing.Pool()
    df = pd.DataFrame()

    
    features_csv_files = [f for f in listdir(features_dir) if features_regex.match(f)]
    shuffle(features_csv_files)
    print('All matches: {}'.format(len(features_csv_files)))
    features_train_csv = features_csv_files[0:259]
    features_test_csv  = features_csv_files[259:len(features_csv_files)]
    print('Train matches: {}'.format(len(features_train_csv)))
    print('Test matches: {}'.format(len(features_test_csv)))
    
    
    df_train = pd.concat(pool.map(construct_train_set, features_train_csv))
    df_test = pd.concat(pool.map(construct_test_set, features_test_csv))


    print('Train-set shape: {}'.format(df_train.shape))
    print('Test-set shape: {}'.format(df_test.shape))

All matches: 299
Train matches: 259
Test matches: 40
Train-set shape: (718043, 250)
Test-set shape: (191505, 250)


In [7]:
    # shuffling data-sets agin
    df_train = df_train.sample(frac=1).reset_index(drop=True)
    df_test = df_test.sample(frac=1).reset_index(drop=True)

### Train Set

In [8]:
    describe_dataset(df_train)
    display(df_train.head())

Shape: (718043, 250)

Pitch index counts
------------------
0	: 36260
1	: 18168
2	: 22431
3	: 16606
4	: 27013
5	: 20165
6	: 20341
7	: 19112
8	: 27928
9	: 27359
10	: 26620
11	: 33899
12	: 34454
13	: 33567
14	: 33768
15	: 34416
16	: 34232
17	: 28553
18	: 27987
19	: 29444
20	: 21500
21	: 22414
22	: 23049
23	: 19761
24	: 31513
25	: 21915
26	: 25568


Unnamed: 0,# refX,refY,refSpeed,refDirectX,refDirectY,homeMaxX,homeMinX,homeMaxY,homeMinY,homeAvgX,...,awayCenterForwardSlowAvgX,awayCenterForwardSlowAvgY,awayCenterForwardSlowDensity,awayCenterForwardHirAvgX,awayCenterForwardHirAvgY,awayCenterForwardHirDensity,awayCenterForwardMaxSprintX,awayCenterForwardMaxSprintY,awayCenterForwardMaxSprintSpeed,pitch_index
0,47.125,40.53125,0.790039,-1.0,1.0,67.6875,40.25,59.3125,30.6875,6.769531,...,0.0,0.0,0.0,45.03125,57.0625,1.0,48.6875,49.6875,4.699219,13.0
1,75.9375,10.1875,0.509766,-1.0,-1.0,75.4375,0.0,42.0,0.0,7.546875,...,61.125,32.9375,1.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0
2,86.375,22.65625,2.039062,-1.0,-1.0,96.75,0.0,41.34375,0.0,9.671875,...,0.0,0.0,0.0,0.0,0.0,0.0,81.5,40.34375,4.738281,23.0
3,45.625,49.03125,0.340088,-1.0,-1.0,65.9375,37.71875,58.75,33.96875,6.59375,...,0.0,0.0,0.0,46.09375,51.28125,0.138062,0.0,0.0,0.0,15.0
4,69.875,32.75,1.389648,-1.0,-1.0,79.9375,45.625,64.5625,14.296875,7.996094,...,51.46875,40.46875,1.0,0.0,0.0,0.0,58.0,48.625,4.011719,15.0


### Test Set

In [9]:
    describe_dataset(df_test)
    display(df_test.head())

Shape: (191505, 250)

Pitch index counts
------------------
0	: 84442
1	: 2614
2	: 3356
3	: 2685
4	: 4373
5	: 2931
6	: 3397
7	: 2983
8	: 4197
9	: 4718
10	: 4299
11	: 5195
12	: 5564
13	: 5144
14	: 5134
15	: 5693
16	: 5158
17	: 4329
18	: 4447
19	: 4418
20	: 3161
21	: 3597
22	: 3737
23	: 3092
24	: 5141
25	: 3467
26	: 4233


Unnamed: 0,# refX,refY,refSpeed,refDirectX,refDirectY,homeMaxX,homeMinX,homeMaxY,homeMinY,homeAvgX,...,awayCenterForwardSlowAvgX,awayCenterForwardSlowAvgY,awayCenterForwardSlowDensity,awayCenterForwardHirAvgX,awayCenterForwardHirAvgY,awayCenterForwardHirDensity,awayCenterForwardMaxSprintX,awayCenterForwardMaxSprintY,awayCenterForwardMaxSprintSpeed,pitch_index
0,56.1875,5.609375,0.340088,1.0,1.0,75.8125,0.0,67.9375,0.0,7.582031,...,55.46875,3.960938,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,81.875,64.5,1.179688,1.0,1.0,77.0,41.875,64.25,28.09375,7.695312,...,46.78125,41.75,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,31.421875,36.15625,1.25,-1.0,1.0,49.1875,19.015625,40.25,12.71875,4.921875,...,21.59375,27.078125,1.0,29.0625,38.6875,1.0,0.0,0.0,0.0,9.0
3,47.0625,25.703125,0.819824,1.0,-1.0,52.75,0.0,58.84375,0.0,5.273438,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,71.5,38.03125,1.980469,1.0,1.0,77.4375,0.0,63.46875,0.0,7.742188,...,0.0,0.0,0.0,52.84375,40.9375,1.0,0.0,0.0,0.0,19.0


## Exporting

In [10]:
    df_train.to_csv('../data/general/{dataset}/all_{dataset}.csv'.format(dataset='train'), index=False)
    df_test.to_csv('../data/general/{dataset}/all_{dataset}.csv'.format(dataset='test'), index=False)