In [1]:
# Copyright 2018 Anar Amirli
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Train/Test Set Construction
In this document, we construct training and test sets from already computed feature sets. The sets are computed according to the layout:

data<br>
├── train_features<br>
├── test_features<br>

In [1]:
from os import listdir
from os.path import join
import pickle
import numpy as np
import pandas as pd
import multiprocessing
from collections import Counter
from random import shuffle
import json
import re

# our common utility functions that are used in different notebooks
from utils import *

pd.set_option('compute.use_bottleneck', True)
pd.set_option('compute.use_numexpr', True)

In [7]:
def describe_dataset(dataset):
    print('Shape: {}\n'.format(dataset.shape))
    
    _, col = dataset.shape
        
    heading = 'Pitch index counts'
    print(heading + '\n' + '-'*len(heading))
    for key, val in sorted(Counter(dataset.values[:, col-1]).items()):
        print('{}\t: {}'.format(int(key), val))

## Pitch IDS

In [3]:
if __name__ == '__main__':

    match_id = 60565
    try:
        features_df = pd.read_csv('../data/general/feature-set/match{}_features.csv'.format(match_id))
    except FileNotFoundError:
        pass

    ball_idxs = np.sort(features_df['pitch_index'].unique())
    print('Ball  index: {}'.format(ball_idxs))

    print('Size: {}'.format(len(ball_idxs)))

Ball  index: [  0.   1.   2.   3.   4.   5.   6.   7.   8.   9.  10.  11.  12.  13.  14.
  15.  16.  17.  18.  19.  20.  21.  22.  23.  24.  25.  26.]
Size: 27


Null index(0) represents all the event categories, apart from the ones we are interested in, that occur when the game stops.

## Dataset Construction

In this section, we construct a combined shuffled dataset for train and set sets, coming from feature data.


### Parameters

When obtaining feature data for a given index, set we should reduce the number of frames of the null ball events in our data set. But cosidering the fact that they are most common, we should keep their numbers close to the number of other most occurred ball location indexes.

* $-$: No-labaled positions intervals would lead to greater label noise.

### Construction
Here we construct the combined dataset from all match data we have in the given data directories.

In [5]:
    # file based constants
    features_dir     = '../data/general/feature-set/'
    features_regex   = re.compile(r'match\d+_features.csv')

We speed up the computation by using all the CPU cores via multiprocessing module.

In [6]:
    pool = multiprocessing.Pool()
    df = pd.DataFrame()

    
    features_csv_files = [f for f in listdir(features_dir) if features_regex.match(f)]
    shuffle(features_csv_files)
    print('All matches: {}'.format(len(features_csv_files)))
    features_train_csv = features_csv_files[0:224]
    features_test_csv  = features_csv_files[224:len(features_csv_files)]
    print('Train matches: {}'.format(len(features_train_csv)))
    print('Test matches: {}'.format(len(features_test_csv)))
    
    
    df_train = pd.concat(pool.map(construct_train_set, features_train_csv))
    df_test = pd.concat(pool.map(construct_test_set, features_test_csv))


    print('Train-set shape: {}'.format(df_train.shape))
    print('Test-set shape: {}'.format(df_test.shape))

All matches: 299
Train matches: 224
Test matches: 75
Train-set shape: (624549, 250)
Test-set shape: (354516, 250)


In [8]:
    # shuffling data-sets agin
    df_train = df_train.sample(frac=1).reset_index(drop=True)
    df_test = df_test.sample(frac=1).reset_index(drop=True)

### Train Set

In [12]:
    describe_dataset(df_train)
    display(df_train.head())

Shape: (624549, 250)

Pitch index counts
------------------
0	: 31360
1	: 15767
2	: 19454
3	: 14725
4	: 23701
5	: 17459
6	: 17977
7	: 16731
8	: 24361
9	: 24166
10	: 23324
11	: 29306
12	: 30208
13	: 29139
14	: 29133
15	: 30467
16	: 29645
17	: 24711
18	: 24621
19	: 25325
20	: 18153
21	: 19423
22	: 20064
23	: 17083
24	: 27355
25	: 18803
26	: 22088


Unnamed: 0,# refX,refY,refSpeed,refDirectX,refDirectY,homeMaxX,homeMinX,homeMaxY,homeMinY,homeAvgX,...,awayCenterForwardSlowAvgX,awayCenterForwardSlowAvgY,awayCenterForwardSlowDensity,awayCenterForwardHirAvgX,awayCenterForwardHirAvgY,awayCenterForwardHirDensity,awayCenterForwardMaxSprintX,awayCenterForwardMaxSprintY,awayCenterForwardMaxSprintSpeed,pitch_index
0,37.46875,46.1875,3.609375,1.0,-1.0,59.46875,20.203125,64.5,7.691406,5.945312,...,28.046875,55.3125,1.0,28.203125,42.59375,1.0,0.0,0.0,0.0,0.0
1,38.8125,35.125,7.269531,-1.0,1.0,67.4375,29.234375,53.4375,25.859375,6.742188,...,0.0,0.0,0.0,0.0,0.0,0.0,26.265625,27.953125,5.378906,9.0
2,21.375,34.375,0.330078,-1.0,1.0,32.46875,4.449219,51.25,16.21875,3.246094,...,0.0,0.0,0.0,0.0,0.0,0.0,11.65625,39.78125,3.5,1.0
3,48.03125,20.03125,1.480469,1.0,1.0,56.53125,20.890625,55.71875,2.470703,5.652344,...,34.0,35.40625,1.0,34.75,29.53125,1.0,0.0,0.0,0.0,5.0
4,58.375,25.1875,3.230469,-1.0,-1.0,71.4375,41.9375,49.65625,7.46875,7.140625,...,0.0,0.0,0.0,41.375,31.6875,1.0,62.65625,14.898438,7.378906,14.0


### Test Set

In [13]:
    describe_dataset(df_test)
    display(df_test.head())

Shape: (354516, 250)

Pitch index counts
------------------
0	: 158859
1	: 5015
2	: 6333
3	: 4566
4	: 7685
5	: 5637
6	: 5761
7	: 5364
8	: 7764
9	: 7911
10	: 7595
11	: 9788
12	: 9810
13	: 9572
14	: 9769
15	: 9642
16	: 9745
17	: 8171
18	: 7813
19	: 8537
20	: 6508
21	: 6588
22	: 6722
23	: 5770
24	: 9299
25	: 6579
26	: 7713


Unnamed: 0,# refX,refY,refSpeed,refDirectX,refDirectY,homeMaxX,homeMinX,homeMaxY,homeMinY,homeAvgX,...,awayCenterForwardSlowAvgX,awayCenterForwardSlowAvgY,awayCenterForwardSlowDensity,awayCenterForwardHirAvgX,awayCenterForwardHirAvgY,awayCenterForwardHirDensity,awayCenterForwardMaxSprintX,awayCenterForwardMaxSprintY,awayCenterForwardMaxSprintSpeed,pitch_index
0,89.375,23.984375,1.620117,-1.0,1.0,100.5,0.0,42.15625,0.0,10.046875,...,0.0,0.0,0.0,91.3125,35.75,1.0,0.0,0.0,0.0,26.0
1,71.75,25.84375,2.560547,-1.0,1.0,92.6875,51.0,45.375,13.84375,9.265625,...,65.9375,36.96875,0.072571,61.78125,19.09375,1.0,0.0,0.0,0.0,26.0
2,47.3125,44.90625,2.820312,-1.0,-1.0,62.375,27.5,60.0625,21.203125,6.238281,...,0.0,0.0,0.0,31.8125,35.53125,1.0,39.5,50.5625,5.480469,11.0
3,72.0625,18.609375,3.359375,-1.0,1.0,79.0,47.21875,47.6875,2.480469,7.898438,...,0.0,0.0,0.0,0.0,0.0,0.0,49.625,20.703125,4.390625,17.0
4,58.3125,23.78125,0.070007,1.0,1.0,78.875,50.3125,57.375,5.789062,8.765625,...,59.3125,30.4375,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Exporting

In [16]:
    df_train.to_csv('../data/general/{dataset}/all_{dataset}.csv'.format(dataset='train'), index=False)
    df_test.to_csv('../data/general/{dataset}/all_{dataset}.csv'.format(dataset='test'), index=False)