In [1]:
# Copyright 2018 Anar Amirli
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Train/Test Set Construction
In this document, we construct training and test sets from already computed feature sets. The sets are computed according to the layout:

data<br>
├── train_features<br>
├── test_features<br>

In [2]:
from os import listdir
from os.path import join
import pickle
import numpy as np
import pandas as pd
import multiprocessing
from collections import Counter
from random import shuffle
import json
import re

# our common utility functions that are used in different notebooks
from utils import *

pd.set_option('compute.use_bottleneck', True)
pd.set_option('compute.use_numexpr', True)

In [3]:
def describe_dataset(dataset):
    print('Shape: {}\n'.format(dataset.shape))
    
    _, col = dataset.shape
        
    heading = 'Pitch index counts'
    print(heading + '\n' + '-'*len(heading))
    for key, val in sorted(Counter(dataset.values[:, col-1]).items()):
        print('{}\t: {}'.format(int(key), val))

## Pitch IDS

In [4]:
if __name__ == '__main__':

    match_id = 60565
    try:
        features_df = pd.read_csv('../data/general/feature-set/match{}_features.csv'.format(match_id))
    except FileNotFoundError:
        pass

    ball_idxs = np.sort(features_df['pitch_index'].unique())
    print('Ball  index: {}'.format(ball_idxs))

    print('Size: {}'.format(len(ball_idxs)))

Ball  index: [  0.   1.   2.   3.   4.   5.   6.   7.   8.   9.  10.  11.  12.  13.  14.
  15.  16.  17.  18.  19.  20.  21.  22.  23.  24.  25.  26.]
Size: 27


Null index(0) represents all the event categories, apart from the ones we are interested in, that occur when the game stops.

## Dataset Construction

In this section, we construct a combined shuffled dataset for train and set sets, coming from feature data.


### Parameters

When obtaining feature data for a given index, set we should reduce the number of frames of the null ball events in our data set. But cosidering the fact that they are most common, we should keep their numbers close to the number of other most occurred ball location indexes.

* $-$: No-labaled positions intervals would lead to greater label noise.

### Construction
Here we construct the combined dataset from all match data we have in the given data directories.

In [5]:
    # file based constants
    features_dir     = '../data/general/feature-set/'
    features_regex   = re.compile(r'match\d+_features.csv')

We speed up the computation by using all the CPU cores via multiprocessing module.

In [6]:
    pool = multiprocessing.Pool()
    df = pd.DataFrame()

    
    features_csv_files = [f for f in listdir(features_dir) if features_regex.match(f)]
    shuffle(features_csv_files)
    print('All matches: {}'.format(len(features_csv_files)))
    features_train_csv = features_csv_files[0:259]
    features_test_csv  = features_csv_files[259:len(features_csv_files)]
    print('Train matches: {}'.format(len(features_train_csv)))
    print('Test matches: {}'.format(len(features_test_csv)))
    
    
    df_train = pd.concat(pool.map(construct_train_set, features_train_csv))
    df_test = pd.concat(pool.map(construct_test_set, features_test_csv))


    print('Train-set shape: {}'.format(df_train.shape))
    print('Test-set shape: {}'.format(df_test.shape))

All matches: 299
Train matches: 259
Test matches: 40
Train-set shape: (720963, 250)
Test-set shape: (188376, 250)


In [7]:
    # shuffling data-sets agin
    df_train = df_train.sample(frac=1).reset_index(drop=True)
    df_test = df_test.sample(frac=1).reset_index(drop=True)

### Train Set

In [8]:
    describe_dataset(df_train)
    display(df_train.head())

Shape: (720963, 250)

Pitch index counts
------------------
0	: 36260
1	: 17915
2	: 22567
3	: 16459
4	: 26775
5	: 19711
6	: 20625
7	: 18784
8	: 27588
9	: 27742
10	: 26656
11	: 33906
12	: 34807
13	: 33584
14	: 33723
15	: 34690
16	: 34340
17	: 28625
18	: 28117
19	: 29744
20	: 21538
21	: 22771
22	: 23706
23	: 19970
24	: 32108
25	: 22457
26	: 25795


Unnamed: 0,# refX,refY,refSpeed,refDirectX,refDirectY,homeMaxX,homeMinX,homeMaxY,homeMinY,homeAvgX,...,awayCenterForwardSlowAvgX,awayCenterForwardSlowAvgY,awayCenterForwardSlowDensity,awayCenterForwardHirAvgX,awayCenterForwardHirAvgY,awayCenterForwardHirDensity,awayCenterForwardMaxSprintX,awayCenterForwardMaxSprintY,awayCenterForwardMaxSprintSpeed,pitch_index
0,66.5,29.65625,3.019531,1.0,-1.0,72.9375,42.84375,64.1875,3.839844,60.5625,...,0.0,0.0,0.0,43.5625,45.375,1.0,0.0,0.0,0.0,15.0
1,80.875,24.8125,3.289062,1.0,1.0,93.9375,50.25,51.40625,2.910156,71.5,...,49.59375,37.34375,1.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0
2,51.21875,37.4375,1.820312,1.0,1.0,79.5,40.96875,63.59375,12.796875,59.53125,...,0.0,0.0,0.0,0.0,0.0,0.0,39.96875,23.59375,3.480469,22.0
3,79.125,13.640625,0.529785,1.0,1.0,88.5,57.28125,32.46875,4.730469,73.1875,...,67.75,7.929688,1.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0
4,19.046875,54.3125,0.919922,1.0,1.0,49.0,8.9375,64.5,23.1875,24.375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0


### Test Set

In [9]:
    describe_dataset(df_test)
    display(df_test.head())

Shape: (188376, 250)

Pitch index counts
------------------
0	: 84233
1	: 2867
2	: 3220
3	: 2832
4	: 4611
5	: 3385
6	: 3113
7	: 3311
8	: 4537
9	: 4335
10	: 4263
11	: 5188
12	: 5211
13	: 5127
14	: 5179
15	: 5419
16	: 5050
17	: 4257
18	: 4317
19	: 4118
20	: 3123
21	: 3240
22	: 3080
23	: 2883
24	: 4546
25	: 2925
26	: 4006


Unnamed: 0,# refX,refY,refSpeed,refDirectX,refDirectY,homeMaxX,homeMinX,homeMaxY,homeMinY,homeAvgX,...,awayCenterForwardSlowAvgX,awayCenterForwardSlowAvgY,awayCenterForwardSlowDensity,awayCenterForwardHirAvgX,awayCenterForwardHirAvgY,awayCenterForwardHirDensity,awayCenterForwardMaxSprintX,awayCenterForwardMaxSprintY,awayCenterForwardMaxSprintSpeed,pitch_index
0,61.1875,24.140625,4.179688,-1.0,1.0,72.25,47.375,48.25,6.859375,61.5,...,0.0,0.0,0.0,43.53125,32.25,1.0,39.375,18.671875,3.210938,17.0
1,17.484375,52.625,1.299805,1.0,0.0,39.34375,3.669922,63.6875,25.265625,20.625,...,23.296875,62.6875,0.231445,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,57.71875,27.40625,2.720703,-1.0,1.0,77.6875,45.21875,54.96875,10.273438,57.4375,...,0.0,0.0,0.0,47.53125,25.828125,0.865723,0.0,0.0,0.0,18.0
3,90.4375,10.296875,1.860352,-1.0,1.0,100.375,54.84375,43.625,2.929688,78.375,...,0.0,0.0,0.0,0.0,0.0,0.0,71.75,24.984375,4.019531,0.0
4,39.75,26.40625,1.610352,1.0,1.0,58.0625,0.0,51.5,0.0,39.65625,...,0.0,0.0,0.0,22.390625,32.34375,1.0,0.0,0.0,0.0,0.0


## Exporting

In [10]:
    df_train.to_csv('../data/general/{dataset}/all_{dataset}.csv'.format(dataset='train'), index=False)
    df_test.to_csv('../data/general/{dataset}/all_{dataset}.csv'.format(dataset='test'), index=False)

In [15]:
    df_test[1:10]

Unnamed: 0,# refX,refY,refSpeed,refDirectX,refDirectY,homeMaxX,homeMinX,homeMaxY,homeMinY,homeAvgX,...,awayCenterForwardSlowAvgX,awayCenterForwardSlowAvgY,awayCenterForwardSlowDensity,awayCenterForwardHirAvgX,awayCenterForwardHirAvgY,awayCenterForwardHirDensity,awayCenterForwardMaxSprintX,awayCenterForwardMaxSprintY,awayCenterForwardMaxSprintSpeed,pitch_index
1,17.484375,52.625,1.299805,1.0,0.0,39.34375,3.669922,63.6875,25.265625,20.625,...,23.296875,62.6875,0.231445,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,57.71875,27.40625,2.720703,-1.0,1.0,77.6875,45.21875,54.96875,10.273438,57.4375,...,0.0,0.0,0.0,47.53125,25.828125,0.865723,0.0,0.0,0.0,18.0
3,90.4375,10.296875,1.860352,-1.0,1.0,100.375,54.84375,43.625,2.929688,78.375,...,0.0,0.0,0.0,0.0,0.0,0.0,71.75,24.984375,4.019531,0.0
4,39.75,26.40625,1.610352,1.0,1.0,58.0625,0.0,51.5,0.0,39.65625,...,0.0,0.0,0.0,22.390625,32.34375,1.0,0.0,0.0,0.0,0.0
5,35.28125,47.125,1.549805,-1.0,-1.0,51.40625,18.953125,51.84375,19.1875,31.09375,...,0.0,0.0,0.0,18.125,27.5625,1.0,0.0,0.0,0.0,7.0
6,63.46875,24.5,1.490234,-1.0,1.0,70.4375,38.21875,45.0,19.203125,53.0625,...,36.875,24.28125,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,27.953125,34.3125,0.399902,-1.0,-1.0,22.765625,0.0,40.09375,0.0,14.570312,...,0.0,0.0,0.0,23.796875,28.90625,1.0,12.117188,43.96875,3.279297,0.0
8,57.28125,24.9375,4.21875,1.0,1.0,66.1875,29.953125,54.65625,2.990234,48.59375,...,0.0,0.0,0.0,0.0,0.0,0.0,51.3125,47.125,7.738281,0.0
9,28.1875,36.84375,3.390625,1.0,-1.0,44.9375,0.0,46.46875,0.0,24.4375,...,0.0,0.0,0.0,0.0,0.0,0.0,38.53125,34.0625,4.921875,0.0
