In [1]:
# Copyright 2018 Anar Amirli
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Train/Test Set Construction
In this document, we construct training and test sets from already computed feature sets. The sets are computed according to the layout:

data<br>
├── train_features<br>
├── test_features<br>

In [1]:
from os import listdir
from os.path import join
import pickle
import numpy as np
import pandas as pd
import multiprocessing
from collections import Counter
from random import shuffle
import json
import re

# our common utility functions that are used in different notebooks
from utils import *

pd.set_option('compute.use_bottleneck', True)
pd.set_option('compute.use_numexpr', True)

In [2]:
def describe_dataset(dataset):
    print('Shape: {}\n'.format(dataset.shape))
    
    _, col = dataset.shape
        
    heading = 'Pitch index counts'
    print(heading + '\n' + '-'*len(heading))
    for key, val in sorted(Counter(dataset.values[:, col-3]).items()):
        print('{}\t: {}'.format(int(key), val))

## Pitch IDS

In [3]:
if __name__ == '__main__':

    match_id = 60565
    try:
        features_df = pd.read_csv('../data/general/feature-set/match{}_features_2.csv'.format(match_id))
    except FileNotFoundError:
        pass

    ball_idxs = np.sort(features_df['pitch_index'].unique())
    print('Ball  index: {}'.format(ball_idxs))

    print('Size: {}'.format(len(ball_idxs)))

Ball  index: [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9.]
Size: 10


Null index(0) represents all the event categories, apart from the ones we are interested in, that occur when the game stops.

## Dataset Construction

In this section, we construct a combined shuffled dataset for train and set sets, coming from feature data.


### Parameters

When obtaining feature data for a given index, set we should reduce the number of frames of the null ball events in our data set. But cosidering the fact that they are most common, we should keep their numbers close to the number of other most occurred ball location indexes.

* $-$: No-labaled positions intervals would lead to greater label noise.

### Construction
Here we construct the combined dataset from all match data we have in the given data directories.

In [11]:
    # file based constants
    features_dir     = '../data/general/feature-set/'
    features_regex   = re.compile(r'match\d+_features_2.csv')

We speed up the computation by using all the CPU cores via multiprocessing module.

In [12]:
    pool = multiprocessing.Pool()
    df = pd.DataFrame()

    
    features_csv_files = [f for f in listdir(features_dir) if features_regex.match(f)]
    shuffle(features_csv_files)
    print('All matches: {}'.format(len(features_csv_files)))
    features_train_csv = features_csv_files[0:248]
    features_test_csv  = features_csv_files[248:len(features_csv_files)]
    print('Train matches: {}'.format(len(features_train_csv)))
    print('Test matches: {}'.format(len(features_test_csv)))
    
    
    df_train = pd.concat(pool.map(construct_data_set, features_train_csv))
    df_test = pd.concat(pool.map(construct_data_set, features_test_csv))


    print('Train-set shape: {}'.format(df_train.shape))
    print('Test-set shape: {}'.format(df_test.shape))

All matches: 298
Train matches: 248
Test matches: 50
Train-set shape: (569363, 230)
Test-set shape: (110019, 230)


In [13]:
    # shuffling data-sets agin
    # df_train = df_train.drop('homeGoalKeeperY', axis=1)
    
    df_train = df_train.sample(frac=1).reset_index(drop=True)
    df_test = df_test.sample(frac=1).reset_index(drop=True)

### Train Set

In [14]:
    describe_dataset(df_train)
    display(df_train.head())

Shape: (569363, 230)

Pitch index counts
------------------
0	: 174376
1	: 16287
2	: 39363
3	: 15413
4	: 48708
5	: 142437
6	: 49261
7	: 17806
8	: 44841
9	: 20871


Unnamed: 0,# refX,refY,refSpeed,refDirectX,refDirectY,teamsDBSCANx,teamsDBSCANy,homeDBSCANx,homeDBSCANy,homeMaxX,...,awayCenterForwardSlowAvgX,awayCenterForwardSlowAvgY,awayCenterForwardHirAvgX,awayCenterForwardHirAvgY,awayCenterForwardMaxSprintX,awayCenterForwardMaxSprintY,awayCenterForwardMaxSprintSpeed,pitch_index,xpos,ypos
0,62.34375,31.109375,1.269531,-1.0,-1.0,57.21875,39.75,55.40625,40.03125,81.25,...,0.0,0.0,43.96875,36.9375,0.0,0.0,0.0,5.0,67.0,28.734375
1,15.65625,26.234375,0.199951,1.0,1.0,14.96875,32.59375,14.882812,31.796875,17.3125,...,14.25,29.3125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,19.046875,46.59375,4.839844,-1.0,1.0,35.5625,38.84375,32.5625,40.0625,63.46875,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,9.46875,49.03125
3,58.375,33.28125,1.44043,1.0,-1.0,56.28125,41.28125,52.3125,36.625,68.75,...,0.0,0.0,45.3125,32.625,0.0,0.0,0.0,6.0,54.40625,68.0
4,85.0,31.484375,0.640137,1.0,-1.0,87.0625,33.4375,86.5,31.203125,89.5625,...,84.5625,52.28125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Test Set

In [15]:
    describe_dataset(df_test)
    display(df_test.head())

Shape: (110019, 230)

Pitch index counts
------------------
0	: 35361
1	: 3510
2	: 7838
3	: 2816
4	: 9098
5	: 26959
6	: 9202
7	: 3350
8	: 7944
9	: 3941


Unnamed: 0,# refX,refY,refSpeed,refDirectX,refDirectY,teamsDBSCANx,teamsDBSCANy,homeDBSCANx,homeDBSCANy,homeMaxX,...,awayCenterForwardSlowAvgX,awayCenterForwardSlowAvgY,awayCenterForwardHirAvgX,awayCenterForwardHirAvgY,awayCenterForwardMaxSprintX,awayCenterForwardMaxSprintY,awayCenterForwardMaxSprintSpeed,pitch_index,xpos,ypos
0,64.0,42.8125,2.210938,-1.0,1.0,65.0,40.1875,62.40625,40.4375,81.6875,...,48.0625,40.15625,59.03125,63.1875,0.0,0.0,0.0,8.0,90.0625,48.59375
1,39.5625,44.53125,0.930176,-1.0,1.0,34.28125,43.96875,34.1875,44.28125,67.9375,...,31.421875,56.375,0.0,0.0,0.0,0.0,0.0,6.0,60.5625,63.40625
2,88.75,23.40625,0.049988,-1.0,-1.0,97.4375,30.0625,97.75,27.640625,101.8125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,66.25,22.65625,1.769531,-1.0,1.0,59.375,32.5625,53.15625,31.609375,74.4375,...,0.0,0.0,59.21875,24.21875,49.625,37.9375,3.820312,5.0,44.1875,45.53125
4,31.3125,27.96875,1.839844,1.0,-1.0,18.78125,29.421875,17.046875,28.1875,53.71875,...,28.203125,26.109375,16.625,45.59375,0.0,0.0,0.0,0.0,0.0,0.0


## Exporting

In [16]:
    df_train.to_csv('../data/general/{dataset}/all_{dataset}_3.csv'.format(dataset='train'), index=False)
    df_test.to_csv('../data/general/{dataset}/all_{dataset}_3.csv'.format(dataset='test'), index=False)

In [14]:
    df_train[1:10]

Unnamed: 0,# refX,refY,refSpeed,refDirectX,refDirectY,teamsDBSCANx,teamsDBSCANy,homeDBSCANx,homeDBSCANy,homeMaxX,...,awayCenterForwardAvgY,awayCenterForwardAvgSpeed,awayCenterForwardSlowAvgX,awayCenterForwardSlowAvgY,awayCenterForwardHirAvgX,awayCenterForwardHirAvgY,awayCenterForwardMaxSprintX,awayCenterForwardMaxSprintY,awayCenterForwardMaxSprintSpeed,pitch_index
1,52.1875,21.234375,5.558594,1.0,-1.0,41.40625,23.109375,39.21875,17.34375,71.0625,...,23.09375,5.523438,0.0,0.0,0.0,0.0,66.3125,10.15625,7.511719,4.0
2,48.21875,34.03125,2.640625,-1.0,-1.0,52.625,23.109375,50.6875,13.921875,68.0625,...,10.351562,3.839844,0.0,0.0,0.0,0.0,50.75,10.351562,3.839844,4.0
3,23.015625,32.40625,1.450195,-1.0,1.0,12.015625,25.515625,11.921875,25.9375,52.3125,...,34.0,2.259766,0.0,0.0,11.226562,34.0,0.0,0.0,0.0,5.0
4,85.0625,40.8125,1.410156,1.0,1.0,92.4375,44.40625,82.0,45.0,100.125,...,44.4375,0.040009,54.53125,44.4375,0.0,0.0,0.0,0.0,0.0,0.0
5,77.5625,23.515625,1.110352,1.0,-1.0,83.375,32.5625,80.5625,29.828125,87.875,...,47.125,1.344727,68.1875,47.125,0.0,0.0,0.0,0.0,0.0,8.0
6,36.15625,25.296875,0.939941,-1.0,-1.0,40.75,20.4375,33.4375,20.21875,56.6875,...,25.234375,0.930176,18.953125,25.234375,0.0,0.0,0.0,0.0,0.0,2.0
7,36.625,29.65625,3.609375,1.0,-1.0,36.03125,29.859375,22.390625,31.890625,58.375,...,34.46875,2.669922,0.0,0.0,17.046875,34.46875,0.0,0.0,0.0,1.0
8,82.4375,43.15625,0.370117,-1.0,-1.0,92.75,44.96875,88.8125,51.375,101.9375,...,57.75,1.540039,0.0,0.0,77.125,57.75,0.0,0.0,0.0,0.0
9,63.1875,27.0625,1.669922,1.0,-1.0,63.625,34.5625,56.34375,33.6875,72.4375,...,30.90625,3.449219,0.0,0.0,55.5625,40.15625,56.4375,21.625,3.919922,5.0


## Data Set Construction for RF-Regression Model

In [4]:
    # file based constants
    features_dir     = '../data/general/feature-set/'
    features_regex   = re.compile(r'match\d+_features_2.csv')

In [5]:
    pool = multiprocessing.Pool()
    df = pd.DataFrame()

    
    features_csv_files = [f for f in listdir(features_dir) if features_regex.match(f)]
    shuffle(features_csv_files)
    
    print('All matches: {}'.format(len(features_csv_files)))
    features_all_csv = features_csv_files[0:len(features_csv_files)]
    df_all_data = pd.concat(pool.map(construct_data_set, features_all_csv))

    print('All-data-set shape: {}'.format(df_all_data.shape))
    
    describe_dataset(df_all_data)

All matches: 298
All-data-set shape: (679382, 230)
Shape: (679382, 230)

Pitch index counts
------------------
0	: 209737
1	: 19797
2	: 47201
3	: 18229
4	: 57806
5	: 169396
6	: 58463
7	: 21156
8	: 52785
9	: 24812


## Exporitng 

In [8]:
    df_all_data.to_csv('../data/general/{dataset}/data_{dataset}_3.csv'.format(dataset='all'), index=False)