# Build the training and test dataset
We have the training dataset split between two dataframes. Combine them to get training data, and write the labels to another file

In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
df_match = pd.read_pickle("match_field_plapi2.pkl")
df_player = pd.read_pickle("player_attribs.pkl")

In [3]:
df_match.head()

Unnamed: 0,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_2,home_player_3,home_player_4,...,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11
0,25,2009-03-07,493027,8635,10000,2,0,38388.0,26458.0,13423.0,...,37886.0,37903.0,37889.0,94030.0,37893.0,37981.0,131531.0,130027.0,38231.0,131530.0
1,26,2009-03-13,493034,8203,8635,2,1,67950.0,67958.0,38801.0,...,38388.0,38389.0,31316.0,164694.0,30949.0,38378.0,38383.0,38393.0,38253.0,37069.0
2,29,2009-04-12,493061,8635,8342,1,0,33620.0,38388.0,13423.0,...,39578.0,21812.0,11736.0,37858.0,37983.0,27364.0,38336.0,38366.0,27423.0,38440.0
3,32,2009-05-02,493089,10000,9985,0,0,37886.0,37100.0,37903.0,...,39580.0,30692.0,38800.0,47411.0,35412.0,39631.0,39591.0,148335.0,37262.0,148315.0
4,34,2009-05-16,493107,9991,9985,0,1,38337.0,38255.0,12473.0,...,30692.0,38800.0,37861.0,156551.0,35412.0,26224.0,39631.0,39591.0,37262.0,38369.0


In [4]:
df_player.head()

Unnamed: 0,player_api_id,date,overall_rating,potential,preferred_foot,crossing,finishing,heading_accuracy,short_passing,volleys,...,stamina,strength,long_shots,aggression,interceptions,positioning,vision,marking,standing_tackle,sliding_tackle
0,505942,2016-02-18,0.67,0.71,1,0.49,0.44,0.71,0.61,0.44,...,0.54,0.76,0.35,0.71,0.7,0.45,0.54,0.65,0.69,0.69
1,505942,2015-11-19,0.67,0.71,1,0.49,0.44,0.71,0.61,0.44,...,0.54,0.76,0.35,0.71,0.7,0.45,0.54,0.65,0.69,0.69
2,505942,2015-09-21,0.62,0.66,1,0.49,0.44,0.71,0.61,0.44,...,0.54,0.76,0.35,0.63,0.41,0.45,0.54,0.65,0.66,0.69
3,505942,2015-03-20,0.61,0.65,1,0.48,0.43,0.7,0.6,0.43,...,0.54,0.76,0.34,0.62,0.4,0.44,0.53,0.62,0.63,0.66
4,505942,2007-02-22,0.61,0.65,1,0.48,0.43,0.7,0.6,0.43,...,0.54,0.76,0.34,0.62,0.4,0.44,0.53,0.62,0.63,0.66


In [5]:
# Confirm no NaN
print(df_match.isna().sum().sum() == 0)
print(df_player.isna().sum().sum() == 0)

True
True


In [6]:
# Build the list of columns that store the api ids
player_api_cols = [col for col in df_match.columns if "_player_" in col and ("_X" not in col and "_Y" not in col )]

# Confirm every api id within match is in player
for col_name in player_api_cols:
    comp_mask = np.isin(df_match[col_name], df_player['player_api_id'], invert=True)
    if comp_mask.any():
        print("API for Match is not a subset of Player!!")
        break
print("Done.")

Done.


In [7]:
print(df_match.columns)
print(df_match.shape)

Index([u'stage', u'date', u'match_api_id', u'home_team_api_id',
       u'away_team_api_id', u'home_team_goal', u'away_team_goal',
       u'home_player_2', u'home_player_3', u'home_player_4', u'home_player_5',
       u'home_player_6', u'home_player_7', u'home_player_8', u'home_player_9',
       u'home_player_10', u'home_player_11', u'away_player_2',
       u'away_player_3', u'away_player_4', u'away_player_5', u'away_player_6',
       u'away_player_7', u'away_player_8', u'away_player_9', u'away_player_10',
       u'away_player_11'],
      dtype='object')
(18251, 27)


The code below is really inefficient way to insert the player attributes, but I could not find a vectorized implementation that did this mainly because:

1. For each match, we need to find the player attributes closest to when the match took place, hence we need to find the min of the difference in dates.

In [8]:
from __future__ import print_function

samples = df_match.shape[0]
input_features = torch.zeros(samples, 20, 29)

for index, row in df_match.iterrows():
    print(index, end='\r')
    
    field_cols = df_match.columns[7:]
    for rnum_col_name, p_api_id in row[field_cols].iteritems():
        if p_api_id == 0.0:
            continue
                
        pa = rnum_col_name.split('_')
        rnum = int(pa[-1]) - 2
        if pa[0] =='away':
            rnum += 10
        
        match_date = row['date']
        min_diff = np.timedelta64(365 * 100, 'D')
        min_index = None
        
        for row_index, row in df_player.loc[df_player['player_api_id'] == p_api_id, :].iterrows():
            player_diff = abs(row['date'] - match_date)
            if player_diff < min_diff:
                min_diff = player_diff
                min_index = row_index
        
        # Knowing this min_index, store the player attributes from this index
        input_features[index, rnum, :] = torch.FloatTensor(df_player.iloc[min_index].values[2:].astype(np.double))     

18250

In [13]:
# Write the features and labels to file for future use
torch.save(input_features, "match_wplayer_attrib_features2.pkl")

Unnamed: 0,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_2,home_player_3,home_player_4,...,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11,match_result
0,25,2009-03-07,493027,8635,10000,2,0,38388.0,26458.0,13423.0,...,37903.0,37889.0,94030.0,37893.0,37981.0,131531.0,130027.0,38231.0,131530.0,1
1,26,2009-03-13,493034,8203,8635,2,1,67950.0,67958.0,38801.0,...,38389.0,31316.0,164694.0,30949.0,38378.0,38383.0,38393.0,38253.0,37069.0,1
2,29,2009-04-12,493061,8635,8342,1,0,33620.0,38388.0,13423.0,...,21812.0,11736.0,37858.0,37983.0,27364.0,38336.0,38366.0,27423.0,38440.0,1
3,32,2009-05-02,493089,10000,9985,0,0,37886.0,37100.0,37903.0,...,30692.0,38800.0,47411.0,35412.0,39631.0,39591.0,148335.0,37262.0,148315.0,0
4,34,2009-05-16,493107,9991,9985,0,1,38337.0,38255.0,12473.0,...,38800.0,37861.0,156551.0,35412.0,26224.0,39631.0,39591.0,37262.0,38369.0,2
