### Setup

In [13]:
import os
import pandas as pd

dataset_input = "Datasets/"

### Utilities

In [46]:
def read_directory(path, extension='.csv'):
    return [f for f in os.listdir(path) if f.endswith(extension)]

def write_csv(path, dataset):
    dataset.to_csv(path, index=False)

def read_data(path, columns_to_keep=[]):
    files = read_directory(path)
    dfs = []
    keys = []
    for file in files:
        file_path = os.path.join(path, file)
        year = str(file).split('.')[0].split('_')[2]
        
        df = pd.read_csv(file_path)
        df.fillna(0, inplace=True)
        filtered_df = df[columns_to_keep]
        
        keys.append(year)
        dfs.append(filtered_df)

    result = pd.concat(dfs, keys=keys, axis=0)
    result.reset_index(level=0, inplace=True)
    result.rename(columns={'level_0': 'season'}, inplace=True)
    result.reset_index(drop=True, inplace=True)
    return result

### Preprocessing

In [52]:
df_ratings = read_data(
        path=os.path.join(dataset_input, 'ratings'),
        columns_to_keep=[
            'sofifa_id',
            'attacking_finishing',
            'attacking_heading_accuracy',
            'attacking_short_passing',
            'attacking_volleys',
            'defending_marking',
            'defending_sliding_tackle',
            'defending_standing_tackle',
            'gk_diving',
            'gk_handling',
            'gk_positioning',
            'gk_reflexes',
            'gk_speed',
            'mentality_interceptions',
            'power_long_shots',
            'skill_long_passing'
        ])

df_ratings.head()

Unnamed: 0,season,sofifa_id,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,defending_marking,defending_sliding_tackle,defending_standing_tackle,gk_diving,gk_handling,gk_positioning,gk_reflexes,gk_speed,mentality_interceptions,power_long_shots,skill_long_passing
0,20152016,158023,93,71,88,85,13.0,21,23,0.0,0.0,0.0,0.0,0.0,22,88,79
1,20152016,20801,95,86,81,87,22.0,23,31,0.0,0.0,0.0,0.0,0.0,29,93,72
2,20152016,9014,85,51,85,86,29.0,26,26,0.0,0.0,0.0,0.0,0.0,39,90,74
3,20152016,167495,13,25,48,11,10.0,11,10,85.0,87.0,90.0,86.0,60.0,30,16,47
4,20152016,176580,90,77,82,87,30.0,38,45,0.0,0.0,0.0,0.0,0.0,41,85,64


In [331]:
df_matches = read_data(
        path=os.path.join(dataset_input, 'matches'),
        columns_to_keep=[
            'match_url', 
            'home_formation', 
            'home_team', 
            'home_goals', 
            'away_goals', 
            'away_team', 
            'away_formation', 
            'home_sequence', 
            'away_sequence', 
            'home_xi_sofifa_ids', 
            'away_xi_sofifa_ids'
        ])

df_matches['match_id'] = df_matches['match_url'].apply(lambda x: str(x).split('/')[-1])
df_matches = df_matches[['match_id'] + df_matches.columns[:-1].to_numpy().tolist()]
df_matches = df_matches.drop(columns=['match_url'])
df_matches

Unnamed: 0,match_id,season,home_formation,home_team,home_goals,away_goals,away_team,away_formation,home_sequence,away_sequence,home_xi_sofifa_ids,away_xi_sofifa_ids
0,12313,20152016,4-4-2,Crystal Palace,0,3,Chelsea,4-2-3-1,"C,R,CR,CL,L,R,CR,CL,L,CR,CL","C,R,CR,CL,L,CR,CL,RL,C,LR,C","170084.0,186392.0,157665.0,159577.0,193475.0,2...","192119.0,178372.0,204311.0,13732.0,184432.0,16..."
1,12284,20152016,4-2-3-1,Arsenal,2,1,Manchester City,4-4-1-1,"C,R,CR,CL,L,CR,CL,RL,C,LR,C","C,R,CR,CL,L,R,CR,CL,L,C,C","48940.0,203747.0,53612.0,165229.0,177604.0,156...","150724.0,163419.0,192366.0,190531.0,185103.0,1..."
2,12346,20152016,4-2-3-1,Crystal Palace,1,2,AFC Bournemouth,4-1-4-1,"C,R,CR,CL,L,CR,CL,RL,C,LR,C","C,R,CR,CL,L,C,R,CR,CL,L,C","170084.0,186392.0,157665.0,159577.0,193475.0,1...","105846.0,190885.0,135883.0,193011.0,169638.0,1..."
3,12141,20152016,4-2-3-1,Everton,0,2,Manchester City,4-2-3-1,"C,R,CR,CL,L,CR,CL,RL,C,LR,C","C,R,CR,CL,L,CR,CL,RL,C,LR,C","16254.0,180216.0,203574.0,53914.0,209852.0,188...","150724.0,163419.0,139720.0,190531.0,185103.0,1..."
4,12232,20152016,4-2-3-1,Arsenal,1,1,Tottenham Hotspur,4-2-3-1,"C,R,CR,CL,L,CR,CL,RL,C,LR,C","C,R,CR,CL,L,CR,CL,RL,C,LR,C","48940.0,158626.0,53612.0,165229.0,177604.0,146...","167948.0,188377.0,184087.0,172871.0,169595.0,2..."
...,...,...,...,...,...,...,...,...,...,...,...,...
1515,38419,20182019,4-1-4-1,Cardiff City,2,1,Brighton and Hove Albion,4-4-1-1,"C,R,CR,CL,L,C,R,CR,CL,L,C","C,R,CR,CL,L,R,CR,CL,L,C,C","193186.0,188270.0,187033.0,172425.0,198190.0,1...","199005.0,194644.0,192622.0,199915.0,177766.0,2..."
1516,38588,20182019,5-4-1,AFC Bournemouth,0,1,Manchester City,4-3-3,"C,R,CR,C,CL,L,R,CR,CL,L,C","C,R,CR,CL,L,RL,C,LR,RL,C,LR","105846.0,190456.0,233201.0,232057.0,208920.0,1...","210257.0,188377.0,203574.0,192366.0,227813.0,1..."
1517,38485,20182019,5-4-1,Newcastle United,0,0,Fulham,3-4-3,"C,R,CR,C,CL,L,R,CR,CL,L,C","C,RL,C,LR,R,CR,CL,L,RL,C,LR","220407.0,210972.0,210047.0,203487.0,204355.0,1...","206652.0,185349.0,208534.0,198261.0,200778.0,2..."
1518,38657,20182019,3-5-2,Wolverhampton Wanderers,0,0,Brighton and Hove Albion,4-3-3,"C,RL,C,LR,R,CR,C,CL,L,CR,CL","C,R,CR,CL,L,RL,C,LR,RL,C,LR","178005.0,183774.0,202048.0,202750.0,201417.0,2...","199005.0,145047.0,192622.0,199915.0,177766.0,1..."


In [356]:
# preprocess team formation
def process_formation(row):
    home_formation     = row['home_formation'].split('-')
    away_formation     = row['away_formation'].split('-')
    home_formation_len = len(home_formation)
    away_formation_len = len(away_formation)
    formation_len_diff = abs(away_formation_len - home_formation_len)
    max_formation_len  = max(away_formation_len, home_formation_len)
    formation_grid = ['F', 'MF'] + ['M' + str(i) for i in range(0, max_formation_len - 4)] + ['MD', 'D']

    if formation_len_diff == 0:
        # equal len
        home_formation_position = ','.join([x for x, y in zip(formation_grid, home_formation) for _ in range(int(y))])
        away_formation_position = ','.join([x for x, y in zip(formation_grid, away_formation) for _ in range(int(y))])
    else:
        if away_formation_len > home_formation_len:
            away_formation_position = ','.join([x for x, y in zip(formation_grid, away_formation) for _ in range(int(y))])
            # stretch home defensively
            expanded_formation = home_formation[:1] + ['0' * formation_len_diff] + home_formation[1:]
            home_formation_position = ','.join([x for x, y in zip(formation_grid, expanded_formation) for _ in range(int(y))])
        else:
            home_formation_position = ','.join([x for x, y in zip(formation_grid, home_formation) for _ in range(int(y))])
            # stretch away aggressively
            expanded_formation = away_formation[:-1] + ['0' * formation_len_diff] + away_formation[-1:]
            away_formation_position = ','.join([x for x, y in zip(formation_grid, expanded_formation) for _ in range(int(y))])

    row['formation_grid'] = formation_grid
    row['home_formation_position'] = 'K,' + home_formation_position
    row['away_formation_position'] = 'K,' + away_formation_position
    return row

df_matches_processed = df_matches.copy()
df_matches_processed[['home_formation', 'away_formation', 'formation_grid', 'home_formation_position', 'away_formation_position']] = df_matches[['home_formation', 'away_formation']].apply(process_formation, axis=1)
df_matches_processed[df_matches_processed['away_formation'] == '4-1-2-1-2']

Unnamed: 0,match_id,season,home_formation,home_team,home_goals,away_goals,away_team,away_formation,home_sequence,away_sequence,home_xi_sofifa_ids,away_xi_sofifa_ids,formation_grid,home_formation_position,away_formation_position
385,14257,20162017,4-3-3,Southampton,3,0,Leicester City,4-1-2-1-2,"C,R,CR,CL,L,RL,C,LR,RL,C,LR","C,R,CR,CL,L,C,RL,LR,C,CR,CL","172203.0,201118.0,191648.0,203376.0,169705.0,2...","163587.0,169600.0,136113.0,137785.0,139313.0,2...","[F, MF, M0, MD, D]","K,F,F,F,F,M0,M0,M0,MD,MD,MD","K,F,F,F,F,MF,M0,M0,MD,D,D"
403,14099,20162017,4-2-3-1,West Ham United,0,3,Southampton,4-1-2-1-2,"C,R,CR,CL,L,CR,CL,RL,C,LR,C","C,R,CR,CL,L,C,RL,LR,C,CR,CL","194911.0,182888.0,176285.0,183855.0,146741.0,1...","172203.0,201118.0,171791.0,203376.0,169705.0,1...","[F, MF, M0, MD, D]","K,F,F,F,F,M0,M0,MD,MD,MD,D","K,F,F,F,F,MF,M0,M0,MD,D,D"
453,14052,20162017,4-2-3-1,Manchester United,2,0,Southampton,4-1-2-1-2,"C,R,CR,CL,L,CR,CL,RL,C,LR,C","C,R,CR,CL,L,C,RL,LR,C,CR,CL","193080.0,167905.0,225508.0,190815.0,205988.0,1...","172203.0,201118.0,203376.0,171791.0,218659.0,1...","[F, MF, M0, MD, D]","K,F,F,F,F,M0,M0,MD,MD,MD,D","K,F,F,F,F,MF,M0,M0,MD,D,D"
457,14103,20162017,4-4-2,Leicester City,0,0,Southampton,4-1-2-1-2,"C,R,CR,CL,L,R,CR,CL,L,CR,CL","C,R,CR,CL,L,C,RL,LR,C,CR,CL","163587.0,169600.0,136113.0,137785.0,139313.0,2...","172203.0,201118.0,171791.0,203376.0,169705.0,1...","[F, MF, M0, MD, D]","K,F,F,F,F,M0,M0,M0,M0,MD,MD","K,F,F,F,F,MF,M0,M0,MD,D,D"
473,14409,20162017,3-4-2-1,West Ham United,0,4,Liverpool,4-1-2-1-2,"C,RL,C,LR,R,CR,CL,L,CR,CL,C","C,R,CR,CL,L,C,RL,LR,C,CR,CL","194911.0,171791.0,176285.0,19292.0,208892.0,21...","173426.0,190456.0,197061.0,195086.0,138412.0,2...","[F, MF, M0, MD, D]","K,F,F,F,M0,M0,M0,M0,MD,MD,D","K,F,F,F,F,MF,M0,M0,MD,D,D"
474,14241,20162017,3-5-2,Everton,4,0,Manchester City,4-1-2-1-2,"C,RL,C,LR,R,CR,C,CL,L,CR,CL","C,R,CR,CL,L,C,RL,LR,C,CR,CL","195668.0,225024.0,163824.0,199667.0,180216.0,2...","174543.0,163419.0,192366.0,203574.0,152554.0,2...","[F, MF, M0, MD, D]","K,F,F,F,M0,M0,M0,M0,M0,MD,MD","K,F,F,F,F,MF,M0,M0,MD,D,D"
564,14234,20162017,4-3-3,Middlesbrough,0,0,Leicester City,4-1-2-1-2,"C,R,CR,CL,L,RL,C,LR,RL,C,LR","C,R,CR,CL,L,C,RL,LR,C,CR,CL","164505.0,169426.0,205989.0,200408.0,189680.0,2...","163587.0,169600.0,136113.0,137785.0,139313.0,1...","[F, MF, M0, MD, D]","K,F,F,F,F,M0,M0,M0,MD,MD,MD","K,F,F,F,F,MF,M0,M0,MD,D,D"
730,14066,20162017,4-1-2-1-2,Southampton,1,1,Sunderland,4-1-2-1-2,"C,R,CR,CL,L,C,RL,LR,C,CR,CL","C,R,CR,CL,L,C,RL,LR,C,CR,CL","172203.0,201118.0,171791.0,203376.0,218659.0,1...","204935.0,210972.0,184082.0,197937.0,186190.0,1...","[F, MF, M0, MD, D]","K,F,F,F,F,MF,M0,M0,MD,D,D","K,F,F,F,F,MF,M0,M0,MD,D,D"
875,22489,20172018,4-4-2,Stoke City,2,1,Swansea City,4-1-2-1-2,"C,R,CR,CL,L,R,CR,CL,L,CR,CL","C,R,CR,CL,L,C,RL,LR,C,CR,CL","203042.0,204311.0,169596.0,199550.0,178567.0,1...","164835.0,184456.0,207725.0,208534.0,184472.0,1...","[F, MF, M0, MD, D]","K,F,F,F,F,M0,M0,M0,M0,MD,MD","K,F,F,F,F,MF,M0,M0,MD,D,D"
925,22518,20172018,4-3-3,Manchester City,4,1,Tottenham Hotspur,4-1-2-1-2,"C,R,CR,CL,L,RL,C,LR,RL,C,LR","C,R,CR,CL,L,C,RL,LR,C,CR,CL","210257.0,188377.0,192366.0,190531.0,183427.0,1...","167948.0,186345.0,202335.0,172871.0,169595.0,1...","[F, MF, M0, MD, D]","K,F,F,F,F,M0,M0,M0,MD,MD,MD","K,F,F,F,F,MF,M0,M0,MD,D,D"


In [358]:
# explode 'home_sequence', 'away_sequence', 'home_xi_sofifa_ids', 'away_xi_sofifa_ids'
df_matches_exploded = df_matches_processed.copy()
df_matches_exploded['home_sequence'] = df_matches_exploded['home_sequence'].str.split(',')
df_matches_exploded['away_sequence'] = df_matches_exploded['away_sequence'].str.split(',')
df_matches_exploded['home_xi_sofifa_ids'] = df_matches_exploded['home_xi_sofifa_ids'].str.split(',')
df_matches_exploded['away_xi_sofifa_ids'] = df_matches_exploded['away_xi_sofifa_ids'].str.split(',')
df_matches_exploded['home_formation_position'] = df_matches_exploded['home_formation_position'].str.split(',')
df_matches_exploded['away_formation_position'] = df_matches_exploded['away_formation_position'].str.split(',')

df_matches_exploded = df_matches_exploded.explode(['home_sequence', 'away_sequence', 'home_xi_sofifa_ids', 'away_xi_sofifa_ids', 'home_formation_position', 'away_formation_position'])
df_matches_exploded.reset_index(drop=True, inplace=True)
df_matches_exploded[df_matches_exploded['match_id'] == '12314']


Unnamed: 0,match_id,season,home_formation,home_team,home_goals,away_goals,away_team,away_formation,home_sequence,away_sequence,home_xi_sofifa_ids,away_xi_sofifa_ids,formation_grid,home_formation_position,away_formation_position
1133,12314,20152016,4-3-3,Everton,1,1,Tottenham Hotspur,4-2-3-1,C,C,16254.0,167948.0,"[F, MF, MD, D]",K,K
1134,12314,20152016,4-3-3,Everton,1,1,Tottenham Hotspur,4-2-3-1,R,R,180216.0,188377.0,"[F, MF, MD, D]",F,F
1135,12314,20152016,4-3-3,Everton,1,1,Tottenham Hotspur,4-2-3-1,CR,CR,203574.0,184087.0,"[F, MF, MD, D]",F,F
1136,12314,20152016,4-3-3,Everton,1,1,Tottenham Hotspur,4-2-3-1,CL,CL,198140.0,172871.0,"[F, MF, MD, D]",F,F
1137,12314,20152016,4-3-3,Everton,1,1,Tottenham Hotspur,4-2-3-1,L,L,163631.0,205923.0,"[F, MF, MD, D]",F,F
1138,12314,20152016,4-3-3,Everton,1,1,Tottenham Hotspur,4-2-3-1,RL,CR,199189.0,202335.0,"[F, MF, MD, D]",MD,MF
1139,12314,20152016,4-3-3,Everton,1,1,Tottenham Hotspur,4-2-3-1,C,CL,6826.0,202491.0,"[F, MF, MD, D]",MD,MF
1140,12314,20152016,4-3-3,Everton,1,1,Tottenham Hotspur,4-2-3-1,LR,RL,189725.0,190460.0,"[F, MF, MD, D]",MD,MD
1141,12314,20152016,4-3-3,Everton,1,1,Tottenham Hotspur,4-2-3-1,RL,C,152747.0,211117.0,"[F, MF, MD, D]",D,MD
1142,12314,20152016,4-3-3,Everton,1,1,Tottenham Hotspur,4-2-3-1,C,LR,192505.0,170368.0,"[F, MF, MD, D]",D,MD


In [361]:
# Build a realtion table
df_match_ratings = pd.DataFrame()
df_match_ratings['match_id']  = pd.concat([df_matches_exploded['match_id'], df_matches_exploded['match_id']])
df_match_ratings['team']      = pd.concat([df_matches_exploded['home_team'], df_matches_exploded['away_team']])
df_match_ratings['sofifa_id'] = pd.concat([df_matches_exploded['home_xi_sofifa_ids'], df_matches_exploded['home_xi_sofifa_ids']])
df_match_ratings['formation_position'] = pd.concat([df_matches_exploded['home_formation_position'], df_matches_exploded['away_formation_position']])
df_match_ratings['position']  = pd.concat([df_matches_exploded['home_sequence'], df_matches_exploded['away_sequence']])
df_match_ratings.reset_index(drop=True, inplace=True)
df_match_ratings[df_match_ratings['match_id'] == '12313']


Unnamed: 0,match_id,team,sofifa_id,formation_position,position
0,12313,Crystal Palace,170084.0,K,C
1,12313,Crystal Palace,186392.0,F,R
2,12313,Crystal Palace,157665.0,F,CR
3,12313,Crystal Palace,159577.0,F,CL
4,12313,Crystal Palace,193475.0,F,L
5,12313,Crystal Palace,212014.0,MD,R
6,12313,Crystal Palace,181483.0,MD,CR
7,12313,Crystal Palace,171972.0,MD,CL
8,12313,Crystal Palace,155355.0,MD,L
9,12313,Crystal Palace,169586.0,D,CR
