### Setup

In [13]:
import os
import pandas as pd

dataset_input = "Datasets/"

### Utilities

In [46]:
def read_directory(path, extension='.csv'):
    return [f for f in os.listdir(path) if f.endswith(extension)]

def write_csv(path, dataset):
    dataset.to_csv(path, index=False)

def read_data(path, columns_to_keep=[]):
    files = read_directory(path)
    dfs = []
    keys = []
    for file in files:
        file_path = os.path.join(path, file)
        year = str(file).split('.')[0].split('_')[2]
        
        df = pd.read_csv(file_path)
        df.fillna(0, inplace=True)
        filtered_df = df[columns_to_keep]
        
        keys.append(year)
        dfs.append(filtered_df)

    result = pd.concat(dfs, keys=keys, axis=0)
    result.reset_index(level=0, inplace=True)
    result.rename(columns={'level_0': 'season'}, inplace=True)
    result.reset_index(drop=True, inplace=True)
    return result

### Preprocessing

In [52]:
df_ratings = read_data(
        path=os.path.join(dataset_input, 'ratings'),
        columns_to_keep=[
            'sofifa_id',
            'attacking_finishing',
            'attacking_heading_accuracy',
            'attacking_short_passing',
            'attacking_volleys',
            'defending_marking',
            'defending_sliding_tackle',
            'defending_standing_tackle',
            'gk_diving',
            'gk_handling',
            'gk_positioning',
            'gk_reflexes',
            'gk_speed',
            'mentality_interceptions',
            'power_long_shots',
            'skill_long_passing'
        ])

df_ratings.head()

Unnamed: 0,season,sofifa_id,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,defending_marking,defending_sliding_tackle,defending_standing_tackle,gk_diving,gk_handling,gk_positioning,gk_reflexes,gk_speed,mentality_interceptions,power_long_shots,skill_long_passing
0,20152016,158023,93,71,88,85,13.0,21,23,0.0,0.0,0.0,0.0,0.0,22,88,79
1,20152016,20801,95,86,81,87,22.0,23,31,0.0,0.0,0.0,0.0,0.0,29,93,72
2,20152016,9014,85,51,85,86,29.0,26,26,0.0,0.0,0.0,0.0,0.0,39,90,74
3,20152016,167495,13,25,48,11,10.0,11,10,85.0,87.0,90.0,86.0,60.0,30,16,47
4,20152016,176580,90,77,82,87,30.0,38,45,0.0,0.0,0.0,0.0,0.0,41,85,64


In [136]:
df_matches = read_data(
        path=os.path.join(dataset_input, 'matches'),
        columns_to_keep=[
            'match_url', 
            'home_formation', 
            'home_team', 
            'home_goals', 
            'away_goals', 
            'away_team', 
            'away_formation', 
            'home_sequence', 
            'away_sequence', 
            'home_xi_sofifa_ids', 
            'away_xi_sofifa_ids'
        ])

df_matches['match_id'] = df_matches['match_url'].apply(lambda x: str(x).split('/')[-1])
df_matches = df_matches[['match_id'] + df_matches.columns[:-1].to_numpy().tolist()]
df_matches = df_matches.drop(columns=['match_url'])
df_matches

Unnamed: 0,match_id,season,home_formation,home_team,home_goals,away_goals,away_team,away_formation,home_sequence,away_sequence,home_xi_sofifa_ids,away_xi_sofifa_ids
0,12313,20152016,4-4-2,Crystal Palace,0,3,Chelsea,4-2-3-1,"C,R,CR,CL,L,R,CR,CL,L,CR,CL","C,R,CR,CL,L,CR,CL,RL,C,LR,C","170084.0,186392.0,157665.0,159577.0,193475.0,2...","192119.0,178372.0,204311.0,13732.0,184432.0,16..."
1,12284,20152016,4-2-3-1,Arsenal,2,1,Manchester City,4-4-1-1,"C,R,CR,CL,L,CR,CL,RL,C,LR,C","C,R,CR,CL,L,R,CR,CL,L,C,C","48940.0,203747.0,53612.0,165229.0,177604.0,156...","150724.0,163419.0,192366.0,190531.0,185103.0,1..."
2,12346,20152016,4-2-3-1,Crystal Palace,1,2,AFC Bournemouth,4-1-4-1,"C,R,CR,CL,L,CR,CL,RL,C,LR,C","C,R,CR,CL,L,C,R,CR,CL,L,C","170084.0,186392.0,157665.0,159577.0,193475.0,1...","105846.0,190885.0,135883.0,193011.0,169638.0,1..."
3,12141,20152016,4-2-3-1,Everton,0,2,Manchester City,4-2-3-1,"C,R,CR,CL,L,CR,CL,RL,C,LR,C","C,R,CR,CL,L,CR,CL,RL,C,LR,C","16254.0,180216.0,203574.0,53914.0,209852.0,188...","150724.0,163419.0,139720.0,190531.0,185103.0,1..."
4,12232,20152016,4-2-3-1,Arsenal,1,1,Tottenham Hotspur,4-2-3-1,"C,R,CR,CL,L,CR,CL,RL,C,LR,C","C,R,CR,CL,L,CR,CL,RL,C,LR,C","48940.0,158626.0,53612.0,165229.0,177604.0,146...","167948.0,188377.0,184087.0,172871.0,169595.0,2..."
...,...,...,...,...,...,...,...,...,...,...,...,...
1515,38419,20182019,4-1-4-1,Cardiff City,2,1,Brighton and Hove Albion,4-4-1-1,"C,R,CR,CL,L,C,R,CR,CL,L,C","C,R,CR,CL,L,R,CR,CL,L,C,C","193186.0,188270.0,187033.0,172425.0,198190.0,1...","199005.0,194644.0,192622.0,199915.0,177766.0,2..."
1516,38588,20182019,5-4-1,AFC Bournemouth,0,1,Manchester City,4-3-3,"C,R,CR,C,CL,L,R,CR,CL,L,C","C,R,CR,CL,L,RL,C,LR,RL,C,LR","105846.0,190456.0,233201.0,232057.0,208920.0,1...","210257.0,188377.0,203574.0,192366.0,227813.0,1..."
1517,38485,20182019,5-4-1,Newcastle United,0,0,Fulham,3-4-3,"C,R,CR,C,CL,L,R,CR,CL,L,C","C,RL,C,LR,R,CR,CL,L,RL,C,LR","220407.0,210972.0,210047.0,203487.0,204355.0,1...","206652.0,185349.0,208534.0,198261.0,200778.0,2..."
1518,38657,20182019,3-5-2,Wolverhampton Wanderers,0,0,Brighton and Hove Albion,4-3-3,"C,RL,C,LR,R,CR,C,CL,L,CR,CL","C,R,CR,CL,L,RL,C,LR,RL,C,LR","178005.0,183774.0,202048.0,202750.0,201417.0,2...","199005.0,145047.0,192622.0,199915.0,177766.0,1..."


In [None]:
# preprocess team formation

In [153]:
# explode 'home_sequence', 'away_sequence', 'home_xi_sofifa_ids', 'away_xi_sofifa_ids'
df_matches_exploded = df_matches.copy()
df_matches_exploded['home_sequence'] = df_matches_exploded['home_sequence'].str.split(',')
df_matches_exploded['away_sequence'] = df_matches_exploded['away_sequence'].str.split(',')
df_matches_exploded['home_xi_sofifa_ids'] = df_matches_exploded['home_xi_sofifa_ids'].str.split(',')
df_matches_exploded['away_xi_sofifa_ids'] = df_matches_exploded['away_xi_sofifa_ids'].str.split(',')

df_matches_exploded = df_matches_exploded.explode(['home_sequence', 'away_sequence', 'home_xi_sofifa_ids', 'away_xi_sofifa_ids'])
df_matches_exploded.reset_index(drop=True, inplace=True)
df_matches_exploded[df_matches_exploded['match_id'] == '12313']


Unnamed: 0,match_id,season,home_formation,home_team,home_goals,away_goals,away_team,away_formation,home_sequence,away_sequence,home_xi_sofifa_ids,away_xi_sofifa_ids
0,12313,20152016,4-4-2,Crystal Palace,0,3,Chelsea,4-2-3-1,C,C,170084.0,192119.0
1,12313,20152016,4-4-2,Crystal Palace,0,3,Chelsea,4-2-3-1,R,R,186392.0,178372.0
2,12313,20152016,4-4-2,Crystal Palace,0,3,Chelsea,4-2-3-1,CR,CR,157665.0,204311.0
3,12313,20152016,4-4-2,Crystal Palace,0,3,Chelsea,4-2-3-1,CL,CL,159577.0,13732.0
4,12313,20152016,4-4-2,Crystal Palace,0,3,Chelsea,4-2-3-1,L,L,193475.0,184432.0
5,12313,20152016,4-4-2,Crystal Palace,0,3,Chelsea,4-2-3-1,R,CR,212014.0,162895.0
6,12313,20152016,4-4-2,Crystal Palace,0,3,Chelsea,4-2-3-1,CR,CL,181483.0,164477.0
7,12313,20152016,4-4-2,Crystal Palace,0,3,Chelsea,4-2-3-1,CL,RL,171972.0,180403.0
8,12313,20152016,4-4-2,Crystal Palace,0,3,Chelsea,4-2-3-1,L,C,155355.0,188152.0
9,12313,20152016,4-4-2,Crystal Palace,0,3,Chelsea,4-2-3-1,CR,LR,169586.0,183277.0


In [156]:
# Build a realtion table
df_match_ratings = pd.DataFrame()
df_match_ratings['match_id']  = pd.concat([df_matches_exploded['match_id'], df_matches_exploded['match_id']])
df_match_ratings['team']      = pd.concat([df_matches_exploded['home_team'], df_matches_exploded['away_team']])
df_match_ratings['position']  = pd.concat([df_matches_exploded['home_sequence'], df_matches_exploded['away_sequence']])
df_match_ratings['sofifa_id'] = pd.concat([df_matches_exploded['home_xi_sofifa_ids'], df_matches_exploded['home_xi_sofifa_ids']])
df_match_ratings.reset_index(drop=True, inplace=True)
df_match_ratings[df_match_ratings['match_id'] == '12313']


Unnamed: 0,match_id,team,position,sofifa_id
0,12313,Crystal Palace,C,170084.0
1,12313,Crystal Palace,R,186392.0
2,12313,Crystal Palace,CR,157665.0
3,12313,Crystal Palace,CL,159577.0
4,12313,Crystal Palace,L,193475.0
5,12313,Crystal Palace,R,212014.0
6,12313,Crystal Palace,CR,181483.0
7,12313,Crystal Palace,CL,171972.0
8,12313,Crystal Palace,L,155355.0
9,12313,Crystal Palace,CR,169586.0
