In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import OneHotEncoder
import os
import glob

In [9]:
# normalize the data ?
# noramlize the rates between 0 and 1

# don't keep the names
# player positions: remove it (remove nation position) and keep club position --> prescription: constraint on the number of players per zone. Limit to Attack / Midfield / Defend / Goalkeeper
# keep overall and potential, maybe replace potential by potential - overall
# keep club joined year, and club contract valid year
# work rate: replace by two columns, in numerical values ? or remove it
# get rid of improvements 
# keep league levels

# remove ids, urls, club / league names, club_loaned_from, nationalities / nations (id, name etc.), player tags and player traits, 

In [5]:
# define function so we can easily apply it to all the datasets
def data_preprocessing(data):
    
    # display all columns of the dataframe
    pd.set_option('display.max_columns', None)

    # remove unnecessary columns , 
    data.drop(['sofifa_id', 'player_url', 'long_name', 'club_position', 'dob', 'club_team_id', 'club_name',
        'league_name', 'club_jersey_number', 'club_loaned_from', 'nationality_id', 'nationality_name', 
        'nation_team_id', 'nation_position', 'nation_jersey_number', 'body_type', 'real_face', 'player_tags', 'player_traits',
        'goalkeeping_speed', 'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm',
        'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk', 'player_face_url', 'club_logo_url', 'club_flag_url', 
        'nation_logo_url', 'nation_flag_url'], axis = 1, inplace = True)

    # instead of club position, I keep player_positions and just pick the first element. The reason is that in club_position there are also SUB and RES
    # keep just the first position
    data['player_positions'] = data['player_positions'].apply(lambda x: x.split(',')[0])    

    # translate club position to role, dividing goalkeepers, defenders, midfielders and attackers
    attackers = ['ST', 'LW', 'RW', 'CF', 'LF', 'RF', 'RS', 'LS']
    midfielders = ['CAM', 'CM', 'CDM', 'RCM','LM', 'RM', 'LAM', 'RAM', 'LW', 'RW', 'RDM', 'LCM','LDM']
    defenders = ['CB', 'LB', 'RB', 'LCB', 'RCB', 'LWB', 'RWB']
    goalkeepers = ['GK']

    data['player_positions'] = data['player_positions'].apply(lambda x: 'A' if x in attackers else 
        ('M' if x in midfielders else ('D' if x in defenders else ('G' if x in goalkeepers else x))))

    # apply one hot encoding to player positions
    dummies = pd.get_dummies(data['player_positions'], prefix = 'position')
    data.drop(['player_positions'], axis = 1, inplace = True) # drop original column
    data = pd.concat([data, dummies], axis = 1) # concat the dummy columns


    # only keep year of when club was joined
    data['club_joined'] = [int(data.loc[0,'club_joined'].split('-')[0]) for i in range(len(data['club_joined']))]
    
    # one hot encoding of preferred foot, 0 = left, 1 = right
    data['preferred_foot'] = data['preferred_foot'].apply(lambda x: 0 if x == 'Left' else 1)

    # ordinal encoding of work rate (1 = Low, 2 = Medium, 3 = High), split between attack and defense. 
    data['attack_work_rate'] = data['work_rate'].apply(lambda x: 1 if x.split('/')[0] == 'Low' else (2 if x.split('/')[0] == 'Medium' else 3))
    data['defense_work_rate'] = data['work_rate'].apply(lambda x: 1 if x.split('/')[1] == 'Low' else (2 if x.split('/')[1] == 'Medium' else 3))
    data.drop(['work_rate'], axis = 1, inplace = True)

    # remove nan values
    data = data.dropna()

    # return the dataframe
    return data

In [4]:
# # useful for the prescription part
# data = pd.read_csv('../data/fifa/players_22.csv')

# data = data_preprocessing(data)

# # save dataset to csv
# data.to_csv('../data/fifa/players_22_preprocessed.csv', index = False)

# # remember to keep club name as a feature when you do this

In [254]:
# save dataframes from different years. Each entry of the dataframe has the cleaned
# fifa data for a specific year
total_data = []

for year in range(20, 22+1):
    # read the data
    path = 'data/fifa/players_{}.csv'.format(year)
    data = pd.read_csv(path)

    # preprocess the data
    total_data.append(data_preprocessing(data))

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [255]:
total_data[0].head()

Unnamed: 0,short_name,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,league_level,club_joined,club_contract_valid_until,preferred_foot,weak_foot,skill_moves,international_reputation,release_clause_eur,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,position_A,position_D,position_G,position_M,attack_work_rate,defense_work_rate
0,L. Messi,94,94,95500000.0,560000.0,32,170,72,1.0,2004,2021.0,0,4,4,5,195800000.0,87.0,92.0,92.0,96.0,39.0,66.0,88,95,70,92,88,97,93,94,92,96,91,84,93,95,95,86,68,75,68,94,48,40,94,94,75,96,33,37,26,6,11,15,14,8,1,0,0,0,2,1
1,Cristiano Ronaldo,93,93,58500000.0,410000.0,34,187,83,1.0,2004,2022.0,1,4,5,5,96500000.0,90.0,93.0,82.0,89.0,35.0,78.0,84,94,89,83,87,89,81,76,77,92,89,91,87,96,71,95,95,85,78,93,63,29,95,82,85,95,28,32,24,7,11,15,14,11,1,0,0,0,3,1
2,Neymar Jr,92,92,105500000.0,290000.0,27,175,68,1.0,2004,2022.0,1,5,5,5,195200000.0,91.0,85.0,87.0,95.0,32.0,58.0,87,87,62,87,87,96,88,87,81,95,94,89,96,92,84,80,61,81,49,84,51,36,87,90,90,94,27,26,29,9,9,15,15,11,1,0,0,0,3,2
3,E. Hazard,91,91,90000000.0,470000.0,28,175,74,1.0,2004,2024.0,1,4,4,4,184500000.0,91.0,83.0,86.0,94.0,35.0,66.0,81,84,61,89,83,95,83,79,83,94,94,88,95,90,94,82,56,84,63,80,54,41,87,89,88,91,34,27,22,11,12,6,8,8,1,0,0,0,3,2
4,K. De Bruyne,91,91,90000000.0,370000.0,28,181,70,1.0,2004,2023.0,1,5,4,4,166500000.0,76.0,86.0,92.0,86.0,61.0,78.0,93,82,55,92,82,86,85,83,91,91,77,76,78,91,76,91,63,89,74,90,76,61,88,94,79,91,68,58,51,15,13,5,10,13,0,0,0,1,3,3


####################################################################################################

Now, cleaning of the transfer data

In [8]:
# read all transfer files
path = "../data/Transfers/"
csv_files = glob.glob(os.path.join(path, "*.csv"))

In [10]:
# create unique dataframe with all the transfers
transfers_list = []

# loop over the list of csv files
for f in csv_files:
      
    # read the csv file
    df = pd.read_csv(f)

    # append the dataframe to the list
    transfers_list.append(df)

transfers = pd.concat(transfers_list).reset_index(drop = True)

In [11]:
transfers.to_csv('../data/Transfers/all_transfers.csv', index = False)

In [251]:
# only keep transfers happened in 2020, 2021, 2022
transfers = transfers.loc[transfers['year'].isin([2020, 2021, 2022])]

# reset index
transfers.reset_index(inplace = True, drop = True)

# just keep transfer movement in, because there should be a match between in and out, but the ones that just have out mean they are not playing
transfers = transfers[transfers["transfer_movement"] == "in"]

# remove columns we are not interested in
transfers.drop(['age', 'position', 'club_involved_name', 'transfer_movement', 'league_name', 'season'], axis = 1, inplace = True)

# one hot encoding of transfer period, 1 = summer, 0 = winter
transfers['transfer_period'] = transfers['transfer_period'].apply(lambda x: 1 if x.lower() == 'summer' else 0)

# check if loan is in fee
transfers['loan'] = transfers['fee'].apply(lambda x: 1 if 'loan' in x.lower() else 0)

# get rid of fee column, and rename fee_cleaned to fee
transfers.drop(['fee'], axis = 1, inplace = True)
transfers.rename(columns = {'fee_cleaned': 'fee'}, inplace = True)

# get rid of loan transfers
transfers = transfers[transfers['loan'] == 0]
transfers.drop(['loan'], axis = 1, inplace = True)

# drop duplicates, meaning player_name that appears twice. We keep the last one since it's the most recent transfer
transfers.drop_duplicates(subset = ['player_name'], keep = 'last', inplace = True)

# reset index
transfers.reset_index(inplace = True, drop = True)

# remove NaN values
transfers.dropna(inplace = True)

In [252]:
transfers.head()

Unnamed: 0,club_name,player_name,transfer_period,fee,year
0,Bayern Munich,Leroy Sané,1,60.0,2020
1,Bayern Munich,Bouna Sarr,1,8.0,2020
2,Bayern Munich,Alexander Nübel,1,0.0,2020
3,Bayern Munich,Eric Maxim Choupo-Moting,1,0.0,2020
5,Borussia Dortmund,Jude Bellingham,1,25.0,2020


In [257]:
transfers.shape
# number is still not that high, because data cleaning eliminated most of them. We can go back in time and use all the fifa data we have (from 2015)

(2851, 5)

# Now we need to join the two datasets based on player, if there is a doubt, we keep the club_name, if still we are not sure, we drop it

####################################################################################################

In [137]:
summer22_df = pd.read_csv("data/2022_transfers/2022_2023_football_summer_transfers_v2.csv")
summer22_df.shape

(2000, 12)

In [121]:
summer22_df.head(20)
# We will predict the fee for the summer 2022 transfer window --> change the release clause if the free is lower than the release clause
# Remove everything else before merging with the ratings dataframe
# get rid of players with loan = True

Unnamed: 0,name,position,age,market_value,country_from,league_from,club_from,country_to,league_to,club_to,fee,loan
0,Erling Haaland,Centre-Forward,21,150.0,Germany,Bundesliga,Borussia Dortmund,England,Premier League,Manchester City,60.0,False
1,Antony,Right Winger,22,35.0,Netherlands,Eredivisie,Ajax Amsterdam,England,Premier League,Manchester United,95.0,False
2,Wesley Fofana,Centre-Back,21,40.0,England,Premier League,Leicester City,England,Premier League,Chelsea FC,80.4,False
3,Aurélien Tchouameni,Defensive Midfield,22,60.0,Monaco,Ligue 1,AS Monaco,Spain,LaLiga,Real Madrid,80.0,False
4,Darwin Núñez,Centre-Forward,23,55.0,Portugal,Liga Portugal,SL Benfica,England,Premier League,Liverpool FC,75.0,False
5,Casemiro,Defensive Midfield,30,40.0,Spain,LaLiga,Real Madrid,England,Premier League,Manchester United,70.65,False
6,Alexander Isak,Centre-Forward,22,30.0,Spain,LaLiga,Real Sociedad,England,Premier League,Newcastle United,70.0,False
7,Matthijs de Ligt,Centre-Back,22,70.0,Italy,Serie A,Juventus FC,Germany,Bundesliga,Bayern Munich,67.0,False
8,Raheem Sterling,Left Winger,27,70.0,England,Premier League,Manchester City,England,Premier League,Chelsea FC,56.2,False
9,Sadio Mané,Left Winger,30,70.0,England,Premier League,Liverpool FC,Germany,Bundesliga,Bayern Munich,32.0,False


In [13]:
# first, transform the name in summer22_df to match the name in ratings_df
# for this, we need to keep only the initial of the firstname, if there is one, and we keep all the lastname
# let's create a new column in summer22_df
summer22_df["short_name"] = summer22_df["name"].str.split(" ").str[0].str[0] + ". " + summer22_df["name"].str.split(" ").str[-1]
summer22_df.head(10)


Unnamed: 0,name,position,age,market_value,country_from,league_from,club_from,country_to,league_to,club_to,fee,loan,short_name
0,Erling Haaland,Centre-Forward,21,150.0,Germany,Bundesliga,Borussia Dortmund,England,Premier League,Manchester City,60.0,False,E. Haaland
1,Antony,Right Winger,22,35.0,Netherlands,Eredivisie,Ajax Amsterdam,England,Premier League,Manchester United,95.0,False,A. Antony
2,Wesley Fofana,Centre-Back,21,40.0,England,Premier League,Leicester City,England,Premier League,Chelsea FC,80.4,False,W. Fofana
3,Aurélien Tchouameni,Defensive Midfield,22,60.0,Monaco,Ligue 1,AS Monaco,Spain,LaLiga,Real Madrid,80.0,False,A. Tchouameni
4,Darwin Núñez,Centre-Forward,23,55.0,Portugal,Liga Portugal,SL Benfica,England,Premier League,Liverpool FC,75.0,False,D. Núñez
5,Casemiro,Defensive Midfield,30,40.0,Spain,LaLiga,Real Madrid,England,Premier League,Manchester United,70.65,False,C. Casemiro
6,Alexander Isak,Centre-Forward,22,30.0,Spain,LaLiga,Real Sociedad,England,Premier League,Newcastle United,70.0,False,A. Isak
7,Matthijs de Ligt,Centre-Back,22,70.0,Italy,Serie A,Juventus FC,Germany,Bundesliga,Bayern Munich,67.0,False,M. Ligt
8,Raheem Sterling,Left Winger,27,70.0,England,Premier League,Manchester City,England,Premier League,Chelsea FC,56.2,False,R. Sterling
9,Sadio Mané,Left Winger,30,70.0,England,Premier League,Liverpool FC,Germany,Bundesliga,Bayern Munich,32.0,False,S. Mané


In [14]:
# merge the two dataframes using the short_name column, and keep only the rows where there is a match in the merge
merged_df = pd.merge(ratings_df, summer22_df, how="inner", left_on="short_name", right_on="short_name")

# check if value_eur in ratings_df is close to market_value in transfers_df for the same player
# add a column with the difference between the two values
merged_df["diff_value"] = merged_df["value_eur"]/1e6 - merged_df["market_value"]

# sort by the difference between the two values
merged_df.sort_values(by="diff_value", ascending=False)

# divide value_eur and release_clause_eur by 1e6 to get the value in millions
merged_df["value_eur"] = merged_df["value_eur"]/1e6
merged_df["release_clause_eur"] = merged_df["release_clause_eur"]/1e6

In [23]:
# merge all datasets from Transfers folder into one dataframe
import glob
import os

path = "data/Transfers"
all_files = glob.glob(os.path.join(path, "*.csv"))

df_from_each_file = (pd.read_csv(f) for f in all_files)
all_transfers_df = pd.concat(df_from_each_file, ignore_index=True)

# Keep only the season 2022/2023 transfers
transfers22_df = all_transfers_df[all_transfers_df["season"] == "2022/2023"]

In [31]:
#display all columns of the dataframe
pd.set_option('display.max_columns', None)
transfers22_df.shape
transfers22_df.head(10)

Unnamed: 0,club_name,player_name,age,position,club_involved_name,fee,transfer_movement,transfer_period,fee_cleaned,league_name,year,season
13493,Bayern Munich,Matthijs de Ligt,22.0,Centre-Back,Juventus,€67.00m,in,summer,67.0,1 Bundesliga,2022,2022/2023
13494,Bayern Munich,Sadio Mané,30.0,Left Winger,Liverpool,€32.00m,in,summer,32.0,1 Bundesliga,2022,2022/2023
13495,Bayern Munich,Mathys Tel,17.0,Centre-Forward,Stade Rennais,€20.00m,in,summer,20.0,1 Bundesliga,2022,2022/2023
13496,Bayern Munich,Ryan Gravenberch,20.0,Central Midfield,Ajax,€18.50m,in,summer,18.5,1 Bundesliga,2022,2022/2023
13497,Bayern Munich,Noussair Mazraoui,24.0,Right-Back,Ajax,free transfer,in,summer,0.0,1 Bundesliga,2022,2022/2023
13498,Bayern Munich,Gabriel Vidovic,18.0,Attacking Midfield,FC Bayern II,-,in,summer,,1 Bundesliga,2022,2022/2023
13499,Bayern Munich,Johannes Schenk,19.0,Goalkeeper,FC Bayern II,-,in,summer,,1 Bundesliga,2022,2022/2023
13500,Bayern Munich,Malik Tillman,21.0,Attacking Midfield,Rangers,"End of loanMay 31, 2023",in,summer,,1 Bundesliga,2022,2022/2023
13501,Bayern Munich,Lars Lukas Mai,22.0,Centre-Back,Werder Bremen,"End of loanJun 30, 2022",in,summer,,1 Bundesliga,2022,2022/2023
13502,Bayern Munich,Adrian Fein,23.0,Defensive Midfield,Dynamo Dresden,"End of loanJun 30, 2022",in,summer,,1 Bundesliga,2022,2022/2023


In [28]:
transfers22_df[transfers22_df.transfer_movement == "out"].shape

(2886, 12)

In [29]:
transfers22_df[transfers22_df.transfer_movement == "in"].shape

(2597, 12)