In [88]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import OneHotEncoder
import os
import glob
from tqdm import tqdm

In [2]:
# normalize the data ?
# noramlize the rates between 0 and 1

# don't keep the names
# player positions: remove it (remove nation position) and keep club position --> prescription: constraint on the number of players per zone. Limit to Attack / Midfield / Defend / Goalkeeper
# keep overall and potential, maybe replace potential by potential - overall
# keep club joined year, and club contract valid year
# work rate: replace by two columns, in numerical values ? or remove it
# get rid of improvements 
# keep league levels

# remove ids, urls, club / league names, club_loaned_from, nationalities / nations (id, name etc.), player tags and player traits, 

In [63]:
# define function so we can easily apply it to all the datasets
def data_preprocessing(data):
    
    # display all columns of the dataframe
    pd.set_option('display.max_columns', None)

    # remove unnecessary columns
    data.drop(['sofifa_id', 'player_url', 'long_name', 'club_position', 'dob', 'club_team_id', 
        #'club_name',
        'league_name', 'club_jersey_number', 'club_loaned_from', 'nationality_id', 'nationality_name', 
        'nation_team_id', 'nation_position', 'nation_jersey_number', 'body_type', 'real_face', 'player_tags', 'player_traits',
        'goalkeeping_speed', 'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm',
        'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk', 'player_face_url', 'club_logo_url', 'club_flag_url', 
        'nation_logo_url', 'nation_flag_url'], axis = 1, inplace = True)

    # instead of club position, I keep player_positions and just pick the first element. The reason is that in club_position there are also SUB and RES
    # keep just the first position
    data['player_positions'] = data['player_positions'].apply(lambda x: x.split(',')[0])    

    # translate club position to role, dividing goalkeepers, defenders, midfielders and attackers
    attackers = ['ST', 'LW', 'RW', 'CF', 'LF', 'RF', 'RS', 'LS']
    midfielders = ['CAM', 'CM', 'CDM', 'RCM','LM', 'RM', 'LAM', 'RAM', 'LW', 'RW', 'RDM', 'LCM','LDM']
    defenders = ['CB', 'LB', 'RB', 'LCB', 'RCB', 'LWB', 'RWB']
    goalkeepers = ['GK']

    data['player_positions'] = data['player_positions'].apply(lambda x: 'A' if x in attackers else 
        ('M' if x in midfielders else ('D' if x in defenders else ('G' if x in goalkeepers else x))))

    # apply one hot encoding to player positions
    dummies = pd.get_dummies(data['player_positions'], prefix = 'position')
    data.drop(['player_positions'], axis = 1, inplace = True) # drop original column
    data = pd.concat([data, dummies], axis = 1) # concat the dummy columns


    # only keep year of when club was joined
    data['club_joined'] = [int(data.loc[0,'club_joined'].split('-')[0]) for i in range(len(data['club_joined']))]
    
    # one hot encoding of preferred foot, 0 = left, 1 = right
    data['preferred_foot'] = data['preferred_foot'].apply(lambda x: 0 if x == 'Left' else 1)

    # ordinal encoding of work rate (1 = Low, 2 = Medium, 3 = High), split between attack and defense. 
    data['attack_work_rate'] = data['work_rate'].apply(lambda x: 1 if x.split('/')[0] == 'Low' else (2 if x.split('/')[0] == 'Medium' else 3))
    data['defense_work_rate'] = data['work_rate'].apply(lambda x: 1 if x.split('/')[1] == 'Low' else (2 if x.split('/')[1] == 'Medium' else 3))
    data.drop(['work_rate'], axis = 1, inplace = True)

    # remove nan values
    data = data.dropna()

    # return the dataframe
    return data

In [64]:
# save dataframes from different years. Each entry of the dataframe has the cleaned
# fifa data for a specific year
total_data = []

for year in range(20, 22+1):
    # read the data
    path = 'data/fifa/players_{}.csv'.format(year)
    data = pd.read_csv(path)

    # preprocess the data
    total_data.append(data_preprocessing(data))

# for each year, add the year column to the dataframe
for i in range(len(total_data)):
    total_data[i]['year'] = 2000 + i

  data = pd.read_csv(path)


In [65]:
total_data[0].head()

Unnamed: 0,short_name,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,club_name,league_level,club_joined,club_contract_valid_until,preferred_foot,weak_foot,skill_moves,international_reputation,release_clause_eur,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,position_A,position_D,position_G,position_M,attack_work_rate,defense_work_rate,year
0,L. Messi,94,94,95500000.0,560000.0,32,170,72,FC Barcelona,1.0,2004,2021.0,0,4,4,5,195800000.0,87.0,92.0,92.0,96.0,39.0,66.0,88,95,70,92,88,97,93,94,92,96,91,84,93,95,95,86,68,75,68,94,48,40,94,94,75,96,33,37,26,6,11,15,14,8,1,0,0,0,2,1,2000
1,Cristiano Ronaldo,93,93,58500000.0,410000.0,34,187,83,Juventus,1.0,2004,2022.0,1,4,5,5,96500000.0,90.0,93.0,82.0,89.0,35.0,78.0,84,94,89,83,87,89,81,76,77,92,89,91,87,96,71,95,95,85,78,93,63,29,95,82,85,95,28,32,24,7,11,15,14,11,1,0,0,0,3,1,2000
2,Neymar Jr,92,92,105500000.0,290000.0,27,175,68,Paris Saint-Germain,1.0,2004,2022.0,1,5,5,5,195200000.0,91.0,85.0,87.0,95.0,32.0,58.0,87,87,62,87,87,96,88,87,81,95,94,89,96,92,84,80,61,81,49,84,51,36,87,90,90,94,27,26,29,9,9,15,15,11,1,0,0,0,3,2,2000
3,E. Hazard,91,91,90000000.0,470000.0,28,175,74,Real Madrid CF,1.0,2004,2024.0,1,4,4,4,184500000.0,91.0,83.0,86.0,94.0,35.0,66.0,81,84,61,89,83,95,83,79,83,94,94,88,95,90,94,82,56,84,63,80,54,41,87,89,88,91,34,27,22,11,12,6,8,8,1,0,0,0,3,2,2000
4,K. De Bruyne,91,91,90000000.0,370000.0,28,181,70,Manchester City,1.0,2004,2023.0,1,5,4,4,166500000.0,76.0,86.0,92.0,86.0,61.0,78.0,93,82,55,92,82,86,85,83,91,91,77,76,78,91,76,91,63,89,74,90,76,61,88,94,79,91,68,58,51,15,13,5,10,13,0,0,0,1,3,3,2000


####################################################################################################

Now, cleaning of the transfer data

In [66]:
# read all transfer files
path = "data/Transfers/"
csv_files = glob.glob(os.path.join(path, "*.csv"))

In [67]:
# create unique dataframe with all the transfers
transfers_list = []

# loop over the list of csv files
for f in csv_files:
      
    # read the csv file
    df = pd.read_csv(f)

    # append the dataframe to the list
    transfers_list.append(df)

transfers = pd.concat(transfers_list).reset_index(drop = True)

In [68]:
# only keep transfers happened in 2020, 2021, 2022
transfers = transfers.loc[transfers['year'].isin([2020, 2021, 2022])]

# reset index
transfers.reset_index(inplace = True, drop = True)

# just keep transfer movement in, because there should be a match between in and out, but the ones that just have out mean they are not playing
transfers = transfers[transfers["transfer_movement"] == "in"]

# remove columns we are not interested in
transfers.drop(['age', 'position', 'club_involved_name', 'transfer_movement', 'league_name', 'season'], axis = 1, inplace = True)

# one hot encoding of transfer period, 1 = summer, 0 = winter
transfers['transfer_period'] = transfers['transfer_period'].apply(lambda x: 1 if x.lower() == 'summer' else 0)

# check if loan is in fee
transfers['loan'] = transfers['fee'].apply(lambda x: 1 if 'loan' in x.lower() else 0)

# get rid of fee column, and rename fee_cleaned to fee
transfers.drop(['fee'], axis = 1, inplace = True)
transfers.rename(columns = {'fee_cleaned': 'fee'}, inplace = True)

# get rid of loan transfers
transfers = transfers[transfers['loan'] == 0]
transfers.drop(['loan'], axis = 1, inplace = True)

# drop duplicates, meaning player_name that appears twice. We keep the last one since it's the most recent transfer
#transfers.drop_duplicates(subset = ['player_name'], keep = 'last', inplace = True)

# reset index
transfers.reset_index(inplace = True, drop = True)

# remove NaN values
transfers.dropna(inplace = True)

In [69]:
transfers.head()

Unnamed: 0,club_name,player_name,transfer_period,fee,year
0,Bayern Munich,Leroy Sané,1,60.0,2020
1,Bayern Munich,Marc Roca,1,9.0,2020
2,Bayern Munich,Bouna Sarr,1,8.0,2020
3,Bayern Munich,Alexander Nübel,1,0.0,2020
4,Bayern Munich,Tanguy Nianzou,1,0.0,2020


In [70]:
transfers.shape
# number is still not that high, because data cleaning eliminated most of them. We can go back in time and use all the fifa data we have (from 2015)

(3133, 5)

# Now we need to join the two datasets based on player, if there is a doubt, we keep the club_name, if still we are not sure, we drop it

## Analysis

In [116]:
transfers_22 = transfers[transfers['year'] == 2022].reset_index(drop = True)
transfers_21 = transfers[transfers['year'] == 2021].reset_index(drop = True)
transfers_20 = transfers[transfers['year'] == 2020].reset_index(drop = True)

ratings_22 = total_data[2].reset_index(drop = True)
ratings_21 = total_data[1].reset_index(drop = True)
ratings_20 = total_data[0].reset_index(drop = True)

In [117]:
# player_name from transfers is not the same as player_name from fifa data, so we need to match them
# we can use a metric to measure the similarity between the two names, and then match the ones that are the most similar
# use fuzzywuzzy to get the similarity between two strings
from fuzzywuzzy import fuzz

# function to get the similarity between two strings
def get_similarity(a, b):
    return fuzz.token_sort_ratio(a, b)
    

# function to get the most similar string from a list of strings
def get_most_similar(string, string_list):
    # get the similarity between the string and each string in the list
    similarity = [get_similarity(string, s) for s in string_list]

    # get the index of the most similar string
    max_index = np.argmax(similarity)

    # return the most similar string
    return string_list[max_index]


def get_short_name(player_name):
    # if there is no firts name, return the last name
    if len(player_name.split()) == 1:
        return player_name
    # else, the first name is the first word, and the last name is all the other words
    else:
        first_name = player_name.split()[0]
        last_name = player_name.split()[1:]
        return first_name[0] + ". " + " ".join(last_name)


In [118]:
def map_transfers(transfers, ratings):
    # let's try to map the club names from the transfers dataframe to the club names from the fifa dataframe
    clubs_names = transfers.club_name.unique()
    fifa_clubs = ratings.club_name.unique()

    # create a dictionary to map the club names
    club_name_dict = {}

    # loop over the list of clubs
    for club in tqdm(clubs_names):
        # get the most similar club name from the fifa dataframe
        most_similar_club = get_most_similar(club, fifa_clubs)

        # add the mapping to the dictionary
        club_name_dict[club] = most_similar_club

    # map the club names
    transfers['club_name_mapped'] = transfers['club_name'].map(club_name_dict)

    # rename club_name as transfers_club_name and club_name_mapped as club_name
    transfers.rename(columns = {'club_name': 'transfers_club_name', 'club_name_mapped': 'club_name'}, inplace = True)


    # let's try to map the player names from the transfers dataframe to the player names from the fifa dataframe
    # First possibility to map them: in the ratings dataframe, the short_name looks like "L. Messi", so we can use the first letter of the first name and the last name to match the players
    # if not, we can use the previous functions to match the players

    # get the short name of the players
    transfers['short_name'] = transfers['player_name'].apply(get_short_name)
    
    return transfers


# second function, only if necessary
def add_name_mapping(transfers, ratings):
     # let's try to map the player names from the transfers dataframe to the player names from the fifa dataframe
    # use the get_most_similar function to get the most similar player name from the fifa dataframe
    # create a dictionary to map the player names
    player_name_dict = {}

    player_names = transfers.player_name.unique()
    fifa_names = ratings.short_name.unique()

    # loop over the list of players
    for player_name in tqdm(player_names):
        # get the most similar player name from the fifa dataframe
        most_similar_player = get_most_similar(player_name, fifa_names)

        # add the mapping to the dictionary
        player_name_dict[player_name] = most_similar_player

    # map the player names
    transfers['player_name_mapped'] = transfers['player_name'].map(player_name_dict)

    return transfers

In [119]:
# let's map the transfers to the fifa data
transfers_22 = map_transfers(transfers_22, ratings_22)
transfers_21 = map_transfers(transfers_21, ratings_21)
transfers_20 = map_transfers(transfers_20, ratings_20)


100%|██████████| 173/173 [00:02<00:00, 69.17it/s]
100%|██████████| 173/173 [00:02<00:00, 71.09it/s]
100%|██████████| 169/169 [00:02<00:00, 76.71it/s]


In [120]:
print(transfers_22.shape)
transfers_22.head()


(1058, 7)


Unnamed: 0,transfers_club_name,player_name,transfer_period,fee,year,club_name,short_name
0,Bayern Munich,Matthijs de Ligt,1,67.0,2022,FC Bayern München,M. de Ligt
1,Bayern Munich,Sadio Mané,1,32.0,2022,FC Bayern München,S. Mané
2,Bayern Munich,Mathys Tel,1,20.0,2022,FC Bayern München,M. Tel
3,Bayern Munich,Ryan Gravenberch,1,18.5,2022,FC Bayern München,R. Gravenberch
4,Bayern Munich,Noussair Mazraoui,1,0.0,2022,FC Bayern München,N. Mazraoui


In [121]:
# merge the transfers with the fifa data on the short_name and club_name_mapped columns
merged_22 = pd.merge(transfers_22, ratings_22, on = ['short_name', 'club_name'], how = 'left')
merged_21 = pd.merge(transfers_21, ratings_21, on = ['short_name', 'club_name'], how = 'left')
merged_20 = pd.merge(transfers_20, ratings_20, on = ['short_name', 'club_name'], how = 'left')