In [3]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import OneHotEncoder
import os
import glob
from tqdm import tqdm

In [4]:
# normalize the data ?
# noramlize the rates between 0 and 1

# don't keep the names
# player positions: remove it (remove nation position) and keep club position --> prescription: constraint on the number of players per zone. Limit to Attack / Midfield / Defend / Goalkeeper
# keep overall and potential, maybe replace potential by potential - overall
# keep club joined year, and club contract valid year
# work rate: replace by two columns, in numerical values ? or remove it
# get rid of improvements 
# keep league levels

# remove ids, urls, club / league names, club_loaned_from, nationalities / nations (id, name etc.), player tags and player traits, 

In [5]:
# define function so we can easily apply it to all the datasets
def data_preprocessing(data):
    
    # display all columns of the dataframe
    pd.set_option('display.max_columns', None)

    # remove unnecessary columns
    data.drop(['sofifa_id', 'player_url', 'long_name', 'club_position', 'dob', 'club_team_id', 
        #'club_name',
        'league_name', 'club_jersey_number', 'club_loaned_from', 'nationality_id', 'nationality_name', 
        'nation_team_id', 'nation_position', 'nation_jersey_number', 'body_type', 'real_face', 'player_tags', 'player_traits',
        'goalkeeping_speed', 'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm',
        'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk', 'player_face_url', 'club_logo_url', 'club_flag_url', 
        'nation_logo_url', 'nation_flag_url'], axis = 1, inplace = True)

    # instead of club position, I keep player_positions and just pick the first element. The reason is that in club_position there are also SUB and RES
    # keep just the first position
    data['player_positions'] = data['player_positions'].apply(lambda x: x.split(',')[0])    

    # translate club position to role, dividing goalkeepers, defenders, midfielders and attackers
    attackers = ['ST', 'LW', 'RW', 'CF', 'LF', 'RF', 'RS', 'LS']
    midfielders = ['CAM', 'CM', 'CDM', 'RCM','LM', 'RM', 'LAM', 'RAM', 'LW', 'RW', 'RDM', 'LCM','LDM']
    defenders = ['CB', 'LB', 'RB', 'LCB', 'RCB', 'LWB', 'RWB']
    goalkeepers = ['GK']

    data['player_positions'] = data['player_positions'].apply(lambda x: 'A' if x in attackers else 
        ('M' if x in midfielders else ('D' if x in defenders else ('G' if x in goalkeepers else x))))

    # apply one hot encoding to player positions
    dummies = pd.get_dummies(data['player_positions'], prefix = 'position')
    data.drop(['player_positions'], axis = 1, inplace = True) # drop original column
    data = pd.concat([data, dummies], axis = 1) # concat the dummy columns


    # only keep year of when club was joined
    data['club_joined'] = [int(data.loc[0,'club_joined'].split('-')[0]) for i in range(len(data['club_joined']))]
    
    # one hot encoding of preferred foot, 0 = left, 1 = right
    data['preferred_foot'] = data['preferred_foot'].apply(lambda x: 0 if x == 'Left' else 1)

    # ordinal encoding of work rate (1 = Low, 2 = Medium, 3 = High), split between attack and defense. 
    data['attack_work_rate'] = data['work_rate'].apply(lambda x: 1 if x.split('/')[0] == 'Low' else (2 if x.split('/')[0] == 'Medium' else 3))
    data['defense_work_rate'] = data['work_rate'].apply(lambda x: 1 if x.split('/')[1] == 'Low' else (2 if x.split('/')[1] == 'Medium' else 3))
    data.drop(['work_rate'], axis = 1, inplace = True)

    # remove nan values
    data = data.dropna()

    # return the dataframe
    return data

In [6]:
# save dataframes from different years. Each entry of the dataframe has the cleaned
# fifa data for a specific year
total_data = []

for year in range(20, 22+1):
    # read the data
    path = 'data/fifa/players_{}.csv'.format(year)
    data = pd.read_csv(path)

    # preprocess the data
    total_data.append(data_preprocessing(data))

# for each year, add the year column to the dataframe
for i in range(len(total_data)):
    total_data[i]['year'] = 2000 + i

  data = pd.read_csv(path)


In [7]:
total_data[0].head()

Unnamed: 0,short_name,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,club_name,league_level,club_joined,club_contract_valid_until,preferred_foot,weak_foot,skill_moves,international_reputation,release_clause_eur,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,position_A,position_D,position_G,position_M,attack_work_rate,defense_work_rate,year
0,L. Messi,94,94,95500000.0,560000.0,32,170,72,FC Barcelona,1.0,2004,2021.0,0,4,4,5,195800000.0,87.0,92.0,92.0,96.0,39.0,66.0,88,95,70,92,88,97,93,94,92,96,91,84,93,95,95,86,68,75,68,94,48,40,94,94,75,96,33,37,26,6,11,15,14,8,1,0,0,0,2,1,2000
1,Cristiano Ronaldo,93,93,58500000.0,410000.0,34,187,83,Juventus,1.0,2004,2022.0,1,4,5,5,96500000.0,90.0,93.0,82.0,89.0,35.0,78.0,84,94,89,83,87,89,81,76,77,92,89,91,87,96,71,95,95,85,78,93,63,29,95,82,85,95,28,32,24,7,11,15,14,11,1,0,0,0,3,1,2000
2,Neymar Jr,92,92,105500000.0,290000.0,27,175,68,Paris Saint-Germain,1.0,2004,2022.0,1,5,5,5,195200000.0,91.0,85.0,87.0,95.0,32.0,58.0,87,87,62,87,87,96,88,87,81,95,94,89,96,92,84,80,61,81,49,84,51,36,87,90,90,94,27,26,29,9,9,15,15,11,1,0,0,0,3,2,2000
3,E. Hazard,91,91,90000000.0,470000.0,28,175,74,Real Madrid CF,1.0,2004,2024.0,1,4,4,4,184500000.0,91.0,83.0,86.0,94.0,35.0,66.0,81,84,61,89,83,95,83,79,83,94,94,88,95,90,94,82,56,84,63,80,54,41,87,89,88,91,34,27,22,11,12,6,8,8,1,0,0,0,3,2,2000
4,K. De Bruyne,91,91,90000000.0,370000.0,28,181,70,Manchester City,1.0,2004,2023.0,1,5,4,4,166500000.0,76.0,86.0,92.0,86.0,61.0,78.0,93,82,55,92,82,86,85,83,91,91,77,76,78,91,76,91,63,89,74,90,76,61,88,94,79,91,68,58,51,15,13,5,10,13,0,0,0,1,3,3,2000


####################################################################################################

Now, cleaning of the transfer data

In [8]:
# read all transfer files
path = "data/Transfers/"
csv_files = glob.glob(os.path.join(path, "*.csv"))

In [9]:
# create unique dataframe with all the transfers
transfers_list = []

# loop over the list of csv files
for f in csv_files:
      
    # read the csv file
    df = pd.read_csv(f)

    # append the dataframe to the list
    transfers_list.append(df)

transfers = pd.concat(transfers_list).reset_index(drop = True)

In [10]:
# only keep transfers happened in 2020, 2021, 2022
transfers = transfers.loc[transfers['year'].isin([2020, 2021, 2022])]

# reset index
transfers.reset_index(inplace = True, drop = True)

# just keep transfer movement in, because there should be a match between in and out, but the ones that just have out mean they are not playing
transfers = transfers[transfers["transfer_movement"] == "in"]

# remove columns we are not interested in
transfers.drop(['age', 'position', 
    #'club_involved_name', 
    'transfer_movement', 'league_name', 'season'], axis = 1, inplace = True)

# one hot encoding of transfer period, 1 = summer, 0 = winter
transfers['transfer_period'] = transfers['transfer_period'].apply(lambda x: 1 if x.lower() == 'summer' else 0)

# check if loan is in fee
transfers['loan'] = transfers['fee'].apply(lambda x: 1 if 'loan' in x.lower() else 0)

# get rid of fee column, and rename fee_cleaned to fee
transfers.drop(['fee'], axis = 1, inplace = True)
transfers.rename(columns = {'fee_cleaned': 'fee'}, inplace = True)

# get rid of loan transfers
transfers = transfers[transfers['loan'] == 0]
transfers.drop(['loan'], axis = 1, inplace = True)

# rename club_name to new_club and club_involved_name to old_club
transfers.rename(columns = {'club_name': 'new_club'}, inplace = True)
transfers.rename(columns = {'club_involved_name': 'old_club'}, inplace = True)

# drop duplicates, meaning player_name that appears twice. We keep the last one since it's the most recent transfer
#transfers.drop_duplicates(subset = ['player_name'], keep = 'last', inplace = True)

# reset index
transfers.reset_index(inplace = True, drop = True)

# remove NaN values
transfers.dropna(inplace = True)

In [11]:
transfers.head()

Unnamed: 0,new_club,player_name,old_club,transfer_period,fee,year
0,Bayern Munich,Leroy Sané,Man City,1,60.0,2020
1,Bayern Munich,Marc Roca,Espanyol,1,9.0,2020
2,Bayern Munich,Bouna Sarr,Marseille,1,8.0,2020
3,Bayern Munich,Alexander Nübel,FC Schalke 04,1,0.0,2020
4,Bayern Munich,Tanguy Nianzou,Paris SG,1,0.0,2020


In [12]:
transfers.shape
# number is still not that high, because data cleaning eliminated most of them. We can go back in time and use all the fifa data we have (from 2015)

(3133, 6)

# Now we need to join the two datasets based on player, if there is a doubt, we keep the club_name, if still we are not sure, we drop it

## Analysis

In [13]:
transfers_22 = transfers[transfers['year'] == 2022].reset_index(drop = True)
transfers_21 = transfers[transfers['year'] == 2021].reset_index(drop = True)
transfers_20 = transfers[transfers['year'] == 2020].reset_index(drop = True)

# drop year column when creating ratings_20, ratings_21, ratings_22
ratings_22 = total_data[2].drop(['year'], axis = 1).reset_index(drop = True)
ratings_21 = total_data[1].drop(['year'], axis = 1).reset_index(drop = True)
ratings_20 = total_data[0].drop(['year'], axis = 1).reset_index(drop = True)

In [14]:
transfers_22.head()

Unnamed: 0,new_club,player_name,old_club,transfer_period,fee,year
0,Bayern Munich,Matthijs de Ligt,Juventus,1,67.0,2022
1,Bayern Munich,Sadio Mané,Liverpool,1,32.0,2022
2,Bayern Munich,Mathys Tel,Stade Rennais,1,20.0,2022
3,Bayern Munich,Ryan Gravenberch,Ajax,1,18.5,2022
4,Bayern Munich,Noussair Mazraoui,Ajax,1,0.0,2022


In [15]:
ratings_22[ratings_22.short_name == "S. Mané"]

Unnamed: 0,short_name,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,club_name,league_level,club_joined,club_contract_valid_until,preferred_foot,weak_foot,skill_moves,international_reputation,release_clause_eur,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,position_A,position_D,position_G,position_M,attack_work_rate,defense_work_rate
12,S. Mané,89,89,101000000.0,270000.0,29,175,69,Liverpool,1.0,2021,2023.0,1,4,4,4,186900000.0,91.0,83.0,80.0,89.0,44.0,77.0,78,86,84,84,75,90,77,64,71,88,93,90,93,91,88,83,87,85,72,80,75,35,90,84,71,84,42,42,38,10,10,15,7,14,1,0,0,0,3,2


In [16]:
# player_name from transfers is not the same as player_name from fifa data, so we need to match them
# we can use a metric to measure the similarity between the two names, and then match the ones that are the most similar
# use fuzzywuzzy to get the similarity between two strings
from fuzzywuzzy import fuzz

# function to get the similarity between two strings
def get_similarity(a, b):
    return fuzz.token_sort_ratio(a, b)
    

# function to get the most similar string from a list of strings
def get_most_similar(string, string_list):
    # get the similarity between the string and each string in the list
    similarity = [get_similarity(string, s) for s in string_list]

    # get the index of the most similar string
    max_index = np.argmax(similarity)

    # return the most similar string
    return string_list[max_index]


def get_short_name(player_name):
    # if there is no firts name, return the last name
    if len(player_name.split()) == 1:
        return player_name
    # else, the first name is the first word, and the last name is all the other words
    else:
        first_name = player_name.split()[0]
        last_name = player_name.split()[1:]
        return first_name[0] + ". " + " ".join(last_name)




In [17]:
def map_transfers(transfers, ratings):
    # let's try to map the club names from the transfers dataframe to the club names from the fifa dataframe
    clubs_names = transfers.old_club.unique()
    fifa_clubs = ratings.club_name.unique()

    # create a dictionary to map the club names
    club_name_dict = {}

    # loop over the list of clubs
    for club in tqdm(clubs_names):
        # get the most similar club name from the fifa dataframe
        most_similar_club = get_most_similar(club, fifa_clubs)

        # add the mapping to the dictionary
        club_name_dict[club] = most_similar_club

    # map the club names
    transfers['club_name'] = transfers['old_club'].map(club_name_dict)



    # let's try to map the player names from the transfers dataframe to the player names from the fifa dataframe
    # First possibility to map them: in the ratings dataframe, the short_name looks like "L. Messi", so we can use the first letter of the first name and the last name to match the players
    # if not, we can use the previous functions to match the players

    # get the short name of the players
    transfers['short_name'] = transfers['player_name'].apply(get_short_name)
    
    return transfers


# second function, only if necessary
def add_name_mapping(transfers, ratings):
     # let's try to map the player names from the transfers dataframe to the player names from the fifa dataframe
    # use the get_most_similar function to get the most similar player name from the fifa dataframe
    # create an empty column "player_name_mapped" that we will add to transfers at the end
    player_name_mapped = []

    transfers_info = transfers[['player_name', 'club_name']].values
    ratings_info = ratings[['short_name', 'club_name']].values

    # loop over the list of players
    for player in tqdm(transfers_info):
        # check only the players that play in the same club from ratings
        ratings_players = ratings_info[ratings_info[:, 1] == player[1]]

        # get the most similar player name from the fifa dataframe
        most_similar_player = get_most_similar(player[0], ratings_players[:, 0])

        # add the most similar player name to the list
        player_name_mapped.append(most_similar_player)

    # add the list to the transfers dataframe
    transfers['player_name_mapped'] = player_name_mapped

    return transfers

In [18]:
# let's map the transfers to the fifa data
transfers_22 = map_transfers(transfers_22, ratings_22)
transfers_21 = map_transfers(transfers_21, ratings_21)
transfers_20 = map_transfers(transfers_20, ratings_20)


100%|██████████| 461/461 [00:05<00:00, 79.68it/s]
100%|██████████| 515/515 [00:06<00:00, 83.60it/s]
100%|██████████| 498/498 [00:05<00:00, 86.97it/s]


In [22]:
# function to merge the transfers with the fifa data
def merge(transfers, ratings):
    # keep in memory the initial transfers columns
    transfers_columns = transfers.columns

    # first, try to merge using the short_name and club_name
    merged = pd.merge(transfers, ratings, on = ['short_name', 'club_name'], how = 'left', indicator=True)

    # create a dataframe for the rows where merge==both and one for the rows where merge==left_only
    first_merge_success = merged[merged['_merge'] == 'both'].reset_index(drop = True)
    first_merge_fail = merged[merged['_merge'] == 'left_only'].reset_index(drop = True)

    # drop the _merge column
    first_merge_success.drop(['_merge'], axis = 1, inplace = True)
    first_merge_fail.drop(['_merge'], axis = 1, inplace = True)
    
    # get the transfers data corresponding to the first merge fail
    first_merge_fail_transfers = first_merge_fail[transfers_columns].reset_index(drop = True)

    # use the add_name_mapping function to map the player names in first_merge_fail_transfers
    first_merge_fail_transfers = add_name_mapping(first_merge_fail_transfers, ratings)
    # rename the short name to previous_short_name and the player_name_mapped to short_name
    first_merge_fail_transfers.rename(columns = {'short_name': 'previous_short_name', 'player_name_mapped': 'short_name'}, inplace = True)

    # try a new merge, and only keep the inner merge
    second_merge = pd.merge(first_merge_fail_transfers, ratings, on = ['short_name', 'club_name'], how = 'inner')

    # similarities
    first_merge_success['name_similarity'] = first_merge_success.apply(lambda x: get_similarity(x['player_name'], x['short_name']), axis = 1)
    first_merge_success['club_similarity'] = first_merge_success.apply(lambda x: get_similarity(x['old_club'], x['club_name']), axis = 1)
    second_merge['name_similarity'] = second_merge.apply(lambda x: get_similarity(x['player_name'], x['short_name']), axis = 1)
    second_merge['club_similarity'] = second_merge.apply(lambda x: get_similarity(x['old_club'], x['club_name']), axis = 1)
    
    # drop the columns for which the similarity score between old_club and club_name is less than 75% of the similarity score in the first merge success
    club_similarity_threshold = first_merge_success['club_similarity'].quantile(0.25)
    second_merge = second_merge[second_merge['club_similarity'] >= club_similarity_threshold].reset_index(drop = True)
    
    # drop the columns for which the similarity score between player_name and short_name is less than 75% of the similarity score in the first merge success
    name_similarity_threshold = first_merge_success['name_similarity'].quantile(0.25)
    second_merge = second_merge[second_merge['name_similarity'] >= name_similarity_threshold].reset_index(drop = True)

    # concatenate the two dataframes
    final_merge = pd.concat([first_merge_success, second_merge], axis = 0).reset_index(drop = True)
    
    # put previous_short_name and similarity in the first columns
    final_merge = final_merge[['previous_short_name', 'name_similarity', 'club_similarity'] + final_merge.columns[:-3].tolist()]

    # drop duplicates
    final_merge.drop_duplicates(inplace = True)

    return final_merge

In [23]:
merged_22 = merge(transfers_22, ratings_22)
merged_21 = merge(transfers_21, ratings_21)
merged_20 = merge(transfers_20, ratings_20)

# order by similarity, to check if the merge was successful
merged_22.sort_values(by = 'name_similarity', ascending = True, inplace = True)
merged_21.sort_values(by = 'name_similarity', ascending = True, inplace = True)
merged_20.sort_values(by = 'name_similarity', ascending = True, inplace = True)

# print merged_22 where previous_short_name is not null
#merged_22[merged_22['previous_short_name'].notnull()].head(40)
#merged_21[merged_21['previous_short_name'].notnull()].head(40)
merged_20[merged_20['previous_short_name'].notnull()].head(40)

100%|██████████| 661/661 [00:00<00:00, 1742.76it/s]
100%|██████████| 668/668 [00:00<00:00, 1722.01it/s]
100%|██████████| 635/635 [00:00<00:00, 1765.07it/s]


Unnamed: 0,previous_short_name,name_similarity,club_similarity,new_club,player_name,old_club,transfer_period,fee,year,club_name,short_name,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,league_level,club_joined,club_contract_valid_until,preferred_foot,weak_foot,skill_moves,international_reputation,release_clause_eur,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,position_A,position_D,position_G,position_M,attack_work_rate,defense_work_rate
384,M. Pereira,75,100,West Bromwich Albion,Matheus Pereira,Sporting CP,1,8.25,2020,Sporting CP,Mattheus Oliveira,73.0,78.0,5500000.0,10000.0,24.0,182.0,76.0,1.0,2004.0,2022.0,0.0,4.0,3.0,1.0,11400000.0,64.0,68.0,74.0,74.0,56.0,63.0,69.0,62.0,57.0,78.0,65.0,74.0,74.0,70.0,74.0,80.0,65.0,64.0,67.0,66.0,67.0,78.0,54.0,72.0,64.0,72.0,50.0,57.0,66.0,74.0,67.0,66.0,53.0,62.0,47.0,16.0,8.0,12.0,6.0,7.0,0.0,0.0,0.0,1.0,2.0,2.0
366,I. Rakitic,76,86,Sevilla FC,Ivan Rakitic,Barcelona,1,3.5,2020,FC Barcelona,I. Rakitić,86.0,86.0,38000000.0,240000.0,31.0,184.0,78.0,1.0,2004.0,2021.0,1.0,4.0,3.0,4.0,77900000.0,62.0,81.0,86.0,81.0,74.0,70.0,84.0,78.0,59.0,86.0,84.0,80.0,88.0,82.0,90.0,87.0,64.0,61.0,73.0,80.0,62.0,84.0,34.0,79.0,71.0,85.0,67.0,79.0,77.0,85.0,82.0,80.0,76.0,73.0,72.0,14.0,11.0,12.0,5.0,9.0,0.0,0.0,0.0,1.0,2.0,2.0
380,É. Capoue,76,100,Villarreal CF,Étienne Capoue,Watford,0,2.0,2020,Watford,E. Capoue,80.0,80.0,12000000.0,68000.0,30.0,190.0,87.0,1.0,2004.0,2022.0,1.0,3.0,3.0,2.0,22800000.0,52.0,71.0,74.0,73.0,78.0,77.0,69.0,69.0,72.0,79.0,69.0,76.0,67.0,71.0,78.0,80.0,50.0,53.0,36.0,81.0,61.0,77.0,72.0,77.0,78.0,72.0,76.0,83.0,73.0,71.0,54.0,75.0,76.0,79.0,77.0,12.0,11.0,11.0,14.0,12.0,0.0,0.0,0.0,1.0,2.0,2.0
387,J. Carlos Teixeira,77,80,Feyenoord Rotterdam,João Carlos Teixeira,Vit. Guimarães,1,1.6,2020,Vitória de Guimarães,João Teixeira,72.0,73.0,3700000.0,8000.0,26.0,177.0,69.0,1.0,2004.0,2021.0,1.0,3.0,3.0,1.0,7900000.0,71.0,67.0,70.0,76.0,37.0,57.0,60.0,67.0,46.0,76.0,44.0,76.0,60.0,56.0,72.0,77.0,73.0,69.0,73.0,71.0,74.0,73.0,50.0,64.0,61.0,70.0,42.0,28.0,68.0,72.0,56.0,68.0,44.0,34.0,31.0,14.0,16.0,14.0,15.0,16.0,0.0,0.0,0.0,1.0,3.0,2.0
398,A. Pereira,77,100,Rio Ave FC,André Pereira,FC Porto,1,0.0,2020,FC Porto,Danilo Pereira,83.0,83.0,24000000.0,21000.0,27.0,188.0,83.0,1.0,2004.0,2022.0,1.0,3.0,3.0,3.0,48000000.0,65.0,63.0,72.0,74.0,82.0,85.0,55.0,60.0,77.0,83.0,55.0,76.0,48.0,53.0,81.0,78.0,62.0,68.0,57.0,75.0,60.0,74.0,62.0,82.0,89.0,63.0,83.0,83.0,69.0,73.0,57.0,84.0,83.0,84.0,71.0,9.0,7.0,13.0,14.0,15.0,0.0,0.0,0.0,1.0,2.0,3.0
362,L. Haraslin,78,92,US Sassuolo,Lukas Haraslin,Lechia Gdansk,1,1.7,2020,Lechia Gdańsk,L. Haraslín,73.0,81.0,6000000.0,7000.0,23.0,182.0,71.0,1.0,2004.0,2021.0,0.0,3.0,4.0,1.0,9300000.0,84.0,62.0,67.0,75.0,40.0,66.0,67.0,63.0,59.0,68.0,58.0,73.0,68.0,54.0,66.0,72.0,85.0,84.0,90.0,65.0,83.0,62.0,73.0,89.0,64.0,59.0,38.0,33.0,66.0,68.0,58.0,65.0,33.0,41.0,48.0,13.0,12.0,13.0,10.0,15.0,0.0,0.0,0.0,1.0,3.0,2.0
364,F. Dagerstal,80,82,FK Khimki,Filip Dagerstal,Norrköping,0,0.0,2020,IFK Norrköping,F. Dagerstål,67.0,75.0,1000000.0,3000.0,22.0,189.0,74.0,1.0,2004.0,2020.0,1.0,3.0,2.0,1.0,1500000.0,64.0,43.0,58.0,58.0,64.0,75.0,55.0,37.0,59.0,64.0,34.0,53.0,46.0,38.0,61.0,64.0,60.0,67.0,61.0,65.0,50.0,65.0,80.0,74.0,76.0,40.0,70.0,66.0,36.0,55.0,37.0,70.0,64.0,65.0,64.0,7.0,9.0,8.0,15.0,13.0,0.0,1.0,0.0,0.0,3.0,2.0
389,K. Toko Ekambi,90,87,Olympique Lyon,Karl Toko Ekambi,Villarreal,1,11.5,2020,Villarreal CF,K. Toko-Ekambi,79.0,81.0,15500000.0,40000.0,26.0,185.0,74.0,1.0,2004.0,2023.0,1.0,4.0,3.0,2.0,33700000.0,79.0,78.0,67.0,78.0,27.0,68.0,74.0,81.0,74.0,69.0,72.0,78.0,66.0,63.0,58.0,79.0,79.0,79.0,75.0,76.0,71.0,80.0,68.0,69.0,76.0,72.0,48.0,28.0,81.0,65.0,69.0,77.0,27.0,14.0,18.0,10.0,12.0,12.0,8.0,14.0,1.0,0.0,0.0,0.0,3.0,2.0
376,F. Torres,96,84,Manchester City,Ferran Torres,Valencia,1,33.5,2020,Valencia CF,Ferrán Torres,75.0,85.0,10500000.0,17000.0,19.0,180.0,72.0,1.0,2004.0,2021.0,1.0,4.0,3.0,1.0,24200000.0,78.0,69.0,69.0,76.0,32.0,57.0,76.0,74.0,50.0,70.0,67.0,79.0,58.0,46.0,72.0,76.0,79.0,78.0,74.0,66.0,67.0,71.0,59.0,67.0,61.0,58.0,34.0,34.0,72.0,67.0,62.0,73.0,35.0,24.0,28.0,12.0,9.0,11.0,15.0,12.0,0.0,0.0,0.0,1.0,3.0,2.0
394,C. Falcão,96,88,SC Farense,Claudio Falcão,Desportivo Aves,1,0.0,2020,Desportivo das Aves,Cláudio Falcão,75.0,79.0,8000000.0,10000.0,24.0,185.0,75.0,1.0,2004.0,2022.0,1.0,3.0,3.0,1.0,17000000.0,66.0,65.0,69.0,71.0,72.0,76.0,65.0,58.0,66.0,76.0,58.0,70.0,56.0,53.0,72.0,73.0,67.0,66.0,68.0,74.0,67.0,78.0,83.0,89.0,69.0,66.0,75.0,72.0,67.0,68.0,74.0,72.0,69.0,78.0,64.0,6.0,12.0,10.0,8.0,11.0,0.0,0.0,0.0,1.0,3.0,3.0


In [192]:
# print all shapes
print(merged_22.shape)
print(merged_21.shape)
print(merged_20.shape)

(468, 71)
(493, 71)
(414, 71)
