In [117]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from fuzzywuzzy import fuzz
from tqdm import tqdm

warnings.filterwarnings("ignore")
%matplotlib inline

In [118]:
df = pd.read_csv('../transfers1.4.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,ID,to_club_name,to_league_name,player_name,age,position,from_club_involved_name,fee,transfer_movement,fee_cleaned,year,season,grouping_positions_1,grouping_positions_2,normalized_from_club_name,from_league
0,3812,3813,Hertha BSC,1 Bundesliga,Lukas Klünter,22,Right Back,1. FC Köln,£1.80m,in,1.8,2018,2018/2019,RB,B,1. FC Köln,1 Bundesliga
1,4460,4461,Dinamo Moscow,Premier Liga,Konstantin Rausch,27,Left Back,1. FC Köln,£1.35m,in,1.35,2017,2017/2018,LB,B,1. FC Köln,1 Bundesliga
2,4596,4597,Fulham FC,Premier League,Sascha Riether,30,Right Back,1. FC Köln,£1.26m,in,1.26,2013,2013/2014,RB,B,1. FC Köln,1 Bundesliga
3,1818,1819,Tottenham Hotspur,Premier League,Kevin Wimmer,22,Centre Back,1. FC Köln,£5.40m,in,5.4,2015,2015/2016,CB,D,1. FC Köln,1 Bundesliga
4,3017,3018,TSG 1899 Hoffenheim,1 Bundesliga,Kevin Vogt,24,Centre Back,1. FC Köln,£2.70m,in,2.7,2016,2016/2017,CB,D,1. FC Köln,1 Bundesliga


In [121]:
for i in range(len(df)):
    if(df.normalized_from_club_name[i] == "0"):
        df.normalized_from_club_name[i] = df.from_club_involved_name[i]

#### Grouping based on spending power of club

In [123]:
club_spend = df.groupby('to_club_name')['fee_cleaned'].sum()
club_spend = pd.DataFrame(club_spend)
club_spend = club_spend.sort_values(by=['fee_cleaned'], ascending=False)
club_spend = club_spend.reset_index()
club_spend = club_spend.rename(columns = {'to_club_name': 'club_name', 'fee_cleaned' : 'fee_spent'})
club_spend.head(20)

Unnamed: 0,club_name,fee_spent
0,Manchester City,1504.897
1,FC Barcelona,1331.17
2,Real Madrid,1226.93
3,Chelsea FC,1213.76
4,Juventus FC,1180.153
5,Paris Saint-Germain,1141.74
6,Manchester United,1117.3
7,Liverpool FC,937.598
8,Atlético Madrid,905.735
9,Arsenal FC,762.081


In [124]:
club_earned = df.groupby('normalized_from_club_name')['fee_cleaned'].sum()
club_earned = pd.DataFrame(club_earned)
club_earned = club_earned.sort_values(by=['fee_cleaned'], ascending=False)
club_earned = club_earned.reset_index()
club_earned = club_earned.rename(columns = {'normalized_from_club_name': 'club_name', 'fee_cleaned' : 'fee_earned'})
club_earned.head(20)

Unnamed: 0,club_name,fee_earned
0,Atlético Madrid,820.45
1,AS Monaco,802.335
2,SL Benfica,728.47
3,Real Madrid,701.55
4,Juventus FC,690.653
5,FC Barcelona,681.45
6,AS Roma,641.075
7,Liverpool FC,628.58
8,FC Porto,621.64
9,Chelsea FC,614.694


#### net spend dataframe

In [125]:
club_net_spend = pd.merge(club_spend, club_earned, how = 'outer', on = 'club_name')
club_net_spend = club_net_spend.fillna(0)
club_net_spend['net_spend'] = club_net_spend['fee_spent'] - club_net_spend['fee_earned']
club_net_spend = club_net_spend.sort_values(by=['net_spend'], ascending=False)

In [126]:
club_net_spend = club_net_spend.sort_values(by=['fee_spent'], ascending=False)
club_net_spend.fee_spent.describe()

count     999.000000
mean       36.251253
std       142.246165
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max      1504.897000
Name: fee_spent, dtype: float64

In [127]:
club_net_spend.describe()

Unnamed: 0,fee_spent,fee_earned,net_spend
count,999.0,999.0,999.0
mean,36.251253,36.251253,4.552026e-16
std,142.246165,99.008833,76.78126
min,0.0,0.0,-386.951
25%,0.0,0.63,-8.681
50%,0.0,2.93,-2.03
75%,0.0,16.79,-0.324
max,1504.897,820.45,1083.012


#### CLassifying into 4 tiers
 - Tier 1 : Clubs that spent more than 300M Euros
 - Tier 2 : Clubs that spent more than 100M Euros but less than 300M Eruos
 - Tier 3 : Clubs that spent more than 20M Euros but less than 100M Euros
 - Tier 4 : CLubs that spent less than 20M Euros

In [128]:
club_net_spend['Tier'] = 0
for i in range(len(club_net_spend)):
    if(club_net_spend.fee_spent[i] >= 300 ):
        club_net_spend['Tier'][i] = 'Tier_1'
    elif((300 > club_net_spend.fee_spent[i]) and (club_net_spend.fee_spent[i] >= 100)):
        club_net_spend['Tier'][i] = 'Tier_2'
    elif((100 > club_net_spend.fee_spent[i]) and (club_net_spend.fee_spent[i] >= 20)):
        club_net_spend['Tier'][i] = 'Tier_3'
    else:
        club_net_spend['Tier'][i] = 'Tier_4'
        
club_net_spend['Tier'].value_counts()      

Tier_4    855
Tier_3     56
Tier_2     52
Tier_1     36
Name: Tier, dtype: int64

In [129]:
club_net_spend

Unnamed: 0,club_name,fee_spent,fee_earned,net_spend,Tier
0,Manchester City,1504.897,421.885,1083.012,Tier_1
1,FC Barcelona,1331.170,681.450,649.720,Tier_1
2,Real Madrid,1226.930,701.550,525.380,Tier_1
3,Chelsea FC,1213.760,614.694,599.066,Tier_1
4,Juventus FC,1180.153,690.653,489.500,Tier_1
...,...,...,...,...,...
717,Bologna U19,0.000,0.900,-0.900,Tier_4
702,Ajax U21,0.000,0.946,-0.946,Tier_4
701,SV Sandhausen,0.000,0.990,-0.990,Tier_4
700,Pol. Warsaw,0.000,0.990,-0.990,Tier_4


In [144]:
#encoding to_club_tiers
incoming_tiers = club_net_spend.rename(columns ={'club_name':'to_club_name','Tier':'to_club_tier'})
df_1 = pd.merge(df, incoming_tiers, how = 'outer', on = 'to_club_name')
df_1 = df_1.drop(columns = ['fee_spent','fee_earned','net_spend'])
df_1 = df_1.dropna(subset=['age', 'position'])
df_1.to_club_tier.value_counts()

Tier_2    2447
Tier_1    2242
Tier_3    1370
Tier_4     675
Name: to_club_tier, dtype: int64

In [145]:
#encoding from_club_tiers
outgoing_tiers = club_net_spend.rename(columns ={'club_name':'normalized_from_club_name','Tier':'from_club_tier'})
df_2 = pd.merge(df_1, outgoing_tiers, how = 'outer', on = 'normalized_from_club_name')
df_2 = df_2.drop(columns = ['fee_spent','fee_earned','net_spend'])
df_2 = df_2.dropna(subset=['age', 'position'])
df_2.from_club_tier.value_counts()

Tier_4    3107
Tier_1    1508
Tier_2    1231
Tier_3     888
Name: from_club_tier, dtype: int64

In [146]:
df_2

Unnamed: 0.1,Unnamed: 0,ID,to_club_name,to_league_name,player_name,age,position,from_club_involved_name,fee,transfer_movement,fee_cleaned,year,season,grouping_positions_1,grouping_positions_2,normalized_from_club_name,from_league,to_club_tier,from_club_tier
0,3812.0,3813.0,Hertha BSC,1 Bundesliga,Lukas Klünter,22.0,Right Back,1. FC Köln,£1.80m,in,1.800,2018.0,2018/2019,RB,B,1. FC Köln,1 Bundesliga,Tier_3,Tier_3
1,4460.0,4461.0,Dinamo Moscow,Premier Liga,Konstantin Rausch,27.0,Left Back,1. FC Köln,£1.35m,in,1.350,2017.0,2017/2018,LB,B,1. FC Köln,1 Bundesliga,Tier_2,Tier_3
2,4596.0,4597.0,Fulham FC,Premier League,Sascha Riether,30.0,Right Back,1. FC Köln,£1.26m,in,1.260,2013.0,2013/2014,RB,B,1. FC Köln,1 Bundesliga,Tier_2,Tier_3
3,1818.0,1819.0,Tottenham Hotspur,Premier League,Kevin Wimmer,22.0,Centre Back,1. FC Köln,£5.40m,in,5.400,2015.0,2015/2016,CB,D,1. FC Köln,1 Bundesliga,Tier_1,Tier_3
4,3017.0,3018.0,TSG 1899 Hoffenheim,1 Bundesliga,Kevin Vogt,24.0,Centre Back,1. FC Köln,£2.70m,in,2.700,2016.0,2016/2017,CB,D,1. FC Köln,1 Bundesliga,Tier_2,Tier_3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6729,5808.0,5809.0,Mordovia Saransk,Premier Liga,Milan Perendija,27.0,Centre Back,Otelul Galati,£450k,in,0.450,2012.0,2012/2013,CB,D,Otelul Galati,Other,Tier_4,Tier_4
6730,6505.0,6506.0,Académica Coimbra,Liga Nos,Rafa Lopes,22.0,Centre Forward,Penafiel,£135k,in,0.135,2013.0,2013/2014,CF,F,Penafiel,Other,Tier_4,Tier_4
6731,6577.0,6578.0,NAC Breda,Eredivisie,Karol Mets,24.0,Centre Back,Viking FK,£90k,in,0.090,2017.0,2017/2018,CB,D,Viking FK,Other,Tier_4,Tier_4
6732,6682.0,6683.0,Leixões SC,Liga Nos,Cauê,20.0,Defensive Midfield,Santo André,£45k,in,0.045,2009.0,2009/2010,CDM,M,Santo André,Other,Tier_4,Tier_4


In [152]:
df_2.to_csv('../transfers1.5.csv')