In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [2]:
def get_transfer_spends(url):
    tx_array = []
    headers = {"User-Agent":"Mozilla/5.0"}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    tables = soup.findAll("table")
    for i in range(0,6):
        footer = tables[i].find('tfoot')
        footer.find('td').contents[0]
        x=footer.find('td').contents[0]
        tx_array.append(x)
    return tx_array

In [3]:
def generate_xfer_df(league,league_url,num_teams):
    headers = {"User-Agent":"Mozilla/5.0"}
    response = requests.get(league_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    tables = soup.findAll("table")
    team_data = tables[3].findAll("tbody")
    team_array=[]
    for i in team_data[0].findAll("td", {'class':'zentriert no-border-rechts'}):
        team_name = (i.find('a').find('img')['alt'])
        team_array.append(team_name)
        team_link = (i.find('a')['href'])
        team_link.find('/saison')
        team_short_link = team_link[0:team_link.find('/saison')]
        transfer_link = team_short_link.replace('startseite','alletransfers',1)
        full_tx_link = 'https://www.transfermarkt.com' + transfer_link
        spends = get_transfer_spends(full_tx_link)
        for j in spends:
            team_array.append(j)
    team_df = pd.DataFrame()
    for i in range(0,len(team_array)):
        team_series = pd.Series(team_array[i:i+7])
        team_df = team_df.append(team_series,ignore_index=True)
        i = i+ 7
    if(num_teams==20):
        transfer_df = team_df.iloc[[0,7,14,21,28,35,42,49,56,63,70,77,84,91,98,105,112,119,126,133]]
    elif(num_teams==18):
        transfer_df = team_df.iloc[[0,7,14,21,28,35,42,49,56,63,70,77,84,91,98,105,112,119]]        
#     transfer_df = team_df.iloc[[0,7,14,21,28,35,42,49,56,63,70,77,84,91,98,105,112,119,126,133]]
    transfer_df = transfer_df.reset_index()
    transfer_df = transfer_df.drop(columns=['index'])
    transfer_df['League'] = league
    return transfer_df

In [4]:
epl_df = generate_xfer_df('Premier League','https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1',20)

In [5]:
la_liga_df=generate_xfer_df('La Liga','https://www.transfermarkt.com/laliga/startseite/wettbewerb/ES1',20)

In [6]:
serie_df=generate_xfer_df('Serie A','https://www.transfermarkt.com/serie-a/startseite/wettbewerb/IT1',20)

In [7]:
german_df=generate_xfer_df('Bundesliga','https://www.transfermarkt.com/1-bundesliga/startseite/wettbewerb/L1',18)

In [8]:
french_df=generate_xfer_df('Ligue 1','https://www.transfermarkt.com/ligue-1/startseite/wettbewerb/FR1',20)

In [9]:
dutch_df=generate_xfer_df('Eredivisie','https://www.transfermarkt.com/eredivisie/startseite/wettbewerb/NL1',18)

In [10]:
full_df = epl_df.append([la_liga_df,serie_df,german_df,french_df,dutch_df])

In [11]:
full_df.columns=['Team','Arrival_19','Departure_19','Arrival_18','Departure_18','Arrival_17','Departure_17','League']

In [12]:
full_df.iloc[0] = ['Manchester City','€168.00m','€69.00m','€78.59m','€54.10m','€317.50m','€91.35m','Premier League']

In [13]:
def update_k_val(col):
    for i in range(0,len(full_df[col])):
        if(full_df[col].iloc[i].find('k')>0):
            updated_value = int(full_df[col].iloc[i][1:-1])/1000
            updated_value = '€'+str(updated_value)+'m'
            full_df[col].iloc[i]=updated_value

In [14]:
update_k_val('Arrival_19')
update_k_val('Departure_19')
update_k_val('Arrival_18')
update_k_val('Departure_18')
update_k_val('Arrival_17')
update_k_val('Departure_17')

In [15]:
def update_m_val(col):
    for i in range(0,len(full_df[col])):
        if(full_df[col].iloc[i]!='0'):
            full_df[col].iloc[i]=full_df[col].iloc[i][1:-1]

In [16]:
update_m_val('Arrival_19')
update_m_val('Departure_19')
update_m_val('Arrival_18')
update_m_val('Departure_18')
update_m_val('Arrival_17')
update_m_val('Departure_17')

In [17]:
full_df.head()

Unnamed: 0,Team,Arrival_19,Departure_19,Arrival_18,Departure_18,Arrival_17,Departure_17,League
0,Manchester City,168.0,69.0,78.59,54.1,317.5,91.35,Premier League
1,Liverpool FC,1.9,41.6,182.2,41.1,173.88,194.5,Premier League
2,Tottenham Hotspur,114.0,35.0,0.0,5.35,121.5,103.8,Premier League
3,Chelsea FC,56.0,45.0,145.75,208.8,69.55,260.5,Premier League
4,Manchester United,159.0,69.5,82.7,30.55,198.4,45.5,Premier League


In [18]:
arrival_array=[]
departure_array=[]
for i in range(0,len(full_df['Arrival_19'])):
    x = float(full_df['Arrival_19'].iloc[i])+float(full_df['Arrival_18'].iloc[i])+float(full_df['Arrival_17'].iloc[i])
    arrival_array.append(x)
    y = float(full_df['Departure_19'].iloc[i])+float(full_df['Departure_18'].iloc[i])+float(full_df['Departure_17'].iloc[i])
    departure_array.append(y)    

In [19]:
full_df['Total_Arrivals']=arrival_array
full_df['Total_Departures']=departure_array

In [20]:
avg_spend_array=[]
for i in range(0,len(full_df['Arrival_19'])):
    x = (float(full_df['Total_Arrivals'].iloc[i])-float(full_df['Total_Departures'].iloc[i]))/3
    avg_spend_array.append(x)

In [21]:
full_df['Average Transfer Spend']=avg_spend_array

In [22]:
full_df.head()

Unnamed: 0,Team,Arrival_19,Departure_19,Arrival_18,Departure_18,Arrival_17,Departure_17,League,Total_Arrivals,Total_Departures,Average Transfer Spend
0,Manchester City,168.0,69.0,78.59,54.1,317.5,91.35,Premier League,564.09,214.45,116.546667
1,Liverpool FC,1.9,41.6,182.2,41.1,173.88,194.5,Premier League,357.98,277.2,26.926667
2,Tottenham Hotspur,114.0,35.0,0.0,5.35,121.5,103.8,Premier League,235.5,144.15,30.45
3,Chelsea FC,56.0,45.0,145.75,208.8,69.55,260.5,Premier League,271.3,514.3,-81.0
4,Manchester United,159.0,69.5,82.7,30.55,198.4,45.5,Premier League,440.1,145.55,98.183333


In [23]:
full_df.to_csv('..\\Consolidated Data\\Transfer Spends.csv')