# Import Packages

In [46]:
import requests
from bs4 import BeautifulSoup
import json
from tqdm import tqdm_notebook as tqdm
import pandas as pd
import numpy as np
import random
import time
import js2xml
import re

# Scraping setup

## Rotating headers

In [47]:
user_agent_list = [
   #Chrome
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
    #Firefox
    'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
    'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
]

## Delays

In [48]:
delays = np.linspace(10,15,6)
#delays = np.linspace(20,40,21)
#delays = np.linspace(3,10,7)
#delays = np.linspace(15,35,21)

## Scraping functions

In [49]:
def get_soup(url):
    '''
    Tries to get the html code of a web page, returns Failure if it is unsuccessful.
    '''
    
    # Choosing a random header from the previously given list
    header = {'User-Agent' : random.choice(user_agent_list)} 

    # Delaying the request for a random number of seconds from the range given in delays
    time.sleep(random.choice(delays))
    
    # Try to reach the page and get its content. Checks for the status code.
    try:
        r = requests.get(url, 
                         headers = header)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content,'html.parser')
            siker = True
        else:
            print('Bad status code! STOPPED')
            siker=False
            soup = ''
    except:
        soup=''
        siker=False
    
    # The function returns a boolean (success), and the html text of the webpage
    return siker,soup

## Base URLs

In [50]:
BASEURL = 'https://www.transfermarkt.co.uk'

In [53]:
# English Premier League
epl = 'https://www.transfermarkt.co.uk/premier-league/startseite/wettbewerb/GB1'
# Spanish La Liga
laliga = 'https://www.transfermarkt.co.uk/laliga/startseite/wettbewerb/ES1'
# Italian Serie A
seriea = 'https://www.transfermarkt.co.uk/serie-a/startseite/wettbewerb/IT1'
# German Bundesliga
bliga = 'https://www.transfermarkt.co.uk/bundesliga/startseite/wettbewerb/L1'
# French Ligue 1
ligue1 = 'https://www.transfermarkt.co.uk/ligue-1/startseite/wettbewerb/FR1'

leagues = [epl, laliga, seriea, bliga, ligue1]

# Scraping

## Getting URLs to squad lists

We want to scrape the information of the palyers of Europe's top 5 leagues. (The English, Spanish, German, Italian and French football leagues.)  

For this we need a list of URLs for these players.  

For the list of URLs of these players we need a list of URLs for the clubs.

In [11]:
siker,soup = get_soup(epl)

In [85]:
def get_league_soups(leagues):
    soups = []
    for url in leagues:
        siker,soup = get_soup(url)
        soups.append(soup)
    return soups

def find_teamnames_urls(soups):
    teams = {'name' : [], 'url' : []}
    for soup in soups:
        box = soup.find_all('td', {'class' : 'hauptlink no-border-links show-for-small show-for-pad'})
        for team in box:
            teams['name'].append(team.find('a').text)
            teams['url'].append(BASEURL + team.find('a')['href'])
    return(teams)

def create_teamsdf(leagues):
    teamsdf = pd.DataFrame(find_teamnames_urls(get_league_soups(leagues)))
    teamsdf['squadlist'] = ['/'.join(url.split('/')[0:4] + ['kader'] + url.split('/')[5:] + ['plus/1']) for url in teamsdf['url']]
    return teamsdf

In [86]:
teamsdf = create_teamsdf(leagues)
teamsdf

Unnamed: 0,name,url,squadlist
0,Man City,https://www.transfermarkt.co.uk/manchester-cit...,https://www.transfermarkt.co.uk/manchester-cit...
1,Liverpool,https://www.transfermarkt.co.uk/fc-liverpool/s...,https://www.transfermarkt.co.uk/fc-liverpool/k...
2,Spurs,https://www.transfermarkt.co.uk/tottenham-hots...,https://www.transfermarkt.co.uk/tottenham-hots...
3,Chelsea,https://www.transfermarkt.co.uk/fc-chelsea/sta...,https://www.transfermarkt.co.uk/fc-chelsea/kad...
4,Man Utd,https://www.transfermarkt.co.uk/manchester-uni...,https://www.transfermarkt.co.uk/manchester-uni...
...,...,...,...
93,SC Amiens,https://www.transfermarkt.co.uk/sc-amiens/star...,https://www.transfermarkt.co.uk/sc-amiens/kade...
94,FC Metz,https://www.transfermarkt.co.uk/fc-metz/starts...,https://www.transfermarkt.co.uk/fc-metz/kader/...
95,Stade Brest 29,https://www.transfermarkt.co.uk/stade-brest-29...,https://www.transfermarkt.co.uk/stade-brest-29...
96,Nîmes Olympique,https://www.transfermarkt.co.uk/olympique-nime...,https://www.transfermarkt.co.uk/olympique-nime...


## Getting the squad data

Now we will collect the current squads of the 98 teams, and some basic information about the players.  

An important part of this task is to collect the players URLs, which will let us collect the market values and performance statisctics on a player level.

In [163]:
def get_all_squads(teamsdf):
    squads = []
    for squadurl in teamsdf['squadlist']:
        squad = []
        siker,soup = get_soup(squadurl)
        players = soup.find_all('tr', {'class' : 'odd'}) + soup.find_all('tr', {'class' : 'even'})
        for player in players:
            playerinfo = {}
            infos = player.find_all('td', {'class' : 'zentriert'})
            try:
                playerinfo['kit'] = player.find('div', {'class' : 'rn_nummer'}).text
            except:
                playerinfo['kit'] = ''
            try:
                playerinfo['name'] = player.find('a', {'class' : 'spielprofil_tooltip'}).text
            except:
                playerinfo['name'] = ''
            try:
                playerinfo['genposition'] = infos[0]['title']
            except:
                playerinfo['genposition'] = ''
            try:
                playerinfo['specposition'] = player.find('table', {'class' : 'inline-table'}).find_all('tr')[1].text
            except:
                playerinfo['specposition'] = ''
            try:
                playerinfo['dateofbirth'] = infos[1].text
            except:
                playerinfo['dateofbirth'] = ''
            try:
                playerinfo['nationality'] = infos[2].find('img', {'class' : 'flaggenrahmen'})['title']
            except:
                playerinfo['nationality'] = ''
            try:
                playerinfo['height'] = infos[3].text
            except:
                playerinfo['height'] = ''
            try:
                playerinfo['foot'] = infos[4].text
            except:
                playerinfo['foot'] = ''
            try:
                playerinfo['joined'] = infos[5].text
            except:
                playerinfo['joined'] = ''
            try:
                playerinfo['lastclub'] = infos[6].find('img')['alt']
            except:
                playerinfo['lastclub'] = ''
            try:
                playerinfo['contract'] = infos[7].text
            except:
                playerinfo['contract'] = ''
            try:
                playerinfo['playerurl'] = BASEURL + player.find('a', {'class' : 'spielprofil_tooltip'})['href']
            except:
                playerinfo['playerurl'] = ''
            squad.append(playerinfo)
        squads.append(squad)
        print(len(squads), ' teams done!')
    teamsdf['squaddata'] = squads
    return teamsdf

In [None]:
new_df2 = get_all_squads(teamsdf)
new_df2

In [183]:
new_df2.to_csv('teams_dataframe_status0513.csv', sep = ';', encoding = 'utf-8')

In [188]:
new_df2.head()

Unnamed: 0,name,url,squadlist,squaddata
0,Man City,https://www.transfermarkt.co.uk/manchester-cit...,https://www.transfermarkt.co.uk/manchester-cit...,"[{'kit': '31', 'name': 'Ederson', 'genposition..."
1,Liverpool,https://www.transfermarkt.co.uk/fc-liverpool/s...,https://www.transfermarkt.co.uk/fc-liverpool/k...,"[{'kit': '1', 'name': 'Alisson', 'genposition'..."
2,Spurs,https://www.transfermarkt.co.uk/tottenham-hots...,https://www.transfermarkt.co.uk/tottenham-hots...,"[{'kit': '1', 'name': 'Hugo Lloris', 'genposit..."
3,Chelsea,https://www.transfermarkt.co.uk/fc-chelsea/sta...,https://www.transfermarkt.co.uk/fc-chelsea/kad...,"[{'kit': '1', 'name': 'Kepa', 'genposition': '..."
4,Man Utd,https://www.transfermarkt.co.uk/manchester-uni...,https://www.transfermarkt.co.uk/manchester-uni...,"[{'kit': '1', 'name': 'David de Gea', 'genposi..."


In [187]:
pd.DataFrame(new_df2['squaddata'][80]).head(5)

Unnamed: 0,kit,name,genposition,specposition,dateofbirth,nationality,height,foot,joined,lastclub,contract,playerurl
0,40,Benjamin Lecomte,Goalkeeper,Goalkeeper,"Apr 26, 1991 (29)",France,"1,86 m",left,"Jul 15, 2019",HSC Montpellier,30.06.2024,https://www.transfermarkt.co.uk/benjamin-lecom...
1,40,Loïc Badiashile,Goalkeeper,Goalkeeper,"Feb 5, 1998 (22)",France,"1,86 m",right,"Jul 1, 2016",AS Monaco U19,30.06.2021,https://www.transfermarkt.co.uk/loic-badiashil...
2,16,Diego Benaglio,Goalkeeper,Goalkeeper,"Sep 8, 1983 (36)",Switzerland,"1,94 m",right,"Jul 1, 2017",VfL Wolfsburg,30.06.2020,https://www.transfermarkt.co.uk/diego-benaglio...
3,3,Guillermo Maripán,Defender,Centre-Back,"May 6, 1994 (26)",Chile,"1,93 m",right,"Aug 24, 2019",Deportivo Alavés,30.06.2024,https://www.transfermarkt.co.uk/guillermo-mari...
4,5,Jemerson,Defender,Centre-Back,"Aug 24, 1992 (27)",Brazil,"1,84 m",right,"Jan 31, 2016",Clube Atlético Mineiro,30.06.2020,https://www.transfermarkt.co.uk/jemerson/profi...


## Player Data

Now we have the squad lists of the clubs. We will have to collect  
- Performance stats
- International record
- Personal Awards  

for each player.

How many players are there in our dataset?

In [193]:
playnum = 0
for squad in new_df2['squaddata']:
    playnum += len(squad)
    
print('The number of players in the dataset is: ', playnum)
print('Scraping this would take ', 10 * playnum / 60, ' - ', 15 * playnum / 60, ' minutes.')

The number of players in the dataset is:  2575
Scraping this would take  429.1666666666667  -  643.75  minutes.


### Market value data

In [51]:
def get_market_values(playerurl):
    '''
    Converts the player profile URL to the player value URL,
    and scrapes all the known market values of the player, and returns as a dictionary
    '''
    url = '/'.join(playerurl.split('/')[0:4] + ['marktwertverlauf'] + playerurl.split('/')[5:])
    siker, soup = get_soup(url)
    try:
        script = soup.find("script", text=re.compile("Highcharts.Chart")).text
        parsed = js2xml.parse(script)
        values = [d.xpath(".//array/object/property[@name='y']/number/@value") for d in parsed.xpath("//property[@name='data']")][3]
        dates = [d.xpath(".//array/object/property[@name='datum_mw']/string/text()") for d in parsed.xpath("//property[@name='data']")][3]
    except:
        values = ''
        dates = ''
    return {'date' : dates, 'market_value' : values}

In [247]:
pd.DataFrame(get_market_values('https://www.transfermarkt.co.uk/ansu-fati/profil/spieler/466810'))

Unnamed: 0,date,market_value
0,"Sep 10, 2019",22500000
1,"Dec 20, 2019",36000000
2,"Apr 8, 2020",32400000


### Player Performance Data

In [52]:
def get_player_stats(playerurl):
    url = '/'.join(playerurl.split('/')[0:4] + ['leistungsdatendetails/spieler'] + [playerurl.split('/')[6]] + ['saison//verein/0/liga/0/wettbewerb//pos/0/trainer_id/0/plus/1'])
    siker, soup = get_soup(url)
    try:
        statrows = soup.find_all('tr', {'class' : 'odd'}) + soup.find_all('tr', {'class' : 'even'})
        playerstats = [] # Every player will have a list of dictionaries for his stats
        for row in statrows:
            playerstatrow = {}
            infos1 = row.find_all('td', {'class' : 'zentriert'})
            try:
                playerstatrow['season'] = infos1[0].text
            except:
                playerstatrow['season'] = ''
            try:
                playerstatrow['competition'] = row.find('td', {'class' : 'hauptlink no-border-links'}).text
            except:
                playerstatrow['competition'] = ''
            try:
                playerstatrow['in_squad'] = infos1[2].text
            except:
                playerstatrow['in_squad'] = ''
            try:
                playerstatrow['appearances'] = infos1[3].text
            except:
                playerstatrow['appearances'] = ''
            try:
                playerstatrow['pointspermatch'] = infos1[4].text
            except:
                playerstatrow['pointspermatch'] = ''
            try:
                playerstatrow['goals'] = infos1[5].text
            except:
                playerstatrow['goals'] = ''
            try:
                playerstatrow['assists'] = infos1[6].text
            except:
                playerstatrow['assists'] = ''
            try:
                playerstatrow['owngoals'] = infos1[7].text
            except:
                playerstatrow['owngoals'] = ''
            try:
                playerstatrow['sub_on'] = infos1[8].text
            except:
                playerstatrow['sub_on'] = ''
            try:
                playerstatrow['sub_off'] = infos1[9].text
            except:
                playerstatrow['sub_off'] = ''
            try:
                playerstatrow['yellow'] = infos1[10].text
            except:
                playerstatrow['yellow'] = ''
            try:
                playerstatrow['secondyellow'] = infos1[11].text
            except:
                playerstatrow['secondyellow'] = ''
            try:
                playerstatrow['red'] = infos1[12].text
            except:
                playerstatrow['red'] = ''
            try:
                playerstatrow['penalty'] = infos1[13].text
            except:
                playerstatrow['penalty'] = ''
            infos2 = row.find_all('td', {'class' : 'rechts'})
            try:
                playerstatrow['minutes_per_goal'] = infos2[0].text
            except:
                playerstatrow['minutes_per_goal'] = ''
            try:
                playerstatrow['minutes_played'] = infos2[1].text
            except:
                playerstatrow['minutes_played'] = ''
            playerstats.append(playerstatrow)
    except:
        playerstats = ['']
    return playerstats

In [331]:
iniesta = get_player_stats('https://www.transfermarkt.co.uk/andres-iniesta/profil/spieler/7600')

In [334]:
pd.DataFrame(iniesta).iloc[10:20, :]

Unnamed: 0,season,competition,in_squad,appearances,pointspermatch,goals,assists,owngoals,sub_on,sub_off,yellow,secondyellow,red,penalty,minutes_per_goal,minutes_played
10,15/16,Supercopa,2,2,0.5,-,-,-,1,-,1,-,-,-,-,129'
11,14/15,Champions League,11,11,2.45,-,5,-,1,7,-,-,-,-,-,786'
12,14/15,LaLiga,31,24,2.29,-,1,-,5,14,3,-,-,-,-,1.590'
13,13/14,Copa del Rey,6,6,2.17,-,1,-,2,2,-,-,-,-,-,321'
14,13/14,Supercopa,2,2,1.0,-,-,-,1,-,-,-,-,-,-,107'
15,12/13,Champions League,10,10,1.4,1,1,-,1,1,1,-,-,-,803',803'
16,12/13,Supercopa,2,2,1.5,-,2,-,-,-,-,-,-,-,-,180'
17,11/12,LaLiga,31,27,2.52,2,10,-,6,10,4,-,-,-,932',1.864'
18,11/12,Club World Cup,2,2,3.0,-,-,-,-,-,-,-,-,-,-,180'
19,11/12,Supercopa,2,2,2.0,1,-,-,-,-,-,-,-,-,180',180'


### Player transfer data

In [53]:
def get_transfer_data(url):
    siker, soup = get_soup(url)
    try:
        box = soup.find('div', {'class' : 'box transferhistorie'})
        transfers = box.find_all('tr', {'class' : 'zeile-transfer'})
        transferstatrows = []
        for transfer in transfers:
            transferstat = {}
            try:
                transferstat['season'] = transfer.find_all('td', {'class' : 'zentriert hide-for-small'})[0].text
            except:
                transferstat['season'] = ''
            try:
                transferstat['date'] = transfer.find_all('td', {'class' : 'zentriert hide-for-small'})[1].text
            except:
                transferstat['date'] = ''
            try:
                transferstat['from_club'] = transfer.find_all('td', {'class' : 'hauptlink no-border-links hide-for-small vereinsname'})[0].text.strip()
            except:
                transferstat['from_club'] = ''
            try:
                transferstat['to_club'] = transfer.find_all('td', {'class' : 'hauptlink no-border-links hide-for-small vereinsname'})[1].text.strip()
            except:
                transferstat['to_club'] = ''
            try:
                transferstat['value'] = transfer.find('td', {'class' : 'zelle-mw'}).text.strip()
            except:
                transferstat['value'] = ''
            try:
                transferstat['price'] = transfer.find('td', {'class' : 'zelle-abloese'}).text.strip()
            except:
                transferstat['price'] = ''
            transferstatrows.append(transferstat)
    except:
        transferstatrows = ['']
    return transferstatrows       
    

In [357]:
ibra = get_transfer_data('https://www.transfermarkt.co.uk/zlatan-ibrahimovic/profil/spieler/3455')

In [358]:
pd.DataFrame(ibra)

Unnamed: 0,season,date,from_club,to_club,value,price
0,19/20,"Jan 2, 2020",LA Galaxy,AC Milan,£3.15m,Free transfer
1,17/18,"Mar 23, 2018",Man Utd,LA Galaxy,£4.50m,Free transfer
2,16/17,"Jul 1, 2016",Paris SG,Man Utd,£13.50m,Free transfer
3,12/13,"Jul 18, 2012",AC Milan,Paris SG,£33.30m,£18.90m
4,11/12,"Jul 1, 2011",FC Barcelona,AC Milan,£31.50m,£21.60m
5,10/11,"Jun 30, 2011",AC Milan,FC Barcelona,£31.50m,End of loan
6,10/11,"Aug 28, 2010",FC Barcelona,AC Milan,£40.50m,Loan fee:£5.40m
7,09/10,"Jul 27, 2009",Inter,FC Barcelona,£40.50m,£62.55m
8,06/07,"Aug 10, 2006",Juventus,Inter,£22.50m,£22.32m
9,04/05,"Aug 31, 2004",Ajax,Juventus,-,£14.40m


### Get all player data

In [54]:
df = pd.read_csv('teams_dataframe_status0513.csv', sep = ';', encoding = 'utf-8').drop(columns = 'Unnamed: 0')
df.head()

Unnamed: 0,name,url,squadlist,squaddata
0,Man City,https://www.transfermarkt.co.uk/manchester-cit...,https://www.transfermarkt.co.uk/manchester-cit...,"[{'kit': '31', 'name': 'Ederson', 'genposition..."
1,Liverpool,https://www.transfermarkt.co.uk/fc-liverpool/s...,https://www.transfermarkt.co.uk/fc-liverpool/k...,"[{'kit': '1', 'name': 'Alisson', 'genposition'..."
2,Spurs,https://www.transfermarkt.co.uk/tottenham-hots...,https://www.transfermarkt.co.uk/tottenham-hots...,"[{'kit': '1', 'name': 'Hugo Lloris', 'genposit..."
3,Chelsea,https://www.transfermarkt.co.uk/fc-chelsea/sta...,https://www.transfermarkt.co.uk/fc-chelsea/kad...,"[{'kit': '1', 'name': 'Kepa', 'genposition': '..."
4,Man Utd,https://www.transfermarkt.co.uk/manchester-uni...,https://www.transfermarkt.co.uk/manchester-uni...,"[{'kit': '1', 'name': 'David de Gea', 'genposi..."


In [16]:
pd.DataFrame(eval(df['squaddata'][0]))

Unnamed: 0,kit,name,genposition,specposition,dateofbirth,nationality,height,foot,joined,lastclub,contract,playerurl
0,31,Ederson,Goalkeeper,Goalkeeper,"Aug 17, 1993 (26)",Brazil,"1,88 m",left,"Jul 1, 2017",SL Benfica,30.06.2025,https://www.transfermarkt.co.uk/ederson/profil...
1,33,Scott Carson,Goalkeeper,Goalkeeper,"Sep 3, 1985 (34)",England,"1,88 m",right,"Aug 8, 2019",Derby County,31.05.2020,https://www.transfermarkt.co.uk/scott-carson/p...
2,5,John Stones,Defender,Centre-Back,"May 28, 1994 (25)",England,"1,88 m",right,"Aug 9, 2016",Everton FC,30.06.2022,https://www.transfermarkt.co.uk/john-stones/pr...
3,50,Eric García,Defender,Centre-Back,"Jan 9, 2001 (19)",Spain,"1,83 m",right,"Jul 1, 2019",Manchester City U23,30.06.2022,https://www.transfermarkt.co.uk/eric-garcia/pr...
4,11,Oleksandr Zinchenko,Defender,Left-Back,"Dec 15, 1996 (23)",Ukraine,"1,75 m",left,"Jul 4, 2016",FK Ufa,30.06.2024,https://www.transfermarkt.co.uk/oleksandr-zinc...
5,27,João Cancelo,Defender,Right-Back,"May 27, 1994 (25)",Portugal,"1,82 m",right,"Aug 7, 2019",Juventus FC,30.06.2025,https://www.transfermarkt.co.uk/joao-cancelo/p...
6,25,Fernandinho,Midfielder,Defensive Midfield,"May 4, 1985 (35)",Brazil,"1,79 m",right,"Jul 1, 2013",Shakhtar Donetsk,30.06.2021,https://www.transfermarkt.co.uk/fernandinho/pr...
7,47,Phil Foden,Midfielder,Central Midfield,"May 28, 2000 (19)",England,"1,71 m",left,"Jul 1, 2017",Manchester City U18,30.06.2024,https://www.transfermarkt.co.uk/phil-foden/pro...
8,21,David Silva,Midfielder,Attacking Midfield,"Jan 8, 1986 (34)",Spain,"1,70 m",left,"Jul 14, 2010",Valencia CF,30.06.2020,https://www.transfermarkt.co.uk/david-silva/pr...
9,19,Leroy Sané,Forward,Left Winger,"Jan 11, 1996 (24)",Germany,"1,84 m",left,"Aug 2, 2016",FC Schalke 04,30.06.2021,https://www.transfermarkt.co.uk/leroy-sane/pro...


Now we will iterate on all the players of all the squads and retreive the abovementioned information, and store them in the dataframe

In [55]:
def scrape_all_player_data(df):
    i = 0
    for squad in df['squaddata']:
        squad = eval(squad)
        for player in squad:
            player['marketvaluedata'] = get_market_values(player['playerurl'])
            player['performancestats'] = get_player_stats(player['playerurl'])
            player['playertransfers'] = get_transfer_data(player['playerurl'])
            filename = '../Code/Jsons/' + str(player['name']) + '.json'
            json.dump(player, open(filename, 'w'))
            print('Player done!')
        i += 1
        print(i, ' teams done!')
    return df

In [370]:
proba_df = scrape_all_player_data(new_df.head(1))

Player done!
Player done!
Player done!
Player done!
Player done!
Player done!
Player done!
Player done!
Player done!
Player done!
Player done!
Player done!
Player done!
Player done!
Player done!
Player done!
Player done!
Player done!


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Player done!
Player done!
Player done!
Player done!
Player done!
1  teams done!


In [373]:
proba_df['squaddata'][0]

[{'kit': '31',
  'name': 'Ederson',
  'genposition': 'Goalkeeper',
  'specposition': 'Goalkeeper',
  'dateofbirth': 'Aug 17, 1993 (26)',
  'nationality': 'Brazil',
  'height': '1,88 m',
  'foot': 'left',
  'joined': 'Jul 1, 2017',
  'lastclub': 'SL Benfica',
  'contract': '30.06.2025',
  'playerurl': 'https://www.transfermarkt.co.uk/ederson/profil/spieler/238223',
  'marketvaluedata': {'date': ['Apr 3, 2013',
    'Jul 2, 2013',
    'Jul 25, 2014',
    'Feb 17, 2015',
    'Jul 1, 2015',
    'Feb 24, 2016',
    'Jul 29, 2016',
    'Feb 22, 2017',
    'Jun 26, 2017',
    'Oct 23, 2017',
    'Jan 2, 2018',
    'Mar 23, 2018',
    'May 28, 2018',
    'Oct 17, 2018',
    'Dec 19, 2018',
    'Jun 13, 2019',
    'Dec 10, 2019',
    'Apr 8, 2020'],
   'market_value': ['270000',
    '270000',
    '810000',
    '540000',
    '1080000',
    '1080000',
    '6300000',
    '10800000',
    '19800000',
    '22500000',
    '31500000',
    '40500000',
    '45000000',
    '54000000',
    '54000000',
    '

## Google Search Data

In [63]:
# Not written yet
# Google search results
# https://www.google.com/search?q=lionel+messi