In [1]:
import requests
from bs4 import BeautifulSoup
import re
from pprint import pprint


# Set the header so that they think the request is coming from a website
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

In [2]:
year = '2011'

In [3]:
def year_team_links(year):
    page = 'https://www.transfermarkt.co.uk/premier-league/startseite/wettbewerb/GB1/plus/?saison_id=' + year
    tree = requests.get(page, headers = headers)
    soup = BeautifulSoup(tree.content, 'html.parser')

    #Create an empty list to assign the team links to
    teamLinks = []

    #Extract all links with the correct CSS selector
    links = soup.select("a.vereinprofil_tooltip")

    #We need the location that the link is pointing to, so for each link, take the link location. 
    #Additionally, we only need the links in locations 1,3,5,etc. of our list, so loop through those only
    for i in range(1,41,2):
        teamLinks.append(links[i].get("href"))

    #For each location that we have taken, add the website before it - this allows us to call it later
    for i in range(len(teamLinks)):
        teamLinks[i] = "https://www.transfermarkt.co.uk"+teamLinks[i]
        
    return teamLinks

In [4]:
teamLinks

['https://www.transfermarkt.co.uk/fc-chelsea/startseite/verein/631/saison_id/2011',
 'https://www.transfermarkt.co.uk/manchester-city/startseite/verein/281/saison_id/2011',
 'https://www.transfermarkt.co.uk/manchester-city/startseite/verein/281/saison_id/2011',
 'https://www.transfermarkt.co.uk/manchester-united/startseite/verein/985/saison_id/2011',
 'https://www.transfermarkt.co.uk/tottenham-hotspur/startseite/verein/148/saison_id/2011',
 'https://www.transfermarkt.co.uk/tottenham-hotspur/startseite/verein/148/saison_id/2011',
 'https://www.transfermarkt.co.uk/fc-arsenal/startseite/verein/11/saison_id/2011',
 'https://www.transfermarkt.co.uk/fc-liverpool/startseite/verein/31/saison_id/2011',
 'https://www.transfermarkt.co.uk/fc-liverpool/startseite/verein/31/saison_id/2011',
 'https://www.transfermarkt.co.uk/fc-everton/startseite/verein/29/saison_id/2011',
 'https://www.transfermarkt.co.uk/aston-villa/startseite/verein/405/saison_id/2011',
 'https://www.transfermarkt.co.uk/aston-vill

In [5]:
def get_team_player_links(year_team_links):
    team_player_links = {}

    #Run the scraper through each of our 20 team links
    for team_link in year_team_links:
        # club from link
        team = team_link.split('/')[3]
        
        #Download and process the team page
        page = team_link
        tree = requests.get(page, headers=headers)
        soup = BeautifulSoup(tree.content, 'html.parser')

        #Extract all links
        links = soup.select("a.spielprofil_tooltip")

        #Create an empty list for our player links to go into
        playerLinks = []

        #For each link, extract the location that it is pointing to
        for j in range(len(links)):
            playerLinks.append(links[j].get("href"))

        #The page list the players more than once - let's use list(set(XXX)) to remove the duplicates
        playerLinks = list(set(playerLinks))

        team_player_links[team] = playerLinks

    return team_player_links

In [6]:
team_player_links

{'https://www.transfermarkt.co.uk/fc-chelsea/startseite/verein/631/saison_id/2011': ['/branislav-ivanovic/profil/spieler/36827',
  '/sam-hutchinson/profil/spieler/40617',
  '/raul-meireles/profil/spieler/13168',
  '/frank-lampard/profil/spieler/3163',
  '/ramires/profil/spieler/54170',
  '/juan-mata/profil/spieler/44068',
  '/john-mikel-obi/profil/spieler/30739',
  '/lucas-piazon/profil/spieler/176485',
  '/david-luiz/profil/spieler/46741',
  '/salomon-kalou/profil/spieler/7971',
  '/gary-cahill/profil/spieler/27511',
  '/yossi-benayoun/profil/spieler/7858',
  '/alex/profil/spieler/15420',
  '/ashley-cole/profil/spieler/3182',
  '/petr-cech/profil/spieler/5658',
  '/michael-essien/profil/spieler/5588',
  '/daniel-sturridge/profil/spieler/47082',
  '/didier-drogba/profil/spieler/3924',
  '/bosingwa/profil/spieler/9813',
  '/florent-malouda/profil/spieler/5461',
  '/jamal-blackman/profil/spieler/128898',
  '/josh-mceachran/profil/spieler/128905',
  '/oriol-romeu/profil/spieler/66100',
  

In [13]:
# player_link = team_player_links['https://www.transfermarkt.co.uk/fc-chelsea/startseite/verein/631/saison_id/2011'][0]
# print(player_link)

def get_player_data(team_year_player_link):
    # Grab the page 
    page = "https://www.transfermarkt.co.uk"+team_year_player_link
    tree = requests.get(page, headers = headers)
    soup = BeautifulSoup(tree.content, 'html.parser')

    # Grab the player's data
    table = soup.find('table', attrs={'class':'auflistung'})
    table_rows = table.find_all('tr')

    player_data = {}

    # Extract the player's data
    for tr in table_rows:
        th = tr.find_all('th')
        td = tr.find_all('td')
        header = [th.text for th in th]
        data = [tr.text for tr in td]
        row = header + data
        # TODO: I need to save this data instead of printing it 
        player_data[row[0][:-1].strip()] = row[1].strip()
        
    return player_data


def clean_extract_player_data(player_link, raw_player_data, year):
    # player name from link
    player_name = " ".join(player_link.split('/')[1].split('-'))

    dob = datetime.strptime(player_data['Date of birth'], '%b %d, %Y')
    age = int(year) - dob.year

    height = player_data['Height'].encode('ascii', 'ignore')
    height = height[:-1].decode("utf-8").replace(',', '')

    foot = player_data['Foot']
    citizenship = player_data['Citizenship']
    position = player_data['Position']
    
    return player_name, age, height, foot, citizenship, position
    

pprint(player_data)

/branislav-ivanovic/profil/spieler/36827
{'Age': '36',
 'Citizenship': 'Serbia',
 'Contract expires': '30.06.2021',
 'Current club:': 'West Bromwich Albion',
 'Date of birth': 'Feb 22, 1984',
 'Foot': 'right',
 'Height': '1,85\xa0m',
 'Joined': 'Sep 15, 2020',
 'Name in home country': 'Бранислав Ивановић',
 'Place of birth': 'Sremska Mitrovica',
 'Player agent': 'Star Management Signings Ltd.',
 'Position': 'Defender - Centre-Back'}


In [24]:
# Other data to get: club as at valuedate | Age as at valuedate | Height | Foot | Citizenship

branislav ivanovic
fc-chelsea
185


In [30]:
# Extract the player's transfer value link
for div in soup.findAll('div', {'class': 'dataMarktwert'}):
    a = div.find('a')
    valuelink = a.attrs['href']
    break

# Get the player's transfer value page
valuepage = "https://www.transfermarkt.co.uk"+valuelink

# TODO: the following error occured
# Traceback (most recent call last):
# File "CompleteEPLPlayerData.py", line 92, in <module>
# Grab the player's transfer page
# AttributeError: 'NoneType' object has no attribute 'group'

# Grab the player's transfer page
tree = requests.get(valuepage, headers=headers)
soup = BeautifulSoup(tree.content, 'html.parser')

# Strip the transfer data (from javascript block)
script = str(soup.findAll('script')[-1])
pattern = "'series':\[(.*)\]"
extract = re.search(pattern, script)
found = extract.group(1)

# Do some magic to turn it into values and store it in a table
stripped = eval(found.replace("\'", "\""))
from pprint import pprint
from datetime import datetime
raw_data = eval(found.replace("\'", "\""))
transfer_value_rows = []
for row in raw_data['data']:
    transfer_value_rows.append((row['y'], datetime.strptime(row['datum_mw'], '%b %d, %Y')))

# ensure sorted by ascending datetime
transfer_value_rows.sort(key=lambda x: x[1])

# get the transfer value 
cuttoff_date = datetime(int(year), 10, 1, 0, 0)
transfer_value_before_cutoff = None
datetime_before_cutoff = None
for value, value_datetime in transfer_value_rows:
    if value_datetime.year == int(year):
        if value_datetime <= cuttoff_date:
            transfer_value_before_cutoff = value
            datetime_before_cutoff = value_datetime
        else:
            break

print(transfer_value_before_cutoff)

[(360000, datetime.datetime(2006, 2, 21, 0, 0)),
 (3060000, datetime.datetime(2007, 3, 29, 0, 0)),
 (4950000, datetime.datetime(2007, 8, 23, 0, 0)),
 (10800000, datetime.datetime(2008, 1, 3, 0, 0)),
 (6750000, datetime.datetime(2008, 7, 4, 0, 0)),
 (7650000, datetime.datetime(2009, 6, 29, 0, 0)),
 (14400000, datetime.datetime(2010, 1, 25, 0, 0)),
 (16200000, datetime.datetime(2010, 8, 13, 0, 0)),
 (18900000, datetime.datetime(2011, 2, 1, 0, 0)),
 (20700000, datetime.datetime(2011, 8, 8, 0, 0)),
 (16200000, datetime.datetime(2012, 2, 6, 0, 0)),
 (18900000, datetime.datetime(2012, 6, 24, 0, 0)),
 (21600000, datetime.datetime(2013, 1, 28, 0, 0)),
 (19800000, datetime.datetime(2013, 7, 2, 0, 0)),
 (17100000, datetime.datetime(2014, 1, 15, 0, 0)),
 (15300000, datetime.datetime(2014, 8, 12, 0, 0)),
 (15300000, datetime.datetime(2015, 2, 12, 0, 0)),
 (15300000, datetime.datetime(2015, 7, 1, 0, 0)),
 (9900000, datetime.datetime(2016, 2, 9, 0, 0)),
 (9000000, datetime.datetime(2016, 8, 1, 0, 0)

In [None]:
# Go through each playerLink and grab their data
for i in range(len(playerLinks)):
    # Grab the page 
    page = "https://www.transfermarkt.co.uk"+playerLinks[i]
    tree = requests.get(page, headers = headers)
    soup = BeautifulSoup(tree.content, 'html.parser')

    # Grab the player's data
    table = soup.find('table', attrs={'class':'auflistung'})
    table_rows = table.find_all('tr')

    # Extract the player's data
    for tr in table_rows:
        th = tr.find_all('th')
        td = tr.find_all('td')
        header = [th.text for th in th]
        data = [tr.text for tr in td]
        row = header + data
        # TODO: I need to save this data instead of printing it 
        print(row)
        
    # Extract the player's transfer value link
    for div in soup.findAll('div', {'class': 'dataMarktwert'}):
        a = div.find('a')
        valuelink = a.attrs['href']
        
    # Get the player's transfer value page
    valuepage = "https://www.transfermarkt.co.uk"+valuelink
    
    # TODO: the following error occured
    # Traceback (most recent call last):
    # File "CompleteEPLPlayerData.py", line 92, in <module>
    # Grab the player's transfer page
    # AttributeError: 'NoneType' object has no attribute 'group'

    # Grab the player's transfer page
    tree = requests.get(valuepage, headers=headers)
    soup = BeautifulSoup(tree.content, 'html.parser')

    # Strip the transfer data (from javascript block)
    script = str(soup.findAll('script')[-1])
    pattern = "'series':\[(.*)\]"
    extract = re.search(pattern, script)
    found = extract.group(1)

    # Do some magic to turn it into values and store it in a table
    stripped = eval(found.replace("\'", "\""))
    from pprint import pprint
    from datetime import datetime
    data = eval(found.replace("\'", "\""))
    rows = []
    for row in data['data']:
        rows.append((row['y'], datetime.strptime(row['datum_mw'], '%b %d, %Y')))

    #TODO: Pass this a date and determine the transfer value FOR the date supplied
    # 1st September [ year ]
    valuedate = '01/09/' + year

    # Store / Output all this data in the following format (to a CSV)
    # SEASON [year] | Player Name | Club as at valuedate | Age as at valuedate | Height | Foot | Citizenship | Transfer value as at valuedate |
    