In [76]:
import urllib
import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
from time import time, sleep

In [77]:
BASE_URL = "https://www.transfermarkt.com"

class League:
	def __init__( self, name, url, scraper):
		self.LeagueName = name
		soup = scraper(url)
		teamsTable = soup.find("table", class_="items")
		teamUrls = teamsTable.find_all("a", class_="vereinprofil_tooltip", id=re.compile("\d+"))[::2]
		teamUrls = [BASE_URL + teamUrl["href"] for teamUrl in teamUrls]
		self.TeamsData = [ Team(teamUrl, self.LeagueName, scraper) for teamUrl in teamUrls]

In [78]:
BASE_URL = "https://www.transfermarkt.com"


class Team:
	def __init__( self, url, name, scraper):
		self.LeagueName = name
		soup = scraper(url)
		#reading player table and filtering for offensive players
		playerTable = soup.find("table", class_="items")
		players = playerTable.find_all("a", class_="spielprofil_tooltip")[::2]
# 		offensivePlayers = filter( Team.isStrikerOrWinger, players)
		PlayersUrls = [BASE_URL + player["href"] for player in players]
		#self.PlayerData = [PlayerProfile( playerUrl, scraper) for playerUrl in offensivePlayersUrls]
		self.PlayersData = []
		for playerUrl in PlayersUrls:
			try:
				NewPlayerProfile = PlayerProfile( playerUrl, scraper)
				NewPlayerProfile.PlayerData["current league"] = self.LeagueName
				self.PlayersData.append( NewPlayerProfile)
			except:
				continue


	@staticmethod
	def isStrikerOrWinger( player):
		position = player.find_next("tr").text.strip().lower()
		return "wing" in position or "centre-forward" in position

In [23]:
from bs4 import BeautifulSoup as bs
import re
import pandas as pd

CURRENT_YEAR = 19
N_SEASON_HISTORY = 5


class PlayerProfile:
	def __init__(self, playerUrl, pageScraper):
		urlPerfPage = playerUrl.replace("profil","leistungsdatendetails")
		soup = pageScraper( playerUrl)

		playerAttributes = {}

		#scraping profile page information
		playerAttributes["name"] = soup.find("div", class_="dataMain").find("h1").text
		StoredAttributes = ["Age:", "Height:", "Nationality:", "Position:", "Foot:", "Current club:"]
		for entry in soup.find("table", class_="auflistung").find_all("th"):
			key = entry.text.strip()
			val = entry.find_next_sibling().text
			if key in StoredAttributes:
				playerAttributes[ key[:-1].lower()] = val.strip()
		#cleaning some entries
		if "height" in playerAttributes:
			#converting height in meters str to cm int 
			meter, centimeters = re.search("(\d),(\d+)",playerAttributes["height"]).groups()
			playerAttributes["height"] = int(meter)*100 + int(centimeters)
		if "nationality" in playerAttributes:
			if "\xa0" in playerAttributes["nationality"]:
				playerAttributes["nationality"] = playerAttributes["nationality"].replace("\xa0", " ")
		if "position" in playerAttributes:
			playerAttributes["position"] = "CF" if "Centre-Forward" in playerAttributes["position"] else "W"
		if "age" in playerAttributes:
			playerAttributes["age"] = int( playerAttributes["age"])

		#scraping performance page information
		soup = pageScraper(urlPerfPage)
		performanceColumns = ("season", "games", "goals", "assists", "minutes")
		performanceRows = pd.DataFrame( {col:[] for col in performanceColumns})
		for row in soup.find("div", class_="responsive-table").find("tbody").find_all("tr"):
			#try:
			rowContents = PlayerProfile.readRow( row)
			#except:
			#	 print( row)
			#else:
			#	 pass
			year = rowContents[0]
			if re.match( "\d{4}", year):
				year = int( year[2:])
				year = "%02d/%02d" %(year-1, year)
				rowContents[0] = year
			if not re.match( "\d{2}\/\d{2}", year):
				raise ValueError("Wrong format for played year")
			if int( year[:2]) < CURRENT_YEAR - N_SEASON_HISTORY:
				break
			#print( rowContents)
			performanceRows = performanceRows.append( {col:val for col, val in zip(performanceColumns, rowContents)}, ignore_index=True)
		performanceDF = performanceRows.groupby("season").sum()
		#converting to serie
		performanceSeries = pd.Series( {"%s %s" %(row, col): performanceDF[col][row] for row in performanceDF.index for col in performanceDF.columns})
		
		self.PlayerData = pd.Series( playerAttributes).append( performanceSeries)
		print( "\t%s done" %self.PlayerData["name"])
		
	def __str__(self):
		return "Performance profile for %s" % self.PlayerData["name"]

	def __repr__(self):
		return "< profile of %s >" % self.PlayerData["name"]

	@staticmethod
	def readRow( row):
		cells = row.find_all( "td")
		cells = list( map( lambda x : x.text.strip(), cells))
		year = cells[0]
		games_played = cells[4]
		goals_scored = cells[6]
		assists = cells[7]
		minutes_played = cells[-1]
		games_played = int(games_played) if games_played != "-" else 0
		goals_scored = int(goals_scored) if goals_scored != "-" else 0
		assists = int(assists) if assists != "-" else 0
		minutes_played = int( minutes_played[:-1].replace(".","")) if minutes_played != "-" else 0
		return [year, games_played, goals_scored, assists, minutes_played]

In [79]:
from bs4 import BeautifulSoup as bs
import re
import pandas as pd

CURRENT_YEAR = 19
N_SEASON_HISTORY = 5


class PlayerProfile:
	def __init__(self, playerUrl, pageScraper):
		soup = pageScraper( playerUrl)
        
		playerAttributes = {}
        
		#scraping profile page information
		playerAttributes["name"] = soup.find("div", class_="dataMain").find("h1").text
		StoredAttributes = ["Age:", "Height:", "Nationality:", "Position:", "Foot:", "Current club:"]
		for entry in soup.find("table", class_="auflistung").find_all("th"):
			key = entry.text.strip()
			val = entry.find_next_sibling().text
			if key in StoredAttributes:
				playerAttributes[ key[:-1].lower()] = val.strip()
		#cleaning some entries
		if "height" in playerAttributes:
			#converting height in meters str to cm int 
			meter, centimeters = re.search("(\d),(\d+)",playerAttributes["height"]).groups()
			playerAttributes["height"] = int(meter)*100 + int(centimeters)
		if "nationality" in playerAttributes:
			if "\xa0" in playerAttributes["nationality"]:
				playerAttributes["nationality"] = playerAttributes["nationality"].replace("\xa0", " ")
		if "position" in playerAttributes:
			playerAttributes["position"] = "CF" if "Centre-Forward" in playerAttributes["position"] else "W"
		if "age" in playerAttributes:
			playerAttributes["age"] = int( playerAttributes["age"])
        
		#scraping transfer page information
		TransferColumns = ("season", "date", "left", "joined", "mv", "fee")
		TransferRows = pd.DataFrame( {col:[] for col in TransferColumns})
		for row in soup.find("div", class_="responsive-table").find("tbody").find_all("tr", class_='zeile-transfer'):
			#try:
			rowContents = PlayerProfile.readRow( row)
			print( rowContents)
			TransferRows = TransferRows.append( {col:val for col, val in zip(TransferColumns, rowContents)}, ignore_index=True)
		TransferDF = TransferRows.groupby("season").sum()
		#converting to serie
		TransferSeries = pd.Series( {"%s %s" %(row, col): TransferDF[col][row] for row in TransferDF.index for col in TransferDF.columns})
		
		self.PlayerData = pd.Series( playerAttributes).append( TransferSeries)
		print( "\t%s done" %self.PlayerData["name"])
		
	def __str__(self):
		return "Transfer profile for %s" % self.PlayerData["name"]

	def __repr__(self):
		return "< profile of %s >" % self.PlayerData["name"]

	@staticmethod
	def readRow( row):
		cells = row.find_all( "td")
		cells = list( map( lambda x : x.text.strip(), cells))
		year = cells[0]
		date = cells[1]
		left = cells[5]
		joined = cells[9]
		mv = cells[10]
		fee = cells[11]
		mv = str(mv) if mv != "-" else 0
		fee = str(fee) if fee != "-" else 0
		return [year, date, left, joined, mv, fee]

In [80]:
N_LEAGUES = 1 #keeping the top N leagues
LEAGUES_URL = "https://www.transfermarkt.com/wettbewerbe/europa/wettbewerbe"
BASE_URL = "https://www.transfermarkt.com"

DELAY_BETWEEN_QUERIES = 0 #min delay in seconds spacing http queries
class PageScraper():
    def __init__(self ):
        self.opener = urllib.request.build_opener()
        self.opener.addheaders = [('User-agent', 'Mozilla/5.0')]
        self.lastQuery = -float("inf")
    def readUrl( self, url_):
        presentTime = time() - self.lastQuery - DELAY_BETWEEN_QUERIES
        if presentTime < 0:
            sleep( abs(presentTime))
        inData = self.opener.open(url_)
        content = inData.read()
        self.lastQuery = time()
        return bs(content, "html.parser")
    def __call__( self, url_):
        return self.readUrl( url_)


if __name__ == "__main__":
	scraper = PageScraper()
	soup = scraper( LEAGUES_URL)
	LeagueTables = soup.find("table", class_="items").find("tbody")
	Leagues = LeagueTables.find_all("a", href=re.compile("wettbewerb/[A-Z]{2}1"), title=re.compile("\w"))
	Leagues = Leagues[:N_LEAGUES]
	LeagueUrlDic = { league.text : BASE_URL + league["href"] for league in Leagues}
	LeaguesData = []
	for leagueName, leagueUrl in LeagueUrlDic.items():
		print( "Scraping the %s..." %leagueName)
		LeaguesData.append( League( leagueName, leagueUrl, scraper))

	#flattening all players information to pandas.DataFrame and exporting to csv
	PlayerProfiles = [player.PlayerData for league in LeaguesData for team in league.TeamsData for player in team.PlayersData]
	df = pd.DataFrame( PlayerProfiles)
	df.to_csv("transfer.csv", index=False)

Scraping the Premier League...
['17/18', 'Jul 1, 2017', 'Benfica', 'Man City', '22,00 mil. €', '40,00 mil. €']
['15/16', 'Jul 1, 2015', 'Rio Ave FC', 'Benfica', '1,20 mil. €', '500 thousand €']
['12/13', 'Jul 1, 2012', 'GD Ribeirão', 'Rio Ave FC', 0, 'Available for free transfer']
['11/12', 'Jul 1, 2011', 'Benfica U19', 'GD Ribeirão', 0, 'Available for free transfer']
['10/11', 'Jul 1, 2010', 'Benfica U17', 'Benfica U19', 0, 0]
['09/10', 'Jan 1, 2010', 'São Paulo U17', 'Benfica U17', 0, '?']
	Ederson done
['16/17', 'Aug 25, 2016', 'FC Barcelona', 'Man City', '15,00 mil. €', '18,00 mil. €']
['14/15', 'Jul 1, 2014', 'Real Sociedad', 'FC Barcelona', '4,00 mil. €', '12,00 mil. €']
['06/07', 'Jul 1, 2006', 'Colo Colo', 'Real Sociedad', 0, '1,20 mil. €']
['01/02', 'Jan 1, 2002', 'Colo Colo U19', 'Colo Colo', 0, 0]
	Claudio Bravo done
['19/20', 'Aug 8, 2019', 'Derby', 'Man City', '2,00 mil. €', 'Loan']
['15/16', 'Jul 1, 2015', 'Wigan', 'Derby', '2,00 mil. €', '?']
['13/14', 'Jul 4, 2013', 'Bu

In [34]:
headers = {'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

url = "https://www.transfermarkt.com/bernd-leno/profil/spieler/72476"
page = requests.get(url, headers=headers)
soup = bs(page.content, 'html.parser')

In [53]:
#scraping transfer page information
TransferColumns = ("season", "date", "left", "joined", "mv", "fee")
TransferRows = pd.DataFrame( {col:[] for col in TransferColumns})
for row in soup.find("div", class_="responsive-table").find("tbody").find_all("tr", class_='zeile-transfer'):
    #try:
    cells = row.find_all( "td")
    cells = list( map( lambda x : x.text.strip(), cells))
    year = cells[0]
    date = cells[1]
    left = cells[5]
    joined = cells[9]
    mv = cells[10]
    fee = cells[11]
    mv = str(mv) if mv != "-" else 0
    fee = str(fee) if fee != "-" else 0
    print(year, date, left, joined, mv, fee)

18/19 Jul 1, 2018 Bay. Leverkusen Arsenal 20,00 mil. € 25,00 mil. €
11/12 Jan 1, 2012 VfB Stuttgart Bay. Leverkusen 5,00 mil. € 10,00 mil. €
11/12 Dec 31, 2011 Bay. Leverkusen VfB Stuttgart 5,00 mil. € End of loan
11/12 Aug 10, 2011 VfB Stuttgart Bay. Leverkusen 600 thousand € Loan fee:500 thousand €
11/12 Jul 1, 2011 Stuttgart II VfB Stuttgart 600 thousand € 0
10/11 Jul 1, 2010 Stuttgart U19 Stuttgart II 150 thousand € 0
09/10 Jul 1, 2009 Stuttgart U17 Stuttgart U19 0 0
07/08 Jul 1, 2007 Stuttgart Yth. Stuttgart U17 0 0
03/04 Jul 1, 2003 Bietigheim Yth. Stuttgart Yth. 0 Available for free transfer


AttributeError: 'NoneType' object has no attribute 'find_all'