In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import csv
import concurrent

def getSoup(url):
    html_content = requests.get(url).text
    return BeautifulSoup(html_content, "lxml")

In [2]:
NATIONALITY = ['Nat.','Nationality','Nation','Nat']
DISCIPLINE = ['Discipline']
NAME = ['Player','Name']
AGE = ['Age','Date of birth','Pos']
YEAR = ['Year']
APPS = ['App']
GOALS = ['Goal']
TEAM = ['Team']
MANAGER = ['Manager']

In [3]:
class TeamURLError(Exception):
    
    def __init__(self,league, url):
        self.league = league
        self.url = url
        self.message = "Couldn't get players from: {}".format(url)
        super().__init__(self.message)

class PlayerURLError(Exception):
    
    def __init__(self,url):
        self.url = url
        self.message = "Couldn't get data from: {}".format(url)
        super().__init__(self.message)

In [4]:
def hasElementContaining(array,criteria):
    return len(list(filter(lambda x: criteria in x,array))) > 0

def hasElementContainingAnyOf(array,criterias):
    for criteria in criterias:
        if hasElementContaining(array,criteria):
            return True
    return False
 
def containsAnyOf(stringToCheck,values):
    for value in values:
        if value in stringToCheck:
            return True
    return False

def stripWiki(text):
    if text:
        return text.replace('/wiki/', '').strip()
    return None

def fullWiki(text):
    return "https://en.wikipedia.org" + text

def getSquadUrl(url):
    return 'https://en.wikipedia.org/wiki/2019-20_'+ url +'_season'

In [5]:
def getAllTables(url):
    soup = getSoup(url)
    tables = soup.find_all("table", attrs={"class": "wikitable"})
    title = soup.title.text.replace('- Wikipedia', '').strip()
    return title,tables

def getText(cell):
    return cell.text.replace('\n', '').strip()

def getHeaders(table):
    return list(map(lambda x:getText(x),table.find_all("th")))

def getIndexOf(table,values):
    headers = getHeaders(table)
    for i in range(len(headers)):
        if containsAnyOf(headers[i],values):
            return i
    return -1

def hasNoHeaders(row):
        return len(row.find_all("th")) == 0


def getRows(table):
    return table.tbody.find_all("tr")

def getBodyRows(table):
    rows = getRows(table)
    return list(filter(lambda x: hasNoHeaders(x),rows))

def extractLink(link):
    return link['href'].strip() if link else None

def getTextAndLink(cell):
    link = cell.find('a')
    url = extractLink(link)
    text = getText(cell)
    return text,url

def getAllLinks(cell):
    links = cell.find_all('a')
    return list(map(lambda x: (getText(x),extractLink(x)),links))
    
def getAllCells(row):
    cells = row.find_all("th") + row.find_all("td")
    return list(map(lambda x:getText(x),cells))

def getRowContaining(table,text):
    rows = getRows(table)
    for row in rows:
        cells = getAllCells(row)
        if hasElementContaining(cells,text):
            return getAllCells(row)
    return None
    

In [6]:
def isInternationalTable(table):
    headers = getHeaders(table)
    year = hasElementContainingAnyOf(headers,YEAR)
    apps = hasElementContainingAnyOf(headers,APPS)
    goals = hasElementContainingAnyOf(headers,GOALS)
    return year and apps and goals

def getAppearancesForPlayer(url):
    tables = []
    try:
        soup = getSoup(url)
        title,tables = getAllTables(url)
    except Exception as exc:
        raise PlayerURLError(url)
    internationalTables = list(filter(lambda x: isInternationalTable(x),tables))
    if len(internationalTables) > 0:
        internationalTable = internationalTables[0]
        totalRow = getRowContaining(internationalTable,'Total')
        appIndex = getIndexOf(internationalTable,APPS) -1 # Country in First Header
        if totalRow:
            return totalRow[appIndex]
    return 0

def isPlayerTable(table):
    headers = getHeaders(table)
    names = hasElementContainingAnyOf(headers,NAME)
    nationality = hasElementContainingAnyOf(headers,NATIONALITY) or hasElementContainingAnyOf(headers,DISCIPLINE)
    age = hasElementContainingAnyOf(headers,AGE)
    notTeam = not hasElementContainingAnyOf(headers,TEAM)
#     print(headers)
#     print('names',names)
#     print('nationality',nationality)
#     print('age',age)
#     print('notTeam',notTeam)
    return names and nationality and age and notTeam

def getDataFromPlayerTable(league,team,playerIndex,countryIndex,rows):
    players = []
    for row in rows:
        cells = row.find_all("td")
        url = None
        country = None
        url = None
        apps = None
        if(playerIndex == -1):
            print('ERROR: No Player Index - Stopping Early')
            return players
        if len(cells) < playerIndex:
            print('ERROR: Player Index Greater than Row Length - Stopping Early')
            return players
        links = getAllLinks(cells[playerIndex])
        (name, wikiPrefix) = getTextAndLink(cells[playerIndex])
        (tmp, countryWiki) = getTextAndLink(cells[countryIndex])
        if len(links) > 0:
            country = stripWiki(countryWiki)
            url = fullWiki(wikiPrefix)
            if countryIndex == -1:
                emptyLinks = list(filter(lambda x: x[0]=='',links))
                if len(emptyLinks) > 0:
                    countryLink = emptyLinks[0]
                    country = stripWiki(countryLink[1])
                urlLinks = list(filter(lambda x: x[0]!='',links))
                if len(urlLinks) > 0:
                    urlLink = urlLinks[0]
                    url = fullWiki(urlLink[1])
        result = (league,team,name,country,url)
        players.append(result)
    return players

def getPlayerDataForTeam(league,url):
    title,tables = getAllTables(url)
    print('Starting ',title)
    playerTables = list(filter(lambda x: isPlayerTable(x),tables))
    if len(playerTables) > 0:
        playerTable = playerTables[0]
        playerIndex = getIndexOf(playerTable,NAME)
        countryIndex = getIndexOf(playerTable,NATIONALITY)
        playerRows = getBodyRows(playerTables[0])
        if len(playerRows)<17 and len(playerTables)>1:
            playerRows = playerRows + getBodyRows(playerTables[1])
        return getDataFromPlayerTable(league,title,playerIndex,countryIndex,playerRows)
    else:
        raise TeamURLError(league,url)

        
def validateTeamURL(league,url):
    title,tables = getAllTables(url)
    print('Starting ',title)
    playerTables = list(filter(lambda x: isPlayerTable(x),tables))
    if len(playerTables) > 0:
        return None
    else:
        raise TeamURLError(league,url)

In [7]:
def getAppearances(url):
    soup = getSoup(url)
    title,tables = getAllTables(url)
    internationalTables = list(filter(lambda x: isInternationalTable(x),tables))
    if len(internationalTables) > 0:
        internationalTable = internationalTables[0]
        totalRow = getRowContaining(internationalTable,'Total')
        appIndex = getIndexOf(internationalTable,APPS) -1 # Country in First Header
        if totalRow:
            return totalRow[appIndex]
    return 0

def getPlayerDetails(league,team,name,country,url):
    return (league,team,name,country,getAppearances(url),url)

In [8]:
def getDetails(playerObject):
    league = playerObject[0]
    team = playerObject[1]
    name = playerObject[2]
    country = playerObject[3]
    url = playerObject[4]
    return getPlayerDetails(league,team,name,country,url)

In [9]:
def writeTeamDataToFile(league,teamUrl,writer):
    team = None
    try:
        team = getPlayerDataForTeam(league,teamUrl)
    except Exception as exc:
        print(exc)
        
    if team:
        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
            futures = {executor.submit(getDetails,player) for player in team}
            for future in concurrent.futures.as_completed(futures):
                try:
                    data = future.result()
                    writer.writerow(data)
                except Exception as exc:
                    print(exc)

In [10]:
def isTeamsTable(table):
    headers = getHeaders(table)
    team = hasElementContainingAnyOf(headers,TEAM)
    return team 

def getDataFromTeamTable(table):
    rows = getBodyRows(table)
    teamIndex = getIndexOf(table,TEAM)
    teams = []
    dodgy = []
    for row in rows:
        cells = row.find_all("td")
        team = cells[teamIndex]
        (text,prefix) = getTextAndLink(team)
        if prefix:
            url = getSquadUrl(stripWiki(prefix))
            teams.append(url)
        else:
            dodgy.append(url)
    print('Dodgy Teams Were')
    print(dodgy)
    return teams

def getTeamsForLeague(url):
    title,tables = getAllTables(url)
    print(title)
    teamTables = list(filter(lambda x: isTeamsTable(x),tables))
    if len(teamTables) > 0:
        teamTable = teamTables[0]
        urls = getDataFromTeamTable(teamTable)
        return list(map(lambda x: [title,x],urls))
    else:
        raise TeamURLError(league,url)
        

In [11]:
def writeTeamURLsToFile(urls,writer):
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(getTeamsForLeague,url) for url in urls}
        for future in concurrent.futures.as_completed(futures):
            try:
                data = future.result()
                for teamUrl in data:
                    writer.writerow(teamUrl)
            except Exception as exc:
                print('ERROR: %s' % (exc))
            else:
                print('Done')

In [12]:
def getTeamsURLsFromLeagueFile(inputFile):
    with open(inputFile, newline='') as csvfile:
        leagues = list(csv.reader(csvfile))
    URLs = list(map(lambda x: x[1],leagues))
    print(URLs)
    with open(r'../data/ligue1Teams.csv', 'a') as f:
        writer = csv.writer(f)
        writeTeamURLsToFile(URLs,writer)

# Write File

In [13]:
# getTeamsURLsFromLeagueFile('../data/league1.csv')

In [14]:
def getTeamsFromFile(filePath):
    with open(filePath, newline='') as csvfile:
        teams = list(csv.reader(csvfile))
    return teams

Doing Some Error Handling

In [15]:
def writeTeamErrorsToFile(league,teamUrl,writer):
    try:
        team = validateTeamURL(league,teamUrl)
    except Exception as exc:
        print(exc)
        writer.writerow([exc.league,exc.url])

def writeErrorsToFile(leagueArray):
    with open(r'../data/errors.csv', 'a') as f:
        writer = csv.writer(f)
        for team in leagueArray:
            league = team[0]
            teamUrl = team[1]
            writeTeamErrorsToFile(league,teamUrl,writer)

In [16]:
headers = ('League','Team','Player','Country','Appearances','URL')
def writePlayerDetailsToFile(leagueArray):
    # Change Output File Name Here
    with open(r'../data/ligue1Data.csv', 'a') as f:
        writer = csv.writer(f)
        writer.writerow(headers)
        for team in leagueArray:
            league = team[0]
            teamUrl = team[1]
            writeTeamDataToFile(league,teamUrl,writer)

In [17]:
teams = getTeamsFromFile('../data/brazil.csv')
writePlayerDetailsToFile(teams)

Starting  2019 Clube de Regatas do Flamengo season
Starting  2019 Santos FC season
Starting  2019 Sociedade Esportiva Palmeiras season
Starting  2019 São Paulo FC season
Starting  2019 Sport Club Corinthians Paulista season
