In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import csv

def getSoup(url):
    html_content = requests.get(url).text
    return BeautifulSoup(html_content, "lxml")

## Constants

In [2]:
NATIONALITY = ['Nat.','Nationality','Nation']
DISCIPLINE = ['Discipline']
NAME = ['Player','Name']
AGE = ['Age','Date of birth','Pos']
YEAR = ['Year']
APPS = ['App']
GOALS = ['Goal']
TEAM = ['Team']
MANAGER = ['Manager']

## File Stuff

In [3]:
def getLeagueList(pathToFile):
    with open(pathToFile, newline='') as csvfile:
        leagues = list(csv.reader(csvfile))
    return leagues

def writeDataToFile(df,fileName):
    df.to_csv('../data/'+fileName,index=False)
    print('Writing to ',fileName)

## String Stuff

In [4]:
def hasElementContaining(array,criteria):
    return len(list(filter(lambda x: criteria in x,array))) > 0

def hasElementContainingAnyOf(array,criterias):
    for criteria in criterias:
        if hasElementContaining(array,criteria):
            return True
    return False

def containsAnyOf(stringToCheck,values):
    for value in values:
        if value in stringToCheck:
            return True
    return False

def stripWiki(text):
    if text:
        return text.replace('/wiki/', '').strip()
    return None

def fullWiki(text):
    return "https://en.wikipedia.org" + text

def getSquadUrl(url):
    return 'https://en.wikipedia.org/wiki/2019-20_'+ url +'_season'

## Table/Cell Stuff

In [5]:
def getAllTables(url):
    soup = getSoup(url)
    tables = soup.find_all("table", attrs={"class": "wikitable"})
    title = soup.title.text.replace('- Wikipedia', '').strip()
    return title,tables

def getText(cell):
    return cell.text.replace('\n', '').strip()

def getHeaders(table):
    return list(map(lambda x:getText(x),table.find_all("th")))

def getIndexOf(table,values):
    headers = getHeaders(table)
    for i in range(len(headers)):
        if containsAnyOf(headers[i],values):
            return i
    return -1

def hasNoHeaders(row):
        return len(row.find_all("th")) == 0


def getRows(table):
    return table.tbody.find_all("tr")

def getBodyRows(table):
    rows = getRows(table)
    return list(filter(lambda x: hasNoHeaders(x),rows))

def extractLink(link):
    return link['href'].strip() if link else None

def getTextAndLink(cell):
    link = cell.find('a')
    url = extractLink(link)
    text = getText(cell)
    return text,url

def getAllLinks(cell):
    links = cell.find_all('a')
    return list(map(lambda x: (getText(x),extractLink(x)),links))
    
def getAllCells(row):
    cells = row.find_all("th") + row.find_all("td")
    return list(map(lambda x:getText(x),cells))

def getRowContaining(table,text):
    rows = getRows(table)
    for row in rows:
        cells = getAllCells(row)
        if hasElementContaining(cells,text):
            return getAllCells(row)
    return None
    


## Get Appearances For Player

In [6]:
def isInternationalTable(table):
    headers = getHeaders(table)
    year = hasElementContainingAnyOf(headers,YEAR)
    apps = hasElementContainingAnyOf(headers,APPS)
    goals = hasElementContainingAnyOf(headers,GOALS)
    return year and apps and goals

def getAppearancesForPlayer(url):
    soup = getSoup(url)
    title,tables = getAllTables(url)
    internationalTables = list(filter(lambda x: isInternationalTable(x),tables))
    if len(internationalTables) > 0:
        internationalTable = internationalTables[0]
        totalRow = getRowContaining(internationalTable,'Total')
        appIndex = getIndexOf(internationalTable,APPS) -1 # Country in First Header
        if totalRow:
            return totalRow[appIndex]
    return 0

## Get Players from Team Wiki

In [7]:
def isPlayerTable(table):
    headers = getHeaders(table)
    names = hasElementContainingAnyOf(headers,NAME)
    nationality = hasElementContainingAnyOf(headers,NATIONALITY) or hasElementContainingAnyOf(headers,DISCIPLINE)
    age = hasElementContainingAnyOf(headers,AGE)
    notTeam = not hasElementContainingAnyOf(headers,TEAM)
    return names and nationality and age and notTeam

def getDataFromPlayerTable(playerIndex,countryIndex,rows):
    players = []
    for row in rows:
        cells = row.find_all("td")
        links = getAllLinks(cells[playerIndex])
        (name, wikiPrefix) = getTextAndLink(cells[playerIndex])
        (tmp, countryWiki) = getTextAndLink(cells[countryIndex])
        url = None
        country = None
        url = None
        apps = None
        if len(links) > 0:
            country = stripWiki(countryWiki)
            url = fullWiki(wikiPrefix)
            if countryIndex == -1:
                print('No Country Column for',name)
                emptyLinks = list(filter(lambda x: x[0]=='',links))[0]
                if len(emptyLinks) > 0:
                    countryLink = emptyLinks[1]
                    country = stripWiki(countryLink[1])
                urlLink = list(filter(lambda x: x[0]!='',links))[0]
                url = fullWiki(urlLink[1])
            apps = getAppearancesForPlayer(url)
        else:
            print('No links found for',name)
        result = (name,country,apps,url)
        print(result)
        players.append(result)
    return players

In [8]:
def getPlayerDataForTeam(url):
    title,tables = getAllTables(url)
    playerTables = list(filter(lambda x: isPlayerTable(x),tables))
    if len(playerTables) > 0:
        playerTable = playerTables[0]
        playerIndex = getIndexOf(playerTable,NAME)
        countryIndex = getIndexOf(playerTable,NATIONALITY)
        playerRows = getBodyRows(playerTables[0])
        if len(playerRows)<17 and len(playerTables)>1:
            playerRows = playerRows + getBodyRows(playerTables[1])
        return getDataFromPlayerTable(playerIndex,countryIndex,playerRows)
    else:
        print('Messed Up Getting Players, ',url)
        return None
    

## Get Teams from League Wiki

In [9]:
def isTeamsTable(table):
    headers = getHeaders(table)
    team = hasElementContainingAnyOf(headers,TEAM)
    return team 

def getDataFromTeamTable(table):
    rows = getBodyRows(table)
    teamIndex = getIndexOf(table,TEAM)
    teams = []
    dodgy = []
    for row in rows:
        cells = row.find_all("td")
        team = cells[teamIndex]
        (text,prefix) = getTextAndLink(team)
        if prefix:
            url = getSquadUrl(stripWiki(prefix))
            players = getPlayerDataForTeam(url)
        else:
            print('Something strange happened with: ',text)
        if players:
            print((text,len(players)))
            teams.append((text,len(players),players))
        else:
            teams.append((text,None,None))
            dodgy.append((text,url))
    print('Dodgy Teams Were')
    print(dodgy)
    return teams

In [10]:
def getTeamsForLeague(url):
    title,tables = getAllTables(url)
    print(title)
    teamTables = list(filter(lambda x: isTeamsTable(x),tables))
    if len(teamTables) > 0:
        teamTable = teamTables[0]
        return getDataFromTeamTable(teamTable)
    else:
        print('Messed Up Getting Teams, ',url)
        

## Trying for All Leagues

In [12]:
def toDf(data):
    return pd.DataFrame(data, columns = ['League','Team Name','playerName','country','apps','url']) 

def formatFlat(rawData):
    outputData = []
    for league in rawData:
        leagueName = league[0]
        teamData = league[2]
        for team in teamData:
            teamName = team[0]
            playerCount = team[1]
            playerList = team[2]
            for player in playerList:
                playerName = player[0]
                country = player[1]
                apps = player[2]
                url = player[3]
                result = (leagueName,teamName,playerName,country,apps,url)
                outputData.append(result)
    return outputData
            

def completedItMate(inputPath,outputPath):
    leagues = getLeagueList(inputPath)
    rawData = []
    for league in leagues:
        leagueCountry = league[0]
        leagueUrl = league[1]
        title,tables = getAllTables(leagueUrl)
        rawData.append((title,leagueUrl,getTeamsForLeague(leagueUrl)))
    formattedData = formatFlat(rawData)
    df = toDf(formattedData)
    writeDataToFile(df,outputPath)
    
    

In [13]:
completedItMate('../data/England.csv','../data/BootifulSoupVersion1.csv')

2019–20 Premier League
('Bernd Leno', 'Germany', '8', 'https://en.wikipedia.org/wiki/Bernd_Leno')
('Héctor Bellerín', 'Spain', '4', 'https://en.wikipedia.org/wiki/H%C3%A9ctor_Beller%C3%ADn')
('Kieran Tierney', 'Scotland', '16', 'https://en.wikipedia.org/wiki/Kieran_Tierney')
('Mohamed Elneny', 'Egypt', '79', 'https://en.wikipedia.org/wiki/Mohamed_Elneny')
('Sokratis Papastathopoulos', 'Greece', '90', 'https://en.wikipedia.org/wiki/Sokratis_Papastathopoulos')
('Henrikh Mkhitaryan', 'Armenia', '88', 'https://en.wikipedia.org/wiki/Henrikh_Mkhitaryan')
('Dani Ceballos', 'Spain', '11', 'https://en.wikipedia.org/wiki/Dani_Ceballos')
('Alexandre Lacazette', 'France', '16', 'https://en.wikipedia.org/wiki/Alexandre_Lacazette')
('Mesut Özil', 'Germany', '92', 'https://en.wikipedia.org/wiki/Mesut_%C3%96zil')
('Lucas Torreira', 'Uruguay', '26', 'https://en.wikipedia.org/wiki/Lucas_Torreira')
('Pierre-Emerick Aubameyang', 'Gabon', '65', 'https://en.wikipedia.org/wiki/Pierre-Emerick_Aubameyang')
('A