In [1]:
import requests
import lxml.html as lh
import pandas as pd
import re 
import csv

In [2]:
def hasNational(text):
    return "Appearances and goals by national team and year" in text or "International" in text

def isNationalTable(table):
    content = table.text_content()
    return "Year" in content and "Apps" in content and "Goals" in content and "Total" in content

def isTotalRow(row):
    return len(row) ==3 and "Total" in row.text_content() and row[1].text_content().isnumeric()

def getValue(rows):
    row = rows[0]
    return int(row[0].text_content())

def fetchAppearances(doc):
#     print('Inside Apps')
    tables = doc.xpath('//table')
#     print('Found ',len(tables),' Tables')
    nationalTable = list(filter(lambda x: isNationalTable(x), tables))
#     print('Found ',len(nationalTable),' National Tables')
    if len(nationalTable) == 0:
        return 0
    rows = nationalTable[0].xpath('//tr')
#     print('Found ',len(rows),' Rows')
    totalRows = list(filter(lambda x: isTotalRow(x), rows))
#     print('Found ',len(totalRows),' Total Tables')
    if len(totalRows) == 1:
        row = totalRows[0]
        text = row[1].text_content()
#         print(text)
        return int(text)
#     print('Blow Up 2')
    return -1

def getApps(url):
    if url == "ERROR: No Link":
        return 0
    page = requests.get(url)
    doc = lh.fromstring(page.content)
    text = doc.text_content()
#     print(url)
    hasNationalTeam = hasNational(text)
    if hasNationalTeam:
#         print('Found National Team')
        return fetchAppearances(doc)
    else:
        return 0

In [3]:
def notAPhoto(x):
    return "upload" not in x[2]

def getWikiLink(cell):
    allLinks = list(cell.iterlinks())
    goodLinks = list(filter(lambda x: notAPhoto(x), allLinks))
    if len(goodLinks)==0:
        return "ERROR: No Link"
    else:
        return goodLinks[0][2].replace('/wiki/','')

def getWikiString(string):
    return string.replace('https://en.wikipedia.org/wiki/','')

def getWikiDetails(cells):
    player = cells[0]
    country_cell = cells[1]
    display_name = player.text_content()
    return display_name, getWikiLink(player), getWikiLink(country_cell)

def getFullWikiLink(cell):
    allLinks = list(cell.iterlinks())
    goodLinks = list(filter(lambda x: notAPhoto(x), allLinks))
    if len(goodLinks)==0:
        return "ERROR: No Link"
    else:
        return goodLinks[0][2].replace('/wiki/','https://en.wikipedia.org/wiki/')

def getSquadUrl(url):
    return 'https://en.wikipedia.org/wiki/2019-20_'+ url +'_season'

In [4]:
def validPlayerRow(row):
    skip = False
    goodLength = len(row) in [12,13,14]
    if goodLength:
        name_cell = row[3]
        nationality_cell = row[2]
        text = name_cell.text_content().strip()
        skip = text.isnumeric() or len(text) == 0 or text == 'Name'
    return goodLength and not skip

def getPlayers(league,country,teamName,url):
    playerRows = []
    page = requests.get(url)
#     print('Getting Players for: ',teamName)
    print(url)
    doc = lh.fromstring(page.content)
    allRows = list(doc.xpath('//tr'))
    goodRows = list(filter(lambda x: validPlayerRow(x), allRows))
    for row in goodRows:
        playerName = row[3].text_content().strip()
        wikiUrl = getFullWikiLink(row[3])
        nationality = getWikiLink(row[2])
#         print(playerName)
        apps = getApps(wikiUrl)
#         print((playerName,country,apps,wikiUrl))
        playerRows.append((league,country,teamName,playerName,nationality,apps, wikiUrl))
    print('Found ',len(playerRows),'Players for',teamName)
#     return pd.DataFrame(playerRows, columns = ['league','country',team','name','country','apps', 'wiki'])
    return playerRows

In [5]:
def collapseDfs(arrayDf):
    base = arrayDf[0]
    arrayDf.pop(0)
    length = len(arrayDf)
    for i in range(length):
        base = base.append(arrayDf[i])
    return base

def getTeams(url,league,country):
#     outputData = pd.DataFrame([], columns = ['team','name','country','apps', 'wiki'])
    outputData = []
    page = requests.get(url)
    doc = lh.fromstring(page.content)
    rows = doc.xpath('//tr')
    for row in rows:
        if len(row) == 4:
            cell = row[0]
            teamName = cell.text_content().strip()
            wiki = getWikiLink(cell)
            skip = teamName == 'Team' or teamName == 'Rank' or teamName.isnumeric() or "ERROR" in wiki
            if not skip:
                print(len(outputData))
                outputData = outputData + getPlayers(league,country,teamName,getSquadUrl(wiki))
    return outputData

In [6]:
teams = getTeams('https://en.wikipedia.org/wiki/2019%E2%80%9320_Premier_League','Premier League','England')

0
Found  63 Players for Arsenal
63
Found  34 Players for Aston Villa
97
Found  30 Players for Bournemouth
127
Found  39 Players for Brighton & Hove Albion
166
Found  36 Players for Burnley
202
Found  1 Players for Chelsea
203
Found  27 Players for Crystal Palace
230
Found  28 Players for Everton
258
Found  38 Players for Leicester City
296
Found  1 Players for Liverpool
297
Found  0 Players for Manchester City
297
Found  3 Players for Manchester United
300
Found  32 Players for Newcastle United
332
Found  1 Players for Norwich City
333
Found  30 Players for Sheffield United
363
Found  14 Players for Southampton
377
Found  24 Players for Tottenham Hotspur
401
Found  38 Players for Watford
439
Found  30 Players for West Ham United
469
Found  0 Players for Wolverhampton Wanderers


Chelsea, Liverpool, Man City, Man UTD, Norwick Citym So

In [7]:
print(len(teams))

469


In [8]:
dodgy Teams:
    

('Premier League', 'England', 'Arsenal', 'Bernd Leno', 'Germany', 8, 'https://en.wikipedia.org/wiki/Bernd_Leno')
