# Libraries

In [None]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import date

# Webscraping

In [None]:
# Webscraping Code
# https://realpython.com/python-web-scraping-practical-introduction/
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        print(str(e))
        return None


def is_good_response(resp):
    """
    Returns true if the response seems to be HTML, false otherwise
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)

In [None]:
# Dataframes to store extracted data
leagueDF = pd.DataFrame(columns=['League_Name','League_Location','League_Country','League_ID', 'Status'])
leagueAttendanceDF = pd.DataFrame(columns=['League_ID','League_Season','Year','League_Attendance', 'Date'])
leagueCupDF = pd.DataFrame(columns=['League_ID','League_Cup','Year','Cup_Attendance', 'Date'])
leagueChallengeDF = pd.DataFrame(columns=['League_ID','Challenge_Season','Year','Challenge_Attendance', 'Date'])

# This method will download all of the League Challenge Data
def Get_League_Challege(url, ID, page):
    #Create URL
    url = url + "/challenge?&page_=" + str(page)

    #Search URL for HTML
    raw_html = simple_get(url)
    html = BeautifulSoup(raw_html, 'html.parser')
    
    # Recursion
    pages = html.find_all('a', attrs = {'class':'next'})#[1].get_text()[:4]
    pageList = []
    for index2 in range(0,len(pages)):
        if pages[index2].get_text()[:4] == 'Next':
            pageList.append('Next')
    
    if 'Next' in pageList:
        Get_League_Challege(url, ID, (page+1))
    
    # league attendance table
    row = html.find("tbody").find_all("tr")
    
    # Iterate through each row in the table
    for index in range(0,len(row)):
        cells = row[index].find_all("td")
        
        # Get the data
        challenge_Season = cells[0].get_text().replace('\n','').replace('\t','').strip()
        year = cells[1].get_text()
        challenge_Attendance = cells[2].get_text()
        date = cells[3].get_text()
        
        # Save the record
        leagueChallengeDF.loc[len(leagueAttendanceDF)] = [ID, challenge_Season, year, challenge_Attendance, date]

# This method will download all of the League Attendance Data
def Get_League_Attendance(url, ID, page):
    #Create URL
    url2 = url + "/history?&page_=" + str(page)

    #Search URL for HTML
    raw_html = simple_get(url2)
    html = BeautifulSoup(raw_html, 'html.parser')
    
    # Recursion
    pages = html.find_all('a', attrs = {'class':'next'})#[1].get_text()[:4]
    pageList = []
    for index2 in range(0,len(pages)):
        if pages[index2].get_text()[:4] == 'Next':
            pageList.append('Next')
    
    if 'Next' in pageList:
        Get_League_Attendance(url, ID, (page+1))
    
    # league attendance table
    row = html.find_all('table')[0].find("tbody").find_all("tr")
    
    # Iterate through each row in the table
    for index in range(0,len(row)):
        cells = row[index].find_all("td")
        
        # Get the data
        league_Season = cells[0].get_text().replace('\n','').replace('\t','').strip()
        year = cells[1].get_text()
        league_Attendance = cells[2].get_text()
        date = cells[3].get_text()
        
        # Save the record
        leagueAttendanceDF.loc[len(leagueAttendanceDF)] = [ID, league_Season, year, league_Attendance, date]

# This method will download all of the League Cup Data
def Get_League_Cup(url, ID, page):
    #Create URL
    url = url + "/cup?&page_=" + str(page)

    #Search URL for HTML
    raw_html = simple_get(url)
    html = BeautifulSoup(raw_html, 'html.parser')
    
    # Recursion
    pages = html.find_all('a', attrs = {'class':'next'})#[1].get_text()[:4]
    pageList = []
    for index2 in range(0,len(pages)):
        if pages[index2].get_text()[:4] == 'Next':
            pageList.append('Next')
    
    if 'Next' in pageList:
        Get_League_Attendance(url, ID, (page+1))

    # league cup table
    row = html.find("tbody").find_all("tr")
    #row = html.find_all('table')[0].find("tbody").find_all("tr")
    
    # Iterate through each row in the table
    for index in range(0,len(row)):
        cells = row[index].find_all("td")
    
        # Get the data
        cup_Season = cells[0].get_text().replace('\n','').replace('\t','').strip()
        year = cells[1].get_text()
        cup_Attendance = cells[2].get_text()
        date = cells[3].get_text()
        
        # Save the record
        leagueCupDF.loc[len(leagueCupDF)] = [ID, cup_Season, year, cup_Attendance, date]        

# This method will download all of the League Data by calling the above methods
def Get_League_Data(url, ID):
    try:
        #Search URL for HTML
        raw_html = simple_get(url)
        html = BeautifulSoup(raw_html, 'html.parser')

        # Get the League address
        address_html = html.find_all('dl', attrs = {'class':'verysimpleDL2'})[0].find_all("dd")
        address = ""
        for index in range(1,len(address_html)):
            address = address + " " + address_html[index].get_text()    
        address = address.replace(u'\xa0', u'').strip()

        # Get League's Country
        league_Country = address_html[6].get_text()
        # Get League's Name 
        league_Name = address_html[0].get_text()
        
        # Save Record
        leagueDF.loc[len(leagueDF)] = [league_Name, address, league_Country, ID, 'Active']

        # league attendance table
        Get_League_Attendance(url,ID,1)
        
        # league challenge table
        Get_League_Challege(url,ID,1)
        
        # league cup table
        Get_League_Cup(url,ID,1)
        
    except Exception as e:
        print(str(e))
        # Try again
        Get_League_Data(url, ID)

In [None]:
# Counter to print progress accross pages
counter = 0

# This method will search for each page of leagues, then searches for each league's data
def Search_Webpage(url,page):
    global counter

    #Create URL
    url2 = url + str(page)
    
    #Search URL for HTML
    raw_html = simple_get(url2)
    html = BeautifulSoup(raw_html, 'html.parser')
    
    # Recursion
    pages = html.find_all('a', attrs = {'class':'next'})
    for index2 in range(0,len(pages)):
        if pages[index2].get_text()[:4] == 'Next':
            print('Page ' + str(page))
            Search_Webpage(url, (page+1))
    
    # Search for table data
    row = html.find_all('table', attrs = {'id':'table-1'})[0].find("tbody").find_all("tr")

    # Iterate each row in table
    for index in range(0,len(row)):
        # Get current row
        cells = row[index].find_all("td")

        # get league Name
        league_Name = cells[2].find_all('a', href=True)[0].get_text()

        # get League ID
        league_ID = cells[2].find_all('a', href=True)[0]['href'].split('/')[5]

        # Use League ID to get League Data
        League_URL = "https://www.pokemon.com/uk/play-pokemon/pokemon-events/leagues/" + str(league_ID)
        
        # Print Progress
        counter = counter + 1
        print(str(counter) + " " + league_Name)
        
        # Get the League's Data
        Get_League_Data(League_URL, league_ID)

In [None]:
# Start the Web scraping
Search_Webpage("https://www.pokemon.com/uk/play-pokemon/pokemon-events/find-an-event/?city=London&results_pp=100&location_name=&event_type=league&end_date=&event_name=&country=175&sort_order=distance&postal_code=&distance_within=99999&address=&product_type=tcg&start_date=0&state_other=&page_1=", 1)

In [None]:
# Save the Results
leagueAttendanceDF.to_csv("leagueAttendanceDF.csv")
leagueDF.to_csv("leagueDF.csv")
leagueCupDF.to_csv("leagueCupDF.csv")
leagueChallengeDF.to_csv("leagueChallengeDF.csv")