In [51]:
import io
import requests
import pandas as pd
from lxml import etree
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
import urllib.parse
import html5lib

In [25]:
# tournament level

tourn_id = 16917
tourn_id_str = "?tourn_id=" + str(tourn_id)
url_str = "https://www.tabroom.com/index/tourn/"

home_page = url_str + "index.mhtml" + tourn_id_str
entries = url_str + "fields.mhtml" + tourn_id_str
judges = url_str + "judges.mhtml" + tourn_id_str
pairings = url_str + "postings/index.mhtml" + tourn_id_str
results = url_str + "results/index.mhtml" + tourn_id_str
 
URL = home_page
resp = requests.get(URL)
 
if resp.status_code == 200:

    # # Using lxml
    # dom = etree.HTML(resp.text)
    # elements = dom.xpath("//span[@data-testid='TemperatureValue' and contains(@class,'CurrentConditions')]")
    # print(elements[0].text)
 
    # # Using BeautifulSoup
    soup = BeautifulSoup(resp.text, "lxml")
    # elements = soup.select('span[data-testid="TemperatureValue"][class^="CurrentConditions"]')
    # print(elements[0].text)
    #tables = pd.read_html(URL)
    #tables = pd.DataFrame(tables[0])
    #display(tables)
    #display(tables.iloc[2])
else:
    print("connection failed")



In [64]:
#parsing events from entry

def parse_events():
    req = Request(entries)
    html_page = urlopen(req)
    soup = BeautifulSoup(html_page, "lxml")
    
    event_list = soup.find_all('a', class_="blue full")

    event_dict = {}

    for entry in event_list:
        entry = str(entry)

        index = entry.find("event_id")
        clipped = entry[index + 9:]
        end_index = clipped.find('">')
        end_event_title = clipped.find('</a>')
        event_id = entry[(index + 9):(index + 9 + end_index)]
        event_title = clipped[end_index + 2:end_event_title].strip()
        event_dict[event_id] = event_title

    return event_dict

In [76]:
# getting participants
def get_participants():

    event_table = pd.DataFrame()
    event_list = []
    event_table_list = []
    # maps event to the list of participants

    for event_id, event_name in parse_events().items():
        event_link = entries + "&event_id=" + event_id
        req = Request(event_link)
        html_page = urlopen(req)
        soup = BeautifulSoup(html_page, "lxml")

        tables = pd.read_html(event_link)
        table = pd.DataFrame(tables[0])
        if "Record" in table:
            record_url_list = []
            user_id1_list = []
            user_id2_list = []
            people_entries = soup.find_all('tr')[1:]

            for entry in people_entries:
                entry = str(entry)
                url_start_index = entry.find('href="') + 6
                url_end_index = entry.find('" target="')
                url = "https://tabroom.com" + entry[url_start_index:url_end_index]
                record_url_list.append(url)
                
                id1_start = url.find('id1=') + 4
                id1_end = url.find('&amp')
                id2_start = url.find('id2=') + 4

                id1_str = url[id1_start:id1_end]
                id2_str = url[id2_start:]

                user_id1_list.append(id1_str)
                user_id2_list.append(id2_str)
            table = table.drop(columns=["Record"])
        else:
            record_url_list = ['' for _ in range(table.shape[0])]
            user_id1_list = record_url_list
            user_id2_list = record_url_list

        if "Status" in table:
            table = table.drop(columns=['Status'])

        
        

        event_list = [event_id for _ in range(table.shape[0])]
        table['Competitive History'] = record_url_list
        table['ID 1'] = user_id1_list
        table['ID 2'] = user_id2_list
        table['Event ID'] = event_list
        event_table_list.append(table)
    
    
    event_table = pd.concat(event_table_list, ignore_index=True)

    return event_table

In [54]:
# handling judges
def parse_judges():
    req = Request(judges)
    html_page = urlopen(req)
    soup = BeautifulSoup(html_page, "lxml")
    judge_events = soup.find_all('div', class_="odd nospace")
    paradigm_link_dict = {}
    judge_table = pd.DataFrame()
    table_list = []
    event_list = []
    paradigm_list = []
    judge_person_id_list = []
    for cur_string in judge_events:

        # parse event title
        cur_string = str(cur_string)
        cutoff1 = '"halfspacer"></span>'
        event_index = cur_string.find(cutoff1) + len(cutoff1)
        cutoff2 = '</span>'
        event_end_index = cur_string[event_index:].find(cutoff2)
        event_title = cur_string[event_index:event_index + event_end_index].strip()
        
        # parse paradigm urls
        list_url_index = cur_string.find('href=') + 6
        list_url_end_index = cur_string[list_url_index:].find('">')

        paradigm_url = "https://www.tabroom.com" + cur_string[list_url_index:list_url_index + list_url_end_index].strip()
        paradigm_link_dict[event_title] = paradigm_url
    
    # visit all and load
    for event_title, paradigm_url in paradigm_link_dict.items():
        req = Request(paradigm_url)
        html_page = urlopen(req)
        soup = BeautifulSoup(html_page, "lxml")
        tables = pd.read_html(paradigm_url)
        table = pd.DataFrame(tables[0])
        table_list.append(table)
 


        judge_entries = soup.find_all('tr')[1:]
        

        for entry_i in judge_entries:
            event_list.append(event_title)
            entry_i = str(entry_i)


            if entry_i.find('<a') != -1:        # if there is a link, find the paradigm
                startindex = entry_i.find('href=') + 6
                end_index = entry_i.find('" target="')
                judge_url = "https://www.tabroom.com" + entry_i[startindex:end_index]
                judge_person_index = judge_url.find("judge_person_id=") + len('judge_person_id=')
                judge_person_id = int(judge_url[judge_person_index:])
                judge_person_id_list.append(judge_person_id)


                # visit url and get paradigm
                req = Request(judge_url)
                html_page = urlopen(req)
                soup = BeautifulSoup(html_page, "lxml")

                judge_paradigm = soup.find_all('div', class_="paradigm ltborderbottom")
                judge_paradigm = str(judge_paradigm)
                #removing html
                judge_paradigm = judge_paradigm.replace('[<div class="paradigm ltborderbottom">', '')
                judge_paradigm = judge_paradigm.replace('</div>]', '')
                judge_paradigm = judge_paradigm.strip()
                if len(judge_paradigm) > 0:
                    paradigm_list.append(judge_paradigm)
                else:
                    paradigm_list.append("No paradigm")

            else:
                judge_person_id_list.append(0)
                paradigm_list.append("No paradigm")


        # find way to parse entry for <a> tokens, extract url

        # parse judge person_id
        # parse judge paradigm - if neither exist add NA to a column in DF
    
    judge_table = pd.concat(table_list, ignore_index=True)
    judge_table = judge_table.drop(['Para','Rounds', 'Middle'], axis=1)
    judge_table['Event'] = event_list
    judge_table['Paradigm'] = paradigm_list
    judge_table['Judge ID'] = judge_person_id_list
    return judge_table

In [10]:
full_judge_list = parse_judges()


In [77]:
# with pd.option_context('display.max_rows', None,
#                        'display.max_columns', None,
#                        'display.precision', 3,
#                        ):
#     display(full_judge_list)
p_table = get_participants()
display(p_table)


Unnamed: 0,School,Location,Entry,Code,Competitive History,ID 1,ID 2,Event ID
0,Archbishop Mitty,CA/US,Maria Korolik,Maria Korolik,,,,141442
1,Archbishop Mitty,CA/US,Rohan Bharadwaj,Rohan Bharadwaj,,,,141442
2,Archbishop Mitty,CA/US,Caitlin Bergevin,Caitlin Bergevin,,,,141442
3,Bellarmine College Prep,CA/US,Nathan Gong,Nathan Gong,,,,141442
4,Bellarmine College Prep,CA/US,Veer Juneja,Veer Juneja,,,,141442
...,...,...,...,...,...,...,...,...
1547,Westmont,CA/US,Hand & Roman,Westmont HR,https://tabroom.com/index/results/team_results...,858019,858020,141456
1548,Westmont,CA/US,Narurkar & Vaddavalli,Westmont NV,https://tabroom.com/index/results/team_results...,721623,721622,141456
1549,Westmont,CA/US,Tucker & Breithaupt,Westmont TB,https://tabroom.com/index/results/team_results...,721627,721625,141456
1550,Westridge,CA/US,Hoffman & Wilson,Westridge WH,https://tabroom.com/index/results/team_results...,659367,842326,141456
