# Collect teams' ranking and matches history data

## Teams' ranking points

In [None]:
from selenium.webdriver import ChromeOptions, Chrome
from bs4 import BeautifulSoup

# set up browser driver
options = ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')

In [None]:
driver = Chrome(chrome_options=options)
driver.get('https://www.opendota.com/teams')

# parse selenium driver using BeautifulSoup
def parse(driver):
    page_source = driver.page_source
    return BeautifulSoup(page_source, 'lxml')

soup = parse(driver)

In [None]:
from pandas import DataFrame
from datetime import date

names = [] # team names
links = [] # team links
ranks = [] # team rankings

# extract data from webpage
for tr in soup.find_all('tr')[1:]:
    a = tr.find('a', href=True)
    links.append(a['href'])
    names.append(a.get_text())
    ranks.append(int(tr.find('div', class_='iARqUo').get_text()[:-1]))

frame = DataFrame()
frame['name'] = names
frame['link'] = links
frame['rank'] = ranks

## Matches history

In [None]:
from datetime import datetime, timedelta

now = datetime.now()

# get team url from shorten link
def get_url(link):
    return 'https://www.opendota.com' + link + '/matches'

# get DataFrame of matches history from BeautifulSoup object
# also indicate if found a match from more than a month ago (False if found)
def get_frame(soup):
    frame_his = DataFrame(columns=['time', 'lost', 'opponent', 'o_rank'])
    for tr in soup.find_all('tr')[1:]:
        match = {}
        time = tr.find('div', class_='jehLtr').get_text()
        
        # check if match is more than a month ago
        index = time.find('day')
        if index == -1:
            return frame_his, False
        
        # time
        match['time'] = (now - timedelta(days=int(time[:index - 1]))).date()

        # result (True if lost)
        match['lost'] = (tr.find_all('td')[2].find('span').get_text()[0] == 'L')

        # opponent
        a = tr.find_all('a', href=True)[1]
        opponent = a.get_text()
        match['opponent'] = opponent
        # opponent's rank
        global frame
        if frame['name'].str.contains(opponent).any():
            rank = frame[frame['name'] == opponent]['rank'].tolist()[0]
        else:
            href = a['href']
            driver_opponent = Chrome(chrome_options=options)
            driver_opponent.get(get_url(href))
            soup_team = parse(driver_opponent)
            rank = soup_team.find_all('div', class_='iLrWwU')[2].find_all('span')[1].get_text()
            frame = frame.append({'name':opponent, 'link':href, 'rank':rank}, ignore_index=True)
        match['o_rank'] = rank
        
        frame_his = frame_his.append(match, ignore_index=True)
    return frame_his, True

In [None]:
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions
from time import sleep
from pandas import concat

for link in links:
    name = frame[frame['link'] == link]['name'].tolist()[0]
    print('getting ' + name)
    driver = Chrome(chrome_options=options)
    driver.get(get_url(link))
    frame_init, threshold = get_frame(parse(driver))
    # if threshold:
    #     try:
    #         elem = WebDriverWait(driver, 10).until(expected_conditions.presence_of_element_located((By.CLASS_NAME, "gzZqVc")))
    #     finally:
    #         driver.quit()
    sleep(1)
    while threshold:
        driver.find_elements_by_css_selector('.sc-fAjcbJ.gzZqVc')[-1].click()
        frame_his, threshold = get_frame(parse(driver))
        frame_init = concat([frame_init, frame_his])
    frame_init.to_csv('../data/teams/' + name.replace('.', '') + '.csv', index=False)

In [None]:
frame.to_csv('../data/ranks/' + str(date.today()) + '.csv', index=False)