In [371]:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen

import pandas as pd
import numpy as np
import re

### Get tournament url's that fit our criteria (online/offline)
TODO: Convert the table collecting process to my get_table method

In [372]:
url = "./chess_results.html"
with open(url) as fp:
    soup = BeautifulSoup(fp, 'html.parser')

In [373]:
tour_df_list = []
# using soup to select by css selectors
# https://stackoverflow.com/questions/24801548/how-to-use-css-selectors-to-retrieve-specific-links-lying-in-some-class-using-be
table = soup.select('table.CRs2 tr')

# ignore first row
for row in table[1:]:
    
    # Get values in row
    vals = row.select('td')
    
    # Get url
    url_idx = 0
    url = vals[url_idx].a['href']
    tour_id = int(re.findall("tnr(\d+).aspx", url)[0])
    
    # Get location
    loc_idx = 8
    location = vals[loc_idx].text.strip()
    
    # Get medium (online or offline)
    medium = None
    if '.com' in location.lower() or 'online' in location.lower():
        medium = 'online'
    elif location != '':
        medium = 'offline'
        
    # Get Time Control
    time_control_idx = len(vals)-5
    time_control = vals[time_control_idx].text.strip()
     
    tour_data = {
        'tournament_id': tour_id,
        'url': url,
        'location': location,
        'medium': medium,
        'time_control': time_control
    }
    tour_df_list.append(tour_data)

    

In [374]:
tour_df = pd.DataFrame(tour_df_list)

In [375]:
tour_df.head()

Unnamed: 0,tournament_id,url,location,medium,time_control
0,686540,https://chess-results.com/tnr686540.aspx?lan=1,"Mellieha, Malta",offline,90min+30sec/move
1,669871,https://chess-results.com/tnr669871.aspx?lan=1,"Mellieha, Malta",offline,90min+30sec/move
2,686538,https://chess-results.com/tnr686538.aspx?lan=1,"Mellieha, Malta",offline,90min+30sec/move
3,686539,https://chess-results.com/tnr686539.aspx?lan=1,"Mellieha, Malta",offline,90min+30sec/move
4,670407,https://chess-results.com/tnr670407.aspx?lan=1,Cala Gonone - Dorgali (NU),offline,90'/40 moves + 30' + 30'' bonus


In [376]:
tour_df[tour_df['medium']=='offline'].shape

(215, 5)

In [377]:
good_tour_df = tour_df[tour_df['medium'].isin(['offline','online'])]

In [378]:
good_tour_df.to_csv('./data/tournaments.csv', index=False)

### Get tournament results

In [None]:
def get_soup(url):
    # Create the soup
    req = Request(url)
    html_page = urlopen(req)
    soup = BeautifulSoup(html_page, 'html.parser')   
    
    return soup

In [None]:
def get_table(url):
    soup = get_soup(url)
    
    # Select the games
    df_list = []
    rows = soup.select('table tr[class^="CR"]')
    
    # Get the column names
    cols = None
    for r in rows:
        c = r.select('td[class^="CR"]')
        if len(c) > 1:
            cols = [c[i].text.strip() for i in range(len(c))]
            break
    #print(cols)
    
    # If we can find column names, then move on to appending the rows
    if cols:
        for r in rows:
            # make sure number of cols is big enough (and cut if required)
            elems = r.select('td')
            if len(elems) >= len(cols):
                vals = [elems[i].text.strip() for i in range(len(cols))]
                df_list.append(vals)
        df = pd.DataFrame(df_list[1:], columns=cols)
        return df
        
    else:
        return None
        

In [None]:
# testing 
test_url = 'https://chess-results.com/tnr679621.aspx?lan=1'
test_df = get_table(test_url)
test_df.head()

### Get players

In [380]:
tour_df = pd.read_csv('./data/tournaments.csv')
tour_df.head()

Unnamed: 0,tournament_id,url,location,medium,time_control
0,686540,https://chess-results.com/tnr686540.aspx?lan=1,"Mellieha, Malta",offline,90min+30sec/move
1,669871,https://chess-results.com/tnr669871.aspx?lan=1,"Mellieha, Malta",offline,90min+30sec/move
2,686538,https://chess-results.com/tnr686538.aspx?lan=1,"Mellieha, Malta",offline,90min+30sec/move
3,686539,https://chess-results.com/tnr686539.aspx?lan=1,"Mellieha, Malta",offline,90min+30sec/move
4,670407,https://chess-results.com/tnr670407.aspx?lan=1,Cala Gonone - Dorgali (NU),offline,90'/40 moves + 30' + 30'' bonus


In [381]:
tour_ids = tour_df['tournament_id']

# testing
#tour_ids = tour_ids[:1]

In [382]:
tour_url = 'https://chess-results.com/tnr{tid}.aspx?lan=1'

In [251]:
req_player_cols = ['Name','FideID','Rtg']

In [255]:
# collect players
players_df_list = []
for tour_id in tour_ids:
    print('tour_id: {0}'.format(tour_id))
    players_df = get_table(tour_url.format(tid=tour_id))
    if players_df is not None and set(req_player_cols).issubset(set(players_df.columns)):
        players_df_list.append(players_df[req_player_cols])

In [256]:
players_path = './data/players.csv'

# IN THE FUTURE, this will be reading the existing df
old_players_df = pd.DataFrame([['dummy','dummy','dummy']], columns=req_player_cols)

new_players_df = pd.concat([old_players_df] + players_df_list)
new_players_df.head()


Unnamed: 0,Name,FideID,Rtg
0,dummy,dummy,dummy
0,Margadgua Erdenebayar,4904052,1667
1,Alipbek Arailym,13724800,1630
2,Von Beckh Frieda,16233751,1615
3,Amulya Guruprasad,25683896,1537


In [260]:
new_players_df.to_csv(players_path, index=False)

### Get games

In [421]:
players_path = './data/players.csv'
players = pd.read_csv(players_path)
players.set_index('Name', inplace=True)
players.head()

Unnamed: 0_level_0,FideID,Rtg
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
dummy,dummy,dummy
Margadgua Erdenebayar,4904052,1667
Alipbek Arailym,13724800,1630
Von Beckh Frieda,16233751,1615
Amulya Guruprasad,25683896,1537


In [422]:
tour_df = pd.read_csv('./data/tournaments.csv')
tour_df.set_index('tournament_id', inplace=True)

tour_df.head()

Unnamed: 0_level_0,url,location,medium,time_control
tournament_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
686540,https://chess-results.com/tnr686540.aspx?lan=1,"Mellieha, Malta",offline,90min+30sec/move
669871,https://chess-results.com/tnr669871.aspx?lan=1,"Mellieha, Malta",offline,90min+30sec/move
686538,https://chess-results.com/tnr686538.aspx?lan=1,"Mellieha, Malta",offline,90min+30sec/move
686539,https://chess-results.com/tnr686539.aspx?lan=1,"Mellieha, Malta",offline,90min+30sec/move
670407,https://chess-results.com/tnr670407.aspx?lan=1,Cala Gonone - Dorgali (NU),offline,90'/40 moves + 30' + 30'' bonus


In [429]:
tour_ids = list(tour_df.index)

# testing
tour_ids = tour_ids[:2]

In [430]:
tour_ids

[686540, 669871]

In [None]:
round_url = 'https://chess-results.com/tnr{tid}.aspx?lan=1&art=2&rd={rid}'

In [None]:
def parse_games_df(games_df):
    # make left and right distinction
    name_idxs = np.where(np.array(games_df.columns) == 'Name')[0].tolist()
    rtg_idxs = np.where(np.array(games_df.columns) == 'Rtg')[0].tolist()

    games_df.columns.values[name_idxs[0]] = "left_name"
    games_df.columns.values[name_idxs[1]] = "right_name"
    games_df.columns.values[rtg_idxs[0]] = "left_rtg"
    games_df.columns.values[rtg_idxs[1]] = "right_rtg"

    games_df = games_df.join(players['FideID'], on='left_name')
    games_df.rename(columns={'FideID': 'left_fide_id'}, inplace=True)
    games_df = games_df.join(players['FideID'], on='right_name')
    games_df.rename(columns={'FideID': 'right_fide_id'}, inplace=True)

    games_df['tournament_id'] = [tour_id for i in range(games_df.shape[0])]

    games_df = games_df.astype({
        'left_rtg': 'int',
        'right_rtg': 'int',
        'left_fide_id': 'int',
        'right_fide_id': 'int',
        'tournament_id': 'int',
    })

    # TODO: who's black and who's white?

    # Upset_score
    upset_scores = []
    for index, row in games_df.iterrows():
        #print(row)
        score=0
        result = row['Result']
        left_rtg = row['left_rtg']
        right_rtg = row['right_rtg']
        if (result=='1 - 0' and left_rtg<right_rtg) or (result=='0 - 1' and right_rtg<left_rtg):
            score = abs(right_rtg - left_rtg)
        elif result=='½ - ½':
            score = abs(right_rtg - left_rtg) * 0.5
        upset_scores.append(score)
    games_df['upset_score'] = upset_scores

    # print(games_df.head())
    req_game_cols = ['upset_score', 'tournament_id', 'left_fide_id', 'left_rtg', 'right_fide_id', 'right_rtg']
    if set(req_game_cols).issubset(set(games_df.columns)):
        return games_df[req_game_cols]
    else:
        return None

In [None]:
games_df_list = []
for tour_id in tour_ids:
    print('tournament_id: {0}'.format(tour_id))
    
    round_id = 1
    try:
        games_df = get_table(round_url.format(tid=tour_id, rid=round_id))
    except:
        continue

    while games_df is not None:
        print('round: {0}'.format(round_id))
        
        try:
            games_df_list.append(parse_games_df(games_df))
        except:
            pass
        
        try:
            games_df = get_table(round_url.format(tid=tour_id, rid=round_id))
        except:
            pass
    
        round_id += 1

In [None]:
all_games_df = pd.concat(games_df_list)

In [None]:
# Join with tournament info
all_games_df = all_games_df.join(tour_df[['location', 'medium', 'time_control']], on='tournament_id')

In [None]:
all_games_df.to_csv('./data/games.csv')