In [97]:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen

import pandas as pd
import numpy as np
import re

### Get tournament url's that fit our criteria (online/offline)
TODO: Convert the table collecting process to my get_table method

In [98]:
url = "./chess_results.html"
with open(url) as fp:
    soup = BeautifulSoup(fp, 'html.parser')

In [99]:
tour_df_list = []

In [100]:

# using soup to select by css selectors
# https://stackoverflow.com/questions/24801548/how-to-use-css-selectors-to-retrieve-specific-links-lying-in-some-class-using-be
table = soup.select('table.CRs2 tr')

# ignore first row
for row in table[1:]:
    
    # Get values in row
    vals = row.select('td')
    
    # Get url
    url_idx = 0
    url = vals[url_idx].a['href']
    tour_id = int(re.findall("tnr(\d+).aspx", url)[0])
    
    # Get location
    loc_idx = 8
    location = vals[loc_idx].text.strip()
    
    # Get medium (online or offline)
    medium = None
    if '.com' in location.lower() or 'online' in location.lower():
        medium = 'online'
    elif location != '':
        medium = 'offline'
        
    # TODO: Get country
    
    tour_data = {
        'tournament_id': tour_id,
        'url': url,
        'location': location,
        'medium': medium
    }
    tour_df_list.append(tour_data)

    

In [101]:
tour_df = pd.DataFrame(tour_df_list)

In [102]:
tour_df.head()

Unnamed: 0,tournament_id,url,location,medium
0,686540,https://chess-results.com/tnr686540.aspx?lan=1,"Mellieha, Malta",offline
1,669871,https://chess-results.com/tnr669871.aspx?lan=1,"Mellieha, Malta",offline
2,686538,https://chess-results.com/tnr686538.aspx?lan=1,"Mellieha, Malta",offline
3,686539,https://chess-results.com/tnr686539.aspx?lan=1,"Mellieha, Malta",offline
4,670407,https://chess-results.com/tnr670407.aspx?lan=1,Cala Gonone - Dorgali (NU),offline


In [152]:
tour_df[tour_df['medium']=='offline'].shape

(215, 4)

In [103]:
good_tour_df = tour_df[tour_df['medium'].isin(['offline','online'])]

In [105]:
good_tour_df.to_csv('./data/tournaments.csv', index=False)

In [27]:
test_url = f_links[0]

In [30]:
# https://pythonprogramminglanguage.com/get-links-from-webpage/
req = Request(test_url)
html_page = urlopen(req)

In [31]:
soup = BeautifulSoup(test_url, 'html.parser')

In [2]:
# Get attribute data ...

### Get tournament results

In [239]:
def get_soup(url):
    # Create the soup
    req = Request(url)
    html_page = urlopen(req)
    soup = BeautifulSoup(html_page, 'html.parser')   
    
    return soup

In [240]:
def get_table(url):
    soup = get_soup(url)
    
    # Select the games
    df_list = []
    rows = soup.select('table tr[class^="CR"]')
    
    # Get the column names
    cols = None
    for r in rows:
        c = r.select('td[class^="CR"]')
        if len(c) > 1:
            cols = [c[i].text.strip() for i in range(len(c))]
            break
    #print(cols)
    
    # If we can find column names, then move on to appending the rows
    if cols:
        for r in rows:
            # make sure number of cols is big enough (and cut if required)
            elems = r.select('td')
            if len(elems) >= len(cols):
                vals = [elems[i].text.strip() for i in range(len(cols))]
                df_list.append(vals)
        df = pd.DataFrame(df_list[1:], columns=cols)
        return df
        
    else:
        return None
        

In [241]:
# testing 
test_url = 'https://chess-results.com/tnr679621.aspx?lan=1'
test_df = get_table(test_url)
test_df.head()

Unnamed: 0,No.,Unnamed: 2,Name,ID,FideID,Rtg,sex,Typ,Club/City
0,1,IM,Rathanvel V S,87082021,25002112,2429,,,TN
1,2,IM,Nitin S.,70632021,5018277,2390,,,TN
2,3,GM,Laxman R.R.,869792021,5005361,2375,,,ICF
3,4,IM,Ajay Karthikeyan,67822021,35011685,2341,,,TN
4,5,IM,Harikrishnan.A.Ra,7202021,5081483,2309,,,TN


### Get players

In [242]:
tour_df = pd.read_csv('./data/tournaments.csv')
tour_df.head()

Unnamed: 0,tournament_id,url,location,medium
0,686540,https://chess-results.com/tnr686540.aspx?lan=1,"Mellieha, Malta",offline
1,669871,https://chess-results.com/tnr669871.aspx?lan=1,"Mellieha, Malta",offline
2,686538,https://chess-results.com/tnr686538.aspx?lan=1,"Mellieha, Malta",offline
3,686539,https://chess-results.com/tnr686539.aspx?lan=1,"Mellieha, Malta",offline
4,670407,https://chess-results.com/tnr670407.aspx?lan=1,Cala Gonone - Dorgali (NU),offline


In [249]:
tour_ids = tour_df['tournament_id']

# testing
#tour_ids = tour_ids[:1]

In [250]:
tour_url = 'https://chess-results.com/tnr{tid}.aspx?lan=1'

In [251]:
req_player_cols = ['Name','FideID','Rtg']

In [255]:
# collect players
players_df_list = []
for tour_id in tour_ids:
    print('tour_id: {0}'.format(tour_id))
    players_df = get_table(tour_url.format(tid=tour_id))
    if players_df is not None and set(req_player_cols).issubset(set(players_df.columns)):
        players_df_list.append(players_df[req_player_cols])

In [256]:
players_path = './data/players.csv'

# IN THE FUTURE, this will be reading the existing df
old_players_df = pd.DataFrame([['dummy','dummy','dummy']], columns=req_player_cols)

new_players_df = pd.concat([old_players_df] + players_df_list)
new_players_df.head()


Unnamed: 0,Name,FideID,Rtg
0,dummy,dummy,dummy
0,Margadgua Erdenebayar,4904052,1667
1,Alipbek Arailym,13724800,1630
2,Von Beckh Frieda,16233751,1615
3,Amulya Guruprasad,25683896,1537


In [260]:
new_players_df.to_csv(players_path, index=False)

### Get games

In [303]:
players_path = './data/players.csv'
players = pd.read_csv(players_path)
players.set_index('Name', inplace=True)
players.head()

Unnamed: 0_level_0,FideID,Rtg
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
dummy,dummy,dummy
Margadgua Erdenebayar,4904052,1667
Alipbek Arailym,13724800,1630
Von Beckh Frieda,16233751,1615
Amulya Guruprasad,25683896,1537


In [304]:
tour_df = pd.read_csv('./data/tournaments.csv')
tour_df.head()

Unnamed: 0,tournament_id,url,location,medium
0,686540,https://chess-results.com/tnr686540.aspx?lan=1,"Mellieha, Malta",offline
1,669871,https://chess-results.com/tnr669871.aspx?lan=1,"Mellieha, Malta",offline
2,686538,https://chess-results.com/tnr686538.aspx?lan=1,"Mellieha, Malta",offline
3,686539,https://chess-results.com/tnr686539.aspx?lan=1,"Mellieha, Malta",offline
4,670407,https://chess-results.com/tnr670407.aspx?lan=1,Cala Gonone - Dorgali (NU),offline


In [305]:
tour_ids = tour_df['tournament_id']

# testing
tour_ids = tour_ids[:1]

In [321]:
round_url = 'https://chess-results.com/tnr{tid}.aspx?lan=1&art=2&rd={rid}'

In [335]:
games_df_list = []
for tour_id in tour_ids:
    print('tournament_id: {0}'.format(tour_id))
    
    round_id = 1
    games_df = get_table(round_url.format(tid=tour_id, rid=round_id))
    while games_df is not None:
        print('round: {0}'.format(round_id))
        
        # make left and right distinction
        name_idxs = np.where(np.array(games_df.columns) == 'Name')[0].tolist()
        rtg_idxs = np.where(np.array(games_df.columns) == 'Rtg')[0].tolist()
        if len(name_idxs)==2 and len(rtg_idxs)==2:
            games_df.columns.values[name_idxs[0]] = "left_name"
            games_df.columns.values[name_idxs[1]] = "right_name"
            games_df.columns.values[rtg_idxs[0]] = "left_rtg"
            games_df.columns.values[rtg_idxs[1]] = "right_rtg"
            
            games_df = games_df.join(players['FideID'], on='left_name')
            games_df.rename(columns={'FideID': 'left_fide_id'}, inplace=True)
            games_df = games_df.join(players['FideID'], on='right_name')
            games_df.rename(columns={'FideID': 'right_fide_id'}, inplace=True)

            # print(games_df.head())
            req_game_cols = ['left_fide_id', 'left_rtg', 'right_fide_id', 'right_rtg']
            if set(req_game_cols).issubset(set(games_df.columns)):
                games_df_list.append(games_df[req_game_cols])
            
            # Continue formatting the games table
            # e.g. who's black and who's white?
        
        games_df = get_table(round_url.format(tid=tour_id, rid=round_id))
    
        round_id += 1

tournament_id: 686540
round: 1
round: 2
round: 3
round: 4
round: 5
round: 6
round: 7
round: 8
round: 9
round: 10


In [336]:
all_games_df = pd.concat(games_df_list)

In [337]:
all_games_df.to_csv('./data/games.csv')