In [97]:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen

import pandas as pd
import numpy as np
import re

### Get tournament url's that fit our criteria (online/offline)

In [98]:
url = "./chess_results.html"
with open(url) as fp:
    soup = BeautifulSoup(fp, 'html.parser')

In [99]:
tour_df_list = []

In [100]:

# using soup to select by css selectors
# https://stackoverflow.com/questions/24801548/how-to-use-css-selectors-to-retrieve-specific-links-lying-in-some-class-using-be
table = soup.select('table.CRs2 tr')

# ignore first row
for row in table[1:]:
    
    # Get values in row
    vals = row.select('td')
    
    # Get url
    url_idx = 0
    url = vals[url_idx].a['href']
    tour_id = int(re.findall("tnr(\d+).aspx", url)[0])
    
    # Get location
    loc_idx = 8
    location = vals[loc_idx].text
    
    # Get medium (online or offline)
    medium = None
    if '.com' in location.lower() or 'online' in location.lower():
        medium = 'online'
    elif location != '':
        medium = 'offline'
        
    # TODO: Get country
    
    tour_data = {
        'tournament_id': tour_id,
        'url': url,
        'location': location,
        'medium': medium
    }
    tour_df_list.append(tour_data)

    

In [101]:
tour_df = pd.DataFrame(tour_df_list)

In [102]:
tour_df.head()

Unnamed: 0,tournament_id,url,location,medium
0,686540,https://chess-results.com/tnr686540.aspx?lan=1,"Mellieha, Malta",offline
1,669871,https://chess-results.com/tnr669871.aspx?lan=1,"Mellieha, Malta",offline
2,686538,https://chess-results.com/tnr686538.aspx?lan=1,"Mellieha, Malta",offline
3,686539,https://chess-results.com/tnr686539.aspx?lan=1,"Mellieha, Malta",offline
4,670407,https://chess-results.com/tnr670407.aspx?lan=1,Cala Gonone - Dorgali (NU),offline


In [103]:
good_tour_df = tour_df[tour_df['medium'].isin(['offline','online'])]

In [105]:
good_tour_df.to_csv('./data/tournaments.csv', index=False)

In [27]:
test_url = f_links[0]

In [30]:
# https://pythonprogramminglanguage.com/get-links-from-webpage/
req = Request(test_url)
html_page = urlopen(req)

In [31]:
soup = BeautifulSoup(test_url, 'html.parser')

In [2]:
# Get attribute data ...

### Get tournament results

In [133]:
tour_df = pd.read_csv('./data/tournaments.csv')
tour_ids = tour_df['tournament_id']

In [134]:
tour_results_url = 'https://chess-results.com/tnr{id}.aspx?lan=1&art=2&rd=1'

In [None]:
for tour_id in tour_ids:
    print('{0}'.format(tour_id))
    
    # Create the soup
    req = Request(tour_results_url.format(id=tour_id))
    html_page = urlopen(req)
    soup = BeautifulSoup(html_page, 'html.parser')
    
    # Select the games
    df_list = []
    games = soup.select('table tr[class^="CR"]')
    
    # Get the column names
    cols = None
    for g in games:
        c = g.select('td[class^="CR"]')
        if len(c) > 1:
            cols = c
            break
    
    # If we can find column names, then move on to appending the rows
    if cols:
        cols = [cols[i].text for i in range(len(cols))]
        #print(cols)
        for g in games:
            # Make sure matches number of columns
            elems = g.select('td[class^="CR"]', text=True)[:len(cols)]

            # Make sure not a title, e.g. https://chess-results.com/tnr683767.aspx?lan=1&art=2&rd=1
            if len(elems) == len(cols):
                vals = [elems[i].text for i in range(len(elems))]
                df_list.append(vals)

        # Create the DataFrame
        df = pd.DataFrame(df_list[1:], columns=cols)
        df.to_csv('./data/tournaments/{id}.csv'.format(id=tour_id), index=False)

686540
669871
686538
686539
670407
670410
683767
665934
680077
680641
680633
680631
680636
680637
680639
680944
670153
670152
670151
670150
670149
670148
657090
673431
673429
675024
655571
655566
655565
655570
655569
655568
675755
675757
675756
674835
674836
672631
672624
672623
672221
672633
672625
641973
641963
672645
672627
672222
672220
661581
636731
661394
661395
659173
659176
659169
659168
643549
654763
656389
657561
658303
653095
641925
641917
643548
643262
643259
647382
647384
647383
647380
647379
647376
647375
647373
647372
647371
648627
647385
648637
650678
627802
636608
643627
643626
621180
635362
635600
635359
635364
635351
635354
635363
636369
632638
632511
632509
632507
632498
635726
635717
635708
635704
635719
635713
633943
633941
633940
633939
633938
633937
633936
633933
633932
633931
633930
633929
633586
633590
633588
633594
633593
633595
633592
633596
633597
633591
633587
633589
619868
610286
606677
605240
605233
600854
600855
600852
600853
600928
591762
591821
598842