In [4]:
!pip install selenium
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Ign:2 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:5 http://security.ubuntu.com/ubuntu bionic-security InRelease
Hit:6 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:7 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:9 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:10 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Hit:12 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:13 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu

In [5]:
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm

## Part 1

Goal: given a 
- FIDE ID (corresponding to a player)
- date1
- date2
- threshold

output
- Number of tournaments played with end date between date1 and date2 (inclusive)
- Number of games played during those tournaments
- Number of wins, losses, draws as black
- Number of wins, losses, draws as white
- PENDING: Number of wins, losses, draws against higher rated player (defined as having a rating of >= player rating + threshold). 
- PENDING: Number of wins, losses, draws against equal or lower rated player (defined as having a rating of < player rating + threshold).

IN CALCULATING THE LAST TWO BULLET POINTS, EXCLUDE GAMES WHICH INVOLVE PLAYERS OF ZERO RATING (FIRST TIME IN A TOURNAMENT; NO RATING -> 0 IS NOT AN ACCURATE REPRESENTATION OF THEIR RATING). ALSO, ONLY INTERNATIONAL (FIDE) RATINGS ARE CONSIDERED

Remarks:
1. Number of wins, losses, draws might not sum to total number of games, since a minority of games do not have results

In [86]:
def get_chess_result_data(fide_id_input, date1_input, date2_input, threshold):
  ### ex: fide_id_input = 25829092; date1_input = 20160101; date2_input = 20220101, threshold = 150
  
  wd = webdriver.Chrome('chromedriver',options=chrome_options)
  wd.get("https://chess-results.com/SpielerSuche.aspx?lan=1")

  fide_id = wd.find_element("name", "ctl00$P1$txt_fideID")
  fide_id.click()
  fide_id.send_keys(fide_id_input)

  date1 = wd.find_element("name", "ctl00$P1$txt_von_tag")
  date1.click()
  date1.send_keys(date1_input)

  date2 = wd.find_element("name", "ctl00$P1$txt_bis_tag")
  date2.click()
  date2.send_keys(date2_input)

  button = wd.find_element("name", "ctl00$P1$cb_suchen")
  button.click()

  html_from_page = wd.page_source
  soup = BeautifulSoup(html_from_page)

  tournament_links = [x['href'] for x in soup.find_all('a', href=True)]
  tournament_links = [link for link in tournament_links if link.startswith('tnr') and '&snr=' in link]  

  n_tournaments = 0
  n_games = 0
  n_wins_white = 0
  n_losses_white = 0
  n_draws_white = 0
  n_wins_black = 0
  n_losses_black = 0
  n_draws_black = 0
  n_wins_higher = 0
  n_draws_higher = 0
  n_losses_higher = 0
  n_wins_lower = 0
  n_draws_lower = 0
  n_losses_lower = 0

  for link in tqdm(tournament_links):
    cur_link = "https://chess-results.com/" + link
    try:
      html_page = requests.get(cur_link)
      soup = BeautifulSoup(html_page.text)
      black = soup.find_all("div", {"class":"FarbesT"})
      white = soup.find_all("div", {"class":"FarbewT"})
      if len(black) + len(white) > 0:
        n_tournaments += 1
        n_games += len(white) + len(black)
        white_result = [x.findNext('td').text for x in white]
        black_result = [x.findNext('td').text for x in black]
        white_win = white_result.count('1')
        white_loss = white_result.count('0')
        white_draw = white_result.count('½')
        black_win = black_result.count('1')
        black_loss = black_result.count('0')
        black_draw = black_result.count('½')
        n_wins_white += white_win
        n_losses_white += white_loss
        n_draws_white += white_draw
        n_wins_black += black_win
        n_losses_black += black_loss
        n_draws_black += black_draw
    
        rating = 0
        rating_international = 0

        if len(soup.findAll(text='Rating')) > 0:
          rating = int(soup.find(text='Rating').findNext('td').text)

        if len(soup.findAll(text='Rating international')) > 0:
          rating_international = int(soup.find(text='Rating international').findNext('td').text)
        
        if rating + rating_international > 0:
          col_names = [x.text for x in soup.find("tr", {"class":"CRg1b"}).findAll("td", {"class":"CRr"})]
          if 'Rtg' in col_names:
            idx = col_names.index('Rtg')
          elif 'RtgI' in col_names:
            idx = col_names.index('RtgI')
          
          rating_player = rating if rating > 0 else rating_international

          crg1s = soup.findAll("tr", {"class":"CRg1"})
          crg2s = soup.findAll("tr", {"class":"CRg2"})

          for crg in crg1s + crg2s:
            rating_opponent = int(crg.findAll("td", {"class":"CRr"})[idx].text)
            if rating_opponent > 0:
              crg_str = str(crg)
              if 'FarbesT' in crg_str or 'FarbewT' in crg_str:
                score = crg.findNext('div').findNext('td').text
                if rating_opponent >= rating_player + threshold:
                  n_wins_higher += 1 if score == '1' else 0
                  n_losses_higher += 1 if score == '0' else 0
                  n_draws_higher += 1 if score == '½' else 0
                else:
                  n_wins_lower += 1 if score == '1' else 0
                  n_losses_lower += 1 if score == '0' else 0
                  n_draws_lower += 1 if score == '½' else 0
    except:
      continue
  
  return (n_tournaments, n_games, n_wins_white, n_losses_white, 
          n_draws_white, n_wins_black, n_losses_black, n_draws_black,
          n_wins_higher, n_losses_higher, n_draws_higher,
          n_wins_lower, n_losses_lower, n_draws_lower)

In [87]:
get_chess_result_data(2093596, 20180101, 20220101, 150)

100%|██████████| 41/41 [00:27<00:00,  1.48it/s]


(40, 380, 120, 25, 43, 95, 47, 50, 19, 13, 11, 196, 59, 82)

In [88]:
get_chess_result_data(25829092, 20180101, 20220101, 150)

100%|██████████| 9/9 [00:06<00:00,  1.40it/s]


(8, 52, 14, 11, 2, 16, 8, 1, 8, 14, 0, 18, 5, 2)

# Part 2

Goal: given 
- two FIDE IDs (representing 2 players: player 1 and player 2)
- date1
- date2

output 
- How many times they played each other in a tournament in which the end date is between date1 and date2 (inclusive)
- How many times player 1 wins, draws or loses against player 2

Remarks:
1. Number of wins, losses, draws might not sum to total number of games, since a minority of games do not have results

In [122]:
def get_h2h_record(fide_id_1_input, fide_id_2_input, date1_input, date2_input):
  ### ex: fide_id_1_input = 1503014; fide_id_2_input = 2093596; date1_input = 20100101, date2_input = 20220101
  ### variable name === magnus = player 1; hans = player 2 (for simplicity)
  wd = webdriver.Chrome('chromedriver',options=chrome_options)
  wd.get("https://chess-results.com/SpielerSuche.aspx?lan=1")

  fide_id = wd.find_element("name", "ctl00$P1$txt_fideID")
  fide_id.click()
  fide_id.send_keys(fide_id_1_input)

  date1 = wd.find_element("name", "ctl00$P1$txt_von_tag")
  date1.click()
  date1.send_keys(date1_input)

  date2 = wd.find_element("name", "ctl00$P1$txt_bis_tag")
  date2.click()
  date2.send_keys(date2_input)

  button = wd.find_element("name", "ctl00$P1$cb_suchen")
  button.click()

  html_from_page = wd.page_source
  soup_magnus = BeautifulSoup(html_from_page)

  print('Player 1 html loaded')

  tournament_links = [x['href'] for x in soup_magnus.find_all('a', href=True)]
  tournament_links_magnus_all = [link for link in tournament_links if link.startswith('tnr') and '&snr=' not in link]  
  tournament_links_magnus = [link for link in tournament_links if link.startswith('tnr') and '&snr=' in link]  

  wd = webdriver.Chrome('chromedriver',options=chrome_options)
  wd.get("https://chess-results.com/SpielerSuche.aspx?lan=1")

  fide_id = wd.find_element("name", "ctl00$P1$txt_fideID")
  fide_id.click()
  fide_id.send_keys(fide_id_2_input)

  date1 = wd.find_element("name", "ctl00$P1$txt_von_tag")
  date1.click()
  date1.send_keys(date1_input)

  date2 = wd.find_element("name", "ctl00$P1$txt_bis_tag")
  date2.click()
  date2.send_keys(date2_input)

  button = wd.find_element("name", "ctl00$P1$cb_suchen")
  button.click()

  html_from_page = wd.page_source
  soup_hans = BeautifulSoup(html_from_page)

  print('Player 2 html loaded')

  tournament_links = [x['href'] for x in soup_hans.find_all('a', href=True)]
  tournament_links_hans_all = [link for link in tournament_links if link.startswith('tnr') and '&snr=' not in link]  
  tournament_links_hans = [link for link in tournament_links if link.startswith('tnr') and '&snr=' in link]  

  common_tournaments = []
  for x in tournament_links_magnus_all:
    if x in tournament_links_hans_all:
      common_tournaments.append(x)


  print('Common tournaments: ', common_tournaments)

  n_peo = 0
  n_wins = 0
  n_losses = 0
  n_draws = 0

  for common_tournament in common_tournaments:
    try:
      hans_name = soup_hans.select("a[href*=" + common_tournament + "]")[0].text
      magnus_tournament = [x for x in tournament_links_magnus if x.startswith(common_tournament)][0]
      cur_link = "https://chess-results.com/" + magnus_tournament
      html_page = requests.get(cur_link)
      soup = BeautifulSoup(html_page.text)

      crg1s = soup.findAll("tr", {"class":"CRg1"})
      crg2s = soup.findAll("tr", {"class":"CRg2"})

      for crg in crg1s + crg2s:
        crg_str = str(crg)
        if hans_name in crg_str:
          n_peo += 1
          if 'FarbesT' in crg_str or 'FarbewT' in crg_str:
            score = crg.findNext('div').findNext('td').text
            if score == '1':
              n_wins += 1
            elif score == '0':
              n_losses += 1
            elif score == '½':
              n_draws += 1
    except:
      continue
  
  return (n_peo, n_wins, n_losses, n_draws)

In [123]:
get_h2h_record(1503014, 2093596, 20100101, 20221030)

Player 1 html loaded
Player 2 html loaded
Common tournaments:  ['tnr670809.aspx?lan=1', 'tnr685311.aspx?lan=1', 'tnr600854.aspx?lan=1', 'tnr600852.aspx?lan=1']


(1, 0, 0, 0)

As we know, Magnus and Hans played before numerous times... it's just that this database does not contain all the matches

The 1 comes from Magnus's withdrawal from the Sinquefield Cup

FIDE website also outputs no games between these 2 players: https://ratings.fide.com/view_games.phtml?event=&id=1503014&opp=2093596

Let's try Magnus vs Hikaru Nakamura

In [124]:
get_h2h_record(1503014, 2016192, 20100101, 20221030)

Player 1 html loaded
Player 2 html loaded
Common tournaments:  ['tnr125752.aspx?lan=1', 'tnr125754.aspx?lan=1', 'tnr125755.aspx?lan=1', 'tnr295251.aspx?lan=1', 'tnr371468.aspx?lan=1', 'tnr437799.aspx?lan=1', 'tnr437793.aspx?lan=1', 'tnr499129.aspx?lan=1', 'tnr499127.aspx?lan=1', 'tnr463282.aspx?lan=1', 'tnr36795.aspx?lan=1', 'tnr140380.aspx?lan=1', 'tnr232875.aspx?lan=1', 'tnr317162.aspx?lan=1', 'tnr51861.aspx?lan=1', 'tnr544530.aspx?lan=1', 'tnr548204.aspx?lan=1', 'tnr541412.aspx?lan=1', 'tnr541719.aspx?lan=1', 'tnr303618.aspx?lan=1', 'tnr529081.aspx?lan=1', 'tnr448344.aspx?lan=1', 'tnr71817.aspx?lan=1', 'tnr478041.aspx?lan=1', 'tnr255851.aspx?lan=1', 'tnr269184.aspx?lan=1', 'tnr255559.aspx?lan=1', 'tnr138146.aspx?lan=1', 'tnr137973.aspx?lan=1', 'tnr226000.aspx?lan=1', 'tnr225993.aspx?lan=1', 'tnr399597.aspx?lan=1', 'tnr399595.aspx?lan=1', 'tnr51144.aspx?lan=1', 'tnr527567.aspx?lan=1', 'tnr558150.aspx?lan=1', 'tnr561139.aspx?lan=1', 'tnr562342.aspx?lan=1', 'tnr552132.aspx?lan=1', 'tnr

(47, 15, 0, 32)

FIDE result outputs 21 games.. we managed to get more games since we consider all games (even without PGN): https://ratings.fide.com/view_games.phtml?event=&id=1503014&opp=2016192