# Scrape Chess Data

This notebook scrapes webpages from an online database of chess moves.

The dataset of chess moves will be exported to a file, ```moves_raw.csv```.

In [None]:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from IPython.display import clear_output
import re
import csv
import pandas as pd

# provide link to the "main page": a webpage that contains links to other webpages that contain the moves from a chess match and the links to tournaments
# for example, this may be the profile page of a player
SourcePage= "https://www.chessgames.com/player/hikaru_nakamura.html"
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(SourcePage,headers=hdr)
page = urlopen(req)
soup = BeautifulSoup(page)

In [None]:
# defining some helper functions

def extract_tournamentlinks(text):
    '''
    Extracts links to tournaments from the SourceText.
    A tournament contains multiple games.
    This will only work for chessgames.com
    '''
    identifier = r'/perl/chess.pl\?tid=\d+'
    matches = re.findall(identifier, text)
    tournamentlinks = ["https://www.chessgames.com" + match for match in matches]

    return tournamentlinks

def extract_gamelinks(text):
    '''
    Extracts links to games from the SourceText.
    A game contains the string of all the moves.
    This will only work for chessgames.com
    '''

    if text.startswith("https://"):
        hdr = {'User-Agent': 'Mozilla/5.0'}
        req = Request(text,headers=hdr)
        page = urlopen(req)
        soup = BeautifulSoup(page)
        text = str(soup)

    identifier = r'/perl/chessgame\?gid=\d+'
    matches = re.findall(identifier, text)
    gamelinks = ["https://www.chessgames.com" + match for match in matches]

    return gamelinks

def moves_from_game(game_link):
    '''
    Given a link to the game, it opens the webpage and returns the moves of the game as a string as it is shown on the webpage.
    '''

    hdr = {'User-Agent': 'Mozilla/5.0'}
    req = Request(game_link,headers=hdr)
    page = urlopen(req)
    soup = BeautifulSoup(page)
    text = str(soup)
    x=text.find('<script src="/olga/js/chess.js"></script>')
    closest_bracket = text.rfind("]", 0, x)
    extracted_text = text[closest_bracket+1:x]
    tournament_end = extracted_text.rfind("\'")
    extracted_text = extracted_text[:tournament_end]
    moves = extracted_text[2:]

    return moves


def stringlist2csv(filepath,stringlist):
    '''
    Converts a list of strings to a csv
    '''

    with open(filepath, mode='w', newline='') as file:
        # Create a CSV writer
        csv_writer = csv.writer(file)
        
        # Write each string as a row in the CSV file
        for string in stringlist:
            csv_writer.writerow([string])

def df2csv(filepath, df):
    '''
    Converts a dataframe to a csv without indices
    '''
    df.to_csv(filepath, index=False)


In [None]:
# this is the text contained in the main page
SourceText = str(soup) 

# list of links to games
gamelinks = []

# extract the links to games from the main page
gamelinks += extract_gamelinks(SourceText)

# extract the links to tournaments from the main page
tournamentlinks = extract_tournamentlinks(SourceText)

# extract the links to games from tournaments 
for link in tournamentlinks:
    gamelinks += extract_gamelinks(link)

# initialise DataFrame to store the moves and their link
moves_df = {
    'game_url': [],
    'moves': []
}
moves_df = pd.DataFrame(moves_df)

# store the moves from every game found from the main page
for link in gamelinks:

    # only save the moves if it hasn't already been saved
    if not check_if_in(moves_df['game_url'],link):
        
        moves_df.loc[len(moves_df), 'moves'] = moves_from_game(link)
        moves_df.loc[len(moves_df)-1, 'game_url'] = link

        df2csv(filepath='moves_raw.csv',df=moves_df)

        pc_complete = len(moves_df)/len(gamelinks)*100
        clear_output(wait=True)
        print(f"Progress: {pc_complete:.2f}%")
        t = time()-st
        et = t/(len(moves_df)-starting_point)
        eta = et * (len(gamelinks) - len(moves_df)) / 60
        print(f"Estimated time remaining: {eta:.3f} minutes")