Title: Finding Chess Cheaters with Python! - Data Science Uncut Livestream

Source: Rob Mulla YouTube Channel

Author (Original Tutorial): Rob Mulla

URL: https://www.youtube.com/watch?v=91mC3qs_kw4

Date of Implementation: 2024-01-08

Description:
    Using data science to determine chess cheating.

# Download Grandmaster PGN Games

In [20]:
from bs4 import BeautifulSoup
import urllib.request
from pathlib import Path
from tqdm.notebook  import tqdm

script_dir = Path.cwd()

site = 'https://www.pgnmentor.com/files.html'
base_url = 'http://www.pgnmentor.com/'

# Get raw html
import requests
r = requests.get(site)
data = r.text
soup = BeautifulSoup(data)

# 'a' tags represent link elements
# href is an attribute from 'a' tag
links = [a.get('href') for a in soup.find_all('a', href=True)]

In [16]:
# Extract zip files only
zips = [z for z in links if str(z).endswith('.zip')]
zips = [*set(zips)] # get rid of duplicates
# Filter by players (openings not of interest)
player_zips = [p for p in zips if 'players' in p]
print(player_zips[:5])

['players/Panno.zip', 'players/Horwitz.zip', 'players/Serper.zip', 'players/Winawer.zip', 'players/Huebner.zip']


In [22]:
base_url = 'http://www.pgnmentor.com/'
output_dir = script_dir / "zips"
output_dir.mkdir(parents=True, exist_ok=True)

with tqdm(total=len(player_zips), desc="Downloading Zips") as pbar:
    for p in player_zips:
        out = p.split('/')[-1] # Extract file name (e.g. Magnus.zip)
        destination = output_dir / out
        urllib.request.urlretrieve(f"{base_url}{p}", destination)
        pbar.update(1)

Downloading Zips:   0%|          | 0/249 [00:00<?, ?it/s]

# Unzip files

In [29]:
import shutil
from glob import glob

unpack_dir = script_dir / "pgns"
unpack_dir.mkdir(parents=True, exist_ok=True)
zip_files = list(output_dir.glob("*.zip"))

with tqdm(total=len(player_zips), desc="Unpacking Zips") as pbar:
    for zf in zip_files:
        shutil.unpack_archive(zf, unpack_dir)
        pbar.update(1)

Unpacking Zips:   0%|          | 0/249 [00:00<?, ?it/s]