Required libraries.

In [None]:
$ pip install bs4
$ pip install requests

In [1]:
from bs4 import BeautifulSoup 
import requests
import csv

Scraping functions: generate BeautifulSoup object and analyze medal table.

In [2]:
def wiki_medal_table(year, event='Summer'):
    '''Returns BeautifulSoup object of Wikipedia medal table page 
    for given Olympic event('Summer' or 'Winter')and year.
    '''
    
    print(f'Getting Wikipedia page for {year} {event} Olympics...')
    url = f'https://en.wikipedia.org/wiki/{year}_{event}_Olympics_medal_table'
    return BeautifulSoup(requests.get(url).text, 'html5lib')

def first_gold_medals(year, record, event='Summer'):
    '''Writes countries that scored their first Olympic gold during
    the specified year and event ('Summer' or 'Winter') to the 
    dictionary record.
    '''
    
    tables = wiki_medal_table(year, event).find_all('table')
    medals = ''
    for table in tables:
        if table['class'][0] == 'wikitable':
            medals = table
            break
    
    print(f'Analyzing medal table for {year}...')
    for row in medals.find_all('tr'):
        if row.find_all('span'):
            country = row.find_all('span')[0].string.strip('()')
            if country in record:
                continue
            cols = row.find_all('td')
            # Check if gold medal column has nonzero value
            if cols and cols[len(cols) - 4].string != '0':
                record[country] = year

Generating Summer Olympics records. Simpler to hardcode missing years that can be counted on one hand rather than additional code to handle missing Wikipedia pages. Counts reallocations.

In [3]:
firsts = {}

for year in range(1896,2024,4):
    if year not in (1916,1940,1944):
        first_gold_medals(year, firsts)
print(firsts)

Getting Wikipedia page for 1896 Summer Olympics...
Analyzing medal table for 1896...
Getting Wikipedia page for 1900 Summer Olympics...
Analyzing medal table for 1900...
Getting Wikipedia page for 1904 Summer Olympics...
Analyzing medal table for 1904...
Getting Wikipedia page for 1908 Summer Olympics...
Analyzing medal table for 1908...
Getting Wikipedia page for 1912 Summer Olympics...
Analyzing medal table for 1912...
Getting Wikipedia page for 1920 Summer Olympics...
Analyzing medal table for 1920...
Getting Wikipedia page for 1924 Summer Olympics...
Analyzing medal table for 1924...
Getting Wikipedia page for 1928 Summer Olympics...
Analyzing medal table for 1928...
Getting Wikipedia page for 1932 Summer Olympics...
Analyzing medal table for 1932...
Getting Wikipedia page for 1936 Summer Olympics...
Analyzing medal table for 1936...
Getting Wikipedia page for 1948 Summer Olympics...
Analyzing medal table for 1948...
Getting Wikipedia page for 1952 Summer Olympics...
Analyzing meda

Writing to CSV using IOC codes.

In [24]:
with open('first_golds.csv', 'w', newline='') as first_golds:  
    writer = csv.writer(first_golds)
    writer.writerow(['Country', 'Year'])
    for country, year in firsts.items():
       writer.writerow([country, year])

Generate IOC to ISO country codes conversion dictionary then convert IOC CSV.

In [16]:
url = 'https://simple.wikipedia.org/wiki/Comparison_of_IOC,_FIFA,_and_ISO_3166_country_codes'
soup = BeautifulSoup(requests.get(url).text, 'html5lib')

table = soup.find('table')
codes = {}

for row in table.find_all('tr'):
    cols = row.find_all('td')
    # Only populate if IOC differs from ISO code.
    if cols and cols[2].string:
        if cols[2].string != cols[4].string.rstrip():
            codes[cols[2].string] = cols[4].string.rstrip()
        
print(codes)

{'ALG': 'DZA', 'ASA': 'ASM', 'ANG': 'AGO', 'ANT': 'ATG', 'ARU': 'ABW', 'BAH': 'BHS', 'BRN': 'BHR', 'BAN': 'BGD', 'BAR': 'BRB', 'BIZ': 'BLZ', 'BER': 'BMU', 'BHU': 'BTN', 'AHO': 'BES', 'BOT': 'BWA', 'IVB': 'VGB', 'BRU': 'BRN', 'BUL': 'BGR', 'BUR': 'BFA', 'CAM': 'KHM', 'CAY': 'CYM', 'CHA': 'TCD', 'CHI': 'CHL', 'CGO': 'COG', 'CRC': 'CRI', 'CRO': 'HRV', 'DEN': 'DNK', 'ESA': 'SLV', 'GEQ': 'GNQ', 'FIJ': 'FJI', 'GAM': 'GMB', 'GER': 'DEU', 'GRE': 'GRC', 'GRN': 'GRD', 'GUA': 'GTM', 'GUI': 'GIN', 'GBS': 'GNB', 'HAI': 'HTI', 'HON': 'HND', 'INA': 'IDN', 'IRI': 'IRN', 'KUW': 'KWT', 'LAT': 'LVA', 'LIB': 'LBN', 'LES': 'LSO', 'LBA': 'LBY', 'MAD': 'MDG', 'MAW': 'MWI', 'MAS': 'MYS', 'MTN': 'MRT', 'MRI': 'MUS', 'MON': 'MCO', 'MGL': 'MNG', 'MYA': 'MMR', 'NEP': 'NPL', 'NED': 'NLD', 'NCA': 'NIC', 'NIG': 'NER', 'NGR': 'NGA', 'OMA': 'OMN', 'PLE': 'PSE', 'PAR': 'PRY', 'PHI': 'PHL', 'POR': 'PRT', 'PUR': 'PRI', 'SKN': 'KNA', 'VIN': 'VCT', 'SAM': 'WSM', 'KSA': 'SAU', 'SEY': 'SYC', 'SLO': 'SVN', 'SOL': 'SLB', 'RSA'

In [25]:
with open('first_golds.csv', 'r') as read, open('first_golds_iso.csv', 'w', newline='') as write:
    ioc = csv.reader(read)
    iso = csv.writer(write)
    
    for line in ioc:
        if line[0] in codes:
            iso.writerow([codes[line[0]],line[1]])
        else:
            iso.writerow([line[0],line[1]])

Winter Olympics Medals analysis. Hardcoding missing WW2 years and 1994 switch to 2-year alternating schedule with Summer Olympics.

In [11]:
winter = {}

for year in range(1924,1996,4):
    if year not in (1940,1944):
        first_gold_medals(year, winter, event='Winter')
        
for year in range(1994,2022,4):
    first_gold_medals(year, winter, event='Winter')

print(winter)

Getting Wikipedia page for 1924 Winter Olympics...
Analyzing medal table for 1924...
Getting Wikipedia page for 1928 Winter Olympics...
Analyzing medal table for 1928...
Getting Wikipedia page for 1932 Winter Olympics...
Analyzing medal table for 1932...
Getting Wikipedia page for 1936 Winter Olympics...
Analyzing medal table for 1936...
Getting Wikipedia page for 1948 Winter Olympics...
Analyzing medal table for 1948...
Getting Wikipedia page for 1952 Winter Olympics...
Analyzing medal table for 1952...
Getting Wikipedia page for 1956 Winter Olympics...
Analyzing medal table for 1956...
Getting Wikipedia page for 1960 Winter Olympics...
Analyzing medal table for 1960...
Getting Wikipedia page for 1964 Winter Olympics...
Analyzing medal table for 1964...
Getting Wikipedia page for 1968 Winter Olympics...
Analyzing medal table for 1968...
Getting Wikipedia page for 1972 Winter Olympics...
Analyzing medal table for 1972...
Getting Wikipedia page for 1976 Winter Olympics...
Analyzing meda

CSV writing and IOC to ISO country code conversion (using previously generated table).

In [17]:
with open('first_winter_golds.csv', 'w', newline='') as first_golds:  
    writer = csv.writer(first_golds)
    writer.writerow(['Country', 'Year'])
    for country, year in winter.items():
       writer.writerow([country, year])
    
with open('first_winter_golds.csv', 'r') as read, open('first_winter_golds_iso.csv', 'w', newline='') as write:
    ioc = csv.reader(read)
    iso = csv.writer(write)
    
    for line in ioc:
        if line[0] in codes:
            iso.writerow([codes[line[0]],line[1]])
        else:
            iso.writerow([line[0],line[1]])

Testing and debugging: 

[X] Erroneous results in 1912 with NED. Time.sleep did not work. ~~Try identifying user agent.~~ Not potential timeout or lag. Was catching last row with medal total due to wrong indentation.

[X] Erroneous results in 1936 with TUR, not recording first. Wrong table found, due to new "Part of a series" sidebar. Need way to identify medal table: Second table (or (tables)[1]) for 1936 onward.

[X] Missing rank 21 and 24 rows for 1928, medal first ties. Getting wrong table due to setting last wikitable, need to break (assumes first wikitable is main medal table).

[X] Updating stopped at 2016 medal table. Proper table still scraped, 'span' tag not being found any more.

In [14]:
tables = wiki_medal_table(2016).find_all('table')
medals = ''
for table in tables:
    if table['class'][0] == 'wikitable':
        medals = table
        break

print(f'Analyzing medal table for 2016...')
for row in medals.find_all('tr'):
    print(row.find_all('span'))

Getting Wikipedia page for 2016 Summer Olympics...
Analyzing medal table for 2016...
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
