# Web scraping the historical copies of the Oddshecker's outright World Cup winner page using Archive.org

This code loops through four pre-tournament dates where the Oddschecker outright World Cup winner page was archived, grabs the odds data from the page and outputs it in the same way as the scheduled runs conducted during the tournament.

In [1]:
from bs4 import BeautifulSoup
import numpy
import pandas
from datetime import datetime

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import WebDriverWait

In [61]:
chrome_options = Options()
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36'
chrome_options.add_argument('user-agent={0}'.format(user_agent))
chrome_options.add_argument('--headless')

driver = webdriver.Chrome(service=Service('/usr/local/bin/chromedriver.exe'),options=chrome_options)

wait = WebDriverWait(driver, 20)
action = ActionChains(driver)

In [63]:
webpageArray = [
{"url": "https://web.archive.org/web/20220714091113/https://www.oddschecker.com/football/world-cup/winner", "dateTime": "14/07/2022  09:11:13"},     
{"url": "https://web.archive.org/web/20220916165807/https://www.oddschecker.com/football/world-cup/winner", "dateTime": "16/09/2022  16:58:07"},  
{"url": "https://web.archive.org/web/20221112001610/https://www.oddschecker.com/football/world-cup/winner", "dateTime": "12/11/2022  00:16:10"},
{"url": "https://web.archive.org/web/20221118194846/https://www.oddschecker.com/football/world-cup/winner", "dateTime": "18/11/2022  19:48:46"},
]

listOfCountries = ['Brazil', 'Argentina', 'France', 'Spain', 'England', 'Germany', 'Netherlands', 'Portugal', 'Belgium', 'Denmark', 'Uruguay', 'Croatia', 'Serbia', 'Switzerland', 'Senegal', 'Mexico', 'USA', 'Poland', 'Ecuador', 'Wales', 'Morocco', 'Japan', 'Ghana', 'Canada', 'Cameroon', 'Iran', 'South Korea', 'Australia', 'Qatar', 'Tunisia', 'Saudi Arabia', 'Costa Rica']

In [65]:
for obj in webpageArray:

    driver.get(obj["url"])
    soup = BeautifulSoup(driver.page_source)


    oddsData = []
    totalProb = 0

    for country in listOfCountries:

        countryData = soup.find( class_ = "diff-row evTabRow bc", attrs={"data-bname" : country} )

        if countryData is None:

            oddsArray = [0]

        else:

            oddsArray = countryData.findChildren("td", class_ = "bc", recursive=False)

            for i in range(len(oddsArray)):
                oddsArray[i] = 1/(float(oddsArray[i]["data-odig"]))

        oddsMean = numpy.mean(oddsArray)
        totalProb += oddsMean

        oddsData.append({"country": country, "prob": oddsMean, "currDateTime": obj["dateTime"]})

    for obj in oddsData:

        obj["prob"] = obj["prob"]/totalProb
        
    pandas.DataFrame(oddsData).to_csv("historicalOutputs.csv", sep=',', encoding='utf-8', index=False, mode='a', header=False)

  obj["prob"] = obj["prob"]/totalProb
