# Web Scraping and Observing Tournament Deck Lists

MTG Top 8 is one of the premier websites for looking at deck information and tournament results.

## Install Libraries

In [1]:
import pandas as pd
import numpy as np
import os

import requests
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from contextlib import closing

import time

## Retrieve Page Destination

Before getting decklist information, I first need to navigate the MTG Top 8 webpages to get to the destinations from which I will get decklist information. Once I can navigate to and collect the decklists, I will then be able to retrieve the information I want, for anlaysis.

### Grab URL for Homepage

In [2]:
URL_MTGTOP8 = "https://www.mtgtop8.com/"

In [3]:
URL_MTGTOP8_MODERN = "https://www.mtgtop8.com/format?f=MO"
page = requests.get(URL_MTGTOP8_MODERN)

soup_modern = BeautifulSoup(page.content, "html.parser")

### Grab Table Column Containing Decklists and Retrieve Meta Decks

In [4]:
deck_table_modern = soup_modern.find('table')

# We only need the leftmost column, so "find()" is used instead of "find_all()"
deck_table_modern = deck_table_modern.find("td")

# Skip header and footer cells
deck_table_modern = deck_table_modern.findChildren(recursive=False)[3:]

### Grab each Meta Archetype, and Separate Decks

In [5]:
deck_type_list = []
archetype_name = None

for element in deck_table_modern:
    if element["class"][0] == "meta_arch":
        # Extract Archetype Name
        archetype_name = element.contents[0].split(" ")[0]
    elif element["class"][0] == "hover_tr":
        # Extract Deck Type Name and Deck Type URL
        deck_row = element.find_all("div","S14")[0].contents[0]
        deck_type_list.append({"name": deck_row.contents[0], "archetype": archetype_name, "url": URL_MTGTOP8+deck_row["href"]})
    else:
        raise ValueError("Unwanted Row in Elements")

### Grab URL to Each Deck on Each Deck Type's Webpage

In [6]:
def return_deck_urls_multiple_pages(deck_type_url, driver, max_pages=100):
    out = []

    page_count = 1

    # navigate to page
    driver.get(deck_type_url)  # Selenium driver, currently configured
    time.sleep(10)

    more_pages = True
    while more_pages:
        soup_page = BeautifulSoup(driver.page_source)
        page_base = soup_page.find('table').find_all('td')[1].find_all("form")

        # Returns deck list urls
        for deck_list_item in page_base[0].find_all('tr')[2:-1]:
            out.append(URL_MTGTOP8 + deck_list_item.find('a', href=True)["href"])

        # Checks the "Next" button for an an "onclick" attribute, to determine whether there are more pages
        next_button = page_base[1].find('table').find('table').find('tr').find_all('td')[-2].find('div')
        if (page_count >= max_pages) or (next_button.has_attr("onclick") == False):
            # End loop
            more_pages = False
            break
        else:
            # If the "Next" button is clickable navigates to next page
            driver.find_element(By.XPATH, "//div[contains(text(), 'Next')]").click()
            page_count += 1

            time.sleep(2)

    return out

In [7]:
check_test = []

# Open web browser for Selenium, in case of multiple pages of decks
with closing(webdriver.Chrome()) as driver:
    for deck_type in deck_type_list:

        time.sleep(10)  # Wait on Web Page

        url_list = []

        # Pull page
        deck_type_url = deck_type['url']
        page = requests.get(deck_type_url)
        soup = BeautifulSoup(page.content, "html.parser")

        
        # Navigate for check on whether selenium is needed (if there is mulitple pages of cards)
        page_base_path = soup.find('table').find_all('td')[1].find_all("form")
        if len(page_base_path) > 1:
            # Navigate Selenium (There are multiple pages)
            selenium_deck_types = return_deck_urls_multiple_pages(deck_type_url, driver, max_pages=10)
            for item_url in selenium_deck_types:
                url_list.append(item_url)
        elif len(page_base_path) == 1:
            # Parse BeautifulSoup (There is only one page)
            for deck_list_item in page_base_path[0].find_all('tr')[2:-1]:
                url_list.append(URL_MTGTOP8 + str(deck_list_item.find('a', href=True)["href"]))
        else:
            raise ValueError("There are no decks on page")
        
        # Add "out_list" to desired output
        deck_type['deck_list_urls'] = url_list
        print(f"Finished Deck List for {deck_type['name']} in archetype {deck_type['archetype']}. There are {len(url_list)} decks.")


Finished Deck List for 4/5c Aggro in archetype AGGRO. There are 99 decks.
Finished Deck List for Cascade Crash in archetype AGGRO. There are 65 decks.
Finished Deck List for UR Aggro in archetype AGGRO. There are 60 decks.
Finished Deck List for Rakdos Aggro in archetype AGGRO. There are 40 decks.
Finished Deck List for Hardened Scales in archetype AGGRO. There are 18 decks.
Finished Deck List for Jund in archetype AGGRO. There are 15 decks.
Finished Deck List for Merfolk in archetype AGGRO. There are 14 decks.
Finished Deck List for The Underworld Cookbook in archetype AGGRO. There are 14 decks.
Finished Deck List for Mono Black Aggro in archetype AGGRO. There are 11 decks.
Finished Deck List for Death's Shadow in archetype AGGRO. There are 11 decks.
Finished Deck List for Martyr Life in archetype AGGRO. There are 10 decks.
Finished Deck List for Elementals in archetype AGGRO. There are 5 decks.
Finished Deck List for Red Deck Wins in archetype AGGRO. There are 5 decks.
Finished Deck 

### Retrieve Mainboard and Sideboard for Each Retrieved Deck

Each tournament/event legal deck contains two lists of cards: the mainboard and the sideboard. In the mainboard are the cards that are played in the first game of each 3 game set between players. This list does not change between matches and is dictated before the event starts. This list usually contains 60 cards (the minimum legal number of cards) to reduce variance and promote consistency.

The sideboard consists of 15 cards that can be switched out for cards in the mainboard, or simply added to the played deck between games in a match. The deck played, must still have 60 cards minimum, but these cards help against different matchups, and are usually used to prevent other decks from achieving their goal or to help your deck achieve its goal.

In [8]:
def retrieve_deck_list(soup):
    card_elements = soup.find('div').find('div').find_all('div', recursive=False)[8].find_all('div', recursive=False)[1].find_all('div', recursive=False)[2].find_all('div', recursive=False)

    deck_mainboard = {}
    deck_sideboard = {}

    for division in card_elements:
        #print(division)
        for element in division:
            element_class = element.get("class")[0]
            # Determine Mainboard or Sideboard based on section title
            if element_class == "O14":
                if "SIDEBOARD" in element.contents[0]:
                    board = "SIDEBOARD"
                else:
                    board = "MAINBOARD"
            # Adds the card and card count to the appropriate board
            elif element_class == "deck_line":
                number_of = element.contents[0]
                card_name = element.find('span').contents[0]
                if board == "MAINBOARD":
                    deck_mainboard[card_name] = number_of
                elif board == "SIDEBOARD":
                    deck_sideboard[card_name] = number_of
                else:
                    raise ValueError("Unknown Board State")
            else:
                raise ValueError("Unknown Class")
            
    return deck_mainboard, deck_sideboard

In [9]:
for deck_type in deck_type_list:
    deck_list_list = []
    deck_list_urls = deck_type['deck_list_urls']

    for deck_url in deck_list_urls:
        # BS4 object
        page = requests.get(deck_url)
        soup = BeautifulSoup(page.content, "html.parser")
        # retrieve deck information, split into mainboard and sideboard
        mainboard, sideboard = retrieve_deck_list(soup)
        # save to deck_list_list as a dictionary {'mainboard': thing, 'sideboard': thing}
        deck_list_list.append({"mainboard":mainboard, "sideboard":sideboard})
        time.sleep(5)
    # save deck_list_list to deck_type dictionary as ["deck"]
    deck_type['deck_list'] = deck_list_list

### Reformat Dictionaries To Pandas Tables

In [10]:
deck_type_list[0].keys()

dict_keys(['name', 'archetype', 'url', 'deck_list_urls', 'deck_list'])

In [11]:
mainboard = []
sideboard = []

# Iterate each deck type for deck
for deck_type in deck_type_list:
    for i in range(len(deck_type['deck_list'])):

        # base of dataframes
        mainboard_dict = {'name':deck_type['name'], 'archetype':deck_type['archetype'], 'url': deck_type['deck_list_urls'][i]}
        sideboard_dict = {'name':deck_type['name'], 'archetype':deck_type['archetype'], 'url': deck_type['deck_list_urls'][i]}

        # set deck instance
        mainboard_deck_list = deck_type['deck_list'][i]['mainboard']
        sideboard_deck_list = deck_type['deck_list'][i]['sideboard']
        url_deck_list = deck_type['deck_list_urls'][i]

        # iterates on cards in deck for dictionary
        for card in list(mainboard_deck_list.keys()):
            mainboard_dict[card] = mainboard_deck_list[card]
        for card in list(sideboard_deck_list.keys()):
            sideboard_dict[card] = sideboard_deck_list[card]

        mainboard.append(mainboard_dict)
        sideboard.append(sideboard_dict)

In [12]:
mainboard_df = pd.json_normalize(mainboard)
sideboard_df = pd.json_normalize(sideboard)

### Save Dataframes to CSV

In [13]:
save_dir_path = os.path.join(os.getcwd(), "data", "tournament_deck_lists")

mainboard_df.to_csv(os.path.join(save_dir_path, "Mainboard.csv"))
sideboard_df.to_csv(os.path.join(save_dir_path, "Sideboard.csv"))