In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import requests
import numpy as np
from tqdm import tqdm
import json
import os
import shutil
import pandas as pd

In [None]:
formats = ['ST', 'MO', 'LE', 'VI', 'PI', 'EDH']

# Tools for Scraping

## fetchMetaId

In [None]:
def fetchMetaId(fmt):
    html = urlopen('http://mtgtop8.com/format?f={0}'.format(fmt))
    bs = BeautifulSoup(html, "html.parser")
    
    meta_options = bs.find('select', {"name":"meta"}).findAll('option')
    meta_dict = {}
    for o in meta_options:
        pkg = re.findall(r"format\?f=(\w{2,3})&meta=([0-9]{2,3})", o.attrs['value'])
        if len(pkg)!=0: meta_dict[pkg[0][1]] = o.get_text()
        else: pass
        
    return meta_dict

In [None]:
fetchMetaId('ST')

## fetchFormatEvents

In [None]:
def fetchFormatEvents(fmt, mtid):
    events = []

    page = 1
    flag = True
    while flag:
        html = requests.post('http://mtgtop8.com/format?f={0}&meta={1}'.format(fmt, mtid), \
                             headers={'Content-Type': 'application/x-www-form-urlencoded'}, data = {'cp':page}).text
        bs = BeautifulSoup(html, "html.parser")

        table = bs.select('div div table tr td[width="40%"] > table')[1]
        div = table.select('tr[height="30"]')

        if len(div)!=0:
            for o in div:
                link = o.select('td a', href=True, text=True)[0]

                event = {
                    'title': link.get_text(),
                    '_id': re.findall(r"e\=(\d*)", link['href'])[0],
                    'stars': len(o.select('td[width="15%"] img[src="/graph/star.png"]')),
                    'bigstars': len(o.select('td[width="15%"] img[src="graph/bigstar.png"]')),
                    'date': o.select('td[align="right"]')[0].get_text(),
                    'format': fmt,
                    'meta_id': mtid
                }
                events.append(event)
            page+=1
        else:
            flag = False
    
    return events

In [None]:
fetchFormatEvents('ST','96')[:5]

## fetchEventInfo

In [None]:
def fetchEventInfo(evtid):
    html = urlopen('http://mtgtop8.com/event?e={0}'.format(evtid))
    bs = BeautifulSoup(html, "html.parser")
    
    data = bs.select('table div table td[align=center] div')[1].previous.strip()
    players = re.findall(r"^(\d*) players", data)[0]
    date = re.findall(r"(\d\d\/\d\d\/\d\d)$", data)[0]
    title = bs.select('.w_title td')[0].get_text()
    top_8_decks = []

    for o in bs.select('table td[width="25%"] > div > div:not([align="center"])'):
        link = o.select('div div a')[0]['href']
        deck = {
            '_id': re.findall(r"\&d\=(\d*)", link)[0],
            'result':o.select('div div[align=center]')[0].get_text(),
            'title': o.select('div div a')[0].get_text(),
            'player': o.select('div div a')[0].get_text()
        }
        top_8_decks.append(deck)
    
    return top_8_decks

In [None]:
fetchEventInfo('22539')

## fetchDeck

In [None]:
def fetchDeck(evtid, plyid):
    html = urlopen('http://mtgtop8.com/event?e={0}&d={1}'.format(evtid, plyid))
    bs = BeautifulSoup(html, "html.parser")
    
    tables = bs.select('table table table')

    main_deck = []
    for t in tables[0:-1]:
        for o in t.select('tr td div span'):
            pkg = o.parent.get_text().split()
            count = pkg[0]
            card = ' '.join(pkg[1:])
            main_deck.append((card,count))

    sideboard = []
    for o in tables[-1].select('tr td div span'):
        pkg = o.parent.get_text().split()
        count = pkg[0]
        card = ' '.join(pkg[1:])
        sideboard.append((card,count))

    deck = {
        'player': bs.select('table .chosen_tr [align=right] .topic')[0].get_text(),
        'result': bs.select('table .chosen_tr [align=center]')[0].get_text(),
        'main_deck': main_deck,
        'sideboard': sideboard
    }
    
    return deck

In [None]:
fetchDeck('25681','390867')
#json.dumps(fetchDeck('25681','390867'))

# Main Scraping

In [None]:
import pandas as pd

In [None]:
fmts = []
_ids = []
names = []

for fmt in formats:
    dict_aux = fetchMetaId(fmt)
    fmts+=[fmt]*len(dict_aux)
    _ids+=list(dict_aux.keys())
    names+=list(dict_aux.values())

all_meta_info = {'meta_id': _ids,'meta_name': names,'meta_format': fmts}

In [None]:
df_meta = pd.DataFrame.from_dict(all_meta_info)

In [None]:
df_meta.head()

In [None]:
df_meta.to_csv('df_meta.csv')

In [None]:
all_events = []

In [None]:
meta_ids = all_meta_info['meta_id']
meta_formats = all_meta_info['meta_format']

for i in tqdm(range(len(meta_formats))):
    try:
        all_events+=fetchFormatEvents(meta_formats[i],meta_ids[i])
    except:
        print('error en: ', meta_formats[i], meta_ids[i])

In [None]:
np.save('all_events.npy', np.array(all_events))

In [None]:
dict_events = {'event_title':[], 'event__id':[], 'event_stars':[], 'event_bigstars':[], 'event_date':[], 'event_format':[], 'event_meta_id':[]}
for e in all_events:
    for k,v in e.items():
        dict_events['event_'+k].append(v)

In [None]:
df_events = pd.DataFrame.from_dict(dict_events)

In [None]:
df_events.head()

In [None]:
df_events.to_csv('df_events.csv')

In [None]:
main_dir = './events'

In [None]:
errors_id = []
for ind in tqdm(df_events.index):
    id_event = df_events['event__id'][ind]
    try:
        path_event = os.path.join(main_dir, id_event)
        if not os.path.exists(path_event):
            os.mkdir(path_event)

            path_players_info = os.path.join(path_event, 'players_info.csv')
            event_info_players = {'player__id':[], 'player_result':[], 'player_title':[], 'player_player':[]}
            for ply in fetchEventInfo(id_event):
                for k,v in ply.items():
                    event_info_players['player_'+k].append(v)
            df_players = pd.DataFrame.from_dict(event_info_players)
            df_players.to_csv(path_players_info)

            path_players_decks = os.path.join(path_event, 'players_decks')
            os.mkdir(path_players_decks)
            for indx in df_players.index:
                id_player = df_players['player__id'][indx]
                path_player_deck = os.path.join(path_players_decks, 'player_{0}_deck.json'.format(id_player))
                with open(path_player_deck, 'w') as f:
                    json.dump(fetchDeck(id_event, id_player), f)
        else:
            pass
            #print('folder  '+ path_event+' exist')
    except:
        errors_id.append(id_event)
        print('error en: ', id_event)

In [None]:
np.save('vector_errores.npy',np.array(errors_id))

In [None]:
empy_dirs = []
for fol in os.listdir(main_dir):
    if not os.listdir(os.path.join(main_dir, fol)):
        empy_dirs.append(fol)
len(empy_dirs)

In [None]:
np.save('empy_dirs.npy',np.array(empy_dirs))

In [None]:
for fol in empy_dirs:
    shutil.rmtree(os.path.join(main_dir, fol))

In [None]:
indices_errores = []
actual_events = list(map(int, os.listdir(main_dir)))
df_all_events = pd.read_csv("df_events.csv").drop_duplicates(subset ="event__id")

In [None]:
df_all_events.head()

In [None]:
for ind in tqdm(df_all_events.index):
    id_event = int(df_all_events['event__id'][ind])
    if id_event not in actual_events:
        indices_errores.append(ind)

In [None]:
df_all_events = df_all_events.drop(indices_errores)

In [None]:
df_all_events = df_all_events.drop(columns='Unnamed: 0')

In [None]:
df_all_events.to_csv('df_events_v2.csv', index=False)