In [None]:
#Selenium had to be used to collect the data from the rating because the website is dynamically rendered: you 
#have to manually select the rating you want from a menu and then you get that specific table rendered
#with no redirection. 

#To get the personal data from every athlete, you must access a specific url for each one of them,
#so we don't need to use Selenium web driver anymore and Scrapy will be used instead because of its efficiency

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
import pandas as pd
import numpy as np
import time

In [None]:
df_base = pd.read_csv('rating.csv', index_col=0)

In [None]:
urls = df_base['URL detalhes']

In [None]:
class Spider(scrapy.Spider):
    name = "scrap"
    start_urls = urls
    count = 1
    total = len(start_urls)
    
    custom_settings = {
        'LOG_ENABLED' : 'False',
        'FEED_FORMAT':'csv',
        'FEED_URI': 'rating_detalhado.csv',
        
        'USER_AGENT' : 'Bárbara Gomes (barbaragomes@ufmg.br)',
        'ROBOTSTXT_OBEY' : 'True',
        
        'CONCURRENT_REQUESTS_PER_DOMAIN' : '32', 
        'CONCURRENT_REQUESTS' : '32',
        'DOWNLOAD_DELAY' : '0.5',
        'RANDOMIZE_DOWNLOAD_DELAY' : 'True',
        
        'AUTOTHROTTLE_ENABLED' : 'False',
        'HTTPCACHE_ENABLED' : 'True',
    }

    def parse(self, response):
        eTime = time.time() - start_time
        print(f'\rParsing {self.count}/{self.total} \
              Elapsed minutes = {eTime/60} \
              Minutes left = {eTime * (self.total - self.count) / (self.count * 60)}', end='')
        self.count += 1
        
        name = response.css('#lblNome::text').get()
        state = response.css('#imgBandeiraUF ::attr(src)').get()[-6:-4] 
        club = response.css('#lblClube::text').get() 
        age = response.css('#lblIdade::text').get()
        category = response.css('#lblCategoria::text').get()[-1]
        rating = response.css('#lblCategoria::text').get()[1]
        pts = response.css('#lblPontos::text').get()
        placing = response.css('#lblColocacao::text').get()
        
        championships = []
        
        for ch in response.css('#grideventos_DXMainTable td'):
            chData = ch.css('span::text').getall() 
            #'Ganho' (chData[3]) is missing in 4 records, so I add it to chData as NaN
            if (len(chData) == 4):
                chData.append(chData[3])
                chData[3] = 'NaN'
                
            if chData:
                championships.append({                
                    'Data' : chData[0],
                    'Campeonato' : chData[1],
                    'Inicial' : chData[2],
                    'Ganho' : chData[3],
                    'Final' : chData[4]
                })
            
        yield {
            'Nome' : name,
            'Estado' : state,
            'Clube' : club,
            'Idade' : age,
            'Categoria' : category,
            'Rating' : rating,
            'Pontos' : pts,
            'Colocação' : placing,
            'URL detalhes' : response.request.url,
            'Eventos' : championships 
        }

In [None]:
process = CrawlerProcess(get_project_settings())
#process = CrawlerProcess({'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'})
process.crawl(Spider)
start_time = time.time()
process.start()

In [None]:
df = pd.read_csv('rating_detalhado.csv')
errorCount = len(pd.concat([df_base['URL detalhes'], df['URL detalhes']]).drop_duplicates(keep=False))
print(f'{str(errorCount)} records couldn`t be scrapped')

In [None]:
missing = df_base['URL detalhes'][~df_base['URL detalhes'].isin(df['URL detalhes'])].dropna().values
missing