In [4]:
import time
import numpy as np
import pandas as pd
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Data Extractor from QConcursos

In [17]:
def open_browser():
    """This function opens a Chrome browser using the main User Data, so it can retain login info.
    
    Returns
    -------
    WebDriver
        Selenium WebDriver object
    """
    options = webdriver.ChromeOptions()
    options.add_argument(r"--user-data-dir=C:\Users\alexa\AppData\Local\Google\Chrome\User Data\\")
    driver = webdriver.Chrome(executable_path='./resources/chromedriver.exe', options=options)
    return driver

def transform_query_for_url(query):
    return '%20'.join(query.split())



def get_data_question(web_question):
    """This function gets the data of a specific question from QConcursos
    
    Parameters
    ----------
    web_question: WebElement
        The data collected with Selenium about the question
    
    Returns
    -------
    dict
        The data about the question translated into a Python dict
    """
    question = dict()
    # Info: the data about where the question come from - its year, organization that formulated it ('banca'), the place for which the place is applying for,
    # and the name of the exam.
    info = web_question.find_element(By.CLASS_NAME, 'q-question-info')
    question['info'] = info.get_attribute('innerText')
    # Enunciado: the question, including its contextualization, before the answer options.
    enunciado = web_question.find_element(By.CLASS_NAME, 'q-question-enunciation')
    question['enunciado'] = enunciado.get_attribute('innerText')
    # Options: Between 2 and 5 answer options to the question.
    options = web_question.find_element(By.CLASS_NAME, 'q-question-options')
    question['options'] = options.get_attribute('innerText')
    return question

def read_30_pages(initial, driver, query, end=30, discipline_id=None):
    """This function reads thirty QConcursos webpages in sequence. This is needed because there is a block in the CloudFare Server if we read more than that
    sequencially.
    
    Parameters
    ----------
    initial: int
        The page in which we start reading the data
    driver: WebDriver
        The driver that will manage the browser
    end: int, optional
        The page in which we end our reading
    
    Returns
    -------
    list
        List of dicts with the questions, usign the get_data_question(question) function
    """
    questions = []

    internal_parameters = []
    if query:
        internal_parameters.append(f'q={transform_query_for_url(query)}')
    if discipline_id:
        internal_parameters.append(f'discipline_ids%5B%5D={discipline_id}')
    
    # End + 1 is needed so 30 pages are read, instead of 29
    for page in range(initial, end):
        # ECA:
        # link = f'https://www.qconcursos.com/questoes-de-concursos/questoes?discipline_ids%5B%5D=233&page={page}'
        # Lei Maria da Penha:
        url = f'https://www.qconcursos.com/questoes-de-concursos/questoes?{"&".join(internal_parameters)}&page={page}'
        # SINASE
        driver.get(link)
        try:
            WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.CLASS_NAME, 'js-question-item')))
            for question in driver.find_elements(By.CLASS_NAME, 'js-question-item'):
                questions.append(get_data_question(question))
        except:
            time.sleep(10)
    return questions

# Extractor
def get_number_pages(driver, query, discipline_id=None):
    """This function will search for a given query, and see how many pages of questions it has.
    
    Parameters
    ----------
    driver: WebDriver
        The driver that will access the browser
        
    Returns
    -------
    int
        How many pages with the given question the site has."""
    internal_parameters = []
    if query:
        internal_parameters.append(f'q={transform_query_for_url(query)}')
    if discipline_id:
        internal_parameters.append(f'discipline_ids%5B%5D={discipline_id}')
    url = f'https://www.qconcursos.com/questoes-de-concursos/questoes?{"&".join(internal_parameters)}&page=1'
    driver.get(url)
    print(url)
    # The following code finds the text, which usually is 'Foram encontradas x questões'
    qtd_questions = driver.find_element(By.CLASS_NAME, 'q-page-results-title').get_attribute('innerText')
    # The following code gets only the number, removes the '.' and transforms into int
    qtd_questions = int(qtd_questions.split()[2].replace('.', ''))
    # Each page has at most 5 questions, so we can find the number of pages with a simple division
    qtd_pages = np.ceil(qtd_questions / 5)
    # The limit of the site is 1000 pages; 1001 forth are not displayed
    if qtd_pages > 1000:
        qtd_pages = 1000
    return int(qtd_pages)


In [None]:
https://www.qconcursos.com/questoes-de-concursos/questoes?discipline_ids%5B%5D=579

## Usage of the extractor

In [9]:
temas = ['estatuto deficiente', 'política saúde deficiente']

In [15]:
for tema in temas:
    print('Analisando', tema)
    questions = []
    driver = open_browser()
    qtd_pages = get_number_pages(driver, tema)
    driver.close()
    step=30

    for i in range(1, qtd_pages, step):
        driver = open_browser()
        questions.extend(read_30_pages(i, driver, tema, i+30))
        driver.close()
        if i + step > qtd_pages:
            step = qtd_pages - i
        else:
            step = 30

    # Only after extracting
    df_questions = pd.DataFrame()
    for i in range(len(questions)):
        df_questions = pd.concat([df_questions, pd.DataFrame(questions[i], index=[i])], axis=0)
    df_questions['ano'] = df_questions['info'].map(lambda x: x.split(' Banca')[0].split('Ano: ')[1])
    df_questions['banca'] = df_questions['info'].map(lambda x: x.split('Banca: ')[1].split()[0])
    df_questions['orgao'] = df_questions['info'].map(lambda x: x.split('Órgão: ')[1].split('Prova')[0])
    df_questions['prova'] = df_questions['info'].map(lambda x: x.split(':')[-1])
    now = datetime.now()
    df_questions.to_csv(f'data/{now.year}{now.month}{now.day} - {now.hour}-{now.minute}.csv', sep=';')

Analisando estatuto deficiente


  driver = webdriver.Chrome(executable_path='./resources/chromedriver.exe', options=options)


https://www.qconcursos.com/questoes-de-concursos/questoes?q=estatuto%20deficiente&page=1


SessionNotCreatedException: Message: session not created
from disconnected: received Inspector.detached event
  (Session info: chrome=114.0.5735.199)
Stacktrace:
Backtrace:
	GetHandleVerifier [0x00508893+48451]
	(No symbol) [0x0049B8A1]
	(No symbol) [0x003A5058]
	(No symbol) [0x0039924D]
	(No symbol) [0x00398DDD]
	(No symbol) [0x00398175]
	(No symbol) [0x0039810C]
	(No symbol) [0x00396D46]
	(No symbol) [0x003975CA]
	(No symbol) [0x003A8067]
	(No symbol) [0x003A9631]
	(No symbol) [0x003A96D0]
	(No symbol) [0x003F1A58]
	(No symbol) [0x003F047C]
	(No symbol) [0x003EA0B6]
	(No symbol) [0x003C7E08]
	(No symbol) [0x003C8F2D]
	GetHandleVerifier [0x00768E3A+2540266]
	GetHandleVerifier [0x007A8959+2801161]
	GetHandleVerifier [0x007A295C+2776588]
	GetHandleVerifier [0x00592280+612144]
	(No symbol) [0x004A4F6C]
	(No symbol) [0x004A11D8]
	(No symbol) [0x004A12BB]
	(No symbol) [0x00494857]
	BaseThreadInitThunk [0x75C37D59+25]
	RtlInitializeExceptionChain [0x772DB74B+107]
	RtlClearBits [0x772DB6CF+191]


In [18]:
tema = temas[0]
print('Analisando', tema)
questions = []
driver = open_browser()
qtd_pages = get_number_pages(driver, None, 579)
driver.close()
step=30

for i in range(1, qtd_pages, step):
    driver = open_browser()
    questions.extend(read_30_pages(i, driver, None, i+30, 579))
    driver.close()
    if i + step > qtd_pages:
        step = qtd_pages - i
    else:
        step = 30

# Only after extracting
df_questions = pd.DataFrame()
for i in range(len(questions)):
    df_questions = pd.concat([df_questions, pd.DataFrame(questions[i], index=[i])], axis=0)
df_questions['ano'] = df_questions['info'].map(lambda x: x.split(' Banca')[0].split('Ano: ')[1])
df_questions['banca'] = df_questions['info'].map(lambda x: x.split('Banca: ')[1].split()[0])
df_questions['orgao'] = df_questions['info'].map(lambda x: x.split('Órgão: ')[1].split('Prova')[0])
df_questions['prova'] = df_questions['info'].map(lambda x: x.split(':')[-1])
now = datetime.now()
df_questions.to_csv(f'data/{now.year}{now.month}{now.day} - {now.hour}-{now.minute}.csv', sep=';')

Analisando estatuto deficiente


  driver = webdriver.Chrome(executable_path='./resources/chromedriver.exe', options=options)


https://www.qconcursos.com/questoes-de-concursos/questoes?discipline_ids%5B%5D=579&page=1


NameError: name 'link' is not defined

## Formating and saving the data