# Collecting the undergraduate monograph of Ufal-Penedo Information Systems Course graduates

## 1. Importing the required libraries

In [None]:
# Importing the required libraries.
import re, traceback, csv, pandas as pd, time
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from unidecode import unidecode
from playwright.async_api import async_playwright, expect
from twisted.internet.error import TCPTimedOutError, TimeoutError
import playwright._impl._errors as errors

## 2. Defining the Spider class

In [None]:
class SpiderUfalSI:
    def __init__(self, url):
        self.__url_base = url
        self.__max_attempts = 2
        self.__user_agent = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) "
                             "Chrome/122.0.0.0 Safari/537.36 OPR/108.0.0.0")
        self.__playwright = None
        self.__browser = None
        self.__page = None

    async def __get_html(self, url, css_selector=None, to_close=True):
        if self.__playwright is None:
            self.__playwright = await async_playwright().start()
        if self.__browser is None or not self.__browser.is_connected():
            self.__browser = await self.__playwright.chromium.launch(headless=True, args=["--start-maximized"])
        if self.__page is None or self.__page.is_closed():
            self.__page = await self.__browser.new_page(user_agent=self.__user_agent)
            # self.__page = await self.__browser.new_page()
        await self.__page.goto(url)
        if css_selector is not None:
            await self.__page.wait_for_selector(css_selector)
        html = await self.__page.content()
        if to_close:
            await self.__browser.close()
            await self.__playwright.stop()
        return html

    async def __collect_links(self, css_sel):
        await self.__page.locator(css_sel).wait_for()
        css_sel = f"{css_sel} > div[class='card-wrapper'] > a"
        links = self.__page.locator(css_sel)
        links = [await link.get_attribute("href") for link in await links.all()]
        return links

    async def __collect_data(self, url):
        await self.__page.goto(url)
        html = await self.__page.content()
        soup = BeautifulSoup(html, "html.parser")
        rows = soup.css.select("div[class='list-group-item'] > div.row")
        record = {re.sub(r"\s+", " ", r.css.select_one("strong").get_text(), flags=re.IGNORECASE).strip():
                    re.sub(r"\s+", " ", r.css.select_one("div").get_text(), flags=re.IGNORECASE).strip()
                 for r in rows}
        return record

    async def extract_data(self, num_attempt=0):
        try:
            links = list()
            css_sel = "section#content > div:nth-child(2) > div > div:nth-child(2)"
            await self.__get_html(self.__url_base, css_sel, False)
            while(True):
                links.extend(await self.__collect_links(css_sel))
                button = self.__page.locator("ul.pagination > li:nth-child(3) > a")
                if await button.is_visible():
                    await button.click()
                    time.sleep(2)
                else:
                    break
            records = list()
            for link in links:
                link = urljoin(self.__url_base, link)
                records.append(await self.__collect_data(link))
            await self.__browser.close()
            await self.__playwright.stop()
            return records
        except (errors.TimeoutError, errors.TargetClosedError, errors.Error, AttributeError, Exception,
                TCPTimedOutError, TimeoutError) as e:
            print(f"[ERROR-DEBUG] {e}: {self.__url_base}")
            print("".join(traceback.format_tb(e.__traceback__)))
            if num_attempt <= self.__max_attempts:
                num_attempt += 1
                print(f"Number of attempting in 'extract_data': {num_attempt}")
                await self.extract_data(num_attempt)

## 3. Collecting the data

In [None]:
# Determining the URL of target page.
url = "https://ud10.arapiraca.ufal.br/repositorio/publicacoes/?curso_id__id=27&page_num=1"

In [None]:
# Getting webdriver.
spider = SpiderUfalSI(url)

# Collecting the data.
records = await spider.extract_data()

## 4. Saving the data

In [None]:
# Saving the data into a CSV file.
df = pd.DataFrame(records)
cols = {"Autor(a)": "authors", "Ano de publicação": "publication_year", "Data da defesa": "defense_date",
        "Curso/Outros": "course", "Número de folhas": "num_pages", "Tipo": "type_document", "Local": "defense_local",
        "Resumo": "abstract_ptbr", "Abstract": "abstract_en", "Orientador(a)": "advisor", "Coorientador(a)": "co-advisor",
        "Banca Examinadora": "examining_board", "Palavras-chave": "keywords", "Áreas do Conhecimento/Localização": "subject_areas",
        "Categorias CNPQ": "CNPq_categories", "Anexos": "attachments", "Visualizações": "views", "Observações": "observations"}
df.rename(columns=cols, inplace=True)
df.to_csv("data_ufal.csv", sep=",", index=False, quoting=csv.QUOTE_ALL)