# Web Scraping of RI Ufal - Arapiraca Campus

## 1. Importing the required libraries

In [None]:
# Importing the required libraries.
import re, traceback, csv, pandas as pd, time, numpy as np
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from unidecode import unidecode
from playwright.async_api import async_playwright, expect
from twisted.internet.error import TCPTimedOutError, TimeoutError
import playwright._impl._errors as errors

## 2. Defining and executing the Spider class

In [None]:
class SpiderUfalSI:
    def __init__(self, url):
        self.__url_base = url
        self.__max_attempts = 2
        self.__user_agent = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) "
                             "Chrome/122.0.0.0 Safari/537.36 OPR/108.0.0.0")
        self.__playwright = None
        self.__browser = None
        self.__page = None

    async def __get_html(self, url, css_selector=None, to_close=True):
        if self.__playwright is None:
            self.__playwright = await async_playwright().start()
        if self.__browser is None or not self.__browser.is_connected():
            self.__browser = await self.__playwright.chromium.launch(headless=True, args=["--start-maximized"])
        if self.__page is None or self.__page.is_closed():
            self.__page = await self.__browser.new_page(user_agent=self.__user_agent)
            # self.__page = await self.__browser.new_page()
        await self.__page.goto(url)
        if css_selector is not None:
            await self.__page.wait_for_selector(css_selector)
        html = await self.__page.content()
        if to_close:
            await self.__browser.close()
            await self.__playwright.stop()
        return html

    async def __collect_links(self, css_sel):
        await self.__page.locator(css_sel).wait_for()
        css_sel = f"{css_sel} > div[class='card-wrapper'] > a"
        links = self.__page.locator(css_sel)
        links = [await link.get_attribute("href") for link in await links.all()]
        return links

    async def __collect_data(self, url):
        await self.__page.goto(url)
        html = await self.__page.content()
        soup = BeautifulSoup(html, "html.parser")
        rows = soup.css.select("div[class='list-group-item'] > div.row")
        record = {re.sub(r"\s+", " ", r.css.select_one("strong").get_text(), flags=re.IGNORECASE).strip():
                    re.sub(r"\s+", " ", r.css.select_one("div").get_text(), flags=re.IGNORECASE).strip()
                 for r in rows}
        record["title"] = re.sub(r"\s+", " ", soup.select_one("h3#pub-titulo").text, flags=re.IGNORECASE).strip()
        record["Anexos"] = soup.select_one("div[class='list-group-item'] > div.row > div:nth-child(2) > a")
        if record["Anexos"] is not None:
            record["Anexos"] = urljoin(url, record["Anexos"]["href"])
        return record

    async def extract_data(self, num_attempt=0):
        try:
            links = list()
            css_sel = "section#content > div:nth-child(2) > div > div:nth-child(2)"
            await self.__get_html(self.__url_base, css_sel, False)
            while(True):
                links.extend(await self.__collect_links(css_sel))
                button = self.__page.locator("ul.pagination > li:nth-child(3) > a")
                if await button.is_visible():
                    await button.click()
                    time.sleep(2)
                else:
                    break
            records = list()
            for link in sorted(list(set(links))):
                link = urljoin(self.__url_base, link)
                records.append(await self.__collect_data(link))
            await self.__browser.close()
            await self.__playwright.stop()
            return records
        except (errors.TimeoutError, errors.TargetClosedError, errors.Error, AttributeError, Exception,
                TCPTimedOutError, TimeoutError) as e:
            print(f"[ERROR-DEBUG] {e}: {self.__url_base}")
            print("".join(traceback.format_tb(e.__traceback__)))
            if num_attempt <= self.__max_attempts:
                num_attempt += 1
                print(f"Number of attempting in 'extract_data': {num_attempt}")
                await self.extract_data(num_attempt)

In [None]:
# Determining the URL of target pages.
urls = [
    "https://ud10.arapiraca.ufal.br/repositorio/publicacoes/?curso_id__id=27&page_num=1",
    "https://ud10.arapiraca.ufal.br/repositorio/publicacoes/?curso_id__id=49&page_num=1",
    "https://ud10.arapiraca.ufal.br/repositorio/publicacoes/?curso_id__id=8&page_num=1"
]

In [None]:
# Creating the data repository.
data = list()
for url in urls:
    # Getting webdriver.
    spider = SpiderUfalSI(url)

    # Collecting the data.
    results = await spider.extract_data()

    # Merging the data.
    data.extend(results)

In [None]:
# Checking the data.
len(data)

## 3. Preprocessing the data

In [None]:
# Function to clean any text.
def clean_text(text):
    text = re.sub(r"\s+", " ", text, flags=re.IGNORECASE).strip()
    text = text.replace("- ", "-").replace("\ufeff", "").replace("\xad", "")
    text = text if len(text.strip()) > 1 else None
    return text

In [None]:
# Creating the dataframe.
df = pd.DataFrame(data)

In [None]:
# Listing the information about the dataset.
df.info()

In [None]:
# Handling the nullable values.
df.replace({np.nan: None}, inplace=True)

In [None]:
# Renaming the columns.
df.rename(columns={"Tipo": "thesis_type", "Autor(a)": "authors",
                   "Orientador(a)": "main_advisor", "Resumo": "pt_abstract",
                   "Abstract": "en_abstract", "Palavras-chave": "auth_keywords",
                   "Ano de publicação": "publication_year", "Data da defesa": "defense_date",
                   "Curso/Outros": "course", "Número de folhas": "num_pages",
                   "Local": "defense_local", "Banca Examinadora": "committee",
                   "Coorientador(a)": "second_advisor", "Categorias CNPQ": "cnpq_area",
                   "Áreas do Conhecimento/Localização": "publisher",
                   "Observações": "observations", "Visualizações": "num_views",
                   "Anexos": "document_url"}, inplace=True)

In [None]:
# List of names to fix.
names_replace = {
    "Alejandro Cesar Frery Orgambide": "Alejandro César Frery Orgambide",
    "Alessandro Fabricio Garcia": "Alessandro Fabrício Garcia",
    "Allan Medeiros de Martins": "Allan de Medeiros Martins",
    "Alvaro Alvares de Carvalho César Sobrinho": "Álvaro Alvares de Carvalho César Sobrinho",
    "Alvaro Alvares de Carvalho Cesar Sobrinho": "Álvaro Alvares de Carvalho César Sobrinho",
    "Álvaro Alvares de Carvalho Cesar Sobrinho": "Álvaro Alvares de Carvalho César Sobrinho",
    "Álvaro Álvares de Carvalho César Sobrinho": "Álvaro Alvares de Carvalho César Sobrinho",
    "André Lage Freitas": "André Lages Freitas",
    "André Luiz Lins Aquino": "André Luiz Lins de Aquino",
    "Andre Luiz Lins de Aquino": "André Luiz Lins de Aquino",
    "Andre Luiz Lins de Aquino Aquino": "André Luiz Lins de Aquino",
    "André Magno Costa de Araujo": "André Magno Costa de Araújo",
    "Antonio Alfredo Ferreira Loureiro": "Antônio Alfredo Ferreira Loureiro",
    "António Fernando de Sousa Bezerra": "Antônio Fernando de Sousa Bezerra",
    "Antonio Marcus Nogueira Lima": "Antônio Marcus Nogueira de Lima",
    "Arturo Hernandez Dominguez": "Arturo Hernández-Domínguez",
    "Arturo Hernandez-Dominguez": "Arturo Hernández-Domínguez",
    "Arturo Hernández Domí\xadnguez": "Arturo Hernández-Domínguez",
    "Arturo Hernández Domínguez": "Arturo Hernández-Domínguez",
    "Arturo Hernández-Dominguéz": "Arturo Hernández-Domínguez",
    "Arturo Hernández-Domí\xadnguez": "Arturo Hernández-Domínguez",
    "Aydano Pomponet Machado": "Aydano Pamponet Machado",
    "Baldoino Fonseca dos Santos Neto" : "Baldoíno Fonseca dos Santos Neto",
    "Carlisson Borges Tenório": "Cárlisson Borges Tenório Galdino",
    "Cecí\xadlia Mary Fischer Rubira": "Cecília Mary Fischer Rubira",
    "Cleide Jane de Sa Araujo Costa": "Cleide Jane de Sá Araujo Costa",
    "Cleide Jane de Sá Araújo Costa": "Cleide Jane de Sá Araujo Costa",
    "Credine Silva de Menezes": "Crediné Silva de Menezes",
    "Dalgoberto Miguilino Pinho Junior": "Dalgoberto Miquilino Pinho Júnior",
    "Dalgoberto Miguilino Pinho Júnior": "Dalgoberto Miquilino Pinho Júnior",
    "David Bibiano Brito": "Davi Bibiano Brito",
    "Davy de Medeiros Baia": "Davy de Medeiros Baía",
    "Denys Felipe Souza Rocha": "Denys Fellipe Souza Rocha",
    "Diego Carvalho Nascimento": "Diego Carvalho do Nascimento",
    "Diego Dermeval Medeiros da Cunha Matos": "Diego Dermeval de Medeiros da Cunha Matos",
    "Diego Dermeval Medeiros da Cunha": "Diego Dermeval de Medeiros da Cunha Matos",
    "Diego Dermeval da Cunha Matos": "Diego Dermeval de Medeiros da Cunha Matos",
    "Erick de Andrade Barbosa": "Erick de Andrade Barboza",
    "Elvys Soares Alves": "Elvys Alves Soares",
    "Fabio Jose Coutinho da Silva": "Fábio José Coutinho da Silva",
    "Fabio José Coutinho da Silva": "Fábio José Coutinho da Silva",
    "Fábio Paraguaçu": "Fábio Paraguaçu Duarte da Costa",
    "Fabio Paraguaçu Duarte da Costa": "Fábio Paraguaçu Duarte da Costa",
    "Fernando Antonio Dantas Gomes Pinto": "Fernando Antônio Dantas Gomes Pinto",
    "Giseldo da Silva Neo": "Giseldo da Silva Néo",
    "Glauber Arthur nascimento da Silva": "Glauber Arthur Nascimento da Silva",
    "Guilherme Ataí\xadde Dias": "Guilherme Ataíde Dias",
    "Heitor Judiss Savino.": "Heitor Judiss Savino",
    "Ibsen Mateus Bittencourt": "Ibsen Mateus Bittencourt Santana Pinto",
    "Icaro Bezerra Queiroz de Araújo": "Ícaro Bezerra Queiroz de Araújo",
    "Icaro Bezerra Queiroz de Araujo": "Ícaro Bezerra Queiroz de Araújo",
    "Ig Ibert Bittencourt Santanta Pinto": "Ig Ibert Bittencourt Santana Pinto",
    "Ig Ibert Bitterncourt Santana Pinto": "Ig Ibert Bittencourt Santana Pinto",
    "Ibert Bittencourt Santana Pinto": "Ig Ibert Bittencourt Santana Pinto",
    "Ig Bert Bittencourt Santana Pinto": "Ig Ibert Bittencourt Santana Pinto",
    "Ig Ibert Bittencourt": "Ig Ibert Bittencourt Santana Pinto",
    "Ig Ibert Bittencourt Santana": "Ig Ibert Bittencourt Santana Pinto",
    "Italo Carlo Lopes Silva": "Ítalo Carlo Lopes Silva",
    "Jario Jose dos Santos Junior": "Jário José dos Santos Júnior",
    "Jáiro José dos Santos Júnior": "Jário José dos Santos Júnior",
    "Jário Santos": "Jário José dos Santos Júnior",
    "Jobson de Araujo Nascimento": "Jobson de Araújo Nascimento",
    "Jonathas Patrick Hermenegildo de. Azevedo": "Jonathas Patrick Hermenegildo de Azevedo",
    "Joseana Macedo Fechine": "Joseana Macêdo Fechine Régis de Araújo",
    "Josias jordão Andrade Alves": "Josias Jordão Andrade Alves",
    "Joao Marcelo de Almeida Gusmao Lyra": "João Marcelo de Almeida Gusmão Lyra",
    "Joao Marcos Travassos Romano": "João Marcos Travassos Romano",
    "Keila Barbosa Costa": "Keila Barbosa Costa dos Santos",
    "Leandro aparecido Villas": "Leandro Aparecido Villas",
    "Leandro de Melo Sales": "Leandro Melo de Sales",
    "Leopoldo Motta Texeira": "Leopoldo Motta Teixeira",
    "Leonardo de Melo Medeiros": "Leonardo Melo de Medeiros",
    "Leonardo Pereira Viana": "Leonardo Viana Pereira",
    "Lucas Benevides Viana Amorim": "Lucas Benevides Viana de Amorim",
    "Luis Cláudius Coradine": "Luís Cláudius Coradine",
    "Luiz Cláudio Ferreira da Silva Júnior": "Luiz Claúdio Ferreira da Silva Júnior",
    "Luiz Marcos Garcia Goncalves": "Luiz Marcos Garcia Gonçalves",
    "Marcelo Costa de Oliveira": "Marcelo Costa Oliveira",
    "Manoel Álvaro de Lins Freitas Neto": "Manoel Alvaro de Freitas Lins Neto",
    "Maria Alayde Mendonçaa da Silva": "Maria Alayde Mendonça da Silva",
    "Maria Cristina Tenório C. Escarpini": "Maria Cristina Tenório Cavalcante Escarpini",
    "Maria Cristina Tenório Cabral Cavalcante": "Maria Cristina Tenório Cavalcante Escarpini",
    "Maria Cristina Tenório Cabral Cavalcante Escarpini": "Maria Cristina Tenório Cavalcante Escarpini",
    "Marí\xada Del Rosario Girardi Gutiérrez": "María Del Rosario Girardi Gutiérrez",
    "Maurí\xadcio Marengoni": "Maurício Marengoni",
    "Nuno Manuel dos Santos Antunes": "Nuno Manoel dos Santos Antunes",
    "Olival de Gusmão Freitas Junior": "Olival de Gusmão Freitas Júnior",
    "Olival de Gusmão Freitas Jr": "Olival de Gusmão Freitas Júnior",
    "Orivaldo Vieira Santana Jr": "Orivaldo Vieira de Santana Júnior",
    "Orivaldo Vieira Santana Júnior": "Orivaldo Vieira de Santana Júnior",
    "Osvaldo Anibal Rosso": "Osvaldo Aníbal Rosso",
    "Otávio José Costa de. Albuquerque Júnior": "Otávio José Costa de Albuquerque Júnior",
    "Patricia Leone Espinheira Ospina": "Patrícia Leone Espinheira Ospina",
    "Patrik Henrique da Silva Brito": "Patrick Henrique da Silva Brito",
    "Petrucio Antonio Medeiros Barros": "Petrúcio Antônio Medeiros Barros",
    "Petrucio Antônio Medeiros Barros": "Petrúcio Antônio Medeiros Barros",
    "Petrúcio Antonio Medeiros Barros": "Petrúcio Antônio Medeiros Barros",
    "Rafael Amorim da Silva": "Rafael de Amorim Silva",
    "Rafael Amorim Silva": "Rafael de Amorim Silva",
    "Ranilson Oscar Araujo Paiva": "Ranilson Oscar Araújo Paiva",
    "Renato Ambrósio Jr": "Renato Ambrósio Júnior",
    "Renato Ambrósio Jr.": "Renato Ambrósio Júnior",
    "Rodrigo de Barros": "Rodrigo de Barros Paes",
    "Romulo Nunes de Oliveira": "Rômulo Nunes de Oliveira",
    "Seiji Isotoni": "Seiji Isotani",
    "Thales Vieira": "Thales Miranda de Almeida Vieira",
    "Thales Miranda Vieira": "Thales Miranda de Almeida Vieira",
    "Thomas Lewiner": "Thomas Maurice Lewiner"
}

In [None]:
# Normalizing the features "publication_year", and "defense_date".
df.publication_year = df.publication_year.astype(np.int16)
indexes = {
    120: "01/10/2016", 157: "10/05/2012", 158: "12/12/2011",
    159: "01/12/2012", 161: "05/09/2012", 162: "10/05/2012",
    163: "10/11/2011", 333: "10/11/2012", 335: "01/10/2012",
    336: "05/02/2012", 337: "01/09/2012", 339: "20/01/2011",
    340: "06/06/2011", 341: "03/03/2011"}
df.loc[list(indexes.keys()), "defense_date"] = \
    pd.Series(index=list(indexes.keys()),
              data=list(indexes.values()))
df.defense_date = pd.to_datetime(df.defense_date,
                                 format="%d/%m/%Y")

In [None]:
# Creating the features "campus", and "department", as well as normalizing the column "course".
df.loc[:, "campus"] = df.course.apply(
    lambda x: "U.E. Penedo" \
    if x == "Sistemas da Informação (U.E. Penedo)" \
    else "Campus Arapiraca")
df.loc[:, "department"] = df.course.apply(
    lambda x: "Núcleo de Sistemas de Informação" \
        if x == "Sistemas da Informação (U.E. Penedo)" else \
    "Instituto de Computação" \
        if x == "Sistemas de Informação (EaD)" else \
    "Departamento de Ciências da Computação")
df.course.replace({
    "Sistemas de Informação (EaD)": "Sistemas de Informação",
    "Sistemas da Informação (U.E. Penedo)": "Sistemas de Informação",
    "Ciência da Computação": "Ciências da Computação"},
    inplace=True)

In [None]:
# Normalizing the features "num_pages", and "num_views".
df.loc[49, "num_pages"] = 76
df.num_pages = df.num_pages.apply(
    lambda x: np.int16(x) if x is not None else -1)
df.num_views = df.num_views.astype(np.int16)

In [None]:
# Normalizing the features "cnpq_area", "thesis_type", and "publisher".
df.loc[:, "cnpq_area"] = "Ciência da Computação"
df.loc[:, "thesis_type"] = "Trabalho de Conclusão de Curso"
df.loc[:, "publisher"] = "Universidade Federal de Alagoas"
df.loc[:, "affil_accronym"] = "UFAL"

In [None]:
# Normalizing the feature "authors".
df.authors = df.authors.apply(lambda x:
    re.sub(r"\<[^\<\>]+\>", "", x, flags=re.IGNORECASE).strip())
df.authors = df.authors.apply(lambda x:
    tuple([f"{i.split(",")[-1].strip()} {i.split(",")[0].strip()}" \
           if len(i.split(",")) > 1 else i.strip()
           for i in x.split(".") if len(i.strip()) > 1]))
df.authors = df.authors.apply(lambda x: tuple(
    [names_replace[a] if a in names_replace else a.strip()
     for a in x]) if x is not None else None)

In [None]:
# Normalizing the feature "main_advisor", "second_advisor", and "committee".
df.loc[:, ["main_advisor", "second_advisor", "committee"]] = \
df.loc[:, ["main_advisor", "second_advisor", "committee"]].apply(
    lambda row: row.apply(lambda x: re.sub(
        r"(Me\.|Dr\..|Ma\.|Esp\.)", "", x).strip() \
            if x is not None else None))
df.loc[:, ["main_advisor", "second_advisor", "committee"]] = \
df.loc[:, ["main_advisor", "second_advisor", "committee"]].apply(
    lambda row: row.apply(lambda x: tuple(
        [f"{i.split(",")[-1].strip()} {i.split(",")[0].strip()}" \
            if len(i.split(",")) > 1 else i.strip()
                for i in x.split(".") if len(i.strip()) > 1]) \
                    if x is not None else None))
filter_data = [len(a) > 1 if a is not None else False
               for a in df.main_advisor]
df.loc[filter_data, ["main_advisor", "second_advisor"]] = \
df.loc[filter_data, "main_advisor"].apply(
    lambda x: pd.Series(
        index=["main_advisor", "second_advisor"],
        data=[tuple([x[0]]), tuple(x[1:])]))
df.loc[:, ["main_advisor", "second_advisor", "committee"]] = \
df.loc[:, ["main_advisor", "second_advisor", "committee"]].apply(
    lambda row: row.apply(lambda x: tuple([names_replace[a] \
        if a in names_replace else a.strip() for a in x]) \
            if x is not None else None))

In [None]:
# Removing the unecessary columns.
df.drop(columns=["defense_local", "num_pages", "observations"], inplace=True)

In [None]:
# Normalizing the column "auth_keywords".
df.loc[df.auth_keywords.notnull(), "auth_keywords"] = df.loc[
    df.auth_keywords.notnull(), "auth_keywords"].apply(lambda x: tuple(
        [clean_text(k).strip() for k in x.split(".")
            if clean_text(k) is not None]))

In [None]:
# Normalizing the columns "title", "pt_abstract", and "en_abstract".
df.loc[df.title.notnull(), "title"] = df.loc[
    df.title.notnull(), "title"].apply(clean_text)
df.loc[df.pt_abstract.notnull(), "pt_abstract"] = df.loc[
    df.pt_abstract.notnull(), "pt_abstract"].apply(clean_text)
df.loc[df.en_abstract.notnull(), "en_abstract"] = df.loc[
    df.en_abstract.notnull(), "en_abstract"].apply(clean_text)

In [None]:
# Handling the nullable values.
df.replace({np.nan: None}, inplace=True)

In [None]:
# Checking the result.
df.head()

In [None]:
# Showing the information about the data.
df.info()

## 4. Saving the data

In [None]:
# Saving the data into a CSV file.
df.to_csv("../data/ufal_arapiraca_campus_thesis.csv", index=False, quoting=csv.QUOTE_ALL)