# Scraping de dados para a matrícula da UTFPR

### Carrega as bibliotecas & funções

In [1]:
import shutil
import re
from datetime import date, datetime
import requests
import pandas as pd
from bs4 import BeautifulSoup
from unidecode import unidecode

In [2]:
def find_table(soup: BeautifulSoup, pattern: str) -> str:
    h3_tags = soup.find_all('h3')
    for h3 in h3_tags:
        tag = h3.find(string=re.compile(f'^{pattern}$', re.I))
        if tag:
            return str(tag.find_next('table'))
    return None

def process_table(html_table: str, campus_id: pd.DataFrame, enrollment_phase_id: int, year: int, semester: int):
    if html_table:
        table = pd.read_html(html_table)[0]
        table.dropna(inplace=True)
        processed = pd.DataFrame()
        processed['CAMPUS'] = table.iloc[:, 0].apply(lambda x: unidecode(x).upper())
        processed['S_DATE_TIME'] = table.iloc[:, 1] + ' ' + table.iloc[:, 2]
        processed['START_TIMESTAMP'] = processed['S_DATE_TIME'].apply(lambda x: str(datetime.strptime(str(x).strip(), '%d/%m/%Y %Hh')))
        processed['E_DATE_TIME'] = table.iloc[:, 3] + ' ' + table.iloc[:, 4]
        processed['END_TIMESTAMP'] = processed['E_DATE_TIME'].apply(lambda x: str(datetime.strptime(str(x).strip(), '%d/%m/%Y %Hh')))
        processed['ENROLLMENT_PHASE_ID'] = enrollment_phase_id
        processed['YEAR'] = year
        processed['SEMESTER'] = semester
        if processed['CAMPUS'][0] == 'TODOS OS CAMPI':
            campus_id_len = len(campus_id.index)
            processed = pd.concat([processed]*campus_id_len, ignore_index=True, axis=0)
            processed = pd.concat([campus_id, processed], axis=1)
        else:
            processed = processed.join(campus_id.set_index('CAMPUS_NAME'), on='CAMPUS')
        return processed[['CAMPUS_ID', 'ENROLLMENT_PHASE_ID', 'YEAR', 'SEMESTER', 'START_TIMESTAMP', 'END_TIMESTAMP']]
    return None

### Define pasta com arquivos CSV para carregamento/atualização

In [3]:
data_folder = 'data_csv'

### Carrega tabelas com códigos de campus & fase da matrícula

In [4]:
campus_id = pd.read_csv(f'{data_folder}/CAMPUS.csv', sep=';')
campus_id

Unnamed: 0,CAMPUS_ID,CAMPUS_NAME,STUDENT_PORTAL_LINK
0,1,CURITIBA,https://sistemas2.utfpr.edu.br/dpls/sistema/al...
1,2,CORNELIO PROCOPIO,https://sistemas2.utfpr.edu.br/dpls/sistema/al...
2,3,CAMPO MOURAO,https://sistemas2.utfpr.edu.br/dpls/sistema/al...
3,4,MEDIANEIRA,https://sistemas2.utfpr.edu.br/dpls/sistema/al...
4,5,PATO BRANCO,https://sistemas2.utfpr.edu.br/dpls/sistema/al...
5,6,PONTA GROSSA,https://sistemas2.utfpr.edu.br/dpls/sistema/al...
6,7,DOIS VIZINHOS,https://sistemas2.utfpr.edu.br/dpls/sistema/al...
7,8,LONDRINA,https://sistemas2.utfpr.edu.br/dpls/sistema/al...
8,9,TOLEDO,https://sistemas2.utfpr.edu.br/dpls/sistema/al...
9,10,APUCARANA,https://sistemas2.utfpr.edu.br/dpls/sistema/al...


In [5]:
enrollment_phase_id = pd.read_csv(f'{data_folder}/ENROLLMENT_PHASE.csv', sep=';')
enrollment_phase_id

Unnamed: 0,ENROLLMENT_PHASE_ID,ENROLLMENT_PHASE_NAME,INFO
0,1,REQUERIMENTO,ATO DO ESTUDANTE
1,2,ANALISE DE TURMAS,ATO INTERNO
2,3,AJUSTE,ATO DO ESTUDANTE
3,4,INCLUSAO,ATO DO ESTUDANTE
4,5,INTERCAMPUS,ATO DO ESTUDANTE


### Define URL para scraping, faz uma requisição HTTP e carrega o resultado na memória

In [6]:
url = 'https://portal.utfpr.edu.br/secretaria/matricula/cronograma-de-matricula'
r = requests.get(url)

soup = BeautifulSoup(r.text, 'html.parser')

### Verifica Ano/Semestre da matrícula

In [7]:
title = soup.find('div', class_='documentDescription description').get_text()
year_semester = title.split('-')[-1].strip()
year = int(year_semester.split('/')[0])
semester = int(year_semester.split('/')[1])
print(f'Year: {year}\nSemester: {semester}')

Year: 2023
Semester: 1


### Requerimento

In [8]:
id = int(enrollment_phase_id[enrollment_phase_id['ENROLLMENT_PHASE_NAME'] == 'REQUERIMENTO']['ENROLLMENT_PHASE_ID'].iloc[0])

requerimento = process_table(find_table(soup, 'REQUERIMENTO'), enrollment_phase_id=id, year=year, semester=semester, campus_id=campus_id)
requerimento

Unnamed: 0,CAMPUS_ID,ENROLLMENT_PHASE_ID,YEAR,SEMESTER,START_TIMESTAMP,END_TIMESTAMP
0,10,1,2023,1,2023-02-13 09:00:00,2023-02-14 18:00:00
1,3,1,2023,1,2023-02-10 09:00:00,2023-02-12 18:00:00
2,2,1,2023,1,2023-02-13 09:00:00,2023-02-14 18:00:00
3,1,1,2023,1,2023-02-16 09:00:00,2023-02-17 18:00:00
4,7,1,2023,1,2023-02-09 09:00:00,2023-02-12 18:00:00
5,11,1,2023,1,2023-02-13 09:00:00,2023-02-14 18:00:00
6,12,1,2023,1,2023-02-13 09:00:00,2023-02-14 18:00:00
7,8,1,2023,1,2023-02-10 09:00:00,2023-02-13 18:00:00
8,4,1,2023,1,2023-02-15 09:00:00,2023-02-17 18:00:00
9,5,1,2023,1,2023-02-10 09:00:00,2023-02-13 18:00:00


### Ajuste/Confirmação

In [9]:
id = int(enrollment_phase_id[enrollment_phase_id['ENROLLMENT_PHASE_NAME'] == 'AJUSTE']['ENROLLMENT_PHASE_ID'].iloc[0])

ajuste = process_table(find_table(soup, 'ajuste[/ ]confirma[cç][aã]o'), enrollment_phase_id=id, year=year, semester=semester, campus_id=campus_id)
ajuste

Unnamed: 0,CAMPUS_ID,ENROLLMENT_PHASE_ID,YEAR,SEMESTER,START_TIMESTAMP,END_TIMESTAMP
0,10,3,2023,1,2023-02-17 09:00:00,2023-02-17 18:00:00
1,3,3,2023,1,2023-02-16 09:00:00,2023-02-16 18:00:00
2,2,3,2023,1,2023-02-17 09:00:00,2023-02-17 18:00:00
3,1,3,2023,1,2023-02-27 10:00:00,2023-02-27 18:00:00
4,7,3,2023,1,2023-02-16 09:00:00,2023-02-16 18:00:00
5,11,3,2023,1,2023-02-16 09:00:00,2023-02-16 18:00:00
6,12,3,2023,1,2023-02-16 09:00:00,2023-02-16 18:00:00
7,8,3,2023,1,2023-02-16 09:00:00,2023-02-16 18:00:00
8,4,3,2023,1,2023-02-24 09:00:00,2023-02-24 18:00:00
9,5,3,2023,1,2023-02-16 09:00:00,2023-02-17 18:00:00


### Inclusão

In [10]:
id = int(enrollment_phase_id[enrollment_phase_id['ENROLLMENT_PHASE_NAME'] == 'INCLUSAO']['ENROLLMENT_PHASE_ID'].iloc[0])

inclusao = process_table(find_table(soup, 'inclus[aã]o'), enrollment_phase_id=id, year=year, semester=semester, campus_id=campus_id)
inclusao

Unnamed: 0,CAMPUS_ID,ENROLLMENT_PHASE_ID,YEAR,SEMESTER,START_TIMESTAMP,END_TIMESTAMP
0,10,4,2023,1,2023-02-23 09:00:00,2023-02-26 18:00:00
1,3,4,2023,1,2023-02-17 09:00:00,2023-02-17 18:00:00
3,2,4,2023,1,2023-02-23 09:00:00,2023-02-23 18:00:00
4,1,4,2023,1,2023-02-28 09:00:00,2023-02-28 18:00:00
5,7,4,2023,1,2023-02-17 09:00:00,2023-02-17 18:00:00
6,11,4,2023,1,2023-02-17 09:00:00,2023-02-17 18:00:00
7,12,4,2023,1,2023-02-17 09:00:00,2023-02-17 18:00:00
8,8,4,2023,1,2023-02-17 09:00:00,2023-02-17 18:00:00
9,4,4,2023,1,2023-02-25 09:00:00,2023-02-27 18:00:00
10,5,4,2023,1,2023-02-23 09:00:00,2023-02-24 18:00:00


### Intercampus

In [11]:
id = int(enrollment_phase_id[enrollment_phase_id['ENROLLMENT_PHASE_NAME'] == 'INTERCAMPUS']['ENROLLMENT_PHASE_ID'].iloc[0])

intercampus = process_table(find_table(soup, 'intercampus'), enrollment_phase_id=id, year=year, semester=semester, campus_id=campus_id)
intercampus

Unnamed: 0,CAMPUS_ID,ENROLLMENT_PHASE_ID,YEAR,SEMESTER,START_TIMESTAMP,END_TIMESTAMP
0,1,5,2023,1,2023-03-01 09:00:00,2023-03-01 18:00:00
1,2,5,2023,1,2023-03-01 09:00:00,2023-03-01 18:00:00
2,3,5,2023,1,2023-03-01 09:00:00,2023-03-01 18:00:00
3,4,5,2023,1,2023-03-01 09:00:00,2023-03-01 18:00:00
4,5,5,2023,1,2023-03-01 09:00:00,2023-03-01 18:00:00
5,6,5,2023,1,2023-03-01 09:00:00,2023-03-01 18:00:00
6,7,5,2023,1,2023-03-01 09:00:00,2023-03-01 18:00:00
7,8,5,2023,1,2023-03-01 09:00:00,2023-03-01 18:00:00
8,9,5,2023,1,2023-03-01 09:00:00,2023-03-01 18:00:00
9,10,5,2023,1,2023-03-01 09:00:00,2023-03-01 18:00:00


### Faz backup do arquivo antigo e atualiza dados

In [12]:
enrollment_schedule = pd.concat([requerimento, ajuste, inclusao, intercampus]).sort_values(by=list(requerimento.columns))
enrollment_schedule

Unnamed: 0,CAMPUS_ID,ENROLLMENT_PHASE_ID,YEAR,SEMESTER,START_TIMESTAMP,END_TIMESTAMP
3,1,1,2023,1,2023-02-16 09:00:00,2023-02-17 18:00:00
3,1,3,2023,1,2023-02-27 10:00:00,2023-02-27 18:00:00
4,1,4,2023,1,2023-02-28 09:00:00,2023-02-28 18:00:00
0,1,5,2023,1,2023-03-01 09:00:00,2023-03-01 18:00:00
2,2,1,2023,1,2023-02-13 09:00:00,2023-02-14 18:00:00
2,2,3,2023,1,2023-02-17 09:00:00,2023-02-17 18:00:00
3,2,4,2023,1,2023-02-23 09:00:00,2023-02-23 18:00:00
1,2,5,2023,1,2023-03-01 09:00:00,2023-03-01 18:00:00
1,3,1,2023,1,2023-02-10 09:00:00,2023-02-12 18:00:00
1,3,3,2023,1,2023-02-16 09:00:00,2023-02-16 18:00:00


In [13]:
today = date.today().strftime('%Y%m%d')
shutil.copyfile('data_csv/ENROLLMENT_SCHEDULE.csv', f'data_csv/old/ENROLLMENT_SCHEDULE_{today}.csv')

'data_csv/old/ENROLLMENT_SCHEDULE_20230501.csv'

In [14]:
enrollment_schedule.to_csv('data_csv/ENROLLMENT_SCHEDULE.csv', sep=';', index=False)