In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from unidecode import unidecode
from datetime import datetime

In [2]:
url = 'https://portal.utfpr.edu.br/secretaria/matricula/cronograma-de-matricula'
r = requests.get(url)

In [3]:
soup = BeautifulSoup(r.text, 'html.parser')

In [4]:
title = soup.find('div', class_='documentDescription description').get_text()
year_semester = title.split('-')[-1].strip()
year = year_semester.split('/')[0]
semester = year_semester.split('/')[1]
print(f'Year: {year}\nSemester: {semester}')

Year: 2022
Semester: 1


In [5]:
phases = soup.find_all('a', class_='anchor-link')
phases

[<a class="anchor-link" href="#requerimento" target="_self" title="">Requerimento de Matrícula</a>,
 <a class="anchor-link" href="#analiseturma" target="_self" title="">Análise de Turmas</a>,
 <a class="anchor-link" href="#ajuste" target="_self" title="">Ajuste / Confirmação de Matrícula</a>,
 <a class="anchor-link" href="#inclusao" target="_self" title="">Inclusão</a>,
 <a class="anchor-link" href="#intercampus" target="_self" title="">Intercampus</a>,
 <a class="anchor-link" href="#topo" target="_self" title="">Voltar</a>,
 <a class="anchor-link" href="#topo" target="_self" title="">Voltar</a>,
 <a class="anchor-link" href="#topo" target="_self" title="">Voltar</a>,
 <a class="anchor-link" href="#topo" target="_self" title="">Voltar</a>,
 <a class="anchor-link" href="#topo" target="_self" title="">Voltar</a>]

In [6]:
tables_html = soup.find_all('table')
tables = pd.read_html(str(tables_html))
tables[0]

Unnamed: 0,Campus,Início,Início.1,Término,Término.1
0,Apucarana,14/02/2022,9h,15/02/2022,18h
1,Campo Mourão,14/02/2022,9h,15/02/2022,18h
2,Cornélio Procópio,14/02/2022,9h,15/02/2022,18h
3,Curitiba,16/02/2022,9h,17/02/2022,18h
4,Dois Vizinhos,11/02/2022,9h,13/02/2022,18h
5,Francisco Beltrão,16/02/2022,9h,18/02/2022,18h
6,Guarapuava,14/02/2022,9h,16/02/2022,18h
7,Londrina,11/02/2022,9h,14/02/2022,18h
8,Medianeira,11/02/2022,9h,14/02/2022,18h
9,Pato Branco,12/02/2022,9h,15/02/2022,18h


In [7]:
def process_table(table):
    processed = pd.DataFrame()
    processed['campus'] = table.iloc[:, 0].apply(lambda x: unidecode(x).upper())
    processed['s_date_time'] = table.iloc[:, 1] + ' ' + table.iloc[:, 2]
    processed['start_timestamp'] = processed['s_date_time'].apply(lambda x: datetime.strptime(str(x).strip(), '%d/%m/%Y %Hh'))
    processed['e_date_time'] = table.iloc[:, 3] + ' ' + table.iloc[:, 4]
    processed['end_timestamp'] = processed['e_date_time'].apply(lambda x: datetime.strptime(str(x).strip(), '%d/%m/%Y %Hh'))
    return processed[['campus', 'start_timestamp', 'end_timestamp']]

In [8]:
requerimento = process_table(tables[0])
ajuste = process_table(tables[2])
inclusao = process_table(tables[3])
intercampus = process_table(tables[4])

In [9]:
requerimento

Unnamed: 0,campus,start_timestamp,end_timestamp
0,APUCARANA,2022-02-14 09:00:00,2022-02-15 18:00:00
1,CAMPO MOURAO,2022-02-14 09:00:00,2022-02-15 18:00:00
2,CORNELIO PROCOPIO,2022-02-14 09:00:00,2022-02-15 18:00:00
3,CURITIBA,2022-02-16 09:00:00,2022-02-17 18:00:00
4,DOIS VIZINHOS,2022-02-11 09:00:00,2022-02-13 18:00:00
5,FRANCISCO BELTRAO,2022-02-16 09:00:00,2022-02-18 18:00:00
6,GUARAPUAVA,2022-02-14 09:00:00,2022-02-16 18:00:00
7,LONDRINA,2022-02-11 09:00:00,2022-02-14 18:00:00
8,MEDIANEIRA,2022-02-11 09:00:00,2022-02-14 18:00:00
9,PATO BRANCO,2022-02-12 09:00:00,2022-02-15 18:00:00
