### Extraindo tabelas do calendário acadêmico

In [7]:
import fitz
import regex as re
import pandas as pd


def read_pdf(pdf_path: str) -> str:
    """
    Ler o contéudo de um PDF a partir do camindo até o arquivo

    Args:
        pdf_path (str): caminho até o arquivo
    
    Returns:
        str: texto do PDF com as quebras da linha marcados com '\n'
    """
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text() for page in doc])
    return text

pdf_path = "../calendario_universitario.pdf"
full_content = read_pdf(pdf_path)

In [None]:
full_content

In [11]:
def extract_from_text(text: str, start_point: str) -> str:
    '''
    Retorna um trecho do texto, o trecho é delimitado por um ponto inicial

    Args:
        text (str): texto alvo da extração
        strat_point (str): palavra que delimita o ponto inicial da extração
    
    Returns:
        str: Trecho a partir da delimitação
    '''
    pattern = rf"{start_point}(.*)"
    res = re.search(pattern, text, re.DOTALL)
    return res.group(0).strip() if res else "Texto não encontrado"

In [39]:
months_to_number = {
    "janeiro": 1,
    "fevereiro": 2,
    "março": 3,
    "abril": 4,
    "maio": 5,
    "junho": 6,
    "julho": 7,
    "agosto": 8,
    "setembro": 9,
    "outubro": 10,
    "novembro": 11,
    "dezembro": 12
}

number_to_month = {id: month for month, id in months_to_number.items()}

In [None]:
content = extract_from_text(full_content, "JANEIRO/2024")
content

In [None]:
content = content.split('\n')
content

In [None]:
def extract_tables(content:list[str]) -> pd.DataFrame:
    '''
    Retorna Dataframe que representa todos os dados das tabelas do texto de entrada

    Args:
        content (list[str]): lista com as linhas do texto que queremos extrair as tabelas
    
    Returns:
        DataFrame: Dataframe que representa os dados encontados nas tabelas
            texto, dia, mês do prazo inicial, dia, mês do prazo final e ano
    '''
    cleaned_content = [c for c in content if c not in ['', 'A']]
    
    re_month_year = r'^\b(?:JAN|FEV|MARÇ|ABR|MAI|JUN|JUL|AGO|SET|OUT|NOV|DEZ)[A-Z]*/\d{4}\b$'
    re_bad_month_year = r'^\b(?:JAN|FEV|MARÇ|ABR|MAI|JUN|JUL|AGO|SET|OUT|NOV|DEZ)[A-Z].*/(?:JAN|FEV|MARÇ|ABR|MAI|JUN|JUL|AGO|SET|OUT|NOV|DEZ)[A-Z].*'
    re_just_day = r'^\d{2}\b$'
    re_init_end_days = r'^\b\d{2} [a,e] \d{2}\b$'
    re_init_end_days_diff_months = r'^\b\d{2} [a,e] \d{2}\b\/\d{2}\b$'



    month, year = None, None
    initial_day, terminal_day = None, None
    
    text = []
    days_init = []
    days_term = []
    months_init = []
    months_term = []
    years = []
    
    for word in cleaned_content:
        matches_month = re.findall(re_month_year, word)
        matches_bad_month = re.findall(re_bad_month_year, word)
        matches_just_day = re.findall(re_just_day, word)
        matches_inital_terminal_days = re.findall(re_init_end_days, word)
        matches_inital_terminal_days_diff_months = re.findall(re_init_end_days_diff_months, word)
        
        if matches_month:
            month, year = matches_month[0].split('/')
            month = month.capitalize()
        
        elif matches_bad_month:
            month, year = None, None
        
        else:
            if month and year:
                if matches_just_day:
                    initial_day = matches_just_day[0]
                    terminal_day = matches_just_day[0]
                    text.append([])
                    days_init.append(initial_day)
                    days_term.append(terminal_day)
                    months_init.append(month)
                    months_term.append(month)
                    years.append(year)

                elif matches_inital_terminal_days:
                    matche = matches_inital_terminal_days[0]
                    dates = matche.split('a') if 'a' in matche else matche.split('e')
                    initial_day, terminal_day  = dates[0].strip(), dates[1].strip()
                    text.append([])
                    days_init.append(initial_day)
                    days_term.append(terminal_day)
                    months_init.append(month)
                    months_term.append(month)
                    years.append(year)

                elif matches_inital_terminal_days_diff_months:
                    matche = matches_inital_terminal_days_diff_months[0]
                    dates = matche.split('a') if 'a' in matche else matche.split('e')
                    initial_day, terminal_day  = dates[0].strip(), dates[1].strip()
                    text.append([])
                    days_init.append(initial_day)
                    days_term.append(terminal_day.split('/')[0])
                    months_init.append(month)
                    months_term.append(
                        number_to_month[int(terminal_day.split('/')[-1])].capitalize()
                    )
                    years.append(year)
                else:
                    text[-1].append(word)

    data =  dict(
        text = text,
        days_init = days_init,
        days_term = days_term,
        months_init = months_init,
        months_term = months_term,
        years = years,
    )

    tables_df = pd.DataFrame(data)
    tables_df.text = tables_df.text.apply(lambda texts: ' '.join(texts))
    
    return tables_df 
                
tables_df = extract_tables(content)
tables_df

In [18]:
tables_df.to_csv('calendar_data.csv', index=False)

In [None]:
'asdasd'[:-1]

In [55]:
import pandas as pd
tables_df = pd.read_csv('calendar_data.csv')

def create_date(df_row : pd.DataFrame) -> str:
    is_same_day =  df_row.days_init == df_row.days_term
    is_same_month =  df_row.months_init == df_row.months_term

    if is_same_day and is_same_month:
        return f'em {df_row.days_init} de {df_row.months_init} de {df_row.years}.'

    if not(is_same_day) and is_same_month:
        return f'em {df_row.days_init}/{df_row.days_term} de {df_row.months_init} de {df_row.years}.'
    
    months_init = months_to_number[df_row.months_init.lower()]
    months_term = months_to_number[df_row.months_term.lower()]

    return f'em {df_row.days_init}/{months_init}/{df_row.years} à {df_row.days_term}/{months_term}/{df_row.years}.'

def remove_dot(text:str) -> str:
    return text[:-1] if text[-1] == '.' else text

def reconstruct_text(df: pd.DataFrame) -> str:
    '''
    Utiliza os dados do DataFrame para gerar um texto, mas simples para o modelo fazer suas inferências

    Args:
        df (DataFrame): DataFrame com os dados das tabelas presentes no calendário acadêmico

    Returns:
        str: texto com os dados simplificados das tabelas
    '''

    return df.apply(lambda r: f'{remove_dot(r.text)}, {create_date(r)} ', axis=1).to_list()


good_texts = reconstruct_text(tables_df)
good_text = '\n'.join(good_texts)

In [None]:
print(good_text)

In [57]:
with open("cleaned_calendar.txt", "w") as text_file:
    text_file.write(good_text)