In [1]:
import pandas as pd
import configparser

In [2]:
config = configparser.ConfigParser()
config.read('config.ini')

raw_data_path = config['Paths']['raw_data_path']
raw_df = pd.read_csv(raw_data_path, delimiter=';')

raw_data_2016 = config['Paths']['raw_data_2016']
df_2016 = pd.read_csv(raw_data_2016, delimiter=";")

In [3]:
def clean_and_convert_to_time_series(raw_data_path):
    raw_df = pd.read_csv(raw_data_path, delimiter=';')
    
    def clean_raw_data(df):

        df = df.drop(columns=['Total'])

        df = df[df['SubGrup proced'] != 'Total']
        return df

    cleaned_df = clean_raw_data(raw_df)
    
    id_vars = ['SubGrup proced']
    
    ts_df = pd.melt(cleaned_df, id_vars=id_vars, var_name='Date', value_name='Value')
    
    def parse_custom_date(date_str):

        month_mapping = {
            'Jan': 'January',
            'Fev': 'February',
            'Mar': 'March',
            'Abr': 'April',
            'Mai': 'May',
            'Jun': 'June',
            'Jul': 'July',
            'Ago': 'August',
            'Set': 'September',
            'Out': 'October',
            'Nov': 'November',
            'Dez': 'December'
        }
        
        if date_str != 'Total':
            month_str, year_str = date_str.split('/')
            month_str = month_mapping.get(month_str, month_str)
            return pd.to_datetime(f"{month_str}/{year_str}", format='%B/%Y')
        
        return date_str 

    ts_df['Date'] = ts_df['Date'].apply(parse_custom_date)

    ts_df.set_index('Date', inplace=True)
    
    return ts_df


In [4]:
ts_2020 = clean_and_convert_to_time_series(raw_data_path)
ts_2016 = clean_and_convert_to_time_series(raw_data_2016)


In [5]:
ts_df = pd.concat([ts_2016, ts_2020])


Unnamed: 0_level_0,SubGrup proced,Value
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01-01,0202 Diagnostico em laboratorio clinico,38838
2020-01-01,0202 Diagnostico em laboratorio clinico,43209
2022-02-01,0202 Diagnostico em laboratorio clinico,32816
2019-12-01,0202 Diagnostico em laboratorio clinico,31301
2019-11-01,0202 Diagnostico em laboratorio clinico,36642
...,...,...
2019-06-01,0301 Consultas / Atendimentos / Acompanhamentos,18611
2019-05-01,0301 Consultas / Atendimentos / Acompanhamentos,19733
2019-04-01,0301 Consultas / Atendimentos / Acompanhamentos,20919
2019-02-01,0301 Consultas / Atendimentos / Acompanhamentos,16204


In [8]:
ts_df.sort_values(by=['SubGrup proced', 'Date'], inplace=True)

In [9]:
# Save the combined time series dataset to a CSV file
output_csv_path = config['Paths']['output_csv_path']
ts_df.to_csv(output_csv_path, sep=',', index=True)

print(f"The combined time series dataset has been saved.")


The combined time series dataset has been saved to: C:\Users\brvn\Documents\github\projects\HC\tcc_brvn\data\processed\processed-time-series.csv
