# IMPORTS

In [1]:
import pandas as pd
import numpy as np
import os
from decimal import Decimal
import pickle
from pathlib import Path
from datetime import timedelta
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

# SETUP

In [2]:
dir_tree_util_path = os.path.join("utils", "dir_tree.py")
exec(open(dir_tree_util_path).read())

# INPUTS

In [3]:
# Paths
path_input = PROJECT_DIRS["DADOS_VEQ_ANTT_DIR"]
path_output = PROJECT_DIRS["DADOS_DERIVADOS_DIR"]

In [4]:
periodo = list(range(2010, 2025))

# Criando um dataframe consolidado com os dados de tráfego

In [5]:
# arquivos dos dados
arquivos = os.listdir(path_input)

In [6]:
df_trafego = pd.DataFrame()
for ano in periodo:
    arquivo = [a for a in arquivos if a[:-4].endswith(str(ano))][0]
    df_ano = pd.read_csv(os.path.join(path_input, arquivo), sep=';', encoding='cp1252', low_memory=False)
    df_trafego = pd.concat([df_trafego,df_ano])

# EDA Inicial

In [7]:
df_trafego.head()

Unnamed: 0,concessionaria,mes_ano,sentido,praca,categoria,tipo_de_veiculo,volume_total,multiplicador_de_tarifa,volume_veiculo_equivalente,tipo_de_cobranca
0,RODOVIA DO AÇO,01-01-2010,Decrescente,"Praça 01 BR-393/RJ km 125,00",Categoria 1,Passeio,44146,1,44146,
1,RODOVIA DO AÇO,01-01-2010,Crescente,"Praça 01 BR-393/RJ km 125,00",Categoria 1,Passeio,35771,1,35771,
2,RODOVIA DO AÇO,01-02-2010,Decrescente,"Praça 01 BR-393/RJ km 125,00",Categoria 1,Passeio,33455,1,33455,
3,RODOVIA DO AÇO,01-02-2010,Crescente,"Praça 01 BR-393/RJ km 125,00",Categoria 1,Passeio,27109,1,27109,
4,RODOVIA DO AÇO,01-03-2010,Decrescente,"Praça 01 BR-393/RJ km 125,00",Categoria 1,Passeio,31677,1,31677,


In [8]:
# mes_ano na verdade é uma data
df_trafego['mes_ano'].sample(9, random_state=3)

1057786    25/01/2024
2065656    17/03/2024
1831664    17/04/2024
34616      01/06/2021
242257     11/03/2024
764703     15/04/2024
13788      01/11/2019
86407      01/12/2021
41039      30/01/2024
Name: mes_ano, dtype: object

In [9]:
df_trafego.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3000353 entries, 0 to 2329761
Data columns (total 10 columns):
 #   Column                      Dtype 
---  ------                      ----- 
 0   concessionaria              object
 1   mes_ano                     object
 2   sentido                     object
 3   praca                       object
 4   categoria                   object
 5   tipo_de_veiculo             object
 6   volume_total                object
 7   multiplicador_de_tarifa     object
 8   volume_veiculo_equivalente  object
 9   tipo_de_cobranca            object
dtypes: object(10)
memory usage: 251.8+ MB


In [10]:
for col in df_trafego:
    print(f"{col}: {df_trafego[col].isna().sum()}")

concessionaria: 0
mes_ano: 0
sentido: 0
praca: 0
categoria: 0
tipo_de_veiculo: 0
volume_total: 344
multiplicador_de_tarifa: 0
volume_veiculo_equivalente: 0
tipo_de_cobranca: 74780


In [11]:
df_trafego[df_trafego['volume_total'].isna()].sample(9)

Unnamed: 0,concessionaria,mes_ano,sentido,praca,categoria,tipo_de_veiculo,volume_total,multiplicador_de_tarifa,volume_veiculo_equivalente,tipo_de_cobranca
126172,ECOSUL,01/12/2023,Decrescente,"Praça 04 BR-392/RS km 52,30",Categoria 9,Moto,,50,0,N/I
125302,ECOSUL,01/04/2023,Crescente,"Praça 01 BR-116/RS km 430,79",Categoria 9,Moto,,50,0,N/I
135993,RIOSP,01/08/2023,Crescente,"Praça 08 BR-116/SP km 205,00",Categoria Esp. 07,Comercial,,700,0,N/I
130554,ECOVIAS DO CERRADO,01/09/2023,Decrescente,P4 - SANTA VITÓRIA,Veículo Comercial Acima 10 eixos,Comercial,,1000,0,N/I
126032,ECOSUL,01/11/2023,Decrescente,"Praça 02 BR-116/RS km 510,76",Categoria 9,Moto,,50,0,N/I
125882,ECOSUL,01/09/2023,Crescente,"Praça 05 BR-392/RS km 111,47",Categoria 9,Moto,,50,0,N/I
136000,RIOSP,01/08/2023,Crescente,"Praça 08 BR-116/SP km 205,00",Categoria 5,Passeio,,200,0,N/I
125442,ECOSUL,01/05/2023,Crescente,"Praça 03 BR-116/RS km 541,20",Categoria 9,Moto,,50,0,N/I
135766,RIOSP,01/07/2023,Crescente,"Praça 08 BR-116/SP km 205,00",Categoria 5,Passeio,,200,0,N/I


In [12]:
df_trafego[df_trafego['volume_total'].isna()]['volume_veiculo_equivalente'].unique()

array(['0,00'], dtype=object)

In [13]:
df_trafego['mes_ano'].str[:2].unique()

array(['01', '14', '28', '31', '30', '02', '03', '04', '05', '06', '07',
       '08', '09', '10', '11', '12', '13', '15', '16', '17', '18', '19',
       '20', '21', '22', '23', '24', '25', '26', '27', '29'], dtype=object)

In [14]:
df_trafego['mes_ano'].str[3:5].unique()

array(['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11',
       '12'], dtype=object)

In [15]:
df_trafego['tipo_de_cobranca'].unique()

array([nan, 'N/I', 'Manual', 'Automática', 'Mista'], dtype=object)

In [16]:
df_trafego['volume_total'].isna().any()

np.True_

In [17]:
df_trafego['volume_total'].isna().sum()

np.int64(344)

In [18]:
df_trafego['multiplicador_de_tarifa'].unique()

array(['1', '2', '1,5', '3', '4', '5', '6', '0,5', '7', '8', '9', '10',
       1.0, 2.0, 1.5, 3.0, 4.0, 5.0, 6.0, 0.5, 7.0, 8.0, 9.0, 10.0,
       '2,00', '3,00', '4,00', '5,00', '6,00', '0,50', '1,00', '1,50',
       '7,00', '8,00', '9,00', '10,00', '0,00', '11,00', '13,00', '15,00',
       '12,00', '14,00', '18,00', '20,00', '16,00', '17,00', '19,00'],
      dtype=object)

# Tratamento dos dados

In [19]:
# Colunas de ano, mes e dia
df_trafego['year'] = df_trafego['mes_ano'].str[-4:].astype('int') 
df_trafego['month'] = df_trafego['mes_ano'].str[3:5].astype('int')
df_trafego['day'] = df_trafego['mes_ano'].str[:2].astype('int') 

In [20]:
# Recriando a coluna de data, agora como datetime ao invés de string. Tem que ser em ingles pq a função do pandas exige
# OBS: essa coluna está cagada, as vezes temos a data completa, as vezes apenas um registro consolidando o mês!
df_trafego['data'] = pd.to_datetime(df_trafego[['year', 'month', 'day']]).dt.date
# criando uma coluna de mes_ano
df_trafego['mes_ano'] = pd.to_datetime(df_trafego[['year', 'month', 'day']]).dt.to_period('M')

In [21]:
# OBS: para algumas concessionarias ao menos, os dados a nível de data não são confiáveis

df_trafego.query("concessionaria == 'CRO' and mes_ano == @pd.to_datetime('2015-09').to_period('M')")['data'].unique()


# Os dados com início com 2010-01-01 estão ok.

array([datetime.date(2015, 9, 1)], dtype=object)

In [22]:
# preenchendo os NaNs, qdo aplicável:
df_trafego['volume_total'] = df_trafego['volume_total'].fillna(0) #o volume equivalente é zero sempre que volume_total é zero

In [23]:
# Convertendo os tipos de dados para numéricos:
df_trafego['volume_total'] = df_trafego['volume_total'].apply(lambda x: x.replace(',','.') if type(x) == str else x)
df_trafego['volume_total'] = df_trafego['volume_total'].apply(lambda x: x.split('.')[0] if type(x) == str else x)
df_trafego['volume_total'] = df_trafego['volume_total'].astype('int')

df_trafego['volume_veiculo_equivalente'] = df_trafego['volume_veiculo_equivalente'].apply(lambda x: x.replace(',','.') if type(x) == str else x)
df_trafego['volume_veiculo_equivalente'] = df_trafego['volume_veiculo_equivalente'].astype('float') # existe multiplicador de tarifa fracionário

In [24]:
# convertendo categoria para string (do contrário não salva para parquet):
df_trafego['categoria'] = df_trafego['categoria'].astype('string')

In [25]:
# convertendo o multiplicador de tarifa de string para decimal (p/manter a precisao):
df_trafego['multiplicador_de_tarifa'] = df_trafego['multiplicador_de_tarifa'].apply(lambda x: x.replace(',','.') if type(x) == str else x)
df_trafego['multiplicador_de_tarifa'] = df_trafego['multiplicador_de_tarifa'].apply(Decimal)

In [26]:
# O tipo de tráfego ora está em maisúcula, ora em minúscula.
# Colocando tudo p/maiúscula
df_trafego['tipo_de_veiculo'] = df_trafego['tipo_de_veiculo'].str.upper()

In [27]:
# algumas concessionárias estão ora em maiúsculas ora em minúsculas
# colocando tudo para maiúscula
df_trafego['concessionaria'] = df_trafego['concessionaria'].str.upper()

In [28]:
df_trafego.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3000353 entries, 0 to 2329761
Data columns (total 14 columns):
 #   Column                      Dtype    
---  ------                      -----    
 0   concessionaria              object   
 1   mes_ano                     period[M]
 2   sentido                     object   
 3   praca                       object   
 4   categoria                   string   
 5   tipo_de_veiculo             object   
 6   volume_total                int64    
 7   multiplicador_de_tarifa     object   
 8   volume_veiculo_equivalente  float64  
 9   tipo_de_cobranca            object   
 10  year                        int64    
 11  month                       int64    
 12  day                         int64    
 13  data                        object   
dtypes: float64(1), int64(4), object(7), period[M](1), string(1)
memory usage: 343.4+ MB


In [29]:
df_trafego['year'].unique()

array([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020,
       2021, 2022, 2023, 2024])

In [30]:
df_trafego.query("year == 2024")['month'].unique()

array([1, 2, 3, 4, 5, 6])

## Criando coluna desambiguando categorias comercial x passeio

In [31]:
df_trafego['tipo_de_veiculo'].unique()

array(['PASSEIO', 'COMERCIAL', 'MOTO', 'VEÍCULO PEQUENO'], dtype=object)

In [32]:
df_trafego["TIPO_TRAFEGO"] = df_trafego['tipo_de_veiculo'].apply(lambda x: x if x == "COMERCIAL" else "PASSEIO")

In [33]:
df_trafego["TIPO_TRAFEGO"].unique()

array(['PASSEIO', 'COMERCIAL'], dtype=object)

## Criando colunas com trafégo em Veqs para comercial e passeio

In [34]:
df_trafego = df_trafego.rename(columns={'volume_veiculo_equivalente':'VEQS_TOTAL'})
df_trafego['VEQS_COMERCIAL'] = np.where(
    df_trafego['TIPO_TRAFEGO'] == 'COMERCIAL',
    df_trafego['VEQS_TOTAL'],
    0
)
df_trafego['VEQS_PASSEIO'] = np.where(
    df_trafego['TIPO_TRAFEGO'] == 'PASSEIO',
    df_trafego['VEQS_TOTAL'],
    0
)

# Criando dataframe com o período inicial e final dos dados por concessionária

In [35]:
df_periodos = df_trafego.groupby('concessionaria').agg(
    data_inicial=('data', 'min'),
    data_final=('data', 'max')
).reset_index()
df_periodos['data_inicial'] = pd.to_datetime(df_periodos['data_inicial']).dt.date
df_periodos['data_final'] = pd.to_datetime(df_periodos['data_final']).dt.date

In [36]:
df_periodos.index = df_periodos['concessionaria']
df_periodos = df_periodos.drop(columns=['concessionaria'])

In [37]:
df_periodos = df_periodos.sort_values(by='data_inicial', ascending=False)

# Criando dataframes agrupando VEQs por ANO/concessionaria/tipo de trafego

In [38]:
cols_veqs = ['VEQS_COMERCIAL','VEQS_PASSEIO']

In [39]:
dict_veqs_anual = {}
for col in cols_veqs:
    dict_veqs_anual[col] = df_trafego.pivot_table(
        index='concessionaria',
        columns='year',
        values=col,
        aggfunc='sum'
        ).T

In [40]:
def filter_incomplete_years(df_trafego, df_periodos):
    df = df_trafego.copy()
    for conc in df:
        data_inicial = df_periodos.loc[conc,'data_inicial']
        data_final = df_periodos.loc[conc,'data_final']
        # print(data_inicial, data_final)
        years_drop = []
        if (data_inicial.day != 1) or (data_inicial.month != 1):
            years_drop.append(data_inicial.year)
        if (data_final.day != 31) or (data_final.month != 12):
            years_drop.append(data_final.year)
        # print(list(set(years_drop)))
        years_drop = list(set(years_drop))
        for year in df.index:
            if year in years_drop:
                df.loc[year, conc] = None
    return df    

In [41]:
# retirando os anos incompletos
def filter_incomplete_years(df_trafego, df_periodos):
    df = df_trafego.copy()
    for conc in df:
        data_inicial = df_periodos.loc[conc,'data_inicial']
        data_final = df_periodos.loc[conc,'data_final']
        # print(data_inicial, data_final)
        years_drop = []
        if (data_inicial.day != 1) or (data_inicial.month != 1):
            years_drop.append(data_inicial.year)
        if (data_final.day != 31) or (data_final.month != 12):
            years_drop.append(data_final.year)
        # print(list(set(years_drop)))
        years_drop = list(set(years_drop))
        for year in df.index:
            if year in years_drop:
                df.loc[year, conc] = None
    return df    

In [42]:
for veq, df in dict_veqs_anual.items():
    dict_veqs_anual[veq] = filter_incomplete_years(df, df_periodos)

# Criando dataframes agrupando VEQs por MÊS/concessionaria/tipo de trafego

In [43]:
dict_veqs_mensal = {}
for col in cols_veqs:
    dict_veqs_mensal[col] = df_trafego.pivot_table(
        index='concessionaria',
        columns='mes_ano',
        values=col,
        aggfunc='sum'
        ).T
    dict_veqs_mensal[col].index = dict_veqs_mensal[col].index.to_timestamp().to_period('M')

In [44]:
dict_veqs_mensal['VEQS_COMERCIAL']

concessionaria,AUTOPISTA FERNÃO DIAS,AUTOPISTA FLUMINENSE,AUTOPISTA LITORAL SUL,AUTOPISTA PLANALTO SUL,AUTOPISTA REGIS BITTENCOURT,CONCEBRA,CONCEPA,CONCER,CRO,CRT,ECO050,ECO101 CONCESSIONARIA DE RODOVIAS S/A,ECOPONTE,ECORIOMINAS,ECOSUL,ECOVIAS DO ARAGUAIA,ECOVIAS DO CERRADO,HOLDING DO SISTEMA RODOVIARIO RIO - SAO PAULO S.A.,MSVIA,NOVADUTRA,RIOSP,RODOVIA DO AÇO,TRANSBRASILIANA,VIA 040,VIA BAHIA,VIA BRASIL,VIA COSTEIRA,VIA SUL
mes_ano,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
2010-01,6190339.0,1815015.0,5183361.0,1464018.0,8261978.0,,1156902.0,990659.0,,653385.0,,,,,902674.0,,,,,7602748.0,,986812.0,1088796.0,,,,,
2010-02,5924839.0,1708351.0,5020757.0,1481465.0,7872436.0,,1063578.0,905443.0,,612653.0,,,,,850322.0,,,,,7065494.0,,929227.0,1060479.0,,,,,
2010-03,7003056.0,2018155.0,5864833.0,1701691.0,9965511.0,,1221469.0,1114422.0,,726098.0,,,,,1239520.0,,,,,9025403.0,,1165079.0,1274481.0,,,,,
2010-04,6875269.0,1752751.0,5433517.0,1627705.0,9190180.0,,1115475.0,1007970.0,,625073.0,,,,,1676946.0,,,,,9694048.0,,1100444.0,1198545.0,,,,,
2010-05,7518015.0,2005884.0,5796304.0,1746326.0,9827356.0,,1155528.0,1084941.0,,705851.0,,,,,1416282.0,,,,,10581017.0,,1176777.0,1334414.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-02,,1987813.0,8614519.0,2222301.0,6893.0,5072773.0,,889976.0,6110893.0,,2771769.0,2814580.0,281351.0,3611826.0,1397204.0,2542428.0,2077433.0,5201774.0,3677050.0,,,856805.0,375430.0,3859726.0,4325280.0,,3240.0,4693961.0
2024-03,,805183.0,9150873.0,2339823.0,,5408234.0,,935950.0,6690406.0,,3556270.0,3447955.0,341273.0,3890033.0,1709606.0,3254560.0,2224151.0,7247821.0,3530682.0,,,1034652.0,389816.0,4354306.0,5887783.0,,2493832.0,319314.0
2024-04,,21256.0,8755028.0,2436719.0,,5388341.0,,976630.0,7745369.0,,3798837.0,3551304.0,357818.0,3837949.0,2320854.0,3486596.0,2352373.0,7520755.0,808079.0,,,1165673.0,99752.0,4535580.0,5857438.0,,41399.0,482961.0
2024-05,,,,,,5575640.0,,1000784.0,8497497.0,,3700019.0,3599997.0,366801.0,3832806.0,1471602.0,3498661.0,2449695.0,7838099.0,2360.0,,,120607.0,,605275.0,6061462.0,,,221793.0


In [45]:
def filter_incomplete_months(df_trafego, df_periodos):
    df = df_trafego.copy()

    for conc in df:
        data_inicial = df_periodos.loc[conc, 'data_inicial']
        data_final = df_periodos.loc[conc, 'data_final']
        
        # List to keep track of months to drop
        months_drop = []

        # Check if the starting month is incomplete
        if data_inicial.day != 1:
            # Drop the starting month if not starting from the 1st day
            incomplete_start = pd.to_datetime(data_inicial).to_period('M')
            months_drop.append(incomplete_start)

        # Check if the ending month is incomplete
        if data_final != (data_final + pd.offsets.MonthEnd(0)):
            # Drop the ending month if not ending on the last day
            incomplete_end = pd.to_datetime(data_final).to_period('M')
            months_drop.append(incomplete_end)

        # Remove duplicate dates (if any)
        months_drop = list(set(months_drop))

    # Removendo os meses
        for month in df.index:
            if month in months_drop:
                df.loc[month, conc] = None         
    return df  

In [46]:
for veq, df in dict_veqs_mensal.items():
    dict_veqs_mensal[veq] = filter_incomplete_months(df, df_periodos)

In [47]:
# removendo o primeiro mês das que não iniciam em jan/10 (por algum erro dos dados, está incompleto)
def remove_first_month_post_2010(df_mensal, mes_nao_remover):
    for col in df_mensal:
        first_non_nan = df_mensal[col].first_valid_index()
        # print(col, first_non_nan)
        # print(col, df_mensal.loc[first_non_nan, col])
        if first_non_nan != pd.to_datetime(mes_nao_remover).to_period('M'):
            df_mensal.loc[first_non_nan, col] = None
        # print(col, df_mensal.loc[first_non_nan, col])
    return df_mensal

In [48]:
for veq, df in dict_veqs_mensal.items():
    dict_veqs_mensal[veq] = remove_first_month_post_2010(df, '2010-01')

In [49]:
for veq, df in dict_veqs_mensal.items():
    print(df.index)

PeriodIndex(['2010-01', '2010-02', '2010-03', '2010-04', '2010-05', '2010-06',
             '2010-07', '2010-08', '2010-09', '2010-10',
             ...
             '2023-09', '2023-10', '2023-11', '2023-12', '2024-01', '2024-02',
             '2024-03', '2024-04', '2024-05', '2024-06'],
            dtype='period[M]', name='mes_ano', length=174)
PeriodIndex(['2010-01', '2010-02', '2010-03', '2010-04', '2010-05', '2010-06',
             '2010-07', '2010-08', '2010-09', '2010-10',
             ...
             '2023-09', '2023-10', '2023-11', '2023-12', '2024-01', '2024-02',
             '2024-03', '2024-04', '2024-05', '2024-06'],
            dtype='period[M]', name='mes_ano', length=174)


# Criando dataframes agrupando VEQs por TRIMESTRE/concessionaria/tipo de trafego

In [50]:
def group_by_tri(df_monthly):
    df_tri = df_monthly.copy()
    
    # # Ensure the index is a datetime object
    df_tri.index = df_tri.index.to_timestamp()

    # Ensure the index is a datetime object
    # df_tri.index = pd.to_datetime(df_tri.index)  # Convert index to datetime if not already

    # Create a column for the quarter-year format
    df_tri['trimestre'] = df_tri.index.to_period('Q')

    # Identify quarters with any NaN values
    cols = [col for col in df_tri.columns if col != 'trimestre']
    for col in cols:
        # Find quarters where there is at least one NaN value in the column
        quarters_with_nan = df_tri.groupby('trimestre')[col].apply(lambda x: x.isna().any())
        
        # Filter only quarters that have NaN
        quarters_with_nan = quarters_with_nan[quarters_with_nan].index

        # Set all values in those quarters to NaN for the column
        df_tri.loc[df_tri['trimestre'].isin(quarters_with_nan), col] = None

    # Identify incomplete quarters in the index itself
    incomplete_quarters = df_tri.groupby('trimestre').size()
    incomplete_quarters = incomplete_quarters[incomplete_quarters < 3].index

    # Remove incomplete quarters
    df_tri = df_tri[~df_tri['trimestre'].isin(incomplete_quarters)]

    # Drop the helper column
    df_tri = df_tri.drop(columns=['trimestre'])

    # Group by trimester (quarter) and sum
    df_tri = df_tri.groupby(df_tri.index.to_period('Q')).sum()

    df_tri.index.name = 'trimestre'

    return df_tri

In [51]:
def group_by_tri(df_monthly):
    df_tri = df_monthly.copy()
   
    # Ensure the index is a datetime object
    df_tri.index = df_tri.index.to_timestamp()
    
    # Create a column for the quarter-year format
    df_tri['trimestre'] = df_tri.index.to_period('Q')
    
    # Identify quarters with any NaN values
    cols = [col for col in df_tri.columns if col != 'trimestre']
    for col in cols:
        # Find quarters where there is at least one NaN value in the column
        quarters_with_nan = df_tri.groupby('trimestre')[col].apply(lambda x: x.isna().any())
        
        # Filter only quarters that have NaN
        quarters_with_nan = quarters_with_nan[quarters_with_nan].index

        # Set all values in those quarters to zero for the column
        df_tri.loc[df_tri['trimestre'].isin(quarters_with_nan), col] = 0

    # Drop the helper column
    df_tri = df_tri.drop(columns=['trimestre'])

    # Group by trimester (quarter) and sum
    df_tri = df_tri.groupby(df_tri.index.to_period('Q')).sum()
    
    df_tri.index.name = 'trimestre'
    # df_tri.index = df_tri.index.to_period('Q')

    return df_tri


In [52]:
dict_veqs_tri = {}
for veq, df in dict_veqs_mensal.items():
    dict_veqs_tri[veq] = group_by_tri(df)

In [53]:
for veq, df in dict_veqs_tri.items():
    print(df.index)

PeriodIndex(['2010Q1', '2010Q2', '2010Q3', '2010Q4', '2011Q1', '2011Q2',
             '2011Q3', '2011Q4', '2012Q1', '2012Q2', '2012Q3', '2012Q4',
             '2013Q1', '2013Q2', '2013Q3', '2013Q4', '2014Q1', '2014Q2',
             '2014Q3', '2014Q4', '2015Q1', '2015Q2', '2015Q3', '2015Q4',
             '2016Q1', '2016Q2', '2016Q3', '2016Q4', '2017Q1', '2017Q2',
             '2017Q3', '2017Q4', '2018Q1', '2018Q2', '2018Q3', '2018Q4',
             '2019Q1', '2019Q2', '2019Q3', '2019Q4', '2020Q1', '2020Q2',
             '2020Q3', '2020Q4', '2021Q1', '2021Q2', '2021Q3', '2021Q4',
             '2022Q1', '2022Q2', '2022Q3', '2022Q4', '2023Q1', '2023Q2',
             '2023Q3', '2023Q4', '2024Q1', '2024Q2'],
            dtype='period[Q-DEC]', name='trimestre')
PeriodIndex(['2010Q1', '2010Q2', '2010Q3', '2010Q4', '2011Q1', '2011Q2',
             '2011Q3', '2011Q4', '2012Q1', '2012Q2', '2012Q3', '2012Q4',
             '2013Q1', '2013Q2', '2013Q3', '2013Q4', '2014Q1', '2014Q2',
             '201

# Salvando p/parquet

In [54]:
# dataframe total
# df_trafego.to_parquet(os.path.join(path_dados_derivados,'df_trafego.parquet'))
df_trafego.to_parquet(path_output /'df_trafego.parquet')

In [55]:
# data inicial e final
df_periodos.to_parquet(path_output /'df_periodos.parquet')

In [56]:
# dicionário com os dados de tráfego anuais:
for veq in dict_veqs_anual:
    dict_veqs_anual[veq].to_parquet(path_output / f'df_{veq}_anual.parquet')

In [57]:
# dicionário com os dados de tráfego mensial:
for veq in dict_veqs_mensal:
    dict_veqs_mensal[veq].to_parquet(path_output / f'df_{veq}_mensal.parquet')

In [58]:
# dicionário com os dados de tráfego trimestrais:
for veq in dict_veqs_tri:
    dict_veqs_tri[veq].to_parquet(path_output / f'df_{veq}_trimestral.parquet')

# Salvando p/Excel

In [67]:
# dados de tráfego anual
with pd.ExcelWriter(path_output / 'dados_trafego_ANTT_anual.xlsx') as writer:
    # df_periodos.to_excel(writer, sheet_name="periodo_concessionarias")
    for veq in dict_veqs_anual:
        dict_veqs_anual[veq].to_excel(writer, sheet_name=f"{veq}")

In [70]:
# dados de tráfego mensal
with pd.ExcelWriter(path_output / 'dados_trafego_ANTT_mensal.xlsx') as writer:
    # df_periodos.to_excel(writer, sheet_name="periodo_concessionarias")
    for veq,df in dict_veqs_mensal.items():
        df_export = df.copy()
        df_export = df_export.T
        new_index = [col.strftime('%Y-%m-%d') for col in df_export.columns]
        df_export.columns = new_index
        df_export = df_export.T
        df_export.to_excel(writer, sheet_name=f"{veq}")

In [68]:
# dados de tráfego mensal
# with pd.ExcelWriter(path_output / 'dados_trafego_ANTT_mensal.xlsx') as writer:
#     # df_periodos.to_excel(writer, sheet_name="periodo_concessionarias")
#     for veq in dict_veqs_mensal:
#         dict_veqs_mensal[veq].to_excel(writer, sheet_name=f"{veq}")

In [71]:
# dados de tráfego trimestrais
with pd.ExcelWriter(path_output / 'dados_trafego_ANTT_trimestral.xlsx') as writer:
    # df_periodos.to_excel(writer, sheet_name="periodo_concessionarias")
    for veq,df in dict_veqs_tri.items():
        df_export = df.copy()
        df_export = df_export.T
        new_index = [col.strftime('%Y-%m-%d') for col in df_export.columns]
        df_export.columns = new_index
        df_export = df_export.T
        df_export.to_excel(writer, sheet_name=f"{veq}")

In [69]:
# # dados de tráfego trimestrais
# with pd.ExcelWriter(path_output / 'dados_trafego_ANTT_trimestral.xlsx') as writer:
#     # df_periodos.to_excel(writer, sheet_name="periodo_concessionarias")
#     for veq in dict_veqs_tri:
#         dict_veqs_tri[veq].to_excel(writer, sheet_name=f"{veq}")