In [8]:
import polars as pl
import pandas as pd 
import os
import re
import glob
from urllib.request import urlopen, urlretrieve
from zipfile import ZipFile
from rarfile import RarFile
from bs4 import BeautifulSoup
import requests

In [87]:
file = '../files/2022/frota_por_uf_e_tipo_de_veículo_2-2022.xls'
filename = os.path.split(file)[1]

In [101]:
df = pd.read_excel(file)


In [104]:

def guess_header(df: pd.DataFrame, max_header_guess: int = 4) -> int:
    header_guess = 0
    while header_guess < max_header_guess:
        # Iffy logic, but essentially: if all rows of the column are strings, then this is a good candidate for a header.
        if all(df.iloc[header_guess].apply(lambda x: isinstance(x, str))):
            return header_guess
            
        header_guess += 1
    return 0 # If nothing is ever found until the max, let's just assume it's the first row as per usual. 

def change_df_header(df: pd.DataFrame, header_row: int) -> pd.DataFrame:
    new_header = df.iloc[header_row]
    new_df = df[(header_row+1):].reset_index(drop=True)
    new_df.rename(columns=new_header, inplace=True)
    return new_df

def get_year_month_from_filename(filename: str) -> tuple[int, int]:
    match = re.search(r"(\w+)_(\d{1,2})-(\d{4})\.(xls|xlsx)$"

, filename)

    if match:
        month = match.group(2)
        year = match.group(3)
        return month, year
    else:
        raise ValueError("No match found")

In [57]:
DICT_UFS = {
    "AC": "Acre",
    "AL": "Alagoas",
    "AP": "Amapá",
    "AM": "Amazonas",
    "BA": "Bahia",
    "CE": "Ceará",
    "DF": "Distrito Federal",
    "ES": "Espírito Santo",
    "GO": "Goiás",
    "MA": "Maranhão",
    "MT": "Mato Grosso",
    "MS": "Mato Grosso do Sul",
    "MG": "Minas Gerais",
    "PA": "Pará",
    "PB": "Paraíba",
    "PR": "Paraná",
    "PE": "Pernambuco",
    "PI": "Piauí",
    "RJ": "Rio de Janeiro",
    "RN": "Rio Grande do Norte",
    "RS": "Rio Grande do Sul",
    "RO": "Rondônia",
    "RR": "Roraima",
    "SC": "Santa Catarina",
    "SP": "São Paulo",
    "SE": "Sergipe",
    "TO": "Tocantins",
}

In [52]:
new_df  = change_df_header(df, guess_header(df))

Tratamento ad hoc necessário


In [120]:
new_df.rename(columns={ new_df.columns[0]: "sigla_uf" }, inplace= True) # Rename for ease of use.
new_df.sigla_uf = new_df.sigla_uf.str.strip() # Remove whitespace.
clean_df = new_df[new_df.sigla_uf.isin(DICT_UFS.values())].reset_index(drop = True) # Now we get all the actual RELEVANT uf data.
month, year = get_year_month_from_filename(filename)

In [126]:
clean_pl_df = pl.from_pandas(clean_df).lazy()
# Add year and month
clean_pl_df = clean_pl_df.with_columns(pl.lit(year, dtype = pl.Int64).alias('ano'), pl.lit(month, dtype = pl.Int64).alias('mes'))

In [127]:
long_pl_df = clean_pl_df.melt(id_vars = ['ano', 'mes', 'sigla_uf'], variable_name= 'tipo_veiculo', value_name= 'quantidade') # Long format.

In [129]:
long_pl_df.collect()

ano,mes,sigla_uf,tipo_veiculo,quantidade
i64,i64,str,str,i64
2022,2,"""Acre""","""TOTAL""",321989
2022,2,"""Amapá""","""TOTAL""",225303
2022,2,"""Amazonas""","""TOTAL""",1020046
2022,2,"""Pará""","""TOTAL""",2367668
2022,2,"""Rondônia""","""TOTAL""",1118903
2022,2,"""Roraima""","""TOTAL""",252315
2022,2,"""Tocantins""","""TOTAL""",796159
2022,2,"""Alagoas""","""TOTAL""",988002
2022,2,"""Bahia""","""TOTAL""",4720073
2022,2,"""Ceará""","""TOTAL""",3529429
