In [None]:
from google.colab import drive

drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
import os
from io import StringIO
from zipfile import ZipFile

import ftfy
import pandas as pd

In [None]:
!pip install ftfy

In [None]:
pd.set_option("display.max_columns", None)
pd.options.display.float_format = "{:.2f}".format

In [None]:
id_tce = pd.read_csv(
    "/content/gdrive/MyDrive/ComprasPublicas_Brasil/input/PE/municipios.csv",
    encoding="latin-1",
    dtype=str,
)
id_tce.rename(
    columns={
        "CODIGOIBGE": "id_municipio",
        "CODIGO": "id_municipio_tce",
        "UNIDADEFEDERATIVA": "sigla_uf",
    },
    inplace=True,
)
id_tce.drop(["MUNICIPIO", "CODIGOSAGRES"], axis=1, inplace=True)
municipio = pd.read_csv(
    "/content/gdrive/MyDrive/ComprasPublicas_Brasil/auxiliary_files/municipio.csv",
    encoding="latin-1",
    dtype=str,
)
municipio = pd.merge(
    municipio,
    id_tce,
    how="left",
    left_on=["id_municipio", "sigla_uf"],
    right_on=["id_municipio", "sigla_uf"],
)
ufs = municipio["sigla_uf"].tolist()

In [None]:
ordem = [
    "id_municipio",
    "ano",
    "sigla_uf",
    "orgao",
    "nome_orgao",
    "id_unidade_gestora",
    "nome_unidade_gestora",
    "esfera",
]

# PE


In [None]:
with open(
    "/content/gdrive/MyDrive/ComprasPublicas_Brasil/input/PE/unidadesjurisdicionadas.csv",
    "r",
    encoding="utf-8",
) as f:
    text = f.read()

# Corrigir o texto
fixed_text = ftfy.fix_text(text)

# Em seguida, pode tentar ler o texto corrigido usando pandas
id_tce = pd.read_csv(StringIO(fixed_text))

# id_tce = pd.read_csv('/content/gdrive/MyDrive/ComprasPublicas_Brasil/input/PE/unidadesjurisdicionadas.csv', sep=',', encoding='latin-1',dtype=str)

tce_drop = [
    "CODIGOTCE",
    "PODER",
    "UNIDADEFEDERATIVA",
    "NATUREZA",
    "TIPOPESSOAJURIDICA",
    "MUNICIPIO",
    "SIGLA",
    "SITUACAO",
    "CNPJ",
]

rename = {
    "CODIGOMUNICIPIO": "id_municipio_tce",
    "ID_UNIDADE_GESTORA": "id_unidade_gestora",
    "ESFERA": "esfera",
    "ORGAO": "nome_unidade_gestora",
}

id_tce.drop(tce_drop, axis=1, inplace=True)
id_tce.rename(columns=rename, inplace=True)

# merge to get id_municipio (ibge)

pe = pd.merge(
    id_tce,
    municipio,
    how="left",
    left_on="id_municipio_tce",
    right_on="id_municipio_tce",
)

pe.drop(["nome", "id_municipio_6", "id_municipio_tce"], axis=1, inplace=True)

pe.drop_duplicates(subset=["id_municipio", "id_unidade_gestora"], inplace=True)

pe = pe.reindex(columns=ordem)

# MG


In [None]:
municipio_mg = municipio.query('sigla_uf=="MG"')
municipios_mg = municipio_mg["id_municipio"].tolist()

anos_mg = ["2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021"]

pasta = "/content/gdrive/MyDrive/ComprasPublicas_Brasil/input/MG/"

all_df_mg = []
for a in anos_mg:
    for m in municipios_mg:
        df = os.path.join(pasta, "{}/orgao_{}.zip".format(a, a))
        with ZipFile(df) as z:
            with z.open("orgao/{}/{}.{}.orgao.orgao.csv".format(m, a, m)) as f:
                mg1 = pd.read_csv(f, sep=";", encoding="utf-8", dtype=str)

            with z.open(
                "orgao/{}/{}.{}.orgao.orgaoUnidade.csv".format(m, a, m)
            ) as f:
                mg2 = pd.read_csv(f, sep=";", encoding="utf-8", dtype=str)

        mg = pd.merge(
            mg1, mg2, how="left", left_on=["seq_orgao"], right_on=["seq_orgao"]
        )

        mg.drop(
            [
                "nom_uf",
                "dsc_regiaoplanejamento",
                "cod_subunidade",
                "dsc_subunidade",
                "num_anoexercicio",
                "num_versao_arq_x",
                "num_versao_arq_y",
                "nom_municipio",
                "cod_orgao",
                "cod_uf",
                "cod_unidade",
                "tipo_orgao",
                "num_mes_referencia",
            ],
            axis=1,
            inplace=True,
        )

        mg.rename(
            {
                "cod_municipio": "id_municipio",
                "sgl_uf": "sigla_uf",
                "dsc_unidade": "nome_unidade_gestora",
                "nom_orgao": "nome_orgao",
                "seq_orgao": "orgao",
                "seq_unidade": "id_unidade_gestora",
                "num_ano_referencia": "ano",
            },
            axis=1,
            inplace=True,
        )

        all_df_mg.append(mg)

mg = pd.concat(all_df_mg, ignore_index=True, sort=True)

mg = mg.drop_duplicates()

mg = mg.reindex(columns=ordem)

# PR


In [None]:
# PR

municipio_pr = municipio.query('sigla_uf=="PR"')
municipio_pr = municipio_pr.query(
    'id_municipio_6 != "411915" & id_municipio_6 != "411370" & id_municipio_6 != "411535" & id_municipio_6 != "411710" & id_municipio_6 != "412627" & id_municipio_6 != "410140" &  id_municipio_6 != "410350"'
)
municipios_pr = municipio_pr["id_municipio_6"].tolist()

anos_pr = [
    "2013",
    "2014",
    "2015",
    "2016",
    "2017",
    "2018",
    "2019",
    "2020",
    "2021",
]

all_files_pr = []
for a in anos_pr:
    for m in municipios_pr:
        exec(
            "path = '/content/gdrive/MyDrive/ComprasPublicas_Brasil/input/PR/{}/Licitacao/{}/{}_{}_Licitacao.csv'".format(
                a, m, a, m
            )
        )
        all_files_pr.append(path)  # type: ignore  # noqa: F821

# cria coluna arquivo que especifica a origem do dado
all_df_pr = []
for f in all_files_pr:
    df1 = pd.read_csv(f, sep=",", encoding="utf-8", dtype=str)
    df1["arquivo"] = f.split("/")[-1]
    all_df_pr.append(df1)

pr = pd.concat(
    all_df_pr, ignore_index=True, sort=True
)  # junta dfs por variáveis iguais

pr_drop = [
    "DataReferencia",
    "arquivo",
    "dsClausulaProrrogacao",
    "dsRegimeExecucaoLicitacao",
    "dtAbertura",
    "nrEditalOrigem",
    "nrLicitacao",
    "nranoEditalOrigem",
    "ultimoEnvioSIMAMNesteExercicio",
    "dtEnvio",
    "nmMunicipio",
    "vlLicitacao",
    "dsAvaliacaoLicitacao",
    "dsModalidadeLicitacao",
    "dsNaturezaLicitacao",
    "dsClassificacaoObjetoLicitacao",
    "idLicitacao",
    "dsObjeto",
    "dsTipoSituacaoLicitacao",
    "dtEdital",
    "dtOcorrencia",
    "nrAnoLicitacao",
]

pr.drop(pr_drop, axis=1, inplace=True)

pr_rename = {
    "cdIBGE": "id_municipio",
    "idPessoa": "id_unidade_gestora",
    "nmEntidade": "nome_unidade_gestora",
}

pr.rename(pr_rename, axis=1, inplace=True)

# merge id_municipio 6 and 7 digits

pr = pd.merge(
    pr,
    municipio,
    how="left",
    left_on="id_municipio",
    right_on="id_municipio_6",
)

pr.drop(
    ["id_municipio_x", "id_municipio_6", "nome", "id_municipio_tce"],
    axis=1,
    inplace=True,
)
pr.rename({"id_municipio_y": "id_municipio"}, axis=1, inplace=True)

pr = pr.drop_duplicates()

pr = pr.reindex(columns=ordem)

# SP


In [None]:
sp = pd.read_csv(
    "/content/gdrive/MyDrive/ComprasPublicas_Brasil/input/SP/aux_municipio_sp.csv",
    sep=",",
    encoding="utf-8",
    dtype=str,
)

sp_rename = {"ds_orgao": "nome_orgao", "codigo_orgao": "orgao"}

sp.rename(sp_rename, axis=1, inplace=True)

sp = sp.drop_duplicates()

sp = sp.reindex(columns=ordem)

# RS


In [None]:
rs = pd.read_csv(
    "/content/gdrive/MyDrive/ComprasPublicas_Brasil/input/RS/orgaos_auditados_rs.csv",
    sep=",",
    encoding="utf-8",
    dtype=str,
)

rs_rename = {
    "NOME_ORGAO": "nome_orgao",
    "CD_ORGAO": "orgao",
    "ESFERA": "esfera",
    "CD_MUNICIPIO_IBGE": "id_municipio",
}

rs.rename(rs_rename, axis=1, inplace=True)

rs["sigla_uf"] = "RS"

rs = rs.drop_duplicates()

rs = rs.reindex(columns=ordem)

# PB


In [None]:
pb = pd.read_csv(
    "/content/gdrive/MyDrive/ComprasPublicas_Brasil/input/PB/aux_municipio_pb.csv",
    sep=",",
    encoding="utf-8",
    dtype=str,
)

pb_rename = {"de_ugestora": "nome_unidade_gestora"}

pb.rename(pb_rename, axis=1, inplace=True)

pb = pb.drop_duplicates()

pb = pb.reindex(columns=ordem)

# CE


In [None]:
ce = pd.read_csv(
    "/content/gdrive/MyDrive/ComprasPublicas_Brasil/input/CE/Dicionário/orgao_ce.csv",
    sep=",",
    encoding="utf-8",
    dtype=str,
)

id_mun = pd.read_csv(
    "/content/gdrive/MyDrive/ComprasPublicas_Brasil/input/CE/municipios.csv",
    sep=";",
    dtype=str,
    encoding="latin-1",
    usecols=["geoibgeId", "codigo_municipio"],
)

# merge to get id_municipio

ce = pd.merge(
    ce,
    id_mun,
    how="left",
    left_on=["codigo_municipio"],
    right_on=["codigo_municipio"],
)

ce_rename = {
    "geoibgeId": "id_municipio",
    "codigo_orgao": "orgao",
    "codigo_tipo_unidade": "id_unidade_gestora",
}

ce.rename(ce_rename, axis=1, inplace=True)

ce.drop_duplicates(
    subset=["id_municipio", "orgao", "id_unidade_gestora"], inplace=True
)

ce["sigla_uf"] = "CE"

ce["ano"] = ce["exercicio_orcamento"].str[:4]

ce = ce.reindex(columns=ordem)

# Join


In [None]:
orgao_ug = pd.concat([pe, ce, pb, mg, pr, rs])

orgao_ug["nome_orgao"] = orgao_ug["nome_orgao"].str.upper()
orgao_ug["nome_unidade_gestora"] = orgao_ug["nome_unidade_gestora"].str.upper()
orgao_ug["esfera"] = orgao_ug["esfera"].str.lower()

In [None]:
orgao_ug.to_csv(
    "/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/dicionarios/orgao_ug_auxiliar.csv",
    index=False,
    na_rep="",
    float_format="%.2f",
)