**Preamble**

- This code cleans the raw public procurement data obtained from the State Audit Courts (TCEs) of the following states: CE, MG, PR and RS.
- The final output of this code is the tender item table (_licitacao_item_), available at [basedosdados](https://basedosdados.org/dataset/d3874769-bcbd-4ece-a38a-157ba1021514?table=14c5d05b-9830-4710-b7ac-7e0ca1bf9d8b).
- Made by: Nathalia Sales


In [None]:
import glob
import os
from zipfile import ZipFile

import numpy as np
import pandas as pd
from google.colab import drive

In [None]:
# Connect to google drive

drive.mount("/content/gdrive")

# Display options

pd.set_option("display.max_columns", None)
pd.options.display.float_format = "{:.2f}".format

# Set directory

path = "/content/gdrive/MyDrive/ComprasPublicas_Brasil"

# Open some auxiliary files

municipio = pd.read_csv(
    os.path.join(path, "auxiliary_files/municipio.csv"),
    encoding="utf-8",
    dtype=str,
)

id_tce = pd.read_csv(
    os.path.join(path, "input/PE/municipios.csv"),
    encoding="latin-1",
    dtype=str,
    usecols=["CODIGOIBGE", "CODIGO", "UNIDADEFEDERATIVA"],
)

id_tce.rename(
    columns={
        "CODIGOIBGE": "id_municipio",
        "CODIGO": "id_municipio_tce",
        "UNIDADEFEDERATIVA": "sigla_uf",
    },
    inplace=True,
)

# Merge both
municipio = pd.merge(
    municipio,
    id_tce,
    how="left",
    left_on=["id_municipio", "sigla_uf"],
    right_on=["id_municipio", "sigla_uf"],
)

ug_id = pd.read_csv(
    os.path.join(path, "auxiliary_files/ug_id_mg.csv"), sep=",", dtype=str
)  # MG

orgao_municipio = pd.read_csv(
    os.path.join(path, "input/RS/orgaos_auditados_rs.csv"),
    encoding="utf-8",
    dtype=str,
    usecols=["CD_MUNICIPIO_IBGE", "CD_ORGAO"],
)  # RS

# Create a list of UFs
ufs = municipio["sigla_uf"].unique().tolist()

# Set columns order

ordem = [
    "ano",
    "sigla_uf",
    "id_municipio",
    "orgao",
    "id_unidade_gestora",
    "id_licitacao_bd",
    "id_licitacao",
    "id_dispensa",
    "id_item_bd",
    "id_item",
    "descricao",
    "numero",
    "numero_lote",
    "unidade_medida",
    "quantidade_cotada",
    "valor_unitario_cotacao",
    "quantidade",
    "valor_unitario",
    "valor_total",
    "quantidade_proposta",
    "valor_proposta",
    "valor_vencedor",
    "nome_vencedor",
    "documento",
]

## CE


In [None]:
# CE

# Get a list of all CSV files

all_files_ce_itens = glob.glob(
    os.path.join(path, "input/CE/Licitações/itens_licitacoes_*.csv")
)
all_files_ce_licitantes = glob.glob(
    os.path.join(path, "input/CE/Licitações/licitantes_*.csv")
)

# For items - Initialize an empty list and loop through each CSV file

all_df_ce = []
for f in all_files_ce_itens:
    df1 = pd.read_csv(f, sep=";", dtype=str, encoding="latin-1")
    df1["arquivo"] = f.split("/")[-1]
    all_df_ce.append(df1)

# Concatenate all DataFrames in the list into a single DataFrame

ce1 = pd.concat(all_df_ce, ignore_index=True, sort=True)

# Extract the year from the file name

ce1["ano"] = ce1["arquivo"].str[17:21]

# Drop and rename original variables

ce1_drop = ["codigo_tipo_negociante", "arquivo"]

ce1.drop(ce1_drop, axis=1, inplace=True)

ce1_rename = {
    "numero_licitacao": "id_licitacao",
    "descricao_item_licitacao": "descricao",
    "descricao_unidade_item_licitacao": "unidade_medida",
    "valor_vencedor_item_licitacao": "valor_total",
    "valor_unitario_item_licitacao ": "valor_unitario",
    "numero_quantidade_item_licitacao": "quantidade",
    "numero_documento_negociante": "documento",
    "numero_sequencial_item_licitacao": "numero",
}

ce1.rename(ce1_rename, axis=1, inplace=True)

# Merge between codigo_municipio and id_municipio_ibge

id_mun = pd.read_csv(
    os.path.join(path, "input/CE/municipios.csv"),
    sep=";",
    dtype=str,
    encoding="latin-1",
    usecols=["geoibgeId", "codigo_municipio"],
)

id_mun.rename({"geoibgeId": "id_municipio"}, axis=1, inplace=True)

ce1 = pd.merge(
    ce1,
    id_mun,
    how="left",
    left_on="codigo_municipio",
    right_on="codigo_municipio",
)

# Format document number
ce1["documento"] = ce1["documento"].astype(str)
ce1["documento"] = ce1["documento"].str.strip()

# For suppliers - Initialize an empty list and loop through each CSV file

all_df_ce = []
for f in all_files_ce_licitantes:
    df1 = pd.read_csv(f, sep=";", dtype=str, encoding="latin-1")
    df1["arquivo"] = f.split("/")[-1]
    all_df_ce.append(df1)

# Concatenate all DataFrames in the list into a single DataFrame

ce2 = pd.concat(all_df_ce, ignore_index=True, sort=True)

# Extract the year from the file name

ce2["ano"] = ce2["arquivo"].str[11:15]

# Drop and rename original variables

ce2_drop = [
    "fone_negociante",
    "codigo_tipo_negociante",
    "endereco_negociante",
    "cep_negociante",
    "nome_municipio_negociante",
    "arquivo",
    "codigo_uf ",
]

ce2.drop(ce2_drop, axis=1, inplace=True)

ce2_rename = {
    "numero_licitacao": "id_licitacao",
    "nome_negociante": "nome_vencedor",
    "numero_documento_negociante": "documento",
}

ce2.rename(ce2_rename, axis=1, inplace=True)

# Format document number

ce2["documento"] = ce2["documento"].astype(str)
ce2["documento"] = ce2["documento"].str.strip()

# Merge items (ce1) and suppliers files (ce2) to get suppliers name

ce = pd.merge(
    ce1,
    ce2,
    how="left",
    left_on=[
        "codigo_municipio",
        "ano",
        "id_licitacao",
        "documento",
        "data_realizacao_licitacao",
    ],
    right_on=[
        "codigo_municipio",
        "ano",
        "id_licitacao",
        "documento",
        "data_realizacao_licitacao",
    ],
)

# Date format

ce["data_realizacao_licitacao"] = ce["data_realizacao_licitacao"].str[:10]

# Assign state acronym to the 'sigla_uf'

ce["sigla_uf"] = "CE"

# Create a unique identifier for each purchase

ce["id_licitacao_bd"] = (
    ce["id_licitacao"]
    + ce["id_municipio"]
    + ce["ano"].str[2:4]
    + ce["sigla_uf"]
)

# Some id_licitacao_bd have two rows - Import tender (licitacao) table to verify

ce3 = pd.read_csv(
    os.path.join(path, "output/licitacao_ce.csv"),
    dtype=str,
    encoding="utf-8",
    usecols=["id_municipio", "ano", "id_licitacao", "id_licitacao_bd"],
)

ce3.rename({"id_licitacao_bd": "id_licitacao_bd_2"}, axis=1, inplace=True)

# Merge with licitacao
# If the id is different in the two tables, input missing

ce = pd.merge(
    ce,
    ce3,
    how="left",
    left_on=["id_municipio", "ano", "id_licitacao"],
    right_on=["id_municipio", "ano", "id_licitacao"],
    indicator=True,
)

ce["id_licitacao_bd"] = np.where(
    (ce["_merge"] == "both")
    & (ce["id_licitacao_bd"] != ce["id_licitacao_bd_2"]),
    np.nan,
    ce["id_licitacao_bd"],
)

# Drop non-necessary variable
ce.drop("data_realizacao_licitacao", axis=1, inplace=True)

# Create a unique identifier for each item
ce["id_item"] = (
    ce["numero"].astype(str)
    + ce["id_municipio"]
    + ce["id_licitacao"]
    + ce["ano"].str[2:4]
)

# 0.70% duplicates in id_item
# 0.11% duplicates - when id_licitacao_bd not missing

# Create a unique identifier for each item across states
ce["id_item_bd"] = ce["id_item"] + ce["sigla_uf"]

# Drop duplicated in all variables, few cases
ce.drop_duplicates(inplace=True)

# We can have duplicates for items supplied by different suppliers
# When the same id_item have different descriptions, or same description but different quantity/value, replace duplicates by missing

ce["id_item_bd"] = np.where(
    (ce.duplicated(["id_item_bd"], keep=False))
    & (~ce.duplicated(["id_item_bd", "descricao"], keep=False)),
    np.nan,
    ce["id_item_bd"],
)
ce["id_item_bd"] = np.where(
    (ce.duplicated(["id_item_bd", "descricao"], keep=False))
    & (~ce.duplicated(["id_item_bd", "descricao", "quantidade"], keep=False)),
    np.nan,
    ce["id_item_bd"],
)
ce["id_item_bd"] = np.where(
    (ce.duplicated(["id_item_bd", "descricao", "quantidade"], keep=False))
    & (
        ~ce.duplicated(
            ["id_item_bd", "descricao", "quantidade", "valor_total"],
            keep=False,
        )
    ),
    np.nan,
    ce["id_item_bd"],
)

# Format

ce["ano"] = ce["ano"].astype(int)
ce["numero"] = ce["numero"].astype(int)
ce["quantidade"] = (
    pd.to_numeric(ce["quantidade"], errors="coerce").fillna(0).astype(int)
)
ce["quantidade"] = ce["quantidade"].replace(0, "")

floats = ["valor_unitario", "valor_total"]
ce[floats] = ce[floats].astype(float)

ce["length"] = ce["documento"].str.len()
ce["documento"] = np.where(
    (ce["length"] == 13) & (ce["nome_vencedor"] != "LUIZA DA SILVA LIMA - ME"),
    ce["documento"].str.zfill(14),
    ce["documento"],
)

# Drop LUIZA DA SILVA LIMA - ME because when adding the zero to the left, it returns the cnpj of another company, according to google
ce["documento"] = np.where(
    (ce["length"] == 15), ce["documento"].str[1:], ce["documento"]
)
ce["documento"] = np.where(
    (ce["length"] == 13) & (ce["nome_vencedor"] != "LUIZA DA SILVA LIMA - ME"),
    ce["documento"].str.zfill(14),
    ce["documento"],
)

# duplicated
ce["nome_vencedor"] = ce["nome_vencedor"].replace(
    "JOSÉ LUS DE LIMA - ME", "JOSÉ LUIS DE LIMA - ME"
)

# Drop non-necessary variable
ce.drop("length", axis=1, inplace=True)

# Drop duplicated in all variables, after changes in document
ce.drop_duplicates(inplace=True)

# 0.20% duplicates in id_item - all of them when id_licitacao_bd is missing

# Reorder columns
ce = ce.reindex(columns=ordem)

# Save
ce.to_csv(
    os.path.join(path, "output/licitacao_item_ce.csv"),
    index=False,
    na_rep="",
    float_format="%.2f",
)

## MG


In [None]:
# List municipalities to loop

municipio_mg = municipio.query('sigla_uf=="MG"')
municipios_mg = municipio_mg["id_municipio"].tolist()

ug_id1 = ug_id.query('modalidade != "8" & modalidade !="10"')
ug_id2 = ug_id.query('modalidade == "8" | modalidade =="10"')

# Rename and list variables to drop from different files

# Competitive procurement itens

mg1_rename = {
    "seq_item_licitacao": "id_item",
    "seq_licitacao": "id_licitacao",
    "seq_orgao": "orgao",
    "num_ano_referencia": "ano",
    "dsc_unid_medida": "unidade_medida",
    "dsc_item": "descricao",
    "num_lote": "numero_lote",
    "num_item": "numero",
}

mg1_drop = ["num_mes_referencia", "num_versao_arq", "cod_item", "dsc_lote"]

# Items price quotation

mg2_rename = {
    "seq_item_licitacao": "id_item",
    "seq_licitacao": "id_licitacao",
    "seq_orgao": "orgao",
    "num_ano_referencia": "ano",
    "vlr_cot_preco_unit": "valor_unitario_cotacao",
    "num_quant_item_cotado": "quantidade_cotada",
}

mg2_drop = [
    "seq_cot_licitacao",
    "num_mes_referencia",
    "dat_cotacao",
    "vlr_percentual",
    "vlr_min_alien_bens",
    "num_versao_arq",
]

# Items with reference price

mg3_rename = {
    "seq_item_licitacao": "id_item",
    "seq_licitacao": "id_licitacao",
    "seq_orgao": "orgao",
    "num_ano_referencia": "ano",
    "vlr_item": "valor_unitario_cotacao",
}

mg3_drop = ["seq_cred_licitacao", "num_mes_referencia", "num_versao_arq"]

# Homologated items (suppliers)

mg4_rename = {
    "seq_item_licitacao": "id_item",
    "seq_licitacao": "id_licitacao",
    "seq_orgao": "orgao",
    "num_ano_referencia": "ano",
    "vlr_unitario": "valor_unitario",
    "num_quant_item": "quantidade",
    "num_doc_vencedor": "documento",
    "nom_vencedor": "nome_vencedor",
}

mg4_drop = [
    "seq_hom_licitacao",
    "num_mes_referencia",
    "num_versao_arq",
    "vlr_pct_desconto",
    "vlr_pct_tax_adm",
    "vlr_global",
    "num_versao_arq",
]

# Non competitive procurement (dispensa/inexibilidade)

mg5_rename = {
    "seq_item_dispensa": "id_item",
    "seq_dispensa": "id_dispensa",
    "seq_orgao": "orgao",
    "num_ano_referencia": "ano",
    "dsc_unid_medida": "unidade_medida",
    "dsc_item": "descricao",
    "num_item": "numero",
}

mg5_drop = ["num_mes_referencia", "num_versao_arq", "cod_item"]

# Items price quotation

mg6_rename = {
    "seq_item_dispensa": "id_item",
    "seq_dispensa": "id_dispensa",
    "seq_orgao": "orgao",
    "num_ano_referencia": "ano",
    "vlr_preco_unit": "valor_unitario_cotacao",
    "num_quant_item": "quantidade_cotada",
}

mg6_drop = ["seq_cot_dispensa", "num_mes_referencia", "num_versao_arq"]

# Suppliers

mg7_rename = {
    "seq_item_dispensa": "id_item",
    "seq_dispensa": "id_dispensa",
    "seq_orgao": "orgao",
    "num_ano_referencia": "ano",
    "vlr_item": "valor_unitario",
    "num_quant_item": "quantidade",
    "num_doc_fornecedor": "documento",
    "dsc_nom_fornecedor": "nome_vencedor",
}

mg7_drop = [
    "seq_forn_dispensa",
    "num_inscr_estadual",
    "dsc_sigla_uf",
    "num_mes_referencia",
    "num_certidao_inss",
    "dat_emi_cert_inss",
    "dat_emi_cert_fgts",
    "dat_val_cert_fgts",
    "dat_val_cert_inss",
    "num_cert_fgts",
    "num_cndt",
    "dat_emi_cndt",
    "dat_val_cndt",
    "num_versao_arq",
]

# MG folder
folder = os.path.join(path, "input/MG")

In [None]:
anos_mg = ["2014", "2015", "2016"]

all_df_mg = []
for a in anos_mg:
    for m in municipios_mg:
        df = os.path.join(folder, "{}/licitacao_{}.zip".format(a, a))
        with ZipFile(df) as z:
            # Competitive procurement itens
            with z.open(
                "licitacao/{}/{}.{}.licitacao.itemLicitacao.csv".format(
                    m, a, m
                )
            ) as f:
                mg1 = pd.read_csv(f, sep=";", encoding="utf-8", dtype=str)
                mg1["id_municipio"] = m
                mg1.rename(columns=mg1_rename, inplace=True)
                mg1.drop(mg1_drop, axis=1, inplace=True)
                mg1 = mg1.reindex(
                    columns=[
                        "id_municipio",
                        "id_item",
                        "id_licitacao",
                        "ano",
                        "orgao",
                        "descricao",
                        "unidade_medida",
                        "numero",
                        "numero_lote",
                    ]
                )

            # Quoted items
            with z.open(
                "licitacao/{}/{}.{}.licitacao.cotacaoLicitacao.csv".format(
                    m, a, m
                )
            ) as f:
                mg2 = pd.read_csv(f, sep=";", encoding="utf-8", dtype=str)
                mg2["id_municipio"] = m
                mg2.rename(columns=mg2_rename, inplace=True)
                mg2.drop(mg2_drop, axis=1, inplace=True)
                mg2 = mg2.reindex(
                    columns=[
                        "id_municipio",
                        "id_item",
                        "id_licitacao",
                        "ano",
                        "orgao",
                        "quantidade_cotada",
                        "valor_unitario_cotacao",
                    ]
                )

            # Items with reference price
            with z.open(
                "licitacao/{}/{}.{}.licitacao.refLicitacao.csv".format(m, a, m)
            ) as f:
                mg3 = pd.read_csv(f, sep=";", encoding="utf-8", dtype=str)
                mg3["id_municipio"] = m
                mg3.rename(columns=mg3_rename, inplace=True)
                mg3.drop(mg3_drop, axis=1, inplace=True)
                mg3 = mg3.reindex(
                    columns=[
                        "id_municipio",
                        "id_item",
                        "id_licitacao",
                        "ano",
                        "orgao",
                        "valor_unitario_cotacao",
                    ]
                )

            # Homologated items (suppliers)
            with z.open(
                "licitacao/{}/{}.{}.licitacao.homologLicitacao.csv".format(
                    m, a, m
                )
            ) as f:
                mg4 = pd.read_csv(f, sep=";", encoding="utf-8", dtype=str)
                mg4["id_municipio"] = m
                mg4.rename(columns=mg4_rename, inplace=True)
                mg4.drop(mg4_drop, axis=1, inplace=True)
                mg4 = mg4.reindex(
                    columns=[
                        "id_municipio",
                        "id_item",
                        "id_licitacao",
                        "ano",
                        "orgao",
                        "quantidade",
                        "valor_unitario",
                        "nome_vencedor",
                        "documento",
                    ]
                )

            # Non competitive procurement (dispensa/inexibilidade)
            with z.open(
                "licitacao/{}/{}.{}.licitacao.itemDispensa.csv".format(m, a, m)
            ) as f:
                mg5 = pd.read_csv(f, sep=";", encoding="utf-8", dtype=str)
                mg5["id_municipio"] = m
                mg5.rename(columns=mg5_rename, inplace=True)
                mg5.drop(mg5_drop, axis=1, inplace=True)
                mg5 = mg5.reindex(
                    columns=[
                        "id_municipio",
                        "id_item",
                        "id_dispensa",
                        "ano",
                        "orgao",
                        "descricao",
                        "unidade_medida",
                        "numero",
                    ]
                )

            # Quoted items
            with z.open(
                "licitacao/{}/{}.{}.licitacao.cotDispensa.csv".format(m, a, m)
            ) as f:
                mg6 = pd.read_csv(f, sep=";", encoding="utf-8", dtype=str)
                mg6["id_municipio"] = m
                mg6.rename(columns=mg6_rename, inplace=True)
                mg6.drop(mg6_drop, axis=1, inplace=True)
                mg6 = mg6.reindex(
                    columns=[
                        "id_municipio",
                        "id_item",
                        "id_dispensa",
                        "ano",
                        "orgao",
                        "quantidade_cotada",
                        "valor_unitario_cotacao",
                    ]
                )

            # Suppliers
            with z.open(
                "licitacao/{}/{}.{}.licitacao.fornDispensa.csv".format(m, a, m)
            ) as f:
                mg7 = pd.read_csv(f, sep=";", encoding="utf-8", dtype=str)
                mg7["id_municipio"] = m
                mg7.rename(columns=mg7_rename, inplace=True)
                mg7.drop(mg7_drop, axis=1, inplace=True)
                mg7 = mg7.reindex(
                    columns=[
                        "id_municipio",
                        "id_item",
                        "id_dispensa",
                        "ano",
                        "orgao",
                        "quantidade",
                        "valor_unitario",
                        "nome_vencedor",
                        "documento",
                    ]
                )

        # Merge competitive procurement

        # First - merge quoted items with items with reference price
        merge1 = pd.merge(
            mg2,
            mg3,
            how="outer",
            left_on=[
                "id_municipio",
                "id_item",
                "id_licitacao",
                "ano",
                "orgao",
                "valor_unitario_cotacao",
            ],
            right_on=[
                "id_municipio",
                "id_item",
                "id_licitacao",
                "ano",
                "orgao",
                "valor_unitario_cotacao",
            ],
        )

        # Second - merge items general information with their quoted or reference price
        merge2 = pd.merge(
            mg1,
            merge1,
            how="left",
            left_on=[
                "id_municipio",
                "id_item",
                "id_licitacao",
                "ano",
                "orgao",
            ],
            right_on=[
                "id_municipio",
                "id_item",
                "id_licitacao",
                "ano",
                "orgao",
            ],
        )

        # Third - merge with homologated items to get suppliers
        mg_licitacao_1 = pd.merge(
            merge2,
            mg4,
            how="left",
            left_on=[
                "id_municipio",
                "id_item",
                "id_licitacao",
                "ano",
                "orgao",
            ],
            right_on=[
                "id_municipio",
                "id_item",
                "id_licitacao",
                "ano",
                "orgao",
            ],
        )

        mg_licitacao_1["documento"] = mg_licitacao_1["documento"].str.strip()

        # Merge to get id_unidade_gestora
        mg_licitacao_1 = pd.merge(
            mg_licitacao_1,
            ug_id1,
            how="left",
            left_on=["ano", "id_municipio", "id_licitacao"],
            right_on=["ano", "id_municipio", "id_licitacao"],
        )

        # Assign state acronym to the 'sigla_uf'
        mg_licitacao_1["sigla_uf"] = "MG"

        # Create a unique identifier for each purchase, as in licitacao table
        mg_licitacao_1["id_licitacao_bd"] = (
            mg_licitacao_1["id_licitacao"]
            + mg_licitacao_1["id_unidade_gestora"]
            + mg_licitacao_1["sigla_uf"]
        )

        # Keep only necessary variables
        mg_licitacao_1 = mg_licitacao_1.reindex(columns=ordem)

        # Merge non competitive procurement

        # First - merge items general information with their quoted price
        merge3 = pd.merge(
            mg5,
            mg6,
            how="outer",
            left_on=["id_municipio", "id_item", "id_dispensa", "ano", "orgao"],
            right_on=[
                "id_municipio",
                "id_item",
                "id_dispensa",
                "ano",
                "orgao",
            ],
        )

        # Second - merge with homologated items to get suppliers
        mg_dispensa_1 = pd.merge(
            merge3,
            mg7,
            how="left",
            left_on=["id_municipio", "id_item", "id_dispensa", "ano", "orgao"],
            right_on=[
                "id_municipio",
                "id_item",
                "id_dispensa",
                "ano",
                "orgao",
            ],
        )

        mg_dispensa_1["documento"] = mg_dispensa_1["documento"].str.strip()

        # Merge to get id_unidade_gestora
        mg_dispensa_1 = pd.merge(
            mg_dispensa_1,
            ug_id2,
            how="left",
            left_on=["ano", "id_municipio", "id_dispensa"],
            right_on=["ano", "id_municipio", "id_dispensa"],
        )

        # Assign state acronym to the 'sigla_uf'
        mg_dispensa_1["sigla_uf"] = "MG"

        # Create a unique identifier for each purchase, as in licitacao table
        mg_dispensa_1["id_licitacao_bd"] = (
            mg_dispensa_1["id_dispensa"]
            + mg_dispensa_1["id_unidade_gestora"]
            + mg_dispensa_1["sigla_uf"]
        )

        # Keep only necessary variables
        mg_dispensa_1 = mg_dispensa_1.reindex(columns=ordem)

        # Append competitive and non competitive
        mg_item_1 = mg_licitacao_1.append([mg_dispensa_1], ignore_index=True)

        # Variables format
        floats = [
            "valor_unitario_cotacao",
            "valor_proposta",
            "valor_unitario",
            "valor_total",
            "quantidade",
            "quantidade_cotada",
        ]

        mg_item_1[floats] = mg_item_1[floats].astype(float)

        strings = ["id_licitacao", "id_dispensa", "id_unidade_gestora"]
        mg_item_1[strings] = mg_item_1[strings].astype(str)

        mg_item_1["numero_lote"] = mg_item_1["numero_lote"].replace(
            "-1", np.nan
        )
        mg_item_1["id_licitacao"] = mg_item_1["id_licitacao"].replace(
            "nan", np.nan
        )
        mg_item_1["id_dispensa"] = mg_item_1["id_dispensa"].replace(
            "nan", np.nan
        )

        # Create total value

        mg_item_1["valor_total"] = np.where(
            (mg_item_1["valor_unitario"].notnull())
            | (mg_item_1["quantidade"].notnull()),
            mg_item_1["quantidade"] * mg_item_1["valor_unitario"],
            np.nan,
        )

        # Create a unique identifier for each item
        mg_item_1["id_item_bd"] = (
            mg_item_1["id_item"]
            + mg_item_1["id_unidade_gestora"]
            + mg_item_1["sigla_uf"]
        )

        # Duplicates only allowed for items supplied by different suppliers
        mg_item_1["id_item_bd"] = np.where(
            (mg_item_1.duplicated(["id_item_bd"], keep=False))
            & (~mg_item_1.duplicated(["id_item_bd", "descricao"], keep=False)),
            np.nan,
            mg_item_1["id_item_bd"],
        )

        # Drop duplicates in all variables
        mg_item_1.drop_duplicates(inplace=True)

        # Reorder columns
        mg_item_1 = mg_item_1.reindex(columns=ordem)

        # Partition by year and municipality

        mg_item_1.drop(["ano", "sigla_uf"], axis=1, inplace=True)

        exec(
            "mg_item_1.to_csv('/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao_item/ano={}/sigla_uf=MG/municipio_{}.csv', index=False, encoding='utf-8', na_rep='', float_format='%.2f')".format(
                a, m
            )
        )

        # Append all
        mg_item_1["ano"] = a
        mg_item_1["sigla_uf"] = "MG"

        all_df_mg.append(mg_item_1)

mg_item_1 = pd.concat(all_df_mg, ignore_index=True, sort=True)

mg_item_1.to_csv(
    os.path.join(path, "output/temp/mg_item_1.csv"),
    index=False,
    na_rep="",
    float_format="%.2f",
)

In [None]:
anos_mg = ["2017"]

all_df_mg = []

for a in anos_mg:
    for m in municipios_mg:
        df = os.path.join(folder, "{}/licitacao_{}.zip".format(a, a))
        with ZipFile(df) as z:
            # Competitive procurement itens
            with z.open(
                "{}/licitacao/{}/{}.{}.licitacao.itemLicitacao.csv".format(
                    a, m, a, m
                )
            ) as f:
                mg1 = pd.read_csv(f, sep=";", encoding="utf-8", dtype=str)
                mg1["id_municipio"] = m
                mg1.rename(columns=mg1_rename, inplace=True)
                mg1.drop(mg1_drop, axis=1, inplace=True)
                mg1 = mg1.reindex(
                    columns=[
                        "id_municipio",
                        "id_item",
                        "id_licitacao",
                        "ano",
                        "orgao",
                        "descricao",
                        "unidade_medida",
                        "numero",
                        "numero_lote",
                    ]
                )

            # Quoted itens
            with z.open(
                "{}/licitacao/{}/{}.{}.licitacao.cotacaoLicitacao.csv".format(
                    a, m, a, m
                )
            ) as f:
                mg2 = pd.read_csv(f, sep=";", encoding="utf-8", dtype=str)
                mg2["id_municipio"] = m
                mg2.rename(columns=mg2_rename, inplace=True)
                mg2.drop(mg2_drop, axis=1, inplace=True)
                mg2 = mg2.reindex(
                    columns=[
                        "id_municipio",
                        "id_item",
                        "id_licitacao",
                        "ano",
                        "orgao",
                        "quantidade_cotada",
                        "valor_unitario_cotacao",
                    ]
                )

            # Items with reference price
            with z.open(
                "{}/licitacao/{}/{}.{}.licitacao.refLicitacao.csv".format(
                    a, m, a, m
                )
            ) as f:
                mg3 = pd.read_csv(f, sep=";", encoding="utf-8", dtype=str)
                mg3["id_municipio"] = m
                mg3.rename(columns=mg3_rename, inplace=True)
                mg3.drop(mg3_drop, axis=1, inplace=True)
                mg3 = mg3.reindex(
                    columns=[
                        "id_municipio",
                        "id_item",
                        "id_licitacao",
                        "ano",
                        "orgao",
                        "valor_unitario_cotacao",
                    ]
                )

            # Homologated itens (suppliers)
            with z.open(
                "{}/licitacao/{}/{}.{}.licitacao.homologLicitacao.csv".format(
                    a, m, a, m
                )
            ) as f:
                mg4 = pd.read_csv(f, sep=";", encoding="utf-8", dtype=str)
                mg4["id_municipio"] = m
                mg4.rename(columns=mg4_rename, inplace=True)
                mg4.drop(mg4_drop, axis=1, inplace=True)
                mg4 = mg4.reindex(
                    columns=[
                        "id_municipio",
                        "id_item",
                        "id_licitacao",
                        "ano",
                        "orgao",
                        "quantidade",
                        "valor_unitario",
                        "nome_vencedor",
                        "documento",
                    ]
                )

            # Non competitive procurement (dispensa/inexibilidade)
            with z.open(
                "{}/licitacao/{}/{}.{}.licitacao.itemDispensa.csv".format(
                    a, m, a, m
                )
            ) as f:
                mg5 = pd.read_csv(f, sep=";", encoding="utf-8", dtype=str)
                mg5["id_municipio"] = m
                mg5.rename(columns=mg5_rename, inplace=True)
                mg5.drop(mg5_drop, axis=1, inplace=True)
                mg5 = mg5.reindex(
                    columns=[
                        "id_municipio",
                        "id_item",
                        "id_dispensa",
                        "ano",
                        "orgao",
                        "descricao",
                        "unidade_medida",
                        "numero",
                    ]
                )

            # Quoted items
            with z.open(
                "{}/licitacao/{}/{}.{}.licitacao.cotDispensa.csv".format(
                    a, m, a, m
                )
            ) as f:
                mg6 = pd.read_csv(f, sep=";", encoding="utf-8", dtype=str)
                mg6["id_municipio"] = m
                mg6.rename(columns=mg6_rename, inplace=True)
                mg6.drop(mg6_drop, axis=1, inplace=True)
                mg6 = mg6.reindex(
                    columns=[
                        "id_municipio",
                        "id_item",
                        "id_dispensa",
                        "ano",
                        "orgao",
                        "quantidade_cotada",
                        "valor_unitario_cotacao",
                    ]
                )

            # Suppliers
            with z.open(
                "{}/licitacao/{}/{}.{}.licitacao.fornDispensa.csv".format(
                    a, m, a, m
                )
            ) as f:
                mg7 = pd.read_csv(f, sep=";", encoding="utf-8", dtype=str)
                mg7["id_municipio"] = m
                mg7.rename(columns=mg7_rename, inplace=True)
                mg7.drop(mg7_drop, axis=1, inplace=True)
                mg7 = mg7.reindex(
                    columns=[
                        "id_municipio",
                        "id_item",
                        "id_dispensa",
                        "ano",
                        "orgao",
                        "quantidade",
                        "valor_unitario",
                        "nome_vencedor",
                        "documento",
                    ]
                )

        # Merge competitive procurement

        # First - merge quoted items with items with reference price
        merge1 = pd.merge(
            mg2,
            mg3,
            how="outer",
            left_on=[
                "id_municipio",
                "id_item",
                "id_licitacao",
                "ano",
                "orgao",
                "valor_unitario_cotacao",
            ],
            right_on=[
                "id_municipio",
                "id_item",
                "id_licitacao",
                "ano",
                "orgao",
                "valor_unitario_cotacao",
            ],
        )

        # Second - merge items general information with their quoted or reference price
        merge2 = pd.merge(
            mg1,
            merge1,
            how="left",
            left_on=[
                "id_municipio",
                "id_item",
                "id_licitacao",
                "ano",
                "orgao",
            ],
            right_on=[
                "id_municipio",
                "id_item",
                "id_licitacao",
                "ano",
                "orgao",
            ],
        )

        # Third - merge with homologated items to get suppliers
        mg_licitacao_1 = pd.merge(
            merge2,
            mg4,
            how="left",
            left_on=[
                "id_municipio",
                "id_item",
                "id_licitacao",
                "ano",
                "orgao",
            ],
            right_on=[
                "id_municipio",
                "id_item",
                "id_licitacao",
                "ano",
                "orgao",
            ],
        )

        mg_licitacao_1["documento"] = mg_licitacao_1["documento"].str.strip()

        # Merge to get id_unidade_gestora
        mg_licitacao_1 = pd.merge(
            mg_licitacao_1,
            ug_id1,
            how="left",
            left_on=["ano", "id_municipio", "id_licitacao"],
            right_on=["ano", "id_municipio", "id_licitacao"],
        )

        # Assign state acronym to the 'sigla_uf'
        mg_licitacao_1["sigla_uf"] = "MG"

        # Create a unique identifier for each purchase, as in licitacao table
        mg_licitacao_1["id_licitacao_bd"] = (
            mg_licitacao_1["id_licitacao"]
            + mg_licitacao_1["id_unidade_gestora"]
            + mg_licitacao_1["sigla_uf"]
        )

        # Keep only necessary variables
        mg_licitacao_1 = mg_licitacao_1.reindex(columns=ordem)

        # Merge non competitive procurement

        # First - merge items general information with their quoted price
        merge3 = pd.merge(
            mg5,
            mg6,
            how="outer",
            left_on=["id_municipio", "id_item", "id_dispensa", "ano", "orgao"],
            right_on=[
                "id_municipio",
                "id_item",
                "id_dispensa",
                "ano",
                "orgao",
            ],
        )

        # Second - merge with homologated items to get suppliers
        mg_dispensa_1 = pd.merge(
            merge3,
            mg7,
            how="left",
            left_on=["id_municipio", "id_item", "id_dispensa", "ano", "orgao"],
            right_on=[
                "id_municipio",
                "id_item",
                "id_dispensa",
                "ano",
                "orgao",
            ],
        )

        mg_dispensa_1["documento"] = mg_dispensa_1["documento"].str.strip()

        # Merge to get id_unidade_gestora
        mg_dispensa_1 = pd.merge(
            mg_dispensa_1,
            ug_id2,
            how="left",
            left_on=["ano", "id_municipio", "id_dispensa"],
            right_on=["ano", "id_municipio", "id_dispensa"],
        )

        # Assign state acronym to the 'sigla_uf'
        mg_dispensa_1["sigla_uf"] = "MG"

        # Create a unique identifier for each purchase, as in licitacao table
        mg_dispensa_1["id_licitacao_bd"] = (
            mg_dispensa_1["id_dispensa"]
            + mg_dispensa_1["id_unidade_gestora"]
            + mg_dispensa_1["sigla_uf"]
        )

        # Keep only necessary variables
        mg_dispensa_1 = mg_dispensa_1.reindex(columns=ordem)

        # Append competitive and non competitive
        mg_item_2 = mg_licitacao_1.append([mg_dispensa_1], ignore_index=True)

        # Variables format
        floats = [
            "valor_unitario_cotacao",
            "valor_proposta",
            "valor_unitario",
            "valor_total",
            "quantidade",
            "quantidade_cotada",
        ]

        mg_item_2[floats] = mg_item_2[floats].astype(float)

        strings = ["id_licitacao", "id_dispensa", "id_unidade_gestora"]
        mg_item_2[strings] = mg_item_2[strings].astype(str)

        mg_item_2["numero_lote"] = mg_item_2["numero_lote"].replace(
            "-1", np.nan
        )
        mg_item_2["id_licitacao"] = mg_item_2["id_licitacao"].replace(
            "nan", np.nan
        )
        mg_item_2["id_dispensa"] = mg_item_2["id_dispensa"].replace(
            "nan", np.nan
        )

        # Create total value
        mg_item_2["valor_total"] = np.where(
            (mg_item_2["valor_unitario"].notnull())
            | (mg_item_2["quantidade"].notnull()),
            mg_item_2["quantidade"] * mg_item_2["valor_unitario"],
            np.nan,
        )

        # Create a unique identifier for each item
        mg_item_2["id_item_bd"] = (
            mg_item_2["id_item"]
            + mg_item_2["id_unidade_gestora"]
            + mg_item_2["sigla_uf"]
        )

        # Duplicates only allowed for items supplied by different suppliers

        mg_item_2["id_item_bd"] = np.where(
            (mg_item_2.duplicated(["id_item_bd"], keep=False))
            & (~mg_item_2.duplicated(["id_item_bd", "descricao"], keep=False)),
            np.nan,
            mg_item_2["id_item_bd"],
        )

        mg_item_2["id_item_bd"] = np.where(
            (
                mg_item_2.duplicated(
                    ["id_item_bd", "descricao", "documento", "nome_vencedor"],
                    keep=False,
                )
            )
            & (
                ~mg_item_2.duplicated(
                    [
                        "id_item_bd",
                        "descricao",
                        "documento",
                        "nome_vencedor",
                        "quantidade",
                    ],
                    keep=False,
                )
            ),
            np.nan,
            mg_item_2["id_item_bd"],
        )

        # Few duplicates in id_item_bd
        mg_item_2.drop_duplicates(inplace=True)

        # Reorder columns
        mg_item_2 = mg_item_2.reindex(columns=ordem)

        # Partition by year and municipality
        mg_item_2.drop(["ano", "sigla_uf"], axis=1, inplace=True)

        exec(
            "mg_item_2.to_csv('/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao_item/ano={}/sigla_uf=MG/municipio_{}.csv', index=False, encoding='utf-8', na_rep='', float_format='%.2f')".format(
                a, m
            )
        )

        # Append all
        mg_item_2["ano"] = a
        mg_item_2["sigla_uf"] = "MG"

        all_df_mg.append(mg_item_2)

mg_item_2 = pd.concat(all_df_mg, ignore_index=True, sort=True)

mg_item_2.to_csv(
    os.path.join(path, "output/temp/mg_item_2.csv"),
    index=False,
    na_rep="",
    float_format="%.2f",
)

In [None]:
anos_mg = ["2018", "2019", "2020", "2021"]

all_df_mg = []

for a in anos_mg:
    for m in municipios_mg:
        df = os.path.join(folder, "{}/licitacao_{}.zip".format(a, a))
        with ZipFile(df) as z:
            try:
                # Competitive procurement items
                with z.open(
                    "{}.{}.licitacao.itemLicitacao.csv".format(a, m)
                ) as f:
                    mg1 = pd.read_csv(f, sep=";", encoding="utf-8", dtype=str)
                    mg1["id_municipio"] = m
                    mg1.rename(columns=mg1_rename, inplace=True)
                    mg1.drop(mg1_drop, axis=1, inplace=True)
                    mg1 = mg1.reindex(
                        columns=[
                            "id_municipio",
                            "id_item",
                            "id_licitacao",
                            "ano",
                            "orgao",
                            "descricao",
                            "unidade_medida",
                            "numero",
                            "numero_lote",
                        ]
                    )
            except IOError:
                print(
                    "Erro de input/output para o município {} e ano {} - mg1".format(
                        m, a
                    )
                )

            try:
                # Quoted items
                with z.open(
                    "{}.{}.licitacao.cotacaoLicitacao.csv".format(a, m)
                ) as f:
                    mg2 = pd.read_csv(f, sep=";", encoding="utf-8", dtype=str)
                    mg2["id_municipio"] = m
                    mg2.rename(columns=mg2_rename, inplace=True)
                    mg2.drop(mg2_drop, axis=1, inplace=True)
                    mg2 = mg2.reindex(
                        columns=[
                            "id_municipio",
                            "id_item",
                            "id_licitacao",
                            "ano",
                            "orgao",
                            "quantidade_cotada",
                            "valor_unitario_cotacao",
                        ]
                    )
            except IOError:
                print(
                    "Erro de input/output para o município {} e ano {} - mg2".format(
                        m, a
                    )
                )

            try:
                # Items with reference price
                with z.open(
                    "{}.{}.licitacao.refLicitacao.csv".format(a, m)
                ) as f:
                    mg3 = pd.read_csv(f, sep=";", encoding="utf-8", dtype=str)
                    mg3["id_municipio"] = m
                    mg3.rename(columns=mg3_rename, inplace=True)
                    mg3.drop(mg3_drop, axis=1, inplace=True)
                    mg3 = mg3.reindex(
                        columns=[
                            "id_municipio",
                            "id_item",
                            "id_licitacao",
                            "ano",
                            "orgao",
                            "valor_unitario_cotacao",
                        ]
                    )
            except IOError:
                print(
                    "Erro de input/output para o município {} e ano {} - mg3".format(
                        m, a
                    )
                )

            try:
                # Homologated items (suppliers)
                with z.open(
                    "{}.{}.licitacao.homologLicitacao.csv".format(a, m)
                ) as f:
                    mg4 = pd.read_csv(f, sep=";", encoding="utf-8", dtype=str)
                    mg4["id_municipio"] = m
                    mg4.rename(columns=mg4_rename, inplace=True)
                    mg4.drop(mg4_drop, axis=1, inplace=True)
                    mg4 = mg4.reindex(
                        columns=[
                            "id_municipio",
                            "id_item",
                            "id_licitacao",
                            "ano",
                            "orgao",
                            "quantidade",
                            "valor_unitario",
                            "nome_vencedor",
                            "documento",
                        ]
                    )
            except IOError:
                print(
                    "Erro de input/output para o município {} e ano {} - mg4".format(
                        m, a
                    )
                )

            try:
                # Non competitive procurement (dispensa/inexibilidade)
                with z.open(
                    "{}.{}.licitacao.itemDispensa.csv".format(a, m)
                ) as f:
                    mg5 = pd.read_csv(f, sep=";", encoding="utf-8", dtype=str)
                    mg5["id_municipio"] = m
                    mg5.rename(columns=mg5_rename, inplace=True)
                    mg5.drop(mg5_drop, axis=1, inplace=True)
                    mg5 = mg5.reindex(
                        columns=[
                            "id_municipio",
                            "id_item",
                            "id_dispensa",
                            "ano",
                            "orgao",
                            "descricao",
                            "unidade_medida",
                            "numero",
                        ]
                    )
            except IOError:
                print(
                    "Erro de input/output para o município {} e ano {} - mg5".format(
                        m, a
                    )
                )

            try:
                # Quoted items
                with z.open(
                    "{}.{}.licitacao.cotDispensa.csv".format(a, m)
                ) as f:
                    mg6 = pd.read_csv(f, sep=";", encoding="utf-8", dtype=str)
                    mg6["id_municipio"] = m
                    mg6.rename(columns=mg6_rename, inplace=True)
                    mg6.drop(mg6_drop, axis=1, inplace=True)
                    mg6 = mg6.reindex(
                        columns=[
                            "id_municipio",
                            "id_item",
                            "id_dispensa",
                            "ano",
                            "orgao",
                            "quantidade_cotada",
                            "valor_unitario_cotacao",
                        ]
                    )
            except IOError:
                print(
                    "Erro de input/output para o município {} e ano {} - mg6".format(
                        m, a
                    )
                )

            try:
                # Suppliers
                with z.open(
                    "{}.{}.licitacao.fornDispensa.csv".format(a, m)
                ) as f:
                    mg7 = pd.read_csv(f, sep=";", encoding="utf-8", dtype=str)
                    mg7["id_municipio"] = m
                    mg7.rename(columns=mg7_rename, inplace=True)
                    mg7.drop(mg7_drop, axis=1, inplace=True)
                    mg7 = mg7.reindex(
                        columns=[
                            "id_municipio",
                            "id_item",
                            "id_dispensa",
                            "ano",
                            "orgao",
                            "quantidade",
                            "valor_unitario",
                            "nome_vencedor",
                            "documento",
                        ]
                    )
            except IOError:
                print(
                    "Erro de input/output para o município {} e ano {} - mg7".format(
                        m, a
                    )
                )

        # Merge competitive procurement

        # First - merge quoted items with items with reference price
        merge1 = pd.merge(
            mg2,
            mg3,
            how="outer",
            left_on=[
                "id_municipio",
                "id_item",
                "id_licitacao",
                "ano",
                "orgao",
                "valor_unitario_cotacao",
            ],
            right_on=[
                "id_municipio",
                "id_item",
                "id_licitacao",
                "ano",
                "orgao",
                "valor_unitario_cotacao",
            ],
        )

        # Second - merge items general information with their quoted or reference price
        merge2 = pd.merge(
            mg1,
            merge1,
            how="left",
            left_on=[
                "id_municipio",
                "id_item",
                "id_licitacao",
                "ano",
                "orgao",
            ],
            right_on=[
                "id_municipio",
                "id_item",
                "id_licitacao",
                "ano",
                "orgao",
            ],
        )

        # Third - merge with homologated items to get suppliers
        mg_licitacao_1 = pd.merge(
            merge2,
            mg4,
            how="left",
            left_on=[
                "id_municipio",
                "id_item",
                "id_licitacao",
                "ano",
                "orgao",
            ],
            right_on=[
                "id_municipio",
                "id_item",
                "id_licitacao",
                "ano",
                "orgao",
            ],
        )

        mg_licitacao_1["documento"] = mg_licitacao_1["documento"].str.strip()

        # Merge to get id_unidade_gestora
        mg_licitacao_1 = pd.merge(
            mg_licitacao_1,
            ug_id1,
            how="left",
            left_on=["ano", "id_municipio", "id_licitacao"],
            right_on=["ano", "id_municipio", "id_licitacao"],
        )

        # Assign state acronym to the 'sigla_uf'
        mg_licitacao_1["sigla_uf"] = "MG"

        # Create a unique identifier for each purchase, as in licitacao table
        mg_licitacao_1["id_licitacao_bd"] = (
            mg_licitacao_1["id_licitacao"]
            + mg_licitacao_1["id_unidade_gestora"]
            + mg_licitacao_1["sigla_uf"]
        )

        # Keep only necessary variables
        mg_licitacao_1 = mg_licitacao_1.reindex(columns=ordem)

        # Merge non competitive procurement

        # First - merge items general information with their quoted price
        merge3 = pd.merge(
            mg5,
            mg6,
            how="outer",
            left_on=["id_municipio", "id_item", "id_dispensa", "ano", "orgao"],
            right_on=[
                "id_municipio",
                "id_item",
                "id_dispensa",
                "ano",
                "orgao",
            ],
        )

        # Second - merge with homologated items to get suppliers
        mg_dispensa_1 = pd.merge(
            merge3,
            mg7,
            how="left",
            left_on=["id_municipio", "id_item", "id_dispensa", "ano", "orgao"],
            right_on=[
                "id_municipio",
                "id_item",
                "id_dispensa",
                "ano",
                "orgao",
            ],
        )

        mg_dispensa_1["documento"] = mg_dispensa_1["documento"].str.strip()

        # Merge to get id_unidade_gestora
        mg_dispensa_1 = pd.merge(
            mg_dispensa_1,
            ug_id2,
            how="left",
            left_on=["ano", "id_municipio", "id_dispensa"],
            right_on=["ano", "id_municipio", "id_dispensa"],
        )

        # Assign state acronym to the 'sigla_uf'
        mg_dispensa_1["sigla_uf"] = "MG"

        # Create a unique identifier for each purchase, as in licitacao table
        mg_dispensa_1["id_licitacao_bd"] = (
            mg_dispensa_1["id_dispensa"]
            + mg_dispensa_1["id_unidade_gestora"]
            + mg_dispensa_1["sigla_uf"]
        )

        # Keep only necessary variables
        mg_dispensa_1 = mg_dispensa_1.reindex(columns=ordem)

        # Append competitive and non competitive
        mg_item_3 = mg_licitacao_1.append([mg_dispensa_1], ignore_index=True)

        # Variables format
        floats = [
            "valor_unitario_cotacao",
            "valor_proposta",
            "valor_unitario",
            "valor_total",
            "quantidade",
            "quantidade_cotada",
        ]

        mg_item_3[floats] = mg_item_3[floats].astype(float)

        strings = ["id_licitacao", "id_dispensa", "id_unidade_gestora"]
        mg_item_3[strings] = mg_item_3[strings].astype(str)

        mg_item_3["numero_lote"] = mg_item_3["numero_lote"].replace(
            "-1", np.nan
        )
        mg_item_3["id_licitacao"] = mg_item_3["id_licitacao"].replace(
            "nan", np.nan
        )
        mg_item_3["id_dispensa"] = mg_item_3["id_dispensa"].replace(
            "nan", np.nan
        )

        # Create total value

        mg_item_3["valor_total"] = np.where(
            (mg_item_3["valor_unitario"].notnull())
            | (mg_item_3["quantidade"].notnull()),
            mg_item_3["quantidade"] * mg_item_3["valor_unitario"],
            np.nan,
        )

        # Create a unique identifier for each item
        mg_item_3["id_item_bd"] = (
            mg_item_3["id_item"]
            + mg_item_3["id_unidade_gestora"]
            + mg_item_3["sigla_uf"]
        )

        # Duplicates only allowed for items supplied by different suppliers

        mg_item_3["id_item_bd"] = np.where(
            (mg_item_3.duplicated(["id_item_bd"], keep=False))
            & (~mg_item_3.duplicated(["id_item_bd", "descricao"], keep=False)),
            np.nan,
            mg_item_3["id_item_bd"],
        )

        mg_item_3["id_item_bd"] = np.where(
            (
                mg_item_3.duplicated(
                    ["id_item_bd", "descricao", "documento"], keep=False
                )
            )
            & (
                ~mg_item_3.duplicated(
                    ["id_item_bd", "descricao", "documento", "quantidade"],
                    keep=False,
                )
            ),
            np.nan,
            mg_item_3["id_item_bd"],
        )

        # Few duplicates in id_item_bd
        mg_item_3.drop_duplicates(inplace=True)

        mg_item_3 = mg_item_3.reindex(columns=ordem)

        # Partition by year and municipality
        mg_item_3.drop(["ano", "sigla_uf"], axis=1, inplace=True)

        exec(
            "mg_item_3.to_csv('/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao_item/ano={}/sigla_uf=MG/municipio_{}.csv', index=False, encoding='utf-8', na_rep='', float_format='%.2f')".format(
                a, m
            )
        )

        # Append all
        mg_item_3["ano"] = a
        mg_item_3["sigla_uf"] = "MG"

        all_df_mg.append(mg_item_3)

mg_item_3 = pd.concat(all_df_mg, ignore_index=True, sort=True)

mg_item_3.to_csv(
    os.path.join(path, "output/temp/mg_item_3.csv"),
    index=False,
    na_rep="",
    float_format="%.2f",
)

## PR


In [None]:
# PR

# List municipalities to loop
# Drop municipalities for which we couldn't transform xml in csv

municipio_pr = municipio.query('sigla_uf=="PR"')

municipio_pr = municipio_pr.query(
    'id_municipio_6 != "411915" & \
                                   id_municipio_6 != "411370" & \
                                   id_municipio_6 != "411535" & \
                                   id_municipio_6 != "411710" & \
                                   id_municipio_6 != "412627" & \
                                   id_municipio_6 != "410140" & \
                                   id_municipio_6 != "410350"'
)

municipios_pr = municipio_pr["id_municipio_6"].tolist()

# Rename and list variables to drop

pr_columns = [
    "cdIBGE",
    "idlicitacao",
    "idPessoa",
    "nrAnoLicitacao",
    "dsItem",
    "dsUnidadeMedida",
    "nmPessoa",
    "nrDocumento",
    "nrItem",
    "nrLote",
    "nrQuantidadePropostaLicitacao",
    "nrQuantidadeVencedorLicitacao",
    "vlLicitacaoVencedorLicitacao",
    "vlPropostaItem",
    "nrClassificacao",
]

pr_rename = {
    "cdIBGE": "id_municipio",
    "idlicitacao": "id_licitacao",
    "idPessoa": "id_unidade_gestora",
    "nrAnoLicitacao": "ano",
    "dsItem": "descricao",
    "dsUnidadeMedida": "unidade_medida",
    "nmPessoa": "nome_vencedor",
    "nrDocumento": "documento",
    "nrItem": "numero",
    "nrLote": "numero_lote",
    "nrQuantidadePropostaLicitacao": "quantidade_proposta",
    "nrQuantidadeVencedorLicitacao": "quantidade",
    "vlLicitacaoVencedorLicitacao": "valor_vencedor",
    "vlPropostaItem": "valor_proposta",
    "nrClassificacao": "numero_classificacao",
}

In [None]:
anos_pr = [
    "2013",
    "2014",
    "2015",
    "2016",
    "2017",
    "2018",
    "2019",
    "2020",
    "2021",
]

all_df_pr = []

for a in anos_pr:
    for m in municipios_pr:
        exec(
            "path_lic_venc = '/content/gdrive/MyDrive/ComprasPublicas_Brasil/input/PR/{}/Licitacao/{}/{}_{}_LicitacaoVencedor.csv'".format(
                a, m, a, m
            )
        )

        pr = pd.read_csv(
            path_lic_venc,  # noqa: F821
            sep=",",
            encoding="utf-8",
            dtype=str,
        )  # usecols = pr_columns
        pr.rename(pr_rename, axis=1, inplace=True)

        # Merge id_municipio 6 and 7 digits id
        pr = pd.merge(
            pr,
            municipio,
            how="left",
            left_on="id_municipio",
            right_on="id_municipio_6",
        )

        pr.drop(
            ["id_municipio_x", "id_municipio_6", "nome", "id_municipio_tce"],
            axis=1,
            inplace=True,
        )
        pr.rename({"id_municipio_y": "id_municipio"}, axis=1, inplace=True)

        # Format
        pr["documento"] = pr["documento"].str.replace("-", "")
        pr["documento"] = pr["documento"].str.replace(".", "")
        pr["documento"] = pr["documento"].str.strip()

        # Create a unique identifier for each purchase
        pr["id_licitacao_bd"] = (
            pr["id_licitacao"] + pr["id_unidade_gestora"] + pr["sigla_uf"]
        )

        # Create a unique identifier for each item
        pr["id_item"] = (
            pr["numero"]
            + " "
            + pr["numero_lote"]
            + " "
            + pr["id_licitacao"]
            + " "
            + pr["id_municipio"].str[4:]
        )

        # Drop non suppliers
        pr["min_classificacao"] = pr.groupby(
            ["ano", "id_municipio", "id_licitacao", "numero_lote", "numero"]
        )["numero_classificacao"].transform("min")
        pr["diff"] = np.where(
            pr["numero_classificacao"] == pr["min_classificacao"], 0, 1
        )
        pr = pr[pr["diff"] == 0]

        # Create a unique identifier for each item across states
        pr["id_item_bd"] = pr["id_item"] + pr["sigla_uf"]

        # Duplicates only allowed for items supplied by different suppliers

        pr["id_item_bd"] = np.where(
            (pr.duplicated(["id_item_bd"], keep=False))
            & (~pr.duplicated(["id_item_bd", "descricao"], keep=False)),
            np.nan,
            pr["id_item_bd"],
        )

        # Drop duplicated in all variables
        pr.drop_duplicates(inplace=True)

        # Adds zeros to the left, missing in some cnpjs
        pr["length"] = pr["documento"].str.len()
        pr["documento"] = np.where(
            pr["length"] > 11,
            pr["documento"].astype(str).str.zfill(14),
            pr["documento"],
        )

        # Variables format
        floats = ["valor_proposta", "valor_vencedor"]
        pr[floats] = pr[floats].astype(float)

        pr["quantidade"] = (
            pd.to_numeric(pr["quantidade"], errors="coerce")
            .fillna(-1)
            .astype(int)
        )
        pr["quantidade"] = pr["quantidade"].replace(-1, "")

        pr["quantidade_proposta"] = (
            pd.to_numeric(pr["quantidade_proposta"], errors="coerce")
            .fillna(-1)
            .astype(int)
        )
        pr["quantidade_proposta"] = pr["quantidade_proposta"].replace(-1, "")

        # Reorder columns
        pr = pr.reindex(columns=ordem)

        # Partition by year and municipality
        pr.drop(["ano", "sigla_uf"], axis=1, inplace=True)

        exec(
            "pr.to_csv('/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao_item/ano={}/sigla_uf=PR/municipio_{}.csv', index=False, encoding='utf-8', na_rep='', float_format='%.2f')".format(
                a, m
            )
        )

        # Append all
        pr["ano"] = a
        pr["sigla_uf"] = "PR"

        pr = pr.reindex(columns=ordem)

        all_df_pr.append(pr)

item_pr = pd.concat(all_df_pr, ignore_index=True, sort=True)

item_pr.to_csv(
    os.path.join(path, "output/licitacao_item_pr.csv"),
    index=False,
    na_rep="",
    float_format="%.2f",
)

## RS


In [None]:
# List municipalities to loop

municipio_rs = municipio.query('sigla_uf=="RS"')
municipios_rs = municipio_rs["id_municipio"].tolist()

# Rename and list variables to drop

rs_rename = {
    "ANO_LICITACAO": "ano",
    "CD_MUNICIPIO_IBGE": "id_municipio",
    "DS_ITEM": "descricao",
    "NR_ITEM": "numero",
    "NR_LOTE": "numero_lote",
    "QT_ITENS": "quantidade_cotada",
    "SG_UNIDADE_MEDIDA": "unidade_medida",
    "VL_UNITARIO_ESTIMADO": "valor_unitario_cotacao",
    "VL_UNITARIO_HOMOLOGADO": "valor_unitario",
    "VL_TOTAL_HOMOLOGADO": "valor_total",
    "NR_DOCUMENTO": "documento",
    "NR_LICITACAO": "id_licitacao",
    "TP_DOCUMENTO.1": "TP_DOCUMENTO_2",
    "CD_TIPO_MODALIDADE": "modalidade",
}

rs_drop = [
    "BL_COVID19",
    "CD_FONTE_REFERENCIA",
    "CD_TIPO_FAMILIA",
    "CD_TIPO_SUBFAMILIA",
    "DS_FONTE_REFERENCIA",
    "DT_REF_VALOR_ESTIMADO",
    "PC_ENCARGOS_SOCIAIS_ESTIMADO",
    "PC_ENCARGOS_SOCIAIS_HOMOLOGADO",
    "PC_TX_ESTIMADA",
    "TP_ORCAMENTO",
    "PC_TX_HOMOLOGADA",
    "TP_BENEFICIO_MICRO_EPP",
    "PC_BDI_ESTIMADO",
    "PC_BDI_HOMOLOGADO",
    "NR_ITEM_ORIGINAL",
    "TP_RESULTADO_ITEM",
    "NR_DOCUMENTO.1",
    "TP_DOCUMENTO",
    "TP_DOCUMENTO_2",
]

In [None]:
# item.csv
# licitacao.csv
# pessoas.csv

# RS folder
folder = os.path.join(path, "input/RS")

anos_rs = ["2016", "2017", "2018", "2019", "2020", "2021"]

all_df_rs = []
for a in anos_rs:
    df = os.path.join(
        folder,
        "Licitacao/{}.csv.zip".format(
            a,
        ),
    )
    with ZipFile(df) as z:
        with z.open("item.csv") as f:
            rs = pd.read_csv(f, sep=",", encoding="utf-8", dtype=str)

        with z.open("licitacao.csv") as f:
            rs2 = pd.read_csv(
                f,
                sep=",",
                encoding="utf-8",
                dtype=str,
                usecols=[
                    "CD_ORGAO",
                    "NR_LICITACAO",
                    "ANO_LICITACAO",
                    "CD_TIPO_MODALIDADE",
                    "TP_DOCUMENTO_FORNECEDOR",
                    "NR_DOCUMENTO_FORNECEDOR",
                ],
            )

            # Merge to get some cnpjs
            rs = pd.merge(
                rs,
                rs2,
                how="left",
                left_on=[
                    "CD_ORGAO",
                    "NR_LICITACAO",
                    "ANO_LICITACAO",
                    "CD_TIPO_MODALIDADE",
                ],
                right_on=[
                    "CD_ORGAO",
                    "NR_LICITACAO",
                    "ANO_LICITACAO",
                    "CD_TIPO_MODALIDADE",
                ],
            )

            # Concat document variables into one variable
            rs["NR_DOCUMENTO"] = np.where(
                rs["NR_DOCUMENTO"].isna(),
                rs["NR_DOCUMENTO.1"],
                rs["NR_DOCUMENTO"],
            )
            rs["NR_DOCUMENTO"] = np.where(
                rs["NR_DOCUMENTO"].isna(),
                rs["NR_DOCUMENTO_FORNECEDOR"],
                rs["NR_DOCUMENTO"],
            )

            rs.rename(rs_rename, axis=1, inplace=True)

            # Set as missing documents with strings (razao_social)
            rs["documento"] = np.where(
                rs["documento"].str.isnumeric(), rs["documento"], np.nan
            )

            # Add zeros to the left, missing in some cnpjs
            rs["length"] = rs["documento"].str.len()
            rs["documento"] = np.where(
                (rs["length"] > 11)
                & (
                    (rs["TP_DOCUMENTO"] == "J")
                    | (rs["TP_DOCUMENTO_2"] == "J")
                    | (rs["TP_DOCUMENTO_FORNECEDOR"] == "J")
                ),
                rs["documento"].str.zfill(14),
                rs["documento"],
            )

            rs.drop(rs_drop, axis=1, inplace=True)

            # Assign state acronym to the 'sigla_uf'
            rs["sigla_uf"] = "RS"

            # Merge to get id_municipio

            rs = pd.merge(
                rs,
                orgao_municipio,
                how="left",
                left_on="CD_ORGAO",
                right_on="CD_ORGAO",
                indicator=True,
            )  # alguns ids missing

            rs.rename(
                {"CD_MUNICIPIO_IBGE": "id_municipio", "CD_ORGAO": "orgao"},
                axis=1,
                inplace=True,
            )

            rs = rs[rs["_merge"] == "both"]
            rs.drop("_merge", axis=1, inplace=True)

            # Create a unique identifier for each purcase, as in licitacao table
            rs["id_licitacao_bd"] = (
                rs["id_licitacao"]
                + rs["ano"]
                + rs["modalidade"]
                + rs["orgao"]
                + rs["sigla_uf"]
            )

            rs = rs.drop(rs[(rs["modalidade"] == "MAI")].index)

            # Create a unique identifier for each item
            rs["id_item"] = (
                rs["numero"]
                + " "
                + rs["numero_lote"]
                + " "
                + rs["id_licitacao"]
                + " "
                + rs["ano"]
                + " "
                + rs["modalidade"]
                + " "
                + rs["orgao"]
            )  # few duplicates yet

            # Create a unique identifier for each item across states
            rs["id_item_bd"] = rs["id_item"] + rs["sigla_uf"]

        # Open file with participants information

        with z.open("pessoas.csv") as f:
            rs3 = pd.read_csv(
                f,
                sep=",",
                encoding="utf-8",
                dtype=str,
                usecols=["NR_DOCUMENTO", "NM_PESSOA", "TP_PESSOA"],
            )

            rs3.rename(
                {
                    "NM_PESSOA": "nome_vencedor",
                    "NR_DOCUMENTO": "documento",
                    "TP_PESSOA": "tipo",
                },
                axis=1,
                inplace=True,
            )

            # Type of supplier - firm, person or international
            rs3["tipo"] = rs3["tipo"].replace(
                ["J", "F", "E", "P"], ["1", "2", "3", ""]
            )

            # Replace documents containing strings with missing (usually suppliers name)
            rs3["documento"] = np.where(
                rs3["documento"].str.isnumeric(), rs3["documento"], np.nan
            )

            # Add zeros to the left, missing in some cnpjs
            rs3["length"] = rs3["documento"].str.len()
            rs3["documento"] = np.where(
                (rs3["length"] > 11) & (rs3["tipo"] == "1"),
                rs3["documento"].str.zfill(14),
                rs3["documento"],
            )

            rs3.drop(["tipo", "length"], axis=1, inplace=True)

            rs3.drop_duplicates(
                subset=["documento"], inplace=True
            )  # same document in many rows (same suppliers)

            # Merge to get suppliers name
            rs = pd.merge(
                rs,
                rs3,
                how="left",
                left_on=["documento"],
                right_on=["documento"],
            )

            # Format

            rs["valor_unitario"] = rs["valor_unitario"].astype(float)
            rs["valor_total"] = rs["valor_total"].replace(
                "###############", np.nan
            )
            rs["valor_total"] = rs["valor_total"].astype(float)

            rs["quantidade"] = rs["valor_total"] / rs["valor_unitario"]

            rs["quantidade"] = rs["quantidade"].replace(np.nan, -1)
            rs["quantidade"] = rs["quantidade"].astype(int)
            rs["quantidade"] = rs["quantidade"].replace(-1, "")

            rs["quantidade_cotada"] = rs["quantidade_cotada"].astype(float)
            rs["quantidade_cotada"] = rs["quantidade_cotada"].astype(int)

            # Drop duplicates in all variables
            rs.drop_duplicates(inplace=True)

            # Duplicates not allowed for same id_item_bd, but different descriptions
            rs["id_item_bd"] = np.where(
                (rs.duplicated(["id_item_bd"], keep=False))
                & (~rs.duplicated(["id_item_bd", "descricao"], keep=False)),
                np.nan,
                rs["id_item_bd"],
            )

    # Reorder columns
    rs = rs.reindex(columns=ordem)

    # Append all
    all_df_rs.append(rs)

rs = pd.concat(all_df_rs, ignore_index=True, sort=True)

# Save
rs.to_csv(
    os.path.join(path, "output/licitacao_item_rs.csv"),
    index=False,
    na_rep="",
    float_format="%.2f",
)

## Partition


In [None]:
# List of UFs
ufs = ["CE", "RS"]

# Loop over each UF
for uf in ufs:
    # Load the corresponding CSV file for the UF
    file_path = f"/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao_item_{uf.lower()}.csv"
    df = pd.read_csv(file_path, dtype=str, encoding="utf-8")

    # Convert 'ano' column to integer
    df["ano"] = df["ano"].astype(int)

    # Save cvs by year and state

    for ano in [*range(2012, 2022)]:
        for uf in ufs:
            if uf == "CE" and ano in [*range(2009, 2022)]:
                print("Particionando {} do CE".format(ano))
                df2 = df[df["ano"] == ano]
                df2.drop(["ano", "sigla_uf"], axis=1, inplace=True)
                exec(
                    "df2.to_csv('/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao_item/ano={}/sigla_uf=CE/microdados.csv', index=False, encoding='utf-8', na_rep='', float_format='%.2f')".format(
                        ano
                    )
                )
            if uf == "RS" and ano in [*range(2016, 2022)]:
                print("Particionando {} do RS".format(ano))
                df2 = df[df["ano"] == ano]
                df2.drop(["ano", "sigla_uf"], axis=1, inplace=True)
                exec(
                    "df2.to_csv('/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao/ano={}/sigla_uf=RS/microdados.csv', index=False, encoding='utf-8', na_rep='', float_format='%.2f')".format(
                        ano
                    )
                )