In [48]:
# Importa bibliotecas essenciais para o notebook
# - pathlib: manipulação de caminhos de arquivos
# - pandas: manipulação de dados
# - scipy.stats: funções estatísticas como z-score
from pathlib import Path

import pandas as pd
from scipy import stats

In [49]:
# Configura o pandas para exibir números float com 8 casas decimais
# e usando '_' como separador de milhar (ex: 0_12345678)
pd.options.display.float_format = "{:_.8f}".format

In [50]:
# Define os diretórios de dados:
# - DADOS: onde estão os arquivos de votos completos
# - AMOSTRAS: onde serão salvos os arquivos de amostras extraídas
DADOS: Path = Path(".").parent.resolve() / "data" / "raw"
AMOSTRAS: Path = Path(".").parent.resolve() / "data" / "sample"

In [51]:
# Define o nível de significância (alpha = 5%) e calcula o valor z
# correspondente à normal padrão para um IC bicaudal (95%)
alpha = .05
z = stats.norm.ppf(1 - alpha / 2)

# Carga dos dados

In [52]:
# Cria um dicionário para padronizar os nomes dos candidatos
# Isso ajuda a consolidar votos de diferentes grafias ou variações
names_map = {
    "Luiz Inácio Lula Da Silva": "Lula",
    "Jose Maria Eymael": "Eymael",
    "Constituinte Eymael": "Eymael",
    "Kelmon Luis Da Silva Souza": "Padre Kelmon",
    "Brancos": "Branco",
    "Voto Branco": "Branco",
    "Luiz Felipe Chaves D Avila": "Felipe D'Avila",
    "Soraia Thronicke": "Soraya Thronicke",
    "Soraya Vieira Thronicke": "Soraya Thronicke",
    "Ciro Ferreira Gomes": "Ciro Gomes",
    "Sofia Padua Manzano": "Sofia Manzano",
    "Jair Messias Bolsonaro": "Jair Bolsonaro",
    "Simone Nassar Tebet": "Simone Tebet",
    "Leonardo Péricles Vieira Roque": "Léo Péricles",
    "Voto Nulo": "Nulo",
    "Nulos": "Nulo",
    "Vera Lucia Pereira Da Silva Salgado": "Vera Lucia",
    "Vera": "Vera Lucia",
    "Felipe D´Avila": "Felipe D'Avila",
}

In [None]:
# Carrega os dados populacionais
dfs = []

for file in DADOS.glob("*.csv"):
    cargo, turno, *_ = file.stem.split("_")
    df = (
        pd.read_csv(file, dtype="category")
        .iloc[:, -2:]
        .rename(columns=str.casefold)
        .rename(columns=lambda name: "nome" if name != "estado" else name)
        .assign(
            nome=lambda df_: df_["nome"].str.title(),
            cargo=cargo,
            turno=int(turno)
        )
        .assign(
            nome=lambda df_: df_["nome"].apply(lambda s: names_map.get(s, s))
        )
    )
    dfs.append(df)

populacao = pd.concat(dfs).astype({
    "estado": "category", "cargo": "category"
})

# Geração das amostras

In [54]:
sample_sizes = (
    pd.DataFrame([
        ['AC', 1, "Governador", 455_438, 2389],
        ['AC', 1, "Senador", 455_438, 2389],
        ['RO', 1, "Governador", 925_763, 2395],
        ['RO', 1, "Senador", 925_763, 2395],
        ['RO', 2, "Governador", 925563, 2395],
        ["AC", 1, "Presidente", 455_903, 110],
        ["RO", 1, "Presidente", 926_827, 224],
        ["AM", 1, "Presidente", 2_113_771, 512],
        ["RR", 1, "Presidente", 305_404, 74],
        ["AP", 1, "Presidente", 442_842, 107],
        ["PA", 1, "Presidente", 4_789_311, 1_159],
        ["TO", 1, "Presidente", 891_449, 216],
        ["AC", 2, "Presidente", 420_760, 104],
        ["RO", 2, "Presidente", 926_517, 230],
        ["AM", 2, "Presidente", 2_067_875, 513],
        ["RR", 2, "Presidente", 286_269, 71],
        ["AP", 2, "Presidente", 400_683, 99],
        ["PA", 2, "Presidente", 4_701_740, 1_167],
        ["TO", 2, "Presidente", 871_238, 216],

    ],
    columns=["estado", "turno", "cargo", "N", "n"])
    .set_index(["cargo", "estado", "turno"])
)
sample_sizes

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,N,n
cargo,estado,turno,Unnamed: 3_level_1,Unnamed: 4_level_1
Governador,AC,1,455438,2389
Senador,AC,1,455438,2389
Governador,RO,1,925763,2395
Senador,RO,1,925763,2395
Governador,RO,2,925563,2395
Presidente,AC,1,455903,110
Presidente,RO,1,926827,224
Presidente,AM,1,2113771,512
Presidente,RR,1,305404,74
Presidente,AP,1,442842,107


In [None]:
# Cria as amomstras
random_state = 789

for (uf, turno, cargo), tamanho in sample_sizes.iterrows():
    sample = (
        populacao.query(
            f"estado == {uf!r} and turno == {turno} and cargo == {cargo!r}"
        )
        .sort_values(by="nome")
        .sample(tamanho.n, replace=False, random_state=random_state)
    )
    sample[["nome", "estado"]].to_csv(
        AMOSTRAS / f"{cargo}_{turno}_turno_{uf}.csv",
    )
    print(uf, turno, cargo, sample.shape[0])

In [None]:
# Carrega os dados das amostras criadas
dfs = []

for file in AMOSTRAS.glob("*.csv"):
    cargo, turno, *_ = file.stem.split("_")
    df = (
        pd.read_csv(file, dtype="category")
        .iloc[:, -2:]
        .rename(columns=str.casefold)
        .rename(columns=lambda name: "nome" if name != "estado" else name)
        .assign(
            nome=lambda df_: df_["nome"].str.title(),
            cargo=cargo,
            turno=int(turno)
        )
        .assign(
            nome=lambda df_: df_["nome"].apply(lambda s: names_map.get(s, s))
        )
    )
    dfs.append(df)

amostra = (
    pd.concat(dfs)
    .astype({
        "estado": "category", "cargo": "category"
    })
)

# Proporções

## Amostrais

### Governador e Senador

In [57]:
proporcoes_aass = (
    amostra
    .query("cargo != 'Presidente'")
    .groupby(["cargo", "estado", "turno"])
    ["nome"].apply(lambda s: s.value_counts(normalize=True))
    .to_frame("p")
    .assign(nome=lambda df_: df_.index.get_level_values(3))
    .droplevel(3)
    .join(sample_sizes)
    .assign(
        f=lambda df_: df_["n"] / df_["N"],
        q=lambda df_: 1 - df_["p"],
        var_p=lambda df_: (1 - df_["f"]) * df_["p"] * df_["q"] / (df_["n"] - 1),
        ep_p =lambda df_: df_["var_p"].pow(.5),
        low_ci=lambda df_: df_["p"] - z * df_["ep_p"],
        upr_ci=lambda df_: df_["p"] + z * df_["ep_p"],
    )
)
proporcoes_aass

  .groupby(["cargo", "estado", "turno"])


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,p,nome,N,n,f,q,var_p,ep_p,low_ci,upr_ci
cargo,estado,turno,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Governador,AC,1,0.53034743,Gladson Cameli,455438,2389,0.0052455,0.46965257,0.00010376,0.01018613,0.51038297,0.55031188
Governador,AC,1,0.22645458,Jorge Viana,455438,2389,0.0052455,0.77354542,7.297e-05,0.00854229,0.209712,0.24319716
Governador,AC,1,0.10590205,Mara Rocha,455438,2389,0.0052455,0.89409795,3.944e-05,0.00628037,0.09359274,0.11821136
Governador,AC,1,0.06111344,Petecão,455438,2389,0.0052455,0.93888656,2.39e-05,0.00488895,0.05153127,0.07069561
Governador,AC,1,0.04688154,Nulo,455438,2389,0.0052455,0.95311846,1.861e-05,0.00431435,0.03842557,0.05533751
Governador,AC,1,0.02051067,Branco,455438,2389,0.0052455,0.97948933,8.37e-06,0.00289288,0.01484073,0.02618062
Governador,AC,1,0.00669736,Marcio Bittar,455438,2389,0.0052455,0.99330264,2.77e-06,0.00166469,0.00343463,0.0099601
Governador,AC,1,0.00125576,Professor Nilson,455438,2389,0.0052455,0.99874424,5.2e-07,0.0007228,-0.00016091,0.00267243
Governador,AC,1,0.00083717,David Hall,455438,2389,0.0052455,0.99916283,3.5e-07,0.00059029,-0.00031978,0.00199412
Governador,RO,1,0.35866388,Coronel Marcos Rocha,925763,2395,0.00258706,0.64133612,9.584e-05,0.00978954,0.33947675,0.37785102


### Presidente

In [58]:
proporcao_estrato = (
    amostra
    .query("cargo == 'Presidente'")
    .groupby(["cargo", "estado", "turno"])
    ["nome"]
    .apply(lambda s: s.value_counts(normalize=True))
    .to_frame("ph")
    .assign(
        nome=lambda df_: df_.index.get_level_values(-1)
    )
    .droplevel(-1)
    .join(sample_sizes)
    .assign(
        f=lambda df_: df_["n"] / df_["N"],
        qh=lambda df_: 1 - df_["ph"],
        var_ph=lambda df_: (1 - df_["f"]) * df_["ph"] * df_["qh"] / (df_["n"] - 1),
    )
)
proporcao_estrato

  .groupby(["cargo", "estado", "turno"])


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ph,nome,N,n,f,qh,var_ph
cargo,estado,turno,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Presidente,AC,1,0.68181818,Jair Bolsonaro,455903,110,0.00024128,0.31818182,0.00198981
Presidente,AC,1,0.25454545,Lula,455903,110,0.00024128,0.74545455,0.00174042
Presidente,AC,1,0.02727273,Ciro Gomes,455903,110,0.00024128,0.97272727,0.00024333
Presidente,AC,1,0.01818182,Branco,455903,110,0.00024128,0.98181818,0.00016373
Presidente,AC,1,0.00909091,Simone Tebet,455903,110,0.00024128,0.99090909,0.00008262
Presidente,...,...,...,...,...,...,...,...,...
Presidente,TO,1,0.01388889,Branco,891449,216,0.00024230,0.98611111,0.00006369
Presidente,TO,2,0.52314815,Lula,871238,216,0.00024792,0.47685185,0.00116001
Presidente,TO,2,0.43055556,Jair Bolsonaro,871238,216,0.00024792,0.56944444,0.00114008
Presidente,TO,2,0.03703704,Nulo,871238,216,0.00024792,0.96296296,0.00016584


In [59]:
peso_estrato = (
    sample_sizes
    .query("cargo == 'Presidente'")
    .groupby("turno", as_index=False)
    ["N"].apply(lambda s: s / s.sum())
    .to_frame("Wh")
    .droplevel(0)
)

In [60]:
(
    proporcao_estrato
    .join(peso_estrato)
    .assign(
        wh2varph=lambda df_: df_["Wh"].pow(2) * df_["var_ph"],
        whph=lambda df_: df_["Wh"] * df_["ph"],
    )
    .groupby(["nome", "turno"])
    .agg({
        "wh2varph": pd.Series.sum,
        "whph": pd.Series.sum
    })
    .rename(columns={
        "wh2varph": "var_p",
        "whph": "p"
    })
    .assign(
        ep_p =lambda df_: df_["var_p"].pow(.5),
        low_ci=lambda df_: df_["p"] - z * df_["ep_p"],
        upr_ci=lambda df_: df_["p"] + z * df_["ep_p"],
    )
)

Unnamed: 0_level_0,Unnamed: 1_level_0,var_p,p,ep_p,low_ci,upr_ci
nome,turno,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Branco,1,4.12e-06,0.0099909,0.00203065,0.0060109,0.0139709
Branco,2,4.63e-06,0.01124825,0.00215208,0.00703024,0.01546625
Ciro Gomes,1,9.83e-06,0.0241483,0.00313498,0.01800386,0.03029274
Felipe D'Avila,1,1.04e-06,0.00249977,0.00101949,0.0005016,0.00449794
Jair Bolsonaro,1,9.825e-05,0.4496799,0.00991213,0.43025249,0.46910732
Jair Bolsonaro,2,9.887e-05,0.480463,0.00994335,0.46097438,0.49995162
Lula,1,9.879e-05,0.45124266,0.00993956,0.43176148,0.47072383
Lula,2,9.884e-05,0.4862042,0.00994162,0.46671899,0.50568942
Nulo,1,6.81e-06,0.01664724,0.00260986,0.01153201,0.02176248
Nulo,2,9.01e-06,0.02208455,0.00300205,0.01620064,0.02796846


## Populacionais

### Governador e Senador

In [62]:
(
    populacao
    .query("cargo != 'Presidente'")
    .groupby(["cargo", "estado", "turno"])
    ["nome"].apply(lambda s: s.value_counts(normalize=True))
    .to_frame("P")
)

  .groupby(["cargo", "estado", "turno"])


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,P
cargo,estado,turno,Unnamed: 3_level_1,Unnamed: 4_level_1
Governador,AC,1,Gladson Cameli,0.53157620
Governador,AC,1,Jorge Viana,0.22673778
Governador,AC,1,Mara Rocha,0.10357722
Governador,AC,1,Petecão,0.06014650
Governador,AC,1,Nulo,0.04627853
...,...,...,...,...
Senador,TO,1,Pastor Claudemir Lopes,0.02350435
Senador,TO,1,Andrea Schimdt,0.00659412
Senador,TO,1,Lúcia Viana,0.00189190
Senador,TO,1,Marcelo Claudio,0.00084534


### Presidente

In [63]:
(
    populacao
    .query("cargo == 'Presidente'")
    .groupby(["cargo", "turno"])
    ["nome"]
    .apply(lambda s: s.value_counts(normalize=True))
    .to_frame("P")
    .unstack("turno")
)

  .groupby(["cargo", "turno"])


Unnamed: 0_level_0,Unnamed: 1_level_0,P,P
Unnamed: 0_level_1,turno,1,2
cargo,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Presidente,Branco,0.0097059,0.00942752
Presidente,Ciro Gomes,0.02342258,
Presidente,Eymael,0.000107,
Presidente,Felipe D'Avila,0.00181401,
Presidente,Jair Bolsonaro,0.4429136,0.49438392
Presidente,Lula,0.45910471,0.4747168
Presidente,Léo Péricles,0.00019203,
Presidente,Nulo,0.01541916,0.02147176
Presidente,Padre Kelmon,0.00061921,
Presidente,Simone Tebet,0.04113684,
