# 0. Setup

## Libraries

In [19]:
import geopandas as gpd
import folium
from branca.colormap import linear
from folium.plugins import GroupedLayerControl
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import numpy as np
from matplotlib import cm
import seaborn as sns
from shapely.geometry import Point

## Datasets

### 🟨 1. Human Development Index (IDHM) Dataset
**Source**: `data/idhm.csv`  
**Description**:  
This dataset provides municipality-level Human Development Index data for Brazil, broken down into components:

- `Overall IDHM`: Composite index (0 to 1), representing development level.  
- `IDHM Wage`: Economic indicator based on average income.  
- `IDHM Longevity`: Health component, linked to life expectancy.  
- `IDHM Education`: Education component, based on literacy and school attendance.

> 📌 *Note: Values are scaled by 100 in the raw dataset and were later normalized to the 0–1 range for analysis.*

### 🟦 2. Municipal Vote Dataset (2022)  
**Source**: `data/votacao_candidato-municipio_2022.csv`  
**Description**:  
Contains official election results from the Brazilian presidential runoff in 2022, aggregated at the municipality level.

Key columns:

- `nm_municipio`: City name  
- `nm_urna_candidato`: Candidate (e.g., Jair Bolsonaro, Lula)  
- `sg_partido`: Political party  
- `pc_votos_validos`: Proportion of valid votes for that candidate  
- `qt_votos_nom_validos`: Raw vote count  
- `nm_regiao`: Brazilian macro-region (e.g., NORTE, SUDESTE)

This dataset allowed us to assign vote direction (left or right) and analyze support patterns across geography and demographics.

### 🟩 3. State-Level Aggregated Dataset with Geometries  
**Source**: `data/state_2022.gpkg`  
**Description**:  
This GeoDataFrame includes polygon boundaries and state-level election statistics. It enables spatial visualizations using maps.

Key fields:

- `total_votes_right` / `total_votes_left`: Total votes cast per political alignment  
- `percentage_right`: Proportion of votes for the right per state  
- `geometry`: Multipolygon spatial boundary for each state

This dataset powered the state-level choropleth map and regional breakdowns across the 2022 election.

### Explanation

In [6]:
idh_df = pd.read_csv('data/idh.csv', header=0, sep=',')
idh_df.head()

Unnamed: 0,Ranking,City (State),Overall IDHM,IDHM Wage,IDHM Longevity,IDHM Education
0,1 º,São Caetano do Sul (SP),862,891,887,811
1,2 º,Águas de São Pedro (SP),854,849,890,825
2,3 º,Florianópolis (SC),847,870,873,800
3,4 º,Balneário Camboriú (SC),845,854,894,789
4,4 º,Vitória (ES),845,876,855,805


In [15]:
vote_df = pd.read_csv('data/votacao_candidato-municipio_2022.csv')
vote_df.head()

Unnamed: 0,sg_uf,nm_municipio,nr_candidato,nm_urna_candidato,sg_partido,ds_sit_totalizacao,sg_ue,sq_candidato,nm_tipo_destinacao_votos,nm_regiao,pc_votos_validos,qt_votos_nom_validos,qt_votos_concorrentes
0,AC,ACRELÂNDIA,22,JAIR BOLSONARO,PL,Não Eleito,BR,280001618036,Válido,NORTE,0.7846,5317,6777
1,AC,ACRELÂNDIA,13,LULA,PT,Eleito,BR,280001607829,Válido,NORTE,0.2154,1460,6777
2,AC,ASSIS BRASIL,22,JAIR BOLSONARO,PL,Não Eleito,BR,280001618036,Válido,NORTE,0.5891,2717,4612
3,AC,ASSIS BRASIL,13,LULA,PT,Eleito,BR,280001607829,Válido,NORTE,0.4109,1895,4612
4,AC,BRASILÉIA,22,JAIR BOLSONARO,PL,Não Eleito,BR,280001618036,Válido,NORTE,0.7079,9593,13551


In [18]:
state_df = gpd.read_file('data/state_2022.gpkg')
state_df.head()

Unnamed: 0,code_state,abbrev_state,name_state,code_region,name_region,year,state,total_votes_right,total_votes_left,percentage_right,geometry
0,11.0,RO,Rondônia,1.0,Norte,2022,RO,633235.7,262918.3,0.706615,"MULTIPOLYGON (((-65.3815 -10.42907, -65.38013 ..."
1,12.0,AC,Acre,1.0,Norte,2022,AC,287748.8,121562.2,0.703008,"MULTIPOLYGON (((-71.07772 -9.82774, -71.07817 ..."
2,13.0,AM,Amazônas,1.0,Norte,2022,AM,961741.7,1004929.0,0.48902,"MULTIPOLYGON (((-69.83766 -3.68659, -69.82555 ..."
3,14.0,RR,Roraima,1.0,Norte,2022,RR,213518.4,67121.64,0.760827,"MULTIPOLYGON (((-63.96008 2.47313, -63.96041 2..."
4,15.0,PA,Pará,1.0,Norte,2022,PA,2073896.0,2509077.0,0.452522,"MULTIPOLYGON (((-51.43248 -0.47334, -51.42949 ..."


# 1. Motivation

Brazil’s recent presidential elections have exposed deep and growing political, economic, and regional divides. As the country swung from a left-leaning government in the early 2010s to a far-right presidency in 2018—and back again in 2022—the political polarization has increasingly mirrored underlying socioeconomic structures. Understanding how geography, development, and migration patterns intersect with voting behavior is essential for grasping the forces shaping Brazil's political landscape.

This project was motivated by a desire to go beyond simple vote tallies and investigate the structural patterns behind electoral outcomes. Do cities with higher development levels tend to vote differently than those with lower HDI scores? Has the political lean of Brazilians living abroad changed over time? Are there consistent patterns between urban and rural municipalities? These are the types of questions this analysis seeks to answer.

Through a combination of spatial visualizations, temporal comparisons, and socioeconomic correlations, we aim to tell a data-driven story of Brazil’s elections that is both accessible to a general audience and grounded in rigorous analysis.

# 2. Basic Statistics

Write about your choices in data cleaning and preprocessing
Write a short section that discusses the dataset stats, containing key points/plots from your exploratory data analysis.

# 3. Data Analysis

Describe your data analysis and explain what you've learned about the dataset.
If relevant, talk about your machine-learning.

# 4. Genre


Which tools did you use from each of the 3 categories of Visual Narrative (Figure 7 in Segal and Heer). Why?
Which tools did you use from each of the 3 categories of Narrative Structure (Figure 7 in Segal and Heer). Why?

# 5. Visualization

## City Analysis

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(20, 20 / 2.414))
gradient = True

# Define normalization and reversed colormap once
norm = colors.Normalize(vmin=0, vmax=1)
cmap = plt.get_cmap("bwr")  # reversed: red to blue

for i, year in enumerate([2014, 2018, 2022]):
    file = gpd.read_file(f"data/merged_{year}.gpkg")
    title = f"{year}"

    if gradient:
        # Apply gradient color
        file["color_grad"] = file.apply(
            lambda row: cmap(norm(row["pc_votos_validos"] if row["sg_partido"] == "PT" else 1 - row["pc_votos_validos"])),
            axis=1
        )
        file.plot(color=file["color_grad"], ax=ax[i])
    else:
        file.plot(color=file["color"], ax=ax[i])

    ax[i].set_title(title)
    ax[i].axis("off")

# Tight layout first
fig.tight_layout()

# Add colorbar on top
if gradient:
    sm = plt.cm.ScalarMappable(cmap=plt.get_cmap("bwr_r"), norm=norm)
    sm.set_array([])
    cbar = fig.colorbar(
        sm,
        ax=ax,
        orientation="horizontal",
        fraction=0.03,
        pad=0.07,
        aspect=30,
    )
    cbar.set_ticks([0, 1])
    cbar.set_ticklabels(["Left", "Right"])
    cbar.ax.tick_params(labelsize=10)
    cbar.set_label("Proportion of votes for the Left (red) vs Right (blue)", size=10, labelpad=6)

fig.suptitle("Results of Brazilian Presidential Elections years 2014-2022")

plt.savefig("outputs/brazil_elections.png", dpi=300, bbox_inches="tight")

see which cities switched party and see why...
Idea: Think about nice (interactive) vizualization on how the cities switched left/right through out the years.

Maybe we can try to find any data possible for the cities and see which one correlates the most with te switch (if any)

data to look for: something about economic realities, public trust, media influence, and the national mood.

In [None]:
right_data = []

# Define ideology map
ideology_map = {
    "AÉCIO NEVES": "right",
    "DILMA": "left",
    "JAIR BOLSONARO": "right",
    "FERNANDO HADDAD": "left",
    "LULA": "left",
}

for year in [2014, 2018, 2022]:
    data = pd.read_csv(f'data/votacao_candidato-municipio_{year}.csv')
    data = data.rename(columns={'sg_uf': 'state', "nm_municipio": "city", "nm_urna_candidato": "candidate", "pc_votos_validos": "percentage", 'qt_votos_nom_validos': 'num of votes'})
    data["ideology"] = data["candidate"].map(ideology_map)
    data['total voters'] = (data['num of votes'] / data['percentage'].astype(float)).round()
    data = data[["city", "ideology", "percentage", 'total voters', 'state']]
    data['year'] = year
    data = data[data['ideology']=='right']
    
    right_data.append(data)

right_data = pd.concat(right_data, ignore_index=True)
right_data

In [None]:
total_votes_right = right_data.groupby("year").apply(lambda x: (x["percentage"] * x["total voters"]).sum()).reset_index(name="total_votes_right")
total_votes_left = right_data.groupby("year").apply(lambda x: ((1 - x["percentage"]) * x["total voters"]).sum()).reset_index(name="total_votes_left")

total_votes_by_year = pd.merge(total_votes_right, total_votes_left, on="year")
total_votes_by_year

In [None]:
# Group by year and state, calculate total right votes
total_votes_right = right_data.groupby(["year", "state"]).apply(
    lambda x: (x["percentage"] * x["total voters"]).sum()
).reset_index(name="total_votes_right")

# Group by year and state, calculate total left votes
total_votes_left = right_data.groupby(["year", "state"]).apply(
    lambda x: ((1 - x["percentage"]) * x["total voters"]).sum()
).reset_index(name="total_votes_left")

# Merge on both year and state
total_votes_by_year = pd.merge(total_votes_right, total_votes_left, on=["year", "state"])
total_votes_by_year['percentage_right'] = total_votes_by_year['total_votes_right']/ (total_votes_by_year['total_votes_right']+total_votes_by_year['total_votes_left'])
# Save to CSV (no index column in the file)
total_votes_by_year.to_csv("data/state_elections.csv", index=False)

In [None]:
# Assume this DataFrame has both percentage and total right votes
right_votes = total_votes_by_year[["year", "state", "percentage_right", "total_votes_right"]]

# 1. Sort states by 2022 percentage_right
top_2022 = right_votes[right_votes["year"] == 2022]
sorted_states = top_2022.sort_values(by="percentage_right", ascending=False)["state"].tolist()

# 2. Set categorical state ordering
right_votes["state"] = pd.Categorical(right_votes["state"], categories=sorted_states, ordered=True)
right_votes = right_votes.sort_values(by=["state", "year"])

# 3. Setup
years = sorted(right_votes["year"].unique())
states = sorted_states
x = np.arange(len(states))
width = 0.25

# Colors with varying opacity
alphas = {years[0]: 0.3, years[1]: 0.6, years[2]: 1.0}
colors = {year: (0.2, 0.4, 1.0, alphas[year]) for year in years}

# 4. Plotting side-by-side
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 7), sharex=True)

# -- First subplot: percentage_right
for i, year in enumerate(years):
    data = right_votes[right_votes["year"] == year]
    ax1.bar(
        x + i * width - width,
        data["percentage_right"],
        width=width,
        label=str(year),
        color=colors[year]
    )
ax1.set_title("Percentage of Right Votes by State")
ax1.set_ylabel("Percentage")
ax1.set_xlabel("State")
ax1.set_xticks(x)
ax1.set_xticklabels(states, rotation=45)

# -- Second subplot: total_votes_right
for i, year in enumerate(years):
    data = right_votes[right_votes["year"] == year]
    ax2.bar(
        x + i * width - width,
        data["total_votes_right"],
        width=width,
        label=str(year),
        color=colors[year]
    )
ax2.set_title("Total Right Votes by State")
ax2.set_ylabel("Total Votes")
ax2.set_xlabel("State")
ax2.set_xticks(x)
ax2.set_xticklabels(states, rotation=45)

# Legend only once
ax1.legend(title="Year")
plt.tight_layout()
plt.savefig("outputs/right_votes_by_state.png", dpi=300, bbox_inches="tight")

In [None]:
# Define city size bins and labels
bins = [500, 2500, 20000, 100000, 250000, 500000, 1_000_000, 5_000_000, float("inf")]
labels = [
    "Village", "Small Town", "Medium Town", "Large Town / Small City",
    "Medium City", "Large City", "Metropolis", "Megalopolis"
]

# Create new column based on bins
right_data["city_size"] = pd.cut(
    right_data["total voters"],
    bins=bins,
    labels=labels,
    right=False  # to include left edge
)
right_data

In [None]:
n_bins = 100
cmap = cm.get_cmap("bwr_r")

# First plot: number of municipalities
fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)

for i, year in enumerate([2014, 2018, 2022]):
    ax = axes[i]
    data_year = right_data[right_data["year"] == year]
    values = data_year["percentage"]
    
    counts, bins = np.histogram(values, bins=n_bins)
    bin_centers = 0.5 * (bins[:-1] + bins[1:])
    norm = np.clip(bin_centers, 0, 1)
    colors = cmap(norm)

    for j in range(n_bins):
        ax.bar(bin_centers[j], counts[j], width=(bins[1] - bins[0]), color=colors[j], edgecolor="black")

    ax.set_title(year)
    ax.set_xlabel("Percentage of Valid Votes")
    ax.set_xticks(np.linspace(0, 1, 11))
    ax.grid(True, linestyle="--", alpha=0.5)

axes[0].set_ylabel("Number of Municipalities")
fig.suptitle("Right-Wing Vote Share per Municipality")
plt.subplots_adjust(top=0.85)
plt.tight_layout()
plt.show()

# Second plot: weighted by total voters
fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)

for i, year in enumerate([2014, 2018, 2022]):
    ax = axes[i]
    data_year = right_data[right_data["year"] == year]

    counts, bins = np.histogram(data_year["percentage"], bins=n_bins, weights=data_year["total voters"])
    bin_centers = 0.5 * (bins[:-1] + bins[1:])
    norm = np.clip(bin_centers, 0, 1)
    colors = cmap(norm)

    for j in range(n_bins):
        ax.bar(bin_centers[j], counts[j], width=(bins[1] - bins[0]), color=colors[j], edgecolor="black")

    ax.set_title(year)
    ax.set_xlabel("Percentage of Valid Votes")
    ax.set_xticks(np.linspace(0, 1, 11))
    ax.grid(True, linestyle="--", alpha=0.5)

axes[0].set_ylabel("Total Votes")
fig.suptitle("Right-Wing Vote Share per Municipality (Weighted by Voters)")
plt.subplots_adjust(top=0.85)
plt.tight_layout()
plt.savefig("outputs/right_votes_histogram.png", dpi=300, bbox_inches="tight")

In [None]:
# Step 1: Filter for years 2014, 2018, and 2022
filtered = right_data[right_data["year"].isin([2014, 2018, 2022])]

# Step 2: Pivot so each city has one row with 2014, 2018, and 2022 percentages
pivoted = filtered.pivot_table(
    index=["city", "state"],
    columns="year",
    values="percentage"
).reset_index()

# Step 3: Rename columns for clarity
pivoted = pivoted.rename(columns={2014: "percentage_2014", 2018: "percentage_2018", 2022: "percentage_2022"})

# Step 4: Compute the differences
pivoted["perc_diff_2014_vs_2018"] = (pivoted["percentage_2018"] - pivoted["percentage_2014"])
pivoted["perc_diff_2018_vs_2022"] = (pivoted["percentage_2022"] - pivoted["percentage_2018"])

# Step 5: Merge back into right_data
right_data = pd.merge(
    right_data,
    pivoted[["city", "state", "perc_diff_2014_vs_2018", "perc_diff_2018_vs_2022"]],
    on=["city", "state"],
    how="left"
)
right_data.head()

In [None]:
# Filter for 2022 data and drop missing values
data_2022 = right_data[right_data["year"] == 2022].copy()
data_2022 = data_2022.dropna(subset=["perc_diff_2018_vs_2022"])

# Ensure city size categories are in correct order
city_size_order = [
    "Village", "Small Town", "Medium Town", "Large Town / Small City",
    "Medium City", "Large City", "Metropolis", "Megalopolis"
]
data_2022["city_size"] = pd.Categorical(data_2022["city_size"], categories=city_size_order, ordered=True)

# --- Calculate influence per city and group ---
data_2022["influence"] = data_2022["perc_diff_2018_vs_2022"] * data_2022["total voters"]
influence_by_size = data_2022.groupby("city_size")["influence"].sum().reset_index()
influence_by_size["influence_percent"] = 100 * influence_by_size["influence"] / influence_by_size["influence"].sum()

# --- Prepare bar chart data ---
avg_diff_by_size = data_2022.groupby("city_size")["perc_diff_2018_vs_2022"].mean().reset_index()
avg_diff_by_size["city_size"] = pd.Categorical(avg_diff_by_size["city_size"], categories=city_size_order, ordered=True)
avg_diff_by_size = avg_diff_by_size.sort_values("city_size")

# Define shared color palette
palette = sns.color_palette("Set2", n_colors=len(city_size_order))
palette_dict = dict(zip(city_size_order, palette))

# --- Create subplots: now 3 plots ---
fig, axes = plt.subplots(1, 3, figsize=(24, 6))

# --- Plot 1: Scatter plot ---
sns.scatterplot(
    data=data_2022,
    x="total voters",
    y="perc_diff_2018_vs_2022",
    hue="city_size",
    palette=palette_dict,
    edgecolor="none",
    ax=axes[0]
)
axes[0].set_xscale("log")
axes[0].set_title("Change in Right Vote % (2018 vs 2022) by City")
axes[0].set_xlabel("Total Voters (log scale)")
axes[0].set_ylabel("Change in % Right Vote")
axes[0].axhline(0, color="gray", linestyle="--", linewidth=1)
axes[0].legend([], frameon=False)

# --- Plot 2: Bar chart of average change ---
sns.barplot(
    data=avg_diff_by_size,
    y="city_size",
    x="perc_diff_2018_vs_2022",
    palette=palette_dict,
    ax=axes[1]
)
axes[1].set_title("Average Change in Right Vote % by City Size")
axes[1].set_xlabel("Avg. Change in % Right Vote")
axes[1].set_ylabel("")
axes[1].axvline(0, color="gray", linestyle="--", linewidth=1)
axes[1].invert_yaxis()
axes[1].yaxis.set_label_position("right")
axes[1].yaxis.tick_right()

# --- Plot 3: Pie chart of influence without labels ---
axes[2].pie(
    influence_by_size["influence_percent"],
    colors=[palette_dict[cs] for cs in influence_by_size["city_size"]],
    autopct="%1.1f%%",
    startangle=90,
    counterclock=False
)
axes[2].set_title("Share of Total Shift (Weighted by Voters) by City Size")

plt.tight_layout()
plt.savefig("outputs/right_votes_by_city_size_2022.png", dpi=300, bbox_inches="tight")


# Filter for 2014 and 2018 data and drop missing values
data_2014_2018 = right_data[right_data["year"].isin([2014, 2018])].copy()
data_2014_2018 = data_2014_2018.dropna(subset=["perc_diff_2014_vs_2018"])

# Ensure city size categories are in correct order
city_size_order = [
    "Village", "Small Town", "Medium Town", "Large Town / Small City",
    "Medium City", "Large City", "Metropolis", "Megalopolis"
]
data_2014_2018["city_size"] = pd.Categorical(data_2014_2018["city_size"], categories=city_size_order, ordered=True)

# --- Calculate influence per city and group ---
data_2014_2018["influence_2014_2018"] = (data_2014_2018["perc_diff_2014_vs_2018"] * data_2014_2018["total voters"]).abs()
influence_by_size_2014_2018 = data_2014_2018.groupby("city_size")["influence_2014_2018"].sum().reset_index()
influence_by_size_2014_2018["influence_percent"] = 100 * influence_by_size_2014_2018["influence_2014_2018"] / influence_by_size_2014_2018["influence_2014_2018"].sum()

# --- Prepare bar chart data ---
avg_diff_by_size_2014_2018 = data_2014_2018.groupby("city_size")["perc_diff_2014_vs_2018"].mean().reset_index()
avg_diff_by_size_2014_2018["city_size"] = pd.Categorical(avg_diff_by_size_2014_2018["city_size"], categories=city_size_order, ordered=True)
avg_diff_by_size_2014_2018 = avg_diff_by_size_2014_2018.sort_values("city_size")

# Define shared color palette
palette = sns.color_palette("Set2", n_colors=len(city_size_order))
palette_dict = dict(zip(city_size_order, palette))

# --- Create subplots: now 3 plots ---
fig, axes = plt.subplots(1, 3, figsize=(24, 6))

# --- Plot 1: Scatter plot ---
sns.scatterplot(
    data=data_2014_2018,
    x="total voters",
    y="perc_diff_2014_vs_2018",
    hue="city_size",
    palette=palette_dict,
    edgecolor="none",
    ax=axes[0]
)
axes[0].set_xscale("log")
axes[0].set_title("Change in Right Vote % (2014 vs 2018) by City")
axes[0].set_xlabel("Total Voters (log scale)")
axes[0].set_ylabel("Change in % Right Vote")
axes[0].axhline(0, color="gray", linestyle="--", linewidth=1)
axes[0].legend([], frameon=False)

# --- Plot 2: Bar chart of average change ---
sns.barplot(
    data=avg_diff_by_size_2014_2018,
    y="city_size",
    x="perc_diff_2014_vs_2018",
    palette=palette_dict,
    ax=axes[1]
)
axes[1].set_title("Average Change in Right Vote % by City Size (2014 vs 2018)")
axes[1].set_xlabel("Avg. Change in % Right Vote")
axes[1].set_ylabel("")
axes[1].axvline(0, color="gray", linestyle="--", linewidth=1)
axes[1].invert_yaxis()
axes[1].yaxis.set_label_position("right")
axes[1].yaxis.tick_right()

# --- Plot 3: Pie chart of influence without labels ---
axes[2].pie(
    influence_by_size_2014_2018["influence_percent"],
    colors=[palette_dict[cs] for cs in influence_by_size_2014_2018["city_size"]],
    autopct="%1.1f%%",
    startangle=90,
    counterclock=False
)
axes[2].set_title("Share of Total Shift (Weighted by Voters) by City Size (2014 vs 2018)")

plt.tight_layout()
plt.savefig("outputs/right_votes_by_city_size_2014_2018.png", dpi=300, bbox_inches="tight")

In [None]:
right_data.loc[right_data['total voters'].idxmax()]

In [None]:
right_data[right_data["city"].str.upper() == "SÃO PAULO"]

In [None]:
right_data[right_data["city"].str.upper() == "RIO DE JANEIRO"]

## IDH

In [7]:
idh_df.head()

Unnamed: 0,Ranking,City (State),Overall IDHM,IDHM Wage,IDHM Longevity,IDHM Education
0,1 º,São Caetano do Sul (SP),862,891,887,811
1,2 º,Águas de São Pedro (SP),854,849,890,825
2,3 º,Florianópolis (SC),847,870,873,800
3,4 º,Balneário Camboriú (SC),845,854,894,789
4,4 º,Vitória (ES),845,876,855,805


In [8]:
# Optionally, set custom column headers
idh_df.columns = ['Index','City', 'Overall', 'Income', 'Longevity', 'Education']  # Replace with actual column names
idh_df.drop(columns=['Index'], inplace=True)  # Drop the index column if not needed

In [9]:
# Convert the relevant columns by prefixing '0.' and casting to float
for col in ['Overall', 'Income', 'Longevity', 'Education']:
    idh_df[col] = '0.' + idh_df[col].astype(str)
    idh_df[col] = idh_df[col].astype(float)

In [10]:
# Assuming the column to split is named 'City'
idh_df[['City', 'State']] = idh_df['City'].str.extract(r'^(.*)\s\((.*)\)$')

# Remove any remaining parentheses (if needed)
idh_df['City'] = idh_df['City'].str.strip()
idh_df['State'] = idh_df['State'].str.strip()

# Display the updated DataFrame
idh_df.head()

Unnamed: 0,City,Overall,Income,Longevity,Education,State
0,São Caetano do Sul,0.862,0.891,0.887,0.811,SP
1,Águas de São Pedro,0.854,0.849,0.89,0.825,SP
2,Florianópolis,0.847,0.87,0.873,0.8,SC
3,Balneário Camboriú,0.845,0.854,0.894,0.789,SC
4,Vitória,0.845,0.876,0.855,0.805,ES


In [None]:
idh_df['City'] = idh_df['City'].str.upper()

In [None]:
idh_df.head()

### Votes

In [None]:
def combine_datasets(*dataframes):
    """
    Combine multiple DataFrames by stacking their rows.
    
    Parameters:
        *dataframes: A variable number of DataFrames to combine.
    
    Returns:
        A single DataFrame with all rows combined.
    """
    combined_df = pd.concat(dataframes, ignore_index=True)
    return combined_df

In [None]:
df_2014 = pd.read_csv('data/votacao_candidato-municipio_2014_2.csv')
df_2014['year'] = 2014

df_2018 = pd.read_csv('data/votacao_candidato-municipio_2018_2.csv')
df_2018['year'] = 2018

df_2022 = pd.read_csv('data/votacao_candidato-municipio_2022_2.csv')
df_2022['year'] = 2022

In [None]:
df_2014['nm_candidato'].value_counts()

In [None]:
# Example usage
combined_votes_df = combine_datasets(df_2014, df_2018, df_2022)

# Display the combined DataFrame
combined_votes_df.head()

In [None]:
def process_votes_data(df):
    # Map values in the 'nm_urna_candidato' column
    df['direction'] = df['nm_candidato'].map({
        'JAIR MESSIAS BOLSONARO': 'right',
        'LUIZ INÁCIO LULA DA SILVA': 'left',
        'FERNANDO HADDAD': 'left',
        'AÉCIO NEVES DA CUNHA': 'right',
        'DILMA VANA ROUSSEFF': 'left'
    })
    
    # Filter rows for 'Presidente' and exclude 'ZZ' in 'sg_uf'
    df = df[df['ds_cargo'] == 'Presidente']
    df = df[df['sg_uf'] != 'ZZ']
    
    
    return df

In [None]:
# Example usage for 2022
votes_df = process_votes_data(combined_votes_df)

# Display the processed DataFrame
votes_df.head()

In [None]:
# Group by 'sg_uf' and 'nm_municipio', and find the row with the maximum 'pc_votos_validos'
result = votes_df.loc[
    votes_df.groupby(['sg_uf', 'nm_municipio', 'year'])['pc_votos_validos'].idxmax()
]

# Select only the relevant columns
result = result[['sg_uf', 'nm_municipio', 'direction', 'pc_votos_validos', 'year']]

# Display the result
result.head()

In [None]:
# Merge two DataFrames on a common column
merged_df_test = pd.merge(result, idh_df, left_on=['nm_municipio', 'sg_uf'], right_on=['City', 'State'], how='left')
# Display the merged DataFrame
merged_df_test.head()

In [None]:
# Check for NaN values in the entire DataFrame
nan_values = merged_df_test.isna()

# Display rows with any NaN values
rows_with_nan = merged_df_test[nan_values.any(axis=1)]
rows_with_nan

In [None]:
# Merge two DataFrames on a common column
merged_df = pd.merge(result, idh_df, left_on=['nm_municipio', 'sg_uf'], right_on=['City', 'State'], how='inner')
# Display the merged DataFrame
merged_df.head()

In [None]:
corr_df = merged_df[['sg_uf', 'nm_municipio', 'year', 'Overall', 'direction',]]

In [None]:
# Count total NaN values in the DataFrame
total_nan = merged_df.isna().sum()
print(total_nan)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Custom color palette: match "left" to red and "right" to blue
custom_palette = {
    'left': '#ff0000',
    'right': '#0000ff'
}

plt.figure(figsize=(10, 6))
sns.boxplot(
    data=corr_df,
    x='year',
    y='Overall',
    hue='direction',
    palette=custom_palette
)
plt.title('HDI Distribution by Vote Direction per Year')
plt.ylabel('HDI (Overall)')
plt.xlabel('Election Year')
plt.legend(title='Vote Direction')
plt.savefig('outputs/hdi_boxplot.png', dpi=300, bbox_inches='tight')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid")

# Create FacetGrid to show a histogram for each year
g = sns.FacetGrid(corr_df, col="year", hue="direction", palette={'left': '#ff0000', 'right': '#0000ff'}, height=4, aspect=1.2)
g.map(sns.histplot, "Overall", bins=30, kde=True, alpha=0.6)
g.add_legend(title="Vote Direction")

g.set_axis_labels("HDI (Overall)", "Count")
g.fig.suptitle("HDI Distribution by Vote Ideology per Year", y=1.05)

plt.savefig('outputs/HDI_Distribution.png')

## Votes from emigrated population

In [None]:
def process_votes_data_zz(df):
    # Map values in the 'nm_urna_candidato' column
    df['direction'] = df['nm_candidato'].map({
        'JAIR MESSIAS BOLSONARO': 'right',
        'LUIZ INÁCIO LULA DA SILVA': 'left',
        'FERNANDO HADDAD': 'left',
        'AÉCIO NEVES DA CUNHA': 'right',
        'DILMA VANA ROUSSEFF': 'left'
    })
    
    # Filter rows for 'Presidente' and exclude 'ZZ' in 'sg_uf'
    df = df[df['ds_cargo'] == 'Presidente']
    df = df[df['sg_uf'] == 'ZZ']
    
    
    return df

In [None]:
# Example usage for 2022
votes_df_zz = process_votes_data_zz(combined_votes_df)

# Display the processed DataFrame
votes_df_zz.head()

In [None]:
# Mapping of city names to countries
city_country_mapping = {
    'ABIDJÃ': 'Ivory Coast',
    'ABU DHABI': 'United Arab Emirates',
    'AMÃ': 'Jordan',
    'ANCARA': 'Turkey',
    'ARTIGAS': 'Uruguay',
    'ASSUNÇÃO': 'Paraguay',
    'ATENAS': 'Greece',
    'ATLANTA': 'United States',
    'BANGKOK': 'Thailand',
    'BARCELONA': 'Spain',
    'BEIRUTE': 'Lebanon',
    'BELGRADO': 'Serbia',
    'BERLIM': 'Germany',
    'BISSAU': 'Guinea-Bissau',
    'BOGOTÁ': 'Colombia',
    'BOSTON': 'United States',
    'BRUXELAS': 'Belgium',
    'BUCARESTE': 'Romania',
    'BUDAPESTE': 'Hungary',
    'BUENOS AIRES': 'Argentina',
    'CAIENA': 'French Guiana',
    'CAIRO': 'Egypt',
    'CAMBERRA': 'Australia',
    'CARACAS': 'Venezuela',
    'CHICAGO': 'United States',
    'CIDADE DO CABO': 'South Africa',
    'CIUDAD DEL ESTE': 'Paraguay',
    'CIUDAD GUAYANA': 'Venezuela',
    'COCHABAMBA': 'Bolivia',
    'CONCEPCIÓN': 'Chile',
    'COPENHAGUE-DINA': 'Denmark',
    'CÓRDOBA': 'Argentina',
    'DACAR': 'Senegal',
    'DOHA': 'Qatar',
    'DUBLIN': 'Ireland',
    'DÍLI': 'East Timor',
    'ENCARNACIÓN': 'Paraguay',
    'ESTOCOLMO': 'Sweden',
    'FARO': 'Portugal',
    'FRANKFURT': 'Germany',
    'GENEBRA': 'Switzerland',
    'GEORGETOWN': 'Guyana',
    'GUATEMALA-GUAT': 'Guatemala',
    'HAMAMATSU': 'Japan',
    'HARTFORD': 'United States',
    'HAVANA': 'Cuba',
    'HELSINQUE': 'Finland',
    'HONG KONG-HONG': 'Hong Kong',
    'HOUSTON': 'United States',
    'ISTAMBUL': 'Turkey',
    'JACARTA': 'Indonesia',
    'KINGSTON-JAMA': 'Jamaica',
    'KINSHASA': 'Democratic Republic of the Congo',
    'KUAITE': 'Kuwait',
    'KUALA LUMPUR': 'Malaysia',
    'LA PAZ': 'Bolivia',
    'LAGOS': 'Nigeria',
    'LIMA': 'Peru',
    'LISBOA': 'Portugal',
    'LIUBLIANA': 'Slovenia',
    'LONDRES': 'United Kingdom',
    'LOS ANGELES': 'United States',
    'LUANDA': 'Angola',
    'MADRI': 'Spain',
    'MANILA': 'Philippines',
    'MANÁGUA': 'Nicaragua',
    'MAPUTO': 'Mozambique',
    'MENDOZA': 'Argentina',
    'MIAMI': 'United States',
    'MILÃO': 'Italy',
    'MITSUKAIDO-JAPA': 'Japan',
    'MONTEVIDÉU': 'Uruguay',
    'MONTREAL': 'Canada',
    'MOSCOU': 'Russia',
    'MUNIQUE': 'Germany',
    'MÉXICO-MEXI': 'Mexico',
    'NAGÓIA': 'Japan',
    'NAIRÓBI': 'Kenya',
    'NICOSIA': 'Cyprus',
    'NOVA DELHI': 'India',
    'NOVA YORK': 'United States',
    'OIZUMI-JAPA': 'Japan',
    'OSLO': 'Norway',
    'OTTAWA': 'Canada',
    'PANAMÁ-PAN': 'Panama',
    'PARAMARIBO': 'Suriname',
    'PARIS': 'France',
    'PEDRO JUAN CABALLERO': 'Paraguay',
    'PEQUIM': 'China',
    'PORT OF SPAIN': 'Trinidad and Tobago',
    'PORTO': 'Portugal',
    'PORTO PRÍNCIPE': 'Haiti',
    'PRAGA': 'Czech Republic',
    'PRAIA': 'Cape Verde',
    'PRETÓRIA': 'South Africa',
    'QUITO': 'Ecuador',
    'RABAT': 'Morocco',
    'RAMALLAH-PALE': 'Palestine',
    'RIADE': 'Saudi Arabia',
    'ROMA': 'Italy',
    'ROTTERDÃ': 'Netherlands',
    'SALTO DEL GUAIRÁ': 'Paraguay',
    'SANTIAGO': 'Chile',
    'SEUL': 'South Korea',
    'SINGAPURA-SING': 'Singapore',
    'STA C LA SIERRA-BOLI': 'Bolivia',
    'SUZUKA-JAPA': 'Japan',
    'SYDNEY': 'Australia',
    'SÃO DOMINGOS': 'Dominican Republic',
    'SÃO FRANCISCO': 'United States',
    'SÃO JOSÉ': 'Costa Rica',
    'SÃO SALVADOR': 'El Salvador',
    'TAIPÉ': 'Taiwan',
    'TAKAOKA-JAPA': 'Japan',
    'TEERÃ': 'Iran',
    'TEGUCIGALPA': 'Honduras',
    'TEL AVIV': 'Israel',
    'TORONTO': 'Canada',
    'TOYOHASHI-JAPA': 'Japan',
    'TUNIS': 'Tunisia',
    'TÓQUIO': 'Japan',
    'UEDA-JAPA': 'Japan',
    'VANCOUVER': 'Canada',
    'VARSÓVIA': 'Poland',
    'VIENA': 'Austria',
    'WASHINGTON': 'United States',
    'WELLINGTON': 'New Zealand',
    'WINDHOEK': 'Namibia',
    'XANGAI': 'China',
    'ZAGREB': 'Croatia',
    'ZURIQUE': 'Switzerland',
    'ACCRA-GANA': 'Ghana',
    'BRATISLAVA': 'Slovakia',
    'CANTÃO': 'China',
    'CHUY': 'Uruguay',
    'DAR ES SALAAM': 'Tanzania',
    'GUATEMALA': 'Guatemala',
    'HANÓI': 'Vietnam',
    'KATMANDU': 'Nepal',
    'KIEV': 'Ukraine',
    'KINGSTON-JAMAICA': 'Jamaica',
    'LUSACA': 'Zambia',
    'MASCATE': 'Oman',
    'NASSAU': 'Bahamas',
    'RIO BRANCO': 'Brazil',
    'TALIN': 'Estonia',
    'ABUJA': 'Nigeria',
    'ACCRA': 'Ghana',
    'ADIS ABEBA': 'Ethiopia',
    'AMSTERDÃ': 'Netherlands',
    'ARGEL': 'Algeria',
    'ASTANA': 'Kazakhstan',
    'BAGDÁ': 'Iraq',
    'BAKU': 'Azerbaijan',
    'BAMAKO': 'Mali',
    'BAREIN': 'Bahrain',
    'BELMOPAN': 'Belize',
    'BRAZZAVILLE': 'Republic of the Congo',
    'BRIDGETOWN': 'Barbados',
    'CASTRIES': 'Saint Lucia',
    'COBIJA': 'Bolivia',
    'COLOMBO': 'Sri Lanka',
    'CONACRI': 'Guinea',
    'COPENHAGUE': 'Denmark',
    'COTONOU': 'Benin',
    'DAMASCO': 'Syria',
    'GABORONE': 'Botswana',
    'HARARE': 'Zimbabwe',
    'HONG KONG': 'Hong Kong',
    'IAUNDÊ': 'Cameroon',
    'IEREVAN': 'Armenia',
    'IQUITOS': 'Peru',
    'ISLAMABADE': 'Pakistan',
    'LIBREVILLE': 'Gabon',
    'LILONGUE': 'Malawi',
    'LOMÉ': 'Togo',
    'MALABO': 'Equatorial Guinea',
    'MEXICO': 'Mexico',
    'MUMBAI': 'India',
    'PANAMA': 'Panama',
    'PASO LOS LIBRES': 'Argentina',
    'PUERTO IGUAZÚ': 'Argentina',
    'PUERTO QUIJARRO': 'Bolivia',
    'RAMALLAH': 'Palestine',
    'RIVERA': 'Uruguay',
    'SAINT JOHNS': 'Antigua and Barbuda',
    'SANTA CRUZ DE LA SIERRA': 'Bolivia',
    'SARAJEVO': 'Bosnia and Herzegovina',
    'SINGAPURA': 'Singapore',
    'ST GEORGES DE LOYAPOCK': 'French Guiana',
    'SÃO TOMÉ': 'São Tomé and Príncipe',
    'SÓFIA': 'Bulgaria',
    'TBILISI': 'Georgia',
    'TIRANA': 'Albania',
    'TRÍPOLI': 'Libya',
    'UAGADUGU': 'Burkina Faso',
    'VATICANO': 'Vatican City',
    'YANGON': 'Myanmar'
}


In [None]:
# Assuming your DataFrame is named df and has a column 'city'
votes_df_zz['country'] = votes_df_zz['nm_municipio'].map(city_country_mapping)

In [None]:
votes_df_zz.head()

In [None]:
votes_df_zz_plot = votes_df_zz[['country', 'direction', 'nm_municipio', 'year', 'pc_votos_validos']]
votes_df_zz_plot = votes_df_zz_plot.dropna()
votes_df_zz_plot.head()

In [None]:
# Group by 'sg_uf' and 'nm_municipio', and find the row with the maximum 'pc_votos_validos'
result_zz = votes_df_zz_plot.loc[
    votes_df_zz_plot.groupby(['country', 'nm_municipio', 'year'])['pc_votos_validos'].idxmax()
]

# Select only the relevant columns
result_zz = result_zz[['country', 'nm_municipio', 'direction', 'pc_votos_validos', 'year']]

# Display the result_zz
result_zz.head()

In [None]:
# import pandas as pd
# from geopy.geocoders import Nominatim
# import time

# # Your DataFrame
# # df = pd.read_csv("your_data.csv")  # or already loaded

# geolocator = Nominatim(user_agent="city_mapper")

# def get_location(row):
#     try:
#         location = geolocator.geocode(f"{row['nm_municipio']}, {row['country']}")
#         time.sleep(1)  # to respect usage limits
#         if location:
#             return pd.Series([location.latitude, location.longitude])
#     except:
#         return pd.Series([None, None])

# result_zz[['lat', 'lon']] = result_zz.apply(get_location, axis=1)


In [None]:
# result_zz.to_csv('result_zz.csv', index=False)

In [None]:
result_zz = pd.read_csv('data/result_zz.csv')
result_zz.head()

In [None]:
# Optional: only needed if cities have multiple entries per year
winner_df = result_zz.groupby(['nm_municipio', 'country', 'year', 'lat', 'lon'])['direction'] \
                     .agg(lambda x: x.value_counts().idxmax()).reset_index()

In [None]:
# Load base map
world = gpd.read_file('/home/paulobeckhauser/Documents/dtu/social_data/finalAssignment/ne_110m_admin_0_countries/ne_110m_admin_0_countries.shp')

# Convert your DataFrame to GeoDataFrame
geometry = [Point(xy) for xy in zip(result_zz['lon'], result_zz['lat'])]
gdf = gpd.GeoDataFrame(result_zz, geometry=geometry, crs="EPSG:4326")

# Colors for directions
direction_colors = {'left': 'red', 'right': 'blue'}

# Plot one map per year
years = gdf['year'].unique()
fig, axes = plt.subplots(1, len(years), figsize=(6 * len(years), 6), constrained_layout=True)

for ax, year in zip(axes, sorted(years)):
    world.plot(ax=ax, color='lightgrey', edgecolor='white')
    year_data = gdf[gdf['year'] == year]
    year_data.plot(ax=ax,
                   color=year_data['direction'].map(direction_colors),
                   markersize=50, alpha=0.8)
    ax.set_title(f"Who Won – {year}", fontsize=14)
    ax.axis('off')

plt.suptitle("Global City-Level Vote (Left vs Right)", fontsize=18)
plt.savefig('outputs/global_vote_direction.png', dpi=300)

## Interactive Plot (City and State)

### State

In [None]:
def election_layer_state(year: int, gdf: gpd.GeoDataFrame) -> folium.FeatureGroup:
    fg = folium.FeatureGroup(
        name=str(year),
        show=(year == 2022),
        control=True,
    )

    cmap = linear.RdBu_11.scale(0, 1).to_step(12)

    def style_fn(feature):
        row = gdf.iloc[int(feature["id"])]
        pct = row["percentage_right"]
        return {
            "fillColor": cmap(pct),
            "color": "black",
            "weight": 0.3,
            "fillOpacity": 0.7,
        }

    tooltip = folium.GeoJsonTooltip(
        fields=[
            "state",
            "percentage_right",
        ],
        aliases=[
            "State",
            "% votes",
        ],
        sticky=True,
    )

    folium.GeoJson(gdf, tooltip=tooltip, style_function=style_fn).add_to(fg)

    return fg

In [None]:
def make_multi_year_map(
    outfile="elections_state.html",
):
    probe_gdf = gpd.read_file("data/state_2022.gpkg").to_crs(4326)
    minx, miny, maxx, maxy = probe_gdf.total_bounds
    brazil_bounds = [[miny, minx], [maxy, maxx]]
    centre = [(miny + maxy) / 2, (minx + maxx) / 2]

    m = folium.Map(
        location=centre,
        zoom_start=5,  # initial rough zoom
        tiles="CartoDB positron",
        control_scale=True,
        max_bounds=True,  # users cannot pan outside brazil_bounds
    )
    m.fit_bounds(brazil_bounds)

    KEEP = [
        "geometry",
        "state",
        "percentage_right",
    ]

    overlay_groups = []
    for yr in [2014, 2018, 2022]:
        gdf = (
            gpd.read_file(f"data/state_{yr}.gpkg")
            .to_crs(4326)
            .reset_index(drop=False)
            .rename(columns={"index": "id"})[KEEP]
        )
        gdf["geometry"] = gdf["geometry"].simplify(
            0.02,
            preserve_topology=True,
        )
        fg = election_layer_state(yr, gdf)
        fg.add_to(m)
        overlay_groups.append(fg)

    GroupedLayerControl(
        groups={"Election year": overlay_groups},
        exclusive_groups=True,
        collapsed=False,
    ).add_to(m)

    overlay_groups[-1].add_to(m)

    colourbar = linear.RdBu_11.scale(0, 1).to_step(12)
    colourbar.caption = "Left ← vote share → Right"
    colourbar.position = "bottomleft"
    colourbar.add_to(m)

    m.save(outfile)
    return outfile

In [None]:
path = make_multi_year_map()
print(f"wrote: {path}")

### City

In [None]:
def election_layer_city(year: int, gdf: gpd.GeoDataFrame) -> folium.FeatureGroup:
    fg = folium.FeatureGroup(
        name=str(year),
        show=(year == 2022),
        control=True,
    )

    cmap = linear.RdBu_11.scale(0, 1).to_step(12)

    def style_fn(feature):
        row = gdf.iloc[int(feature["id"])]
        pct = row["pc_votos_validos"] / 100.0
        value = pct if row["sg_partido"] == "PT" else 1 - pct
        return {
            "fillColor": cmap(value),
            "color": "black",
            "weight": 0.3,
            "fillOpacity": 0.7,
        }

    tooltip = folium.GeoJsonTooltip(
        fields=[
            "nm_municipio",
            "sg_uf",
            "sg_partido",
            "pc_votos_validos",
        ],
        aliases=[
            "Municipality",
            "State",
            "Winner party",
            "% votes",
        ],
        sticky=True,
    )

    folium.GeoJson(gdf, tooltip=tooltip, style_function=style_fn).add_to(fg)

    return fg

In [None]:
def make_multi_year_map(
    outfile="elections_br.html",
):
    # 1. Use the first year’s layer to get Brazil’s bounding box
    probe_gdf = gpd.read_file("data/merged_2022.gpkg").to_crs(4326)
    minx, miny, maxx, maxy = probe_gdf.total_bounds
    brazil_bounds = [[miny, minx], [maxy, maxx]]
    centre = [(miny + maxy) / 2, (minx + maxx) / 2]

    # 2. Make the base map, locked to Brazil
    m = folium.Map(
        location=centre,
        zoom_start=5,  # initial rough zoom
        tiles="CartoDB positron",
        control_scale=True,
        max_bounds=True,  # users cannot pan outside brazil_bounds
    )
    m.fit_bounds(brazil_bounds)

    KEEP = [
        "geometry",
        "sg_partido",
        "pc_votos_validos",
        "nm_municipio",
        "sg_uf",
    ]

    # 3. Add election layers
    overlay_groups = []
    for yr in [2014, 2018, 2022]:
        gdf = (
            gpd.read_file(f"data/merged_{yr}.gpkg")
            .to_crs(4326)
            .reset_index(drop=False)
            .rename(columns={"index": "id"})[KEEP]
        )
        gdf["geometry"] = gdf["geometry"].simplify(
            0.02,
            preserve_topology=True,
        )
        fg = election_layer_city(yr, gdf)
        fg.add_to(m)
        overlay_groups.append(fg)

    # 4. Radio-button picker
    GroupedLayerControl(
        groups={"Election year": overlay_groups},
        exclusive_groups=True,
        collapsed=False,
    ).add_to(m)

    overlay_groups[-1].add_to(m)

    # 5. One shared colour bar
    colourbar = linear.RdBu_11.scale(0, 1).to_step(12)
    colourbar.caption = "Left ← vote share → Right"
    colourbar.position = "bottomleft"
    colourbar.add_to(m)

    m.save("elections_br.html")
    return outfile

In [None]:
path = make_multi_year_map()
print(f"wrote: {path}")

# 6. Discussion

What went well?,
What is still missing? What could be improved?, Why?

# 7. Contributions

# 8. References