To test how well topics can help in predicting vote results of previously unknown municipalities, based on
results from other municipalities, we trained the models on a subset of municipalities and tested them on
the remainder to assess their predictive accuracy. Specifically, we compared how well the models performed
when including topics as predictors against models that did not include topics. We established two baseline
scenarios: one using only municipality and subject identifiers, and another incorporating socio-economic
statistics from the FSO.
To ensure the method’s robustness, we experimented with different sizes of training and test sets. We used
test set sizes of 10%, 20%, 50%, 80%, and 90% of all municipalities. This variability allowed us to analyse
how varying sample sizes influenced the models’ predictive capabilities.

In [1]:
import random
import math

import pandas as pd
import sklearn
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, root_mean_squared_error

In [2]:
n = 200

min_year = 1981

random.seed(42)

In [3]:
nv = pd.read_csv("../data/intermediate/national_votes.csv")
topics = pd.read_csv("../data/processed/summary_topics.csv")
controls = pd.read_parquet("../data/intermediate/controls/socioeconomic_0.parquet")

nv["id"] /= 10
nv = nv[nv["id"] >= topics["id"].min()]
nv = nv[nv["id"] <= topics["id"].max()]
nv["jaStimmenInProzent"] /= 100

In [4]:
nv.dropna(subset='jaStimmenInProzent',inplace=True)

In [5]:
# Prepare controls
controls.dropna(inplace=True)
controls["YEAR"] = controls["YEAR"].astype(int)
controls = controls[controls["YEAR"] >= min_year]
controls

Unnamed: 0,CAN_NAME,DIS_ID,DIS_NAME,MUN_ID,MUN_NAME,CAN_ID,YEAR,VALUE,DATA
0,ZH,101.0,Affoltern,1.0,Aeugst am Albis,1.0,1981,DATA_Auswanderung_Frau_Ausland,1.0
1,ZH,101.0,Affoltern,1.0,Aeugst am Albis,1.0,1981,DATA_Auswanderung_Frau_Schweiz,2.0
2,ZH,101.0,Affoltern,1.0,Aeugst am Albis,1.0,1981,DATA_Auswanderung_Frau_Staatsangehörigkeit (Ka...,3.0
3,ZH,101.0,Affoltern,1.0,Aeugst am Albis,1.0,1981,DATA_Auswanderung_Mann_Ausland,0.0
4,ZH,101.0,Affoltern,1.0,Aeugst am Albis,1.0,1981,DATA_Auswanderung_Mann_Schweiz,4.0
...,...,...,...,...,...,...,...,...,...
9679819,JU,2603.0,Porrentruy,6811.0,Damphreux-Lugnez,26.0,2022,DATA_Änderung des Bevölkerungstyps_Frau_Schweiz,0.0
9679820,JU,2603.0,Porrentruy,6811.0,Damphreux-Lugnez,26.0,2022,DATA_Änderung des Bevölkerungstyps_Frau_Staats...,0.0
9679821,JU,2603.0,Porrentruy,6811.0,Damphreux-Lugnez,26.0,2022,DATA_Änderung des Bevölkerungstyps_Mann_Ausland,0.0
9679822,JU,2603.0,Porrentruy,6811.0,Damphreux-Lugnez,26.0,2022,DATA_Änderung des Bevölkerungstyps_Mann_Schweiz,0.0


In [6]:
index = ["YEAR", "MUN_ID"] if "MUN_ID" in controls.columns else ["YEAR"]
controls = controls.pivot(index=index, columns="VALUE", values="DATA")

In [7]:
nv.dropna(inplace=True)
nv["votedate"] = pd.DatetimeIndex(nv["votedate"])
nv["year"] = nv["votedate"].dt.year
nv["month"] = nv["votedate"].dt.month / 12

In [8]:
left_on = ["year", "mun_id"] if "MUN_ID" in index else ["year"]
nv_with_stats = pd.merge(nv, controls, left_on=left_on, right_index=True)

In [9]:
# Drop the columns we don't need...
nv_with_stats.drop(
    columns=[
        "Unnamed: 0",
        "name",
        "canton_name",
        "mun_name",
        "geoLevelParentnummer",
        "gebietAusgezaehlt",
        "jaStimmenAbsolut",
        "neinStimmenAbsolut",
        "stimmbeteiligungInProzent",
        "eingelegteStimmzettel",
        "anzahlStimmberechtigte",
        "gueltigeStimmen",
        "votedate",
    ]
, inplace=True)
# ... and prepare for XGBoost
nv_with_stats.dropna(inplace=True)
nv_with_stats["year"] -= nv_with_stats["year"].min()
nv_with_stats["mun_id"] = nv_with_stats["mun_id"].astype("category")
nv_with_stats["canton_id"] = nv_with_stats["canton_id"].astype("category")

In [10]:
nv_with_stats_topics = nv_with_stats.merge(topics, left_on="id", right_on="id")
nv_with_stats_topics

Unnamed: 0,id,canton_id,mun_id,jaStimmenInProzent,year,month,DATA_Auswanderung_Frau_Ausland,DATA_Auswanderung_Frau_Schweiz,DATA_Auswanderung_Frau_Staatsangehörigkeit (Kategorie) - Total,DATA_Auswanderung_Mann_Ausland,...,506 - Education Expansion,507 - Education Limitation,604 - Traditional Morality: Negative,607 - Multiculturalism: Positive,608 - Multiculturalism: Negative,701 - Labour Groups: Positive,702 - Labour Groups: Negative,703 - Agriculture and Farmers: Positive,704 - Middle Class and Professional Groups,705 - Underprivileged Minority Groups
0,338.0,1,1,0.315341,2,0.250000,2.0,0.0,2.0,2.0,...,0.005833,0.000833,0.007222,0.009722,0.014167,0.003611,0.000278,0.000278,0.005833,0.004167
1,338.0,1,2,0.270188,2,0.250000,41.0,13.0,54.0,46.0,...,0.005833,0.000833,0.007222,0.009722,0.014167,0.003611,0.000278,0.000278,0.005833,0.004167
2,338.0,1,3,0.305727,2,0.250000,7.0,4.0,11.0,5.0,...,0.005833,0.000833,0.007222,0.009722,0.014167,0.003611,0.000278,0.000278,0.005833,0.004167
3,338.0,1,4,0.285714,2,0.250000,6.0,12.0,18.0,7.0,...,0.005833,0.000833,0.007222,0.009722,0.014167,0.003611,0.000278,0.000278,0.005833,0.004167
4,338.0,1,5,0.291277,2,0.250000,9.0,6.0,15.0,8.0,...,0.005833,0.000833,0.007222,0.009722,0.014167,0.003611,0.000278,0.000278,0.005833,0.004167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
741346,657.0,26,6807,0.752044,38,0.416667,2.0,6.0,8.0,3.0,...,0.008667,0.004667,0.012667,0.009333,0.046000,0.032000,0.008667,0.009333,0.012000,0.024667
741347,657.0,26,6808,0.623472,38,0.416667,0.0,5.0,5.0,5.0,...,0.008667,0.004667,0.012667,0.009333,0.046000,0.032000,0.008667,0.009333,0.012000,0.024667
741348,657.0,26,6809,0.721698,38,0.416667,5.0,0.0,5.0,2.0,...,0.008667,0.004667,0.012667,0.009333,0.046000,0.032000,0.008667,0.009333,0.012000,0.024667
741349,657.0,26,6810,0.718204,38,0.416667,3.0,1.0,4.0,1.0,...,0.008667,0.004667,0.012667,0.009333,0.046000,0.032000,0.008667,0.009333,0.012000,0.024667


In [None]:
# including topics and stats
sizes = [0.1, 0.2, 0.5, 0.8, 0.9]
seeds = [17, 23, 32, 42, 47]
muns = nv["mun_id"].unique()
results = pd.DataFrame(columns=['size', 'seed', 'data', 'r2', 'rmse'])
cols = {'stats+topics': nv_with_stats_topics.columns[:],
       'stats': nv_with_stats_topics.columns[0:-56],
       'topics': (nv_with_stats_topics[['mun_id','id','jaStimmenInProzent']].columns[:]).append((nv_with_stats_topics.columns[-56:])),
       'none': nv_with_stats_topics[['mun_id','id','jaStimmenInProzent']].columns[:]}
for size in sizes:
    for seed in seeds:
        random.seed(seed)
        sample = random.sample(muns.tolist(), math.floor(size * len(muns)))
        Xy_train = nv_with_stats_topics[~nv_with_stats_topics["mun_id"].isin(sample)]
        Xy_test = nv_with_stats_topics[nv_with_stats_topics["mun_id"].isin(sample)]
        for data in cols.keys():
            col = cols[data]
            X_train = Xy_train[col].drop(columns='jaStimmenInProzent')
            X_test = Xy_test[col].drop(columns='jaStimmenInProzent')
            y_train = Xy_train['jaStimmenInProzent']
            y_test = Xy_test['jaStimmenInProzent']
            dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
            dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)
            bst = xgb.train({"eta": 0.01}, dtrain, 1000)
            y_pred = bst.predict(dtest)
            r2 = r2_score(y_test, y_pred)
            rmse = root_mean_squared_error(y_test, y_pred)
            print(f'results for size {size}, seed {seed}, data {data}: R2: {r2}, rmse: {rmse}')
            results_dict = {'size': size, 'seed': seed, 'data': data, 'r2': r2, 'rmse': rmse}
            results = pd.concat([results, pd.DataFrame([results_dict])], ignore_index=True)

results for size 0.1, seed 17, data stats+topics: R2: 0.7717164680621518, rmse: 0.09622089188359345


  results = pd.concat([results, pd.DataFrame([results_dict])], ignore_index=True)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Group by 'size' and 'data', and calculate the mean of 'r2' and 'rmse'
grouped = results.groupby(['size', 'data']).agg({'r2': 'mean', 'rmse': 'mean'}).reset_index()

In [None]:
# Plot r2
plt.figure(figsize=(7.5, 6))
sns.lineplot(data=grouped, x='size', y='r2', hue='data', marker='o')
plt.title('Average R2 by Test Set Size and Data Category')
plt.xlabel('Test Set Size')
plt.ylabel('Average R2')
plt.legend(title='Data')
plt.grid(True)
plt.savefig('../plots/varying_test_size_r2.png')
plt.show()

In [None]:
# Plot rmse
plt.figure(figsize=(7.5, 6))
sns.lineplot(data=grouped, x='size', y='rmse', hue='data', marker='o')
plt.title('Average RMSE by Test Set Size and Data Category')
plt.xlabel('Test Set Size')
plt.ylabel('Average RMSE')
plt.legend(title='Data')
plt.grid(True)
plt.savefig('../plots/varying_test_size_rmse.png')
plt.show()