In [161]:
import pandas as pd
import xgboost as xgb

In [162]:
TEST_CUTOFF = 650  # Everything before this is training data

In [163]:
nv = pd.read_csv("../data/intermediate/national_votes.csv")
sv = pd.read_csv("../data/intermediate/swissvotes.csv")
topics = pd.read_csv("../data/processed/summary_topics.csv")

controls = pd.read_parquet("../data/intermediate/controls/socioeconomic_0.parquet")

In [164]:
# Swissvotes vote IDs ("anr" column) are multiplied by 10 for some reason
sv["anr"] /= 10

# Select the relevant rows from swissvotes 
sv = sv[sv["anr"] >= topics["id"].min()]

In [165]:
# Same for national votes from swissdd.
nv["id"] /= 10
nv = nv[nv["id"] >= topics["id"].min()]

In [166]:
# Join nv and topics
nv.dropna(inplace=True)
nv_and_topics = nv.merge(topics, left_on="id", right_on="id")
nv_and_topics

Unnamed: 0.1,Unnamed: 0,name,id,canton_id,canton_name,mun_id,mun_name,geoLevelParentnummer,gebietAusgezaehlt,jaStimmenInProzent,...,506 - Education Expansion,507 - Education Limitation,604 - Traditional Morality: Negative,607 - Multiculturalism: Positive,608 - Multiculturalism: Negative,701 - Labour Groups: Positive,702 - Labour Groups: Negative,703 - Agriculture and Farmers: Positive,704 - Middle Class and Professional Groups,705 - Underprivileged Minority Groups
0,1,Bundesbeschluss über den Beitritt der Schweiz ...,338.0,1,Zürich,1,Aeugst am Albis,101,True,31.534091,...,0.005833,0.000833,0.007222,0.009722,0.014167,0.003611,0.000278,0.000278,0.005833,0.004167
1,2,Bundesbeschluss über den Beitritt der Schweiz ...,338.0,1,Zürich,2,Affoltern am Albis,101,True,27.018752,...,0.005833,0.000833,0.007222,0.009722,0.014167,0.003611,0.000278,0.000278,0.005833,0.004167
2,3,Bundesbeschluss über den Beitritt der Schweiz ...,338.0,1,Zürich,3,Bonstetten,101,True,30.572687,...,0.005833,0.000833,0.007222,0.009722,0.014167,0.003611,0.000278,0.000278,0.005833,0.004167
3,4,Bundesbeschluss über den Beitritt der Schweiz ...,338.0,1,Zürich,4,Hausen am Albis,101,True,28.571429,...,0.005833,0.000833,0.007222,0.009722,0.014167,0.003611,0.000278,0.000278,0.005833,0.004167
4,5,Bundesbeschluss über den Beitritt der Schweiz ...,338.0,1,Zürich,5,Hedingen,101,True,29.127726,...,0.005833,0.000833,0.007222,0.009722,0.014167,0.003611,0.000278,0.000278,0.005833,0.004167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
748447,793318,Übernahme der EU-Verordnung über die Europäisc...,657.0,26,Jura,6808,Clos du Doubs,2603,True,62.347188,...,0.008667,0.004667,0.012667,0.009333,0.046000,0.032000,0.008667,0.009333,0.012000,0.024667
748448,793319,Übernahme der EU-Verordnung über die Europäisc...,657.0,26,Jura,6809,Haute-Ajoie,2603,True,72.169811,...,0.008667,0.004667,0.012667,0.009333,0.046000,0.032000,0.008667,0.009333,0.012000,0.024667
748449,793320,Übernahme der EU-Verordnung über die Europäisc...,657.0,26,Jura,6810,La Baroche,2603,True,71.820449,...,0.008667,0.004667,0.012667,0.009333,0.046000,0.032000,0.008667,0.009333,0.012000,0.024667
748450,793321,Übernahme der EU-Verordnung über die Europäisc...,657.0,26,Jura,6811,Damphreux-Lugnez,2603,True,69.291339,...,0.008667,0.004667,0.012667,0.009333,0.046000,0.032000,0.008667,0.009333,0.012000,0.024667


In [167]:
nv_and_topics["votedate"] = pd.DatetimeIndex(nv_and_topics["votedate"])
nv_and_topics["year"] = nv_and_topics["votedate"].dt.year
nv_and_topics["month"] = nv_and_topics["votedate"].dt.month / 12

In [168]:
# Prepare controls
controls.dropna(inplace=True)
controls["YEAR"] = controls["YEAR"].astype(int)
controls = controls[controls["YEAR"] >= nv_and_topics["year"].min()]
controls

Unnamed: 0,CAN_NAME,DIS_ID,DIS_NAME,MUN_ID,MUN_NAME,CAN_ID,YEAR,VALUE,DATA
691416,ZH,101.0,Affoltern,1.0,Aeugst am Albis,1.0,1984,DATA_Auswanderung_Frau_Ausland,0.0
691417,ZH,101.0,Affoltern,1.0,Aeugst am Albis,1.0,1984,DATA_Auswanderung_Frau_Schweiz,0.0
691418,ZH,101.0,Affoltern,1.0,Aeugst am Albis,1.0,1984,DATA_Auswanderung_Frau_Staatsangehörigkeit (Ka...,0.0
691419,ZH,101.0,Affoltern,1.0,Aeugst am Albis,1.0,1984,DATA_Auswanderung_Mann_Ausland,1.0
691420,ZH,101.0,Affoltern,1.0,Aeugst am Albis,1.0,1984,DATA_Auswanderung_Mann_Schweiz,1.0
...,...,...,...,...,...,...,...,...,...
9679819,JU,2603.0,Porrentruy,6811.0,Damphreux-Lugnez,26.0,2022,DATA_Änderung des Bevölkerungstyps_Frau_Schweiz,0.0
9679820,JU,2603.0,Porrentruy,6811.0,Damphreux-Lugnez,26.0,2022,DATA_Änderung des Bevölkerungstyps_Frau_Staats...,0.0
9679821,JU,2603.0,Porrentruy,6811.0,Damphreux-Lugnez,26.0,2022,DATA_Änderung des Bevölkerungstyps_Mann_Ausland,0.0
9679822,JU,2603.0,Porrentruy,6811.0,Damphreux-Lugnez,26.0,2022,DATA_Änderung des Bevölkerungstyps_Mann_Schweiz,0.0


In [169]:
index = ["YEAR", "MUN_ID"] if "MUN_ID" in controls.columns else ["YEAR"]
controls = controls.pivot(index=index, columns="VALUE", values="DATA")

In [170]:
left_on = ["year", "mun_id"] if "MUN_ID" in index else ["year"]
merged = pd.merge(nv_and_topics, controls, left_on=left_on, right_index=True)

In [171]:
# Drop the columns we don't need...
merged.drop(
    columns=[
        "Unnamed: 0",
        "name",
        "canton_name",
        "mun_name",
        "geoLevelParentnummer",
        "gebietAusgezaehlt",
        "jaStimmenAbsolut",
        "neinStimmenAbsolut",
        "stimmbeteiligungInProzent",
        "eingelegteStimmzettel",
        "anzahlStimmberechtigte",
        "gueltigeStimmen",
        "votedate",
    ]
, inplace=True)
# ... and prepare for XGBoost
merged.dropna(inplace=True)
merged["year"] -= merged["year"].min()
merged["jaStimmenInProzent"] /= 100
merged["mun_id"] = merged["mun_id"].astype("category")
merged["canton_id"] = merged["canton_id"].astype("category")

In [172]:
# Split into training and test data

train = merged[merged["id"] < TEST_CUTOFF]
test = merged[merged["id"] >= TEST_CUTOFF]

train.drop(columns=["id"], inplace=True)
test.drop(columns=["id"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.drop(columns=["id"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.drop(columns=["id"], inplace=True)


In [173]:
# Train the model
dtrain = xgb.DMatrix(
    train.drop(columns=["jaStimmenInProzent"]),
    label=train["jaStimmenInProzent"],
    enable_categorical=True,
)
dtest = xgb.DMatrix(
    test.drop(columns=["jaStimmenInProzent"]),
    label=test["jaStimmenInProzent"],
    enable_categorical=True,
)

# Gain back some desperately needed memory
del merged, nv_and_topics, controls

In [174]:
param = {
    # "objective": "binary:logistic",
    "max_depth": 12,
    "subsample": 0.8,
    "eta": 0.4
}
bst = xgb.train(param, dtrain, 100)

In [175]:
bst.eval(dtest)

'[0]\teval-rmse:0.17285323760245064'