In [1]:
import pandas as pd
import xgboost as xgb

In [2]:
TEST_CUTOFF = 650  # Everything before this is training data

In [3]:
nv = pd.read_csv("../data/intermediate/national_votes.csv")
sv = pd.read_csv("../data/intermediate/swissvotes.csv")
topics = pd.read_csv("../data/processed/summary_topics.csv")

In [4]:
controls = []
for i in range(3):
    controls.append(pd.read_parquet(f"../data/intermediate/controls/socioeconomic_{i}.parquet"))
controls = pd.concat(controls)

In [5]:
# Swissvotes vote IDs ("anr" column) are multiplied by 10 for some reason
sv["anr"] /= 10

# Select the relevant rows from swissvotes 
sv = sv[sv["anr"] >= topics["id"].min()]

In [6]:
# Same for national votes from swissdd.
nv["id"] /= 10
nv = nv[nv["id"] >= topics["id"].min()]

In [7]:
# Join nv and topics
nv.dropna(inplace=True)
nv_and_topics = nv.merge(topics, left_on="id", right_on="id")

In [8]:
nv_and_topics["votedate"] = pd.DatetimeIndex(nv_and_topics["votedate"])
nv_and_topics["year"] = nv_and_topics["votedate"].dt.year
nv_and_topics["month"] = nv_and_topics["votedate"].dt.month / 12

In [9]:
# Prepare controls
controls.dropna(inplace=True)
controls["YEAR"] = controls["YEAR"].astype(int)
controls = controls[controls["YEAR"] >= nv_and_topics["year"].min()]

In [10]:
index = ["YEAR", "MUN_ID"] if "MUN_ID" in controls.columns else ["YEAR"]
controls = controls.pivot(index=index, columns="VALUE", values="DATA")

In [11]:
left_on = ["year", "mun_id"] if "MUN_ID" in index else ["year"]
merged = pd.merge(nv_and_topics, controls, left_on=left_on, right_index=True)

In [12]:
# Drop the columns we don't need...
merged.drop(
    columns=[
        "Unnamed: 0",
        "name",
        "canton_name",
        "mun_name",
        "geoLevelParentnummer",
        "gebietAusgezaehlt",
        "jaStimmenAbsolut",
        "neinStimmenAbsolut",
        "stimmbeteiligungInProzent",
        "eingelegteStimmzettel",
        "anzahlStimmberechtigte",
        "gueltigeStimmen",
        "votedate",
    ]
, inplace=True)
# ... and prepare for XGBoost
#merged.dropna(inplace=True)
merged["year"] -= merged["year"].min()
merged["jaStimmenInProzent"] /= 100
merged["mun_id"] = merged["mun_id"].astype("category")
merged["canton_id"] = merged["canton_id"].astype("category")

In [13]:
# Split into training and test data

train = merged[merged["id"] < TEST_CUTOFF]
test = merged[merged["id"] >= TEST_CUTOFF]

train.drop(columns=["id"], inplace=True)
test.drop(columns=["id"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.drop(columns=["id"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.drop(columns=["id"], inplace=True)


In [14]:
# Train the model
dtrain = xgb.DMatrix(
    train.drop(columns=["jaStimmenInProzent"]),
    label=train["jaStimmenInProzent"],
    enable_categorical=True,
)
dtest = xgb.DMatrix(
    test.drop(columns=["jaStimmenInProzent"]),
    label=test["jaStimmenInProzent"],
    enable_categorical=True,
)

# Gain back some desperately needed memory
del merged, nv_and_topics, controls

: 

In [15]:
param = {
    # "objective": "binary:logistic",
    "max_depth": 15,
    "subsample": 0.8,
    "eta": 0.01
}
bst = xgb.train(param, dtrain, 200)

In [None]:
bst.eval(dtest)

'[0]\teval-rmse:0.14941583165096164'