In [176]:
import random

import pandas as pd
import sklearn
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [177]:
n = 20
cutoff = 650

random.seed(42)

In [178]:
nv = pd.read_csv("../data/intermediate/national_votes.csv")
topics = pd.read_csv("../data/processed/summary_topics.csv")

nv["id"] /= 10
nv = nv[nv["id"] >= topics["id"].min()]
nv = nv[nv["id"] <= topics["id"].max()]
nv["jaStimmenInProzent"] /= 100

In [179]:
muns = nv["mun_id"].unique()
sample = random.sample(muns.tolist(), n)

In [180]:
selected_entries = nv[nv["mun_id"].isin(sample)]
selected_entries = selected_entries[["id", "mun_id", "jaStimmenInProzent"]]

In [181]:
Xy = selected_entries.pivot(index="id", columns="mun_id", values="jaStimmenInProzent")
Xy_with_topics = Xy.merge(topics, left_on="id", right_on="id")

X = Xy_with_topics[Xy.columns[:-1]]
y = Xy_with_topics[Xy.columns[-1]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [182]:
bst = xgb.train({"eta": 0.01}, dtrain, 1000)

In [183]:
bst.eval(dtest, name="baseline (no topics)")

'[0]\tbaseline (no topics)-rmse:0.11417528175882152'

In [184]:
X_with_topics = Xy_with_topics.drop(columns=[Xy.columns[-1]])

X_train, X_test, y_train, y_test = train_test_split(X_with_topics, y, test_size=0.2, random_state=42)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [185]:
bst = xgb.train({"eta": 0.01}, dtrain, 1000)

In [186]:
bst.eval(dtest, name="with topics")

'[0]\twith topics-rmse:0.12144121752570519'