# Data preparation for regression analysis

## 1. Import packages

In [132]:
import pandas as pd
import numpy as np
from datetime import timedelta

## 2. Load data and create individual data frames

### 2.1 2-week repo rate

In [133]:
repo_rate = pd.read_csv("../../data/cnb_repo.txt", sep="|")

In [134]:
repo_rate["VALID_FROM"] = pd.to_datetime(repo_rate["VALID_FROM"], format="%Y%m%d")

In [135]:
repo_rate = repo_rate.rename(columns={"VALID_FROM": "date", "CNB_REPO_RATE_IN_%": "cnb_repo_rate"})

In [136]:
repo_rate = repo_rate[repo_rate["date"] >= "1998-01-22"]

In [137]:
repo_rate["rate_change"] = repo_rate["cnb_repo_rate"].diff()

In [138]:
repo_rate = repo_rate.drop("cnb_repo_rate", axis=1).dropna()

In [139]:
repo_rate

Unnamed: 0,date,rate_change
32,1998-07-17,-0.50
33,1998-08-14,-0.50
34,1998-09-25,-0.50
35,1998-10-27,-1.00
36,1998-11-13,-1.00
...,...,...
108,2024-03-21,-0.50
109,2024-05-03,-0.50
110,2024-06-28,-0.50
111,2024-08-02,-0.25


### Sentiments

In [140]:
sentiments = pd.read_csv(
    "../../predictions/sentiment_predictions.tsv", sep="\t", header=None, names=["date", "sentiment"]
)

In [141]:
dates = sentiments["date"].str.split(".")

In [142]:
sentiments["date"] = dates.apply(lambda x: x[0])

In [143]:
sentiments["date"] = pd.to_datetime(sentiments["date"], format="%Y%m%d")

In [144]:
sentiments.head()

Unnamed: 0,date,sentiment
0,1998-01-22,-0.589855
1,1998-02-19,0.148933
2,1998-03-19,0.690357
3,1998-04-30,-0.957909
4,1998-05-28,0.261321


### Voting records

Here, I am computing disagreement as the standard deviation of suggested policy rate changes among board members.

In [145]:
votings = pd.read_excel("../../data/voting_of_the_bank_board.xlsx", sheet_name="1998-2024", header=None)

In [146]:
votings = votings.T

In [147]:
votings[0] = votings[0].fillna(method="ffill")

  votings[0] = votings[0].fillna(method="ffill")
  votings[0] = votings[0].fillna(method="ffill")


In [148]:
votings.columns = ["year", "day_month"] + votings.iloc[0, 2:].to_list()

In [149]:
votings = votings.drop(0, axis=0)

In [150]:
votings["year"] = votings["year"].astype(int).astype(str)

In [151]:
votings["date"] = votings["day_month"].str.strip() + votings["year"].str.strip()

In [152]:
votings["date"] = pd.to_datetime(votings["date"], format="%d.%m.%Y")

In [153]:
votings = votings.drop(["year", "day_month"], axis=1)

In [154]:
votings = votings.replace("o", np.nan).replace(" ", np.nan)

  votings = votings.replace("o", np.nan).replace(" ", np.nan)


In [155]:
votings["disagreement"] = votings.drop("date", axis=1).std(axis=1)

In [156]:
votings["disagreement"] = votings["disagreement"].replace(np.nan, 0)

In [157]:
votings = votings.drop(columns=[col for col in votings.columns if col not in ["date", "disagreement"]])

In [158]:
votings = votings[votings["date"] <= "2024-09-25"]

In [159]:
votings.head()

Unnamed: 0,date,disagreement
1,1998-01-22,0.0
2,1998-02-19,0.0
3,1998-03-19,0.273861
4,1998-04-30,0.0
5,1998-05-28,0.0


## 3. Merge dataframes

In [160]:
sentiment_votings = pd.merge(sentiments, votings, how="left", on="date")

In [161]:
sentiment_votings = sentiment_votings.sort_values("date")

In [162]:
repo_rate = repo_rate.sort_values("date")

In [163]:
# Possibly change to direction="forward"!!
data = pd.merge_asof(
    sentiment_votings, 
    repo_rate, 
    on="date", 
    direction="nearest",
    tolerance=pd.Timedelta("7D")
)

In [166]:
data = data.fillna(0)

In [167]:
data.to_excel("/Users/bernhardbrunner/Desktop/test_test_test.xlsx")