In [None]:
import warnings
warnings.simplefilter(action="ignore", category=UserWarning)

import pandas as pd
import psycopg

db = psycopg.connect("dbname=next host=localhost port=5432 user=postgres password=postgres")

factions = pd.read_sql_query("select * from open_discourse.factions", db)
print(factions)

In [None]:
speeches = pd.read_sql_query("select * from open_discourse.speeches limit 10", db)
print(speeches.head())

In [None]:
speeches = pd.read_sql_query("""
    SELECT speech_content, factions.abbreviation as faction, electoral_term, date_part('year', date)::int as year
    FROM open_discourse.speeches
    JOIN open_discourse.factions ON factions.id = speeches.faction_id
    WHERE faction_id <> -1 AND faction_id <> 16
    AND date_part('year', date)::int >= 2018
    """, db)
print(speeches.shape)
print(speeches.head())

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
import torch
import umap

# model = SentenceTransformer("deutsche-telekom/gbert-large-paraphrase-euclidean") # Large model
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
print(f"Model loaded on {model.device}")

In [None]:
# Calculate embeddings for each speech
embeddings = model.encode(speeches["speech_content"].values, convert_to_numpy=True)
print("Encoded!")

# Numeric Distance to AfD

In [None]:
from sentence_transformers.util import cos_sim

speech_indices_by_faction = speeches.groupby(["faction"]).groups

embedding_averages = {
    faction: np.mean(embeddings[indices], axis=0)
    for faction, indices in speech_indices_by_faction.items()
}
afd_average = embedding_averages["AfD"]
similarities_to_afd = {
    faction: cos_sim(average, afd_average).item()
    for faction, average in embedding_averages.items() if faction != "AfD"
}
print(dict(sorted(similarities_to_afd.items(), key=lambda item: item[1], reverse=True)))

In [None]:
speech_indices_by_faction_and_year = speeches.groupby(["faction", "year"]).groups

embedding_averages = {
    faction_and_year: np.mean(embeddings[indices], axis=0)
    for faction_and_year, indices in speech_indices_by_faction_and_year.items()
}
print(embedding_averages.keys())

year_range = range(2018, 2023)
factions = ["AfD", "CDU/CSU", "DIE LINKE.", "FDP", "Grüne", "SPD"]

yearly_similarities_to_afd = {
    faction: [cos_sim(embedding_averages[faction, year], embedding_averages["AfD", year]).item() for year in year_range]
    for faction in factions if faction != "AfD"
}
print(yearly_similarities_to_afd)

import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

colours = {
    "AfD": "#009ee0",
    "CDU/CSU": "#1b191d",
    "DIE LINKE.": "#be3075",
    "PDS": "#be3075",
    "FDP": "#ffed00",
    "Grüne": "#56b45e",
    "SPD": "#e30613"
}

for faction, averages in yearly_similarities_to_afd.items():
    plt.plot(year_range, averages, c=colours[faction], label=faction)

plt.xticks(year_range)
plt.ylim(0.97, 1)
plt.legend()
plt.savefig("output/similarity_to_afd.svg", bbox_inches="tight")

# UMAP Chart

In [None]:
reduced_embeddings = umap.UMAP(metric="cosine", n_neighbors=50, random_state=5).fit_transform(list(embedding_averages.values()))
print(reduced_embeddings)

In [None]:
keys = list(embedding_averages.keys())

scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=[colours[key[0]] for key in keys], s=10)

for i, embedding in enumerate(reduced_embeddings):
    plt.annotate(keys[i][1], (embedding[0], embedding[1] + 0.05), fontsize="xx-small")
    
plt.legend(handles=[
    Line2D([0], [0], marker='o', color='w', label=fraction, markerfacecolor=colour, markersize=8)
    for fraction, colour in colours.items() if fraction != "PDS"
], bbox_to_anchor=(1.3, 1))
plt.savefig("output/last_term.svg", bbox_inches="tight")

In [None]:
# Experiment with PCA instead of UMAP for dimensionality reduction

from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(np.array(list(embedding_averages.values())).T)
reduced_embeddings = np.asarray(pca.components_).T

scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=[colours[key[0]] for key in keys], s=10)

for i, embedding in enumerate(reduced_embeddings):
    plt.annotate(keys[i][1], (embedding[0], embedding[1] + 0.01), fontsize="xx-small")

plt.legend(handles=[
    Line2D([0], [0], marker='o', color='w', label=fraction, markerfacecolor=colour, markersize=8)
    for fraction, colour in colours.items() if fraction != "PDS"
], bbox_to_anchor=(1.3, 1))
plt.show()

In [None]:
print(pca.components_)

# All-time Embeddings

In [None]:
speeches = pd.read_sql_query("""
    SELECT speech_content, factions.abbreviation as faction, electoral_term, date_part('year', date)::int as year
    FROM open_discourse.speeches
    JOIN open_discourse.factions ON factions.id = speeches.faction_id
    WHERE factions.abbreviation IN ('AfD', 'CDU/CSU', 'DIE LINKE.', 'PDS', 'FDP', 'Grüne', 'SPD')
    """, db)
print(speeches.shape)
print(speeches.head())

In [None]:
embeddings = model.encode(speeches["speech_content"].values, convert_to_numpy=True)
print("Encoded!")

In [None]:
speech_indices_by_faction_and_term = speeches.groupby(["faction", "electoral_term"]).groups

embedding_averages = {
    faction_and_term: np.mean(embeddings[indices], axis=0)
    for faction_and_term, indices in speech_indices_by_faction_and_term.items()
}

reduced_embeddings = umap.UMAP(metric="cosine", n_neighbors=50, random_state=13).fit_transform(list(embedding_averages.values()))

keys = list(embedding_averages.keys())
scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=[colours[key[0]] for key in keys], s=10)

for i, embedding in enumerate(reduced_embeddings):
    plt.annotate(keys[i][1], (embedding[0] + 0.02, embedding[1] + 0.05), fontsize="xx-small")
    
plt.legend(handles=[
    Line2D([0], [0], marker='o', color='w', label=fraction, markerfacecolor=colour, markersize=8)
    for fraction, colour in colours.items() if fraction != "PDS"
], loc="upper left")
plt.savefig("output/all_terms_factions.svg")

In [None]:
def is_governing(faction, term):
    governments_by_term = {
        1: ["CDU/CSU", "FDP"],
        2: ["CDU/CSU", "FDP"],
        3: ["CDU/CSU"],
        4: ["CDU/CSU", "FDP"],
        5: ["CDU/CSU", "FDP", "SPD"],
        6: ["SPD", "FDP"],
        7: ["SPD", "FDP"],
        8: ["SPD", "FDP"],
        9: ["SPD", "FDP", "CDU/CSU"],
        10: ["CDU/CSU", "FDP"],
        11: ["CDU/CSU", "FDP"],
        12: ["CDU/CSU", "FDP"],
        13: ["CDU/CSU", "FDP"],
        14: ["SPD", "Grüne"],
        15: ["SPD", "Grüne"],
        16: ["CDU/CSU", "SPD"],
        17: ["CDU/CSU", "FDP"],
        18: ["CDU/CSU", "SPD"],
        19: ["CDU/CSU", "SPD"],
        20: ["SPD", "Grüne", "FDP"]
    }
    return faction in governments_by_term[term]

def faction_color(key):
    return "green" if is_governing(key[0], key[1]) else "red"

scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=[faction_color(key) for key in keys], s=10)

for i, embedding in enumerate(reduced_embeddings):
    plt.annotate(keys[i][1], (embedding[0] + 0.02, embedding[1] + 0.05), fontsize="xx-small")
    
plt.legend(handles=[
    Line2D([0], [0], marker='o', color='w', label="Regierung", markerfacecolor="green", markersize=8),
    Line2D([0], [0], marker='o', color='w', label="Opposition", markerfacecolor="red", markersize=8)
], loc="upper left")
plt.savefig("output/all_terms_governments.svg")