In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import requests
from bs4 import BeautifulSoup
import pyarrow.parquet as pq
import pyarrow as pa

In [3]:
data_speech1 = pd.read_parquet('./data/data_speech1.parquet')
data_speech2 = pd.read_parquet('./data/data_speech2.parquet')
data_speech3 = pd.read_parquet('./data/data_speech3.parquet')
dspeech = pd.concat([data_speech1, data_speech2, data_speech3], axis=0)
dmeeting = pd.read_parquet('./data/data_meeting.parquet')
parMem = pd.read_parquet('./data/parliament_members.parquet')
dspeech = pd.merge(dspeech, dmeeting[['meeting_id', 'date']])

We create a dataset with speakers that had missing party

In [6]:
missing_party = dspeech[dspeech['speaker_party'].isnull()] 
missing_party = pd.merge(missing_party, dmeeting[['meeting_id', 'date']])
missing_party = missing_party[['speaker_name', 'date']]

In [7]:
parMem

Unnamed: 0,speaker_name,speaker_party,period_start,period_end
0,Thomas Adelskov,Socialdemokratiet,2005-02-08,2007-11-13
1,Simon Emil Ammitzbøll,Det Radikale Venstre,2005-02-08,2007-11-13
2,Hans Andersen,Venstre,2005-02-08,2007-11-13
3,Jytte Andersen,Socialdemokratiet,2005-02-08,2007-11-13
4,Kim Andersen,Venstre,2005-02-08,2007-11-13
...,...,...,...,...
1069,Theresa Scavenius,Alternativet,2022-11-01,2023-09-07
1070,Anna Falkenberg,Sambandsflokkurin,2022-11-01,2023-09-07
1071,Sjúrður Skaale,Javnaðarflokkurin,2022-11-01,2023-09-07
1072,Aaja Chemnitz Driefer,Inuit Ataqatigiit,2022-11-01,2023-09-07


### Add speakers that were not in parliament members dataset

Some speakers with missing party were not in the parliament members dataset.
We find these and scrape them

In [9]:
periods_start = []
periods_end = []
persons_missing_in_parMem = missing_party[missing_party['speaker_name'].apply(lambda x: x not in parMem['speaker_name'].tolist())]['speaker_name'].unique().tolist()
for person in persons_missing_in_parMem:
    print(person)
    print(f"First day {missing_party[missing_party['speaker_name'] == person]['date'].min()}")
    print(f"Last day {missing_party[missing_party['speaker_name'] == person]['date'].max()}")
    periods_start.append(str(missing_party[missing_party['speaker_name'] == person]['date'].min())[0:10])
    periods_end.append(str(missing_party[missing_party['speaker_name'] == person]['date'].max())[0:10])
    print("\n")

Charlotte Sahl-Madsen
First day 2010-02-25 00:00:00
Last day 2011-05-26 00:00:00


Joy Mogensen
First day 2020-01-15 00:00:00
Last day 2021-05-26 00:00:00


Thor Möger Pedersen
First day 2011-10-12 00:00:00
Last day 2012-10-10 00:00:00


Lars Aagaard
First day 2022-12-20 00:00:00
Last day 2023-05-23 00:00:00


Jørn Neergaard Larsen
First day 2015-07-03 00:00:00
Last day 2016-11-23 00:00:00




For these five persons, we find their political party on wikipedia and add them to the parliament members dataset

In [10]:
persons_missing_in_parMem_wiki = [e.replace(" ", "_") for e in persons_missing_in_parMem]
persons_missing_in_parMem_wiki = ['https://da.wikipedia.org/wiki/' + name for name in persons_missing_in_parMem]
persons_missing_in_parMem_wiki

speaker_partys = []
for url in persons_missing_in_parMem_wiki:
    page = requests.get(url)
    page = BeautifulSoup(page.text, "html.parser")
    table = page.find('table')
    try:
        party_label = table.find('th', {'scope': 'row', 'style': 'text-align:left'}, string=lambda s: 'Politisk' in str(s))
        speaker_party = party_label.find_next('a').get_text(strip=True)
    except AttributeError: #fails for Lars Aagaard, manually enter party
        speaker_party = 'Moderaterne'
    speaker_partys.append(speaker_party)

In [11]:
persons_missing_in_parMem_df = pd.DataFrame({'speaker_name': persons_missing_in_parMem,
                                             'speaker_party': speaker_partys,
                                             'period_start': periods_start,
                                             'period_end': periods_end})
parMem = pd.concat([parMem, persons_missing_in_parMem_df], ignore_index=True)

### Check if we have all the information in parliament members dataset (First merge)

We merge the dataset of speakers with missing party and parliament members dataset on two conditions:
1. speaker_name has to match
2. The date from the dataset of speakers with missing party has to be between the period_start and period_end

Then we merge this merged dataset back onto the dataset of speakers with missing party

In [13]:
missing_party['date'] = pd.to_datetime(missing_party['date'])
parMem['period_start'] = pd.to_datetime(parMem['period_start'])
parMem['period_end'] = pd.to_datetime(parMem['period_end'])
merged_df = pd.merge(missing_party, parMem, on='speaker_name', how='left')
merged_df = merged_df[(merged_df['date'] >= merged_df['period_start']) & (merged_df['date'] <= merged_df['period_end'])]
merged_df = merged_df.drop_duplicates()

missing_party = pd.merge(missing_party, merged_df[['speaker_name', 'date', 'speaker_party']], on=['speaker_name', 'date'], how='left')

We make a basic check to see if the dates worked well. We see that Lars Løkke Rasmussen has speech items both for Venstre and Moderaterne which is good!

In [14]:
missing_party[missing_party['speaker_name'] == 'Lars Løkke Rasmussen']['speaker_party'].value_counts()

Venstre        2547
Moderaterne     114
Name: speaker_party, dtype: int64

We loop over the rows in the full speech dataset and insert the value from missing_party dataset if speaker_party is empty

In [15]:
speaker_party_dict = {(row['speaker_name'], row['date']): row['speaker_party'] for _, row in missing_party.iterrows()}
dspeech_copy = dspeech.copy()
for index, row in dspeech_copy.iterrows():
    speaker_name = row['speaker_name']
    date = row['date']
    if pd.isna(row['speaker_party']) and (speaker_name, date) in speaker_party_dict:
        dspeech_copy.at[index, 'speaker_party'] = speaker_party_dict[(speaker_name, date)]

But there are still missing values in speaker_party! Apparently there were 18 persons in the full speech dataset, who were speaking in the parliament as ministers at dates where the wikipedia pages did not inform that they were ministers.

In [16]:
periods_start = []
periods_end = []
persons_still_missing = dspeech_copy[dspeech_copy['speaker_party'].isnull()]['speaker_name'].unique().tolist()
for person in persons_still_missing:
    print(person)
    periods_start.append(missing_party[missing_party['speaker_name'] == person]['date'].min())
    periods_end.append(missing_party[missing_party['speaker_name'] == person]['date'].max())

Benedikte Kiær
Lykke Friis
Simon Emil Ammitzbøll-Bille
Kaare Dybvad Bek
Søren Pape Poulsen
Thyra Frank
Lars Christian Lilleholt
Peter Hummelgaard
Jeppe Kofod
Tommy Ahlers
Ulla Tørnæs
Martin Lidegaard
Christina Egelund
Jeppe Bruus
Dan Jørgensen
Simon Emil Ammitzbøll
Peter Christensen
Karen Jespersen


Indeed they were all ministers when speaking.

In [18]:
dspeech_copy[dspeech_copy['speaker_party'].isnull()]['speaker_role'].value_counts()

minister               6290
fungerende minister       3
Name: speaker_role, dtype: int64

For these 18 persons, we find their political party on wikipedia and add them to the parliament members dataset

In [22]:
persons_still_missing_wiki = [e.replace(" ", "_") for e in persons_still_missing]
persons_still_missing_wiki = ['https://da.wikipedia.org/wiki/' + name for name in persons_still_missing_wiki]
idx_peter_christensen = persons_still_missing_wiki.index('https://da.wikipedia.org/wiki/Peter_Christensen')
persons_still_missing_wiki[idx_peter_christensen] = persons_still_missing_wiki[idx_peter_christensen] + '_(politiker)'

speaker_partys = []
for url in persons_still_missing_wiki:
    page = requests.get(url)
    page = BeautifulSoup(page.text, "html.parser")
    table = page.find('table')
    try:
        party_label = table.find('th', {'scope': 'row', 'style': 'text-align:left'}, string=lambda s: 'Politisk' in str(s))
        speaker_party = party_label.find_next('a').get_text(strip=True)
        if speaker_party == '':
            speaker_party = party_label.find_next('a')
            speaker_party = speaker_party.find_next('a').get_text(strip=True)   
    except AttributeError:
        party_label = table.find('th', {'scope': 'row', 'style': 'text-align:left;vertical-align:top;'}, string=lambda s: 'Politisk' in str(s))
        speaker_party = party_label.find_next('a').get_text(strip=True)
    speaker_partys.append(speaker_party)

SyntaxError: cannot assign to function call here. Maybe you meant '==' instead of '='? (348842453.py, line 3)

In [21]:
persons_still_missing_wiki

['https://da.wikipedia.org/wiki/Benedikte_Kiær',
 'https://da.wikipedia.org/wiki/Lykke_Friis',
 'https://da.wikipedia.org/wiki/Simon_Emil_Ammitzbøll-Bille',
 'https://da.wikipedia.org/wiki/Kaare_Dybvad_Bek',
 'https://da.wikipedia.org/wiki/Søren_Pape_Poulsen',
 'https://da.wikipedia.org/wiki/Thyra_Frank',
 'https://da.wikipedia.org/wiki/Lars_Christian_Lilleholt',
 'https://da.wikipedia.org/wiki/Peter_Hummelgaard',
 'https://da.wikipedia.org/wiki/Jeppe_Kofod',
 'https://da.wikipedia.org/wiki/Tommy_Ahlers',
 'https://da.wikipedia.org/wiki/Ulla_Tørnæs',
 'https://da.wikipedia.org/wiki/Martin_Lidegaard',
 'https://da.wikipedia.org/wiki/Christina_Egelund',
 'https://da.wikipedia.org/wiki/Jeppe_Bruus',
 'https://da.wikipedia.org/wiki/Dan_Jørgensen',
 'https://da.wikipedia.org/wiki/Simon_Emil_Ammitzbøll',
 'https://da.wikipedia.org/wiki/Peter_Christensen',
 'https://da.wikipedia.org/wiki/Karen_Jespersen']

In [145]:
persons_missing_in_parMem_df = pd.DataFrame({'speaker_name': persons_still_missing,
                                             'speaker_party': speaker_partys,
                                             'period_start': periods_start,
                                             'period_end': periods_end})
parMem = pd.concat([parMem, persons_missing_in_parMem_df], ignore_index=True)

We then map party names to their short name, e.g. Venstre to V

In [146]:
party_letters = ['S',
               'RV',
               'V',
               'EL',
               'SF', 
               'KF', 
               'DF', 
               'T',
               'SIU',
               'A',
               'IA',
               'SP',
               'Y',
               'LA',
               'RV',
               'JF',
               'ALT',
               'NB',
               'M',
               'DD',
               'S'
               ]
party_letters_dict = dict(zip(parMem['speaker_party'].unique().tolist(), party_letters))
party_letters_dict

{'Socialdemokratiet': 'S',
 'Det Radikale Venstre': 'RV',
 'Venstre': 'V',
 'Enhedslisten': 'EL',
 'Socialistisk Folkeparti': 'SF',
 'Det Konservative Folkeparti': 'KF',
 'Dansk Folkeparti': 'DF',
 'Tjóðveldi': 'T',
 'Siumut': 'SIU',
 'Fólkaflokkurin': 'A',
 'Inuit Ataqatigiit': 'IA',
 'Sambandsflokkurin': 'SP',
 'Ny Alliance': 'Y',
 'Liberal Alliance': 'LA',
 'Radikale Venstre': 'RV',
 'Javnaðarflokkurin': 'JF',
 'Alternativet': 'ALT',
 'Nye Borgerlige': 'NB',
 'Moderaterne': 'M',
 'Danmarksdemokraterne': 'DD',
 'Socialdemokraterne': 'S'}

In [147]:
parMem['speaker_party'] = parMem['speaker_party'].replace(party_letters_dict)

### Second merge

We do the same as earlier: merge the dataset of speakers with missing party and parliament members dataset on two conditions:
1. speaker_name has to match
2. The date from the dataset of speakers with missing party has to be between the period_start and period_end

Then we merge this merged dataset back onto the dataset of speakers with missing party

In [148]:
missing_party['date'] = pd.to_datetime(missing_party['date'])
missing_party = missing_party.drop(columns=['speaker_party'])
parMem['period_start'] = pd.to_datetime(parMem['period_start'])
parMem['period_end'] = pd.to_datetime(parMem['period_end'])
merged_df = pd.merge(missing_party, parMem, on='speaker_name', how='left')
merged_df = merged_df[(merged_df['date'] >= merged_df['period_start']) & (merged_df['date'] <= merged_df['period_end'])]
merged_df = merged_df.drop_duplicates()

missing_party = pd.merge(missing_party, merged_df[['speaker_name', 'date', 'speaker_party']], on=['speaker_name', 'date'], how='left')

In [149]:
missing_party['speaker_party'].unique()

array(['V', 'S', 'RV', 'M', 'KF', 'SF', 'LA'], dtype=object)

We loop over the rows in the full speech dataset and insert the value from missing_party dataset if speaker_party is empty

In [150]:
speaker_party_dict = {(row['speaker_name'], row['date']): row['speaker_party'] for _, row in missing_party.iterrows()}
for index, row in dspeech.iterrows():
    speaker_name = row['speaker_name']
    date = row['date']
    if pd.isna(row['speaker_party']) and (speaker_name, date) in speaker_party_dict:
        dspeech.at[index, 'speaker_party'] = speaker_party_dict[(speaker_name, date)]

Now we have 0 missing values in speaker_party!

In [155]:
dspeech['speaker_party'].isnull().sum()

0

In [154]:
dspeech.drop(columns=["date"], inplace=True)

In [157]:
#split intwo two datasets and save
dspeech_1 = pa.Table.from_pandas(dspeech.iloc[: (len(dspeech) // 2)])
dspeech_2 = pa.Table.from_pandas(dspeech.iloc[(len(dspeech) // 2) :])
pq.write_table(dspeech_1, "./data/data_speech1.parquet")
pq.write_table(dspeech_2, "./data/data_speech2.parquet")