In [1]:
import pandas as pd
import numpy as np
import os
from bs4 import BeautifulSoup
import requests
import re
import datetime 
import math
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from tqdm import tqdm

## Slice full transcripts

In [99]:
directory = 'data/raw/transcripts_full'
transcripts_list = []
 
# iterate over files in that directory
for filename in os.listdir(directory):
    file = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(file) and directory + '/transcripts_' in file:
        print(file)
        
        with open(file, encoding='utf-8') as f:
            transcript_df = pd.read_csv(f)
            
            # replace NaN in Text field by empty string ''
            transcript_df['Text'] = transcript_df['Text'].fillna('')

            # replace automatic placeholder for empty cells by NaN
            transcript_df = transcript_df.replace('#PersonNumber#', np.nan)
            transcript_df = transcript_df.replace('#SpeakerFunction#', np.nan)
            transcript_df = transcript_df.replace('#CouncilId#', np.nan)
            transcript_df = transcript_df.replace('#Start#', np.nan)
            transcript_df = transcript_df.replace('#End#', np.nan)
            transcript_df = transcript_df.replace('#Function#', np.nan)
            transcript_df = transcript_df.replace('#LanguageOfText#', np.nan)
            
            # append df to list of df
            transcripts_list.append(transcript_df)
            
len(transcripts_list)

data/raw/transcripts_full/transcripts_48.csv
data/raw/transcripts_full/transcripts_49.csv
data/raw/transcripts_full/transcripts_47.csv
data/raw/transcripts_full/transcripts_46.csv
data/raw/transcripts_full/transcripts_50.csv
data/raw/transcripts_full/transcripts_51.csv


6

In [4]:
# count number of empty cells for each column

for transcript_df in transcripts_list:
    print('size: ', len(transcript_df))
    for col in transcript_df.columns:
        print(col, transcript_df[col].isnull().sum())
        
    print('-----')

size:  51852
ID 0
IdSubject 0
PersonNumber 10300
Text 0
MeetingCouncilAbbreviation 0
MeetingDate 0
IdSession 0
SpeakerFunction 10300
CouncilId 10300
Start 7406
End 7406
Function 10305
LanguageOfText 5919
-----
size:  53019
ID 0
IdSubject 0
PersonNumber 11054
Text 0
MeetingCouncilAbbreviation 0
MeetingDate 0
IdSession 0
SpeakerFunction 11054
CouncilId 11058
Start 7280
End 7280
Function 11056
LanguageOfText 12237
-----
size:  46051
ID 0
IdSubject 0
PersonNumber 5875
Text 0
MeetingCouncilAbbreviation 0
MeetingDate 0
IdSession 0
SpeakerFunction 5875
CouncilId 5908
Start 4305
End 4305
Function 5917
LanguageOfText 5417
-----
size:  39969
ID 0
IdSubject 0
PersonNumber 5025
Text 0
MeetingCouncilAbbreviation 0
MeetingDate 0
IdSession 0
SpeakerFunction 24941
CouncilId 5026
Start 3238
End 3238
Function 31133
LanguageOfText 5537
-----
size:  51716
ID 0
IdSubject 0
PersonNumber 11708
Text 0
MeetingCouncilAbbreviation 0
MeetingDate 0
IdSession 0
SpeakerFunction 11708
CouncilId 11713
Start 6778
End 6

In [5]:
# remove html tags from texts
count = 0

for transcript_df in transcripts_list:
    print('transcript idx: ', count)
    
    transcript_df['Text'] = transcript_df['Text'].apply(lambda text: BeautifulSoup(text, 'html.parser').text)
    count += 1

transcript idx:  0
transcript idx:  1
transcript idx:  2
transcript idx:  3
transcript idx:  4
transcript idx:  5


In [6]:
# save all transcripts by session

for transcript_df in transcripts_list:
    sessions_list = list(transcript_df['IdSession'].unique())
    for session_id in sessions_list:
        transcript_session = transcript_df.loc[transcript_df['IdSession'] == session_id]
        transcript_session.to_csv('data/raw/transcripts/transcript_' + str(session_id) + '.csv', encoding='utf-8')

## Pre-process transcript files

In [2]:
# get all transcripts file names
directory = 'data/raw/transcripts'
transcripts_files_list = []
transcripts_list = []

# iterate over files in that directory
for filename in os.listdir(directory):
    file = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(file) and file.endswith('.csv'):
        transcripts_files_list.append(file)

# sort transcripts chronologically 
transcripts_files_list.sort()
len(transcripts_files_list)

117

In [4]:
# load all transcripts data
for file in tqdm(transcripts_files_list):
    with open(file, encoding='utf-8') as f:
        transcript_df = pd.read_csv(f)
        # drop column 'Unnamed'
        transcript_df = transcript_df.drop(columns=['Unnamed: 0'])
        # append df to list of df
        transcripts_list.append(transcript_df)
            
len(transcripts_list)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 117/117 [00:05<00:00, 22.19it/s]


117

In [20]:
all_sessions_df = pd.DataFrame()
all_sessions_list = list()

for session_df in tqdm(transcripts_list):
    session_id = session_df.loc[0, 'IdSession']
    
    # sort df by council, date, time and subject
    session_df = session_df.sort_values(by=['MeetingCouncilAbbreviation', 'MeetingDate', 'Start', 'IdSubject'])
    
    #print(session_df['SpeakerFunction'].unique())
    #print(session_df['MeetingCouncilAbbreviation'].unique())
    #print(session_df['CouncilId'].unique())
    
    # drop rows without a PersonNumber
    session_df = session_df.dropna(subset=['PersonNumber'])
    # drop rows when federal chancelor or council president is speaking
    #functions_to_drop = ['BK-M', 'BK-F', 'P-M', 'P-F']
    #session_df = session_df[~session_df['SpeakerFunction'].isin(functions_to_drop)]
    # drop rows when meeting is for both chambers (MeetingCouncilAbbreviation=V, for "Vereinigte Bundesversammlung")
    session_df = session_df[session_df['MeetingCouncilAbbreviation'] != 'V']
    
    # replace "[NB]" by a single space
    session_df.loc[:, 'Text'] = session_df.loc[:, 'Text'].replace(regex=r'\[NB\]', value=' ')
    # replace "[...]" by an empty string
    session_df.loc[:, 'Text'] = session_df.loc[:, 'Text'].replace(regex=r'\[.+?\]', value='')
    # set column PersonNumber type to int
    session_df = session_df.astype({"PersonNumber": int})
    
    # sort session by transcript id
    session_df = session_df.sort_values(by='ID')
    session_df = session_df.reset_index(drop=True)
    
    all_sessions_df = pd.concat([all_sessions_df, session_df])
    all_sessions_list.append(session_df)
    
    # save df
    session_df.to_csv('data/transcripts/transcript_' + str(session_id) + '.csv', encoding='utf-8')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 117/117 [00:09<00:00, 12.00it/s]


In [19]:
all_sessions_list[1]

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
0,1541,272,205,Die Kommission beantragt einstimmig bei 3 Enth...,N,20000306,4602,P-M,1.0,2000-03-06T18:05:56,2000-03-06T18:06:35,,
1,1542,273,205,Die Kommission beantragt einstimmig bei 3 Enth...,N,20000306,4602,P-M,1.0,2000-03-06T18:05:11,2000-03-06T18:05:56,,
2,1543,274,111,98.3480\n\nÜberwiesen als Postulat - Transmis ...,N,20000306,4602,1VP-M,1.0,2000-03-06T19:18:25,2000-03-06T19:18:51,,
3,1544,274,429,"Die Fragen, die hier aufgeworfen worden sind, ...",N,20000306,4602,BR-M,99.0,2000-03-06T19:07:18,2000-03-06T19:18:25,,DE
4,1545,274,111,Ich kann mich grundsätzlich den Worten von Her...,N,20000306,4602,1VP-M,1.0,2000-03-06T19:04:09,2000-03-06T19:07:18,Mit-M,DE
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2156,4032,669,195,Bundesgesetz über den Abschluss von Schuldenko...,S,20000324,4602,P-M,2.0,2000-03-24T08:08:45,2000-03-24T08:09:13,,
2157,4033,670,195,Am Schluss dieser Session möchte ich einen Mit...,S,20000324,4602,P-M,2.0,2000-03-24T08:13:14,2000-03-24T08:17:57,,DE
2158,4034,671,195,Bundesgesetz über eine Berichtigung von Artike...,S,20000324,4602,P-M,2.0,2000-03-24T08:12:50,2000-03-24T08:13:14,,
2159,4035,672,195,"Bundesbeschluss über die Volksinitiative ""für ...",S,20000324,4602,P-M,2.0,2000-03-24T08:05:02,2000-03-24T08:05:31,,


### Verification df processing

In [12]:
session_df = transcripts_list[4]
session_df

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
0,8039,1595,205.0,[VS]\nDie Beratung dieses Geschäftes wird unte...,N,20001127,4605,,1.0,2000-11-27T19:58:29,2000-11-27T19:59:47,,
1,8040,1595,140.0,Gestatten Sie mir vier Bemerkungen zum Budget ...,N,20001127,4605,,1.0,2000-11-27T19:52:02,2000-11-27T19:58:29,,DE
2,8041,1595,367.0,Nach den Beschlüssen der Finanzkommission bela...,N,20001127,4605,,1.0,2000-11-27T19:47:44,2000-11-27T19:52:02,,DE
3,8042,1595,440.0,"Ich glaube, wir alle sind froh, dass wir diese...",N,20001127,4605,,1.0,2000-11-27T19:45:23,2000-11-27T19:47:44,,DE
4,8043,1595,332.0,Das Budget 2001 ist mehr oder weniger ausgegli...,N,20001127,4605,,1.0,2000-11-27T19:40:16,2000-11-27T19:45:23,,DE
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2026,10065,1940,111.0,Vierter Wahlgang - Quatrième tour de scrutin\n...,V,20001206,4605,P-M,1.0,2000-12-06T09:44:54,2000-12-06T09:46:41,,
2027,10066,1940,111.0,Fünfter Wahlgang - Cinquième tour de scrutin\n...,V,20001206,4605,P-M,1.0,2000-12-06T09:59:25,2000-12-06T10:01:07,,
2028,10067,1941,111.0,Die Sitzung der Vereinigten Bundesversammlung ...,V,20001206,4605,P-M,1.0,2000-12-06T08:00:05,2000-12-06T08:01:30,,DE
2029,10068,1942,111.0,Ich erkläre die Sitzung der Vereinigten Bundes...,V,20001213,4605,P-M,1.0,2000-12-13T08:00:38,2000-12-13T08:01:49,,DE


In [13]:
# sort df
session_df = session_df.sort_values(by=['MeetingCouncilAbbreviation', 'MeetingDate', 'Start', 'IdSubject'])
session_df

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
32,8071,1596,205.0,Ich begrüsse Sie sehr herzlich zur Wintersessi...,N,20001127,4605,,1.0,2000-11-27T14:30:33,2000-11-27T14:44:36,P-M,DE
37,8076,1597,205.0,1. Wahl des Präsidenten des Nationalrates für ...,N,20001127,4605,,1.0,2000-11-27T14:44:36,2000-11-27T14:45:28,,
36,8075,1597,205.0,Ergebnis der Wahl - Résultat du scrutin\nAusge...,N,20001127,4605,,1.0,2000-11-27T14:55:57,2000-11-27T14:57:47,,
35,8074,1597,111.0,Für die ehrenvolle Wahl zum Nationalratspräsid...,N,20001127,4605,P-M,1.0,2000-11-27T14:57:47,2000-11-27T15:15:21,,
34,8073,1597,111.0,Ergebnis der Wahl - Résultat du scrutin\nAusge...,N,20001127,4605,P-M,1.0,2000-11-27T15:28:08,2000-11-27T15:30:11,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013,10052,1940,111.0,"Herr Bundesrat Schmid, ich gratuliere Ihnen zu...",V,20001206,4605,P-M,1.0,2000-12-06T10:20:19,2000-12-06T10:22:52,,DE
2012,10051,1940,111.0,3. Wahl des Bundespräsidenten für 2001 \n3. El...,V,20001206,4605,P-M,1.0,2000-12-06T10:35:58,2000-12-06T10:37:55,,
2011,10050,1940,111.0,4. Wahl des Vizepräsidenten des Bundesrates fü...,V,20001206,4605,P-M,1.0,2000-12-06T10:49:24,2000-12-06T10:50:57,,
2029,10068,1942,111.0,Ich erkläre die Sitzung der Vereinigten Bundes...,V,20001213,4605,P-M,1.0,2000-12-13T08:00:38,2000-12-13T08:01:49,,DE


In [14]:
# only keep rows with a PersonNumber
session_df = session_df.dropna(subset=['PersonNumber'])
session_df

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
32,8071,1596,205.0,Ich begrüsse Sie sehr herzlich zur Wintersessi...,N,20001127,4605,,1.0,2000-11-27T14:30:33,2000-11-27T14:44:36,P-M,DE
37,8076,1597,205.0,1. Wahl des Präsidenten des Nationalrates für ...,N,20001127,4605,,1.0,2000-11-27T14:44:36,2000-11-27T14:45:28,,
36,8075,1597,205.0,Ergebnis der Wahl - Résultat du scrutin\nAusge...,N,20001127,4605,,1.0,2000-11-27T14:55:57,2000-11-27T14:57:47,,
35,8074,1597,111.0,Für die ehrenvolle Wahl zum Nationalratspräsid...,N,20001127,4605,P-M,1.0,2000-11-27T14:57:47,2000-11-27T15:15:21,,
34,8073,1597,111.0,Ergebnis der Wahl - Résultat du scrutin\nAusge...,N,20001127,4605,P-M,1.0,2000-11-27T15:28:08,2000-11-27T15:30:11,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013,10052,1940,111.0,"Herr Bundesrat Schmid, ich gratuliere Ihnen zu...",V,20001206,4605,P-M,1.0,2000-12-06T10:20:19,2000-12-06T10:22:52,,DE
2012,10051,1940,111.0,3. Wahl des Bundespräsidenten für 2001 \n3. El...,V,20001206,4605,P-M,1.0,2000-12-06T10:35:58,2000-12-06T10:37:55,,
2011,10050,1940,111.0,4. Wahl des Vizepräsidenten des Bundesrates fü...,V,20001206,4605,P-M,1.0,2000-12-06T10:49:24,2000-12-06T10:50:57,,
2029,10068,1942,111.0,Ich erkläre die Sitzung der Vereinigten Bundes...,V,20001213,4605,P-M,1.0,2000-12-13T08:00:38,2000-12-13T08:01:49,,DE


In [15]:
print(session_df['SpeakerFunction'].unique())
print(session_df['MeetingCouncilAbbreviation'].unique())
print(session_df['CouncilId'].unique())
session_df.loc[session_df['MeetingCouncilAbbreviation'] == 'V']

[nan 'P-M' 'BR-M' '1VP-F' 'BR-F' 'BK-F' '2VP-M' 'BPR-M' 'P-F' '1VP-M']
['N' 'S' 'V']
[ 1. 99. 98.  2. nan]


Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
2028,10067,1941,111.0,Die Sitzung der Vereinigten Bundesversammlung ...,V,20001206,4605,P-M,1.0,2000-12-06T08:00:05,2000-12-06T08:01:30,,DE
2016,10055,1940,111.0,1. Rücktritt von Herrn Bundesrat Adolf Ogi\n1....,V,20001206,4605,P-M,1.0,2000-12-06T08:01:30,2000-12-06T08:15:03,,
2017,10056,1940,428.0,"Herr Präsident, Ihre Worte haben mich gerührt....",V,20001206,4605,BPR-M,99.0,2000-12-06T08:15:03,2000-12-06T08:19:23,,DE
2018,10057,1940,111.0,Ich danke Herrn Bundespräsident Ogi für seine ...,V,20001206,4605,P-M,1.0,2000-12-06T08:19:23,2000-12-06T08:20:53,,DE
2019,10058,1940,84.0,Ich möchte im Namen der Fraktion der Schweizer...,V,20001206,4605,,1.0,2000-12-06T08:20:53,2000-12-06T08:23:32,,DE
2020,10059,1940,9.0,"Stellen Sie sich vor, die Zauberformel würde h...",V,20001206,4605,,1.0,2000-12-06T08:23:32,2000-12-06T08:25:31,,DE
2021,10060,1940,241.0,"Nous allons vivre, et le pays avec nous, aujou...",V,20001206,4605,,1.0,2000-12-06T08:25:31,2000-12-06T08:30:54,,FR
2022,10061,1940,70.0,"Le Parti libéral, vous le savez, n'est pas un ...",V,20001206,4605,,1.0,2000-12-06T08:30:54,2000-12-06T08:34:37,,FR
2023,10062,1940,111.0,Erster Wahlgang - Premier tour de scrutin\n[VS...,V,20001206,4605,P-M,1.0,2000-12-06T08:55:37,2000-12-06T08:57:44,,
2024,10063,1940,111.0,Zweiter Wahlgang - Deuxième tour de scrutin\n[...,V,20001206,4605,P-M,1.0,2000-12-06T09:12:05,2000-12-06T09:14:34,,


In [16]:
# drop rows when federal chancelor or council president is speaking
functions_to_drop = ['BK-M', 'BK-F', 'P-M', 'P-F']
session_df = session_df[~session_df['SpeakerFunction'].isin(functions_to_drop)]
# drop rows when meeting is for both chambers (MeetingCouncilAbbreviation=V, for "Vereinigte Bundesversammlung")
session_df = session_df[session_df['MeetingCouncilAbbreviation'] != 'V']
session_df

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
32,8071,1596,205.0,Ich begrüsse Sie sehr herzlich zur Wintersessi...,N,20001127,4605,,1.0,2000-11-27T14:30:33,2000-11-27T14:44:36,P-M,DE
37,8076,1597,205.0,1. Wahl des Präsidenten des Nationalrates für ...,N,20001127,4605,,1.0,2000-11-27T14:44:36,2000-11-27T14:45:28,,
36,8075,1597,205.0,Ergebnis der Wahl - Résultat du scrutin\nAusge...,N,20001127,4605,,1.0,2000-11-27T14:55:57,2000-11-27T14:57:47,,
19,8058,1595,476.0,Zum ersten Mal seit 1991 unterbreitet der Bund...,N,20001127,4605,,1.0,2000-11-27T16:25:08,2000-11-27T16:47:08,*,DE
17,8056,1595,492.0,Le budget 2001 de la Confédération se présente...,N,20001127,4605,,1.0,2000-11-27T16:47:27,2000-11-27T17:12:14,*,FR
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1976,10015,1921,12.0,Sie haben wieder einen ausführlichen Bericht I...,S,20001214,4605,,2.0,2000-12-14T09:30:31,2000-12-14T09:32:58,*,DE
1975,10014,1921,30.0,Comme notre présidente de la commission l'a di...,S,20001214,4605,,2.0,2000-12-14T09:32:58,2000-12-14T09:37:33,,FR
1974,10013,1921,375.0,Il est toujours délicat d'observer qu'une loi ...,S,20001214,4605,,2.0,2000-12-14T09:37:33,2000-12-14T09:39:31,,FR
1962,10001,1916,194.0,Sie haben den Bericht der Delegation bei der I...,S,20001214,4605,,2.0,2000-12-14T09:40:58,2000-12-14T09:44:39,*,DE


In [17]:
# replace "[NB]" by a single space
session_df.loc[:, 'Text'] = session_df.loc[:, 'Text'].replace(regex=r'\[NB\]', value=' ')
# replace "[...]" by an empty string
session_df.loc[:, 'Text'] = session_df.loc[:, 'Text'].replace(regex=r'\[.+?\]', value='')
# set column PersonNumber type to int
session_df = session_df.astype({"PersonNumber": int})
session_df

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
32,8071,1596,205,Ich begrüsse Sie sehr herzlich zur Wintersessi...,N,20001127,4605,,1.0,2000-11-27T14:30:33,2000-11-27T14:44:36,P-M,DE
37,8076,1597,205,1. Wahl des Präsidenten des Nationalrates für ...,N,20001127,4605,,1.0,2000-11-27T14:44:36,2000-11-27T14:45:28,,
36,8075,1597,205,Ergebnis der Wahl - Résultat du scrutin\nAusge...,N,20001127,4605,,1.0,2000-11-27T14:55:57,2000-11-27T14:57:47,,
19,8058,1595,476,Zum ersten Mal seit 1991 unterbreitet der Bund...,N,20001127,4605,,1.0,2000-11-27T16:25:08,2000-11-27T16:47:08,*,DE
17,8056,1595,492,Le budget 2001 de la Confédération se présente...,N,20001127,4605,,1.0,2000-11-27T16:47:27,2000-11-27T17:12:14,*,FR
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1976,10015,1921,12,Sie haben wieder einen ausführlichen Bericht I...,S,20001214,4605,,2.0,2000-12-14T09:30:31,2000-12-14T09:32:58,*,DE
1975,10014,1921,30,Comme notre présidente de la commission l'a di...,S,20001214,4605,,2.0,2000-12-14T09:32:58,2000-12-14T09:37:33,,FR
1974,10013,1921,375,Il est toujours délicat d'observer qu'une loi ...,S,20001214,4605,,2.0,2000-12-14T09:37:33,2000-12-14T09:39:31,,FR
1962,10001,1916,194,Sie haben den Bericht der Delegation bei der I...,S,20001214,4605,,2.0,2000-12-14T09:40:58,2000-12-14T09:44:39,*,DE


In [273]:
session_df.loc[session_df['ID'] == 71340]

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
66,71340,11355,326,"Das Büro hat die Wahl von Herrn Marc Suter, ge...",N,20070305,4716,Mit-M,1.0,2007-03-05T14:32:41,2007-03-05T14:34:33,B,DE


## Get subjects and businesses

In [273]:
# open data files
# https://ws.parlament.ch/odata.svc/Business?$top=20&$filter=Language eq 'FR'&$select=ID,BusinessShortNumber,BusinessType,Description,Title,Tags,TagNames,SubmissionDate,SubmissionLegislativePeriod,SubmissionSession,ResponsibleDepartmentAbbreviation,BusinessStatusDate,BusinessStatusText,BusinessTypeName

with open('data/raw/data_persons.csv', encoding='utf-8') as file:
    persons_df = pd.read_csv(file)

with open('data/raw/data_businesses.csv', encoding='utf-8') as file:
    businesses_df = pd.read_csv(file)

with open('data/raw/data_subjects.csv', encoding='utf-8') as file:
    subjects_df = pd.read_csv(file)
    
businesses_df

Unnamed: 0,ID,BusinessShortNumber,BusinessType,BusinessTypeName,Title,Description,BusinessStatusText,BusinessStatusDate,ResponsibleDepartmentAbbreviation,Tags,SubmissionDate,SubmissionSession,SubmissionLegislativePeriod,TagNames
0,19780222,78.2220,4,Initiative parlementaire,Code pénal. Interruption de la grossesse (Gira...,Rapport de la Commission du 27.08.1979,Liquidé,1981-03-10T00:00:00,#ResponsibleDepartmentAbbreviation#,#Tags#,1978-06-05T00:00:00,4311,40,#TagNames#
1,19800226,80.2260,4,Initiative parlementaire,Constitutiion fédérale. Droit de l'entreprise ...,#Description#,Liquidé,1983-03-03T00:00:00,Parl,#Tags#,1980-06-02T00:00:00,4311,41,#TagNames#
2,19850019,85.0190,1,Objet du Conseil fédéral,Utilisation pacifique de l'énergie nucléaire. ...,Message et projet d'arrêté du 1er mai 1985 con...,Liquidé,2003-12-16T00:00:00,DFAE,#Tags#,1985-05-01T00:00:00,4311,42,#TagNames#
3,19850227,85.2270,4,Initiative parlementaire,Droit des assurances sociales,#Description#,Liquidé,2000-10-06T00:00:00,Parl,#Tags#,1985-02-07T00:00:00,4311,42,#TagNames#
4,19870069,87.0690,1,Objet du Conseil fédéral,Loi sur les chemins de fer. Modification,Message et projets de lois du 18 novembre 1987...,Liquidé,1987-11-18T00:00:00,DETEC,#Tags#,1987-11-18T00:00:00,4311,42,#TagNames#
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57504,20237463,23.7463,14,Heure des questions. Question,Pascal Hollenstein peut-il rester en poste ?,#Description#,Liquidé,2023-06-12T00:00:00,DFF,4|34,2023-06-07T00:00:00,5121,51,Politique d'Etat|Médias et communication
57505,20237464,23.7464,14,Heure des questions. Question,Scandaleuse récolte de bulletins de vote par L...,#Description#,Liquidé,2023-06-12T00:00:00,DETEC,4|8|9|34|48|52,2023-06-07T00:00:00,5121,51,Politique d'Etat|Politique internationale|Poli...
57506,20237465,23.7465,14,Heure des questions. Question,Le cabinet d'audit PwC est-il coresponsable de...,#Description#,Liquidé,2023-06-12T00:00:00,DFF,15|24,2023-06-07T00:00:00,5121,51,Économie|Finances
57507,20237466,23.7466,14,Heure des questions. Question,Effondrement de Credit Suisse et défaillance d...,#Description#,Liquidé,2023-06-12T00:00:00,DFF,15|24,2023-06-07T00:00:00,5121,51,Économie|Finances


In [274]:
subjects_df

Unnamed: 0,IdSubject,BusinessNumber,BusinessShortNumber,Title,TitleFR
0,6688,19850019,85.0190,Utilisation pacifique de l'énergie nucléaire. ...,Utilisation pacifique\nde l'énergie nucléaire....
1,6905,19850019,85.0190,Utilisation pacifique de l'énergie nucléaire. ...,Utilisation pacifique\nde l'énergie nucléaire....
2,645,19850227,85.2270,Droit des assurances sociales,Initiative parlementaire\nMeier Josi.\nDroit d...
3,775,19850227,85.2270,Droit des assurances sociales,Initiative parlementaire\nMeier Josi.\nDroit d...
4,1234,19850227,85.2270,Droit des assurances sociales,Initiative parlementaire\nMeier Josi.\nDroit d...
...,...,...,...,...,...
62273,60486,20239002,23.9002,Communications de la présidente,Communications de la présidente
62274,60501,20239002,23.9002,Communications de la présidente,Communications de la présidente
62275,59643,20239003,23.9003,Eloge funèbre,Eloge funèbre
62276,59647,20239003,23.9003,Eloge funèbre,Eloge funèbre


In [275]:
persons_df

Unnamed: 0,PersonNumber,FirstName,LastName,GenderAsString,ParlGroupName,PartyName,PartyAbbreviation,DateJoining,DateLeaving,DateElection,DateOath,DateResignation,DateOfBirth,DateOfDeath
0,1,Pierre,Aguet,m,Groupe socialiste,Parti socialiste suisse,PSS,1995-12-04T00:00:00,1999-12-05T00:00:00,1995-12-04T00:00:00,1995-12-04T00:00:00,1999-12-05T00:00:00,1938-03-02T00:00:00,
1,2,Heinz,Allenspach,m,Groupe libéral-radical,PLR.Les Libéraux-Radicaux,PLR,1979-11-26T00:00:00,1995-12-03T00:00:00,1979-11-26T00:00:00,1979-11-26T00:00:00,1995-12-03T00:00:00,1928-02-22T00:00:00,2022-09-16T00:00:00
2,6,Manfred,Aregger,m,Groupe libéral-radical,PLR.Les Libéraux-Radicaux,PLR,1995-12-04T00:00:00,1999-12-05T00:00:00,1995-12-04T00:00:00,1995-12-04T00:00:00,1999-12-05T00:00:00,1931-01-27T00:00:00,
3,7,Geneviève,Aubry,f,Groupe libéral-radical,PLR.Les Libéraux-Radicaux,PLR,1979-11-26T00:00:00,1995-12-03T00:00:00,1979-11-26T00:00:00,1979-11-26T00:00:00,1995-12-03T00:00:00,1928-03-04T00:00:00,
4,8,Rosmarie,Bär,f,Groupe des VERT-E-S,Liste libre (BE),LL,1987-11-30T00:00:00,1995-12-03T00:00:00,1987-11-30T00:00:00,1987-11-30T00:00:00,1995-12-03T00:00:00,1947-12-01T00:00:00,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3623,4329,Daniel,Ruch,m,Groupe libéral-radical,PLR.Les Libéraux-Radicaux,PLR,2022-06-13T00:00:00,,2019-10-20T00:00:00,2022-06-13T00:00:00,,1963-03-20T00:00:00,
3624,4330,Alexandre,Berthoud,m,Groupe libéral-radical,PLR.Les Libéraux-Radicaux,PLR,2022-06-13T00:00:00,,2019-10-20T00:00:00,2022-06-13T00:00:00,,1977-06-29T00:00:00,
3625,4331,Marc,Jost,m,Le Groupe du Centre. Le Centre. PEV.,Parti évangélique suisse,PEV,2022-11-28T00:00:00,,2019-10-20T00:00:00,2022-11-28T00:00:00,,1974-02-06T00:00:00,
3626,4332,Mathilde,Crevoisier Crelier,f,Groupe socialiste,Parti socialiste suisse,PSS,2022-12-15T00:00:00,,2019-10-20T00:00:00,2022-12-15T00:00:00,,1980-01-05T00:00:00,


In [276]:
all_sessions_df

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
2,3,1,498,"Madame la Présidente de la Confédération, \nSe...",N,19991206,4601,AP-M,1.0,1999-12-06T14:32:24,1999-12-06T14:33:39,,FR
0,1,1,498,La séance d'aujourd'hui s'ouvre sous le signe ...,N,19991206,4601,AP-M,1.0,1999-12-06T14:36:50,1999-12-06T14:50:21,,FR
47,53,6,498,Il appartenait au doyen d'âge de constituer le...,N,19991206,4601,AP-M,1.0,1999-12-06T14:50:21,1999-12-06T14:52:13,,FR
46,52,6,273,Das provisorische Büro hat an seiner Sitzung v...,N,19991206,4601,,1.0,1999-12-06T14:52:13,1999-12-06T15:01:02,b,DE
45,51,6,305,Le Bureau provisoire de notre Assemblée a siég...,N,19991206,4601,,1.0,1999-12-06T15:01:02,1999-12-06T15:07:43,b,FR
...,...,...,...,...,...,...,...,...,...,...,...,...,...
728,319686,60670,4238,"Sehr geschätzter Herr Nationalrat Egger, Sie h...",N,20230504,5120,BR-F,99.0,2023-05-04T17:04:38,2023-05-04T17:06:20,BR-F,DE
816,319809,60674,4268,La questione della mediatizzazione dei process...,N,20230504,5120,Mit-F,1.0,2023-05-04T17:06:39,2023-05-04T17:11:27,Mit-F,IT
734,319699,60674,1122,"Frau Kollegin Gysin, Sie wollen eine Priorisie...",N,20230504,5120,Mit-M,1.0,2023-05-04T17:11:29,2023-05-04T17:11:46,Mit-M,DE
815,319807,60674,4268,"Collega Fluri, sono molto consapevole del prob...",N,20230504,5120,Mit-F,1.0,2023-05-04T17:11:46,2023-05-04T17:12:44,Mit-F,IT


In [277]:
# get all id of persons and subjects appearing in transcripts

all_person_id = all_sessions_df['PersonNumber'].unique()
print(len(all_person_id))
all_subject_id = all_sessions_df['IdSubject'].unique()
print(len(all_subject_id))

710
25820


In [279]:
# filter persons to keep only those appearing in transcripts

persons_filtered = persons_df[persons_df['PersonNumber'].isin(all_person_id)]
persons_filtered

Unnamed: 0,PersonNumber,FirstName,LastName,GenderAsString,ParlGroupName,PartyName,PartyAbbreviation,DateJoining,DateLeaving,DateElection,DateOath,DateResignation,DateOfBirth,DateOfDeath
5,9,Ruedi,Baumann,m,Groupe des VERT-E-S,Les VERT-E-S suisses,VERT-E-S,1995-12-04T00:00:00,2003-11-30T00:00:00,1995-12-04T00:00:00,1995-12-04T00:00:00,2003-11-30T00:00:00,1947-11-11T00:00:00,
8,12,Christine,Beerli,f,Groupe libéral-radical,PLR.Les Libéraux-Radicaux,PLR,2002-11-25T00:00:00,2003-11-30T00:00:00,1995-12-04T00:00:00,1995-12-04T00:00:00,2003-11-30T00:00:00,1953-03-26T00:00:00,
10,14,Duri,Bezzola,m,Groupe libéral-radical,PLR.Les Libéraux-Radicaux,PLR,2003-12-01T00:00:00,2007-03-04T00:00:00,2003-10-19T00:00:00,2003-12-01T00:00:00,2007-03-04T00:00:00,1942-06-23T00:00:00,
11,15,Max,Binder,m,Groupe de l'Union démocratique du Centre,Union Démocratique du Centre,UDC,2011-12-05T00:00:00,2015-11-29T00:00:00,2011-10-23T00:00:00,2011-12-05T00:00:00,2015-11-29T00:00:00,1947-11-26T00:00:00,
17,21,Christoph,Blocher,m,Groupe de l'Union démocratique du Centre,Union Démocratique du Centre,UDC,2011-12-05T00:00:00,2014-05-31T00:00:00,2011-10-23T00:00:00,2011-12-05T00:00:00,2014-05-31T00:00:00,1940-10-11T00:00:00,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3623,4329,Daniel,Ruch,m,Groupe libéral-radical,PLR.Les Libéraux-Radicaux,PLR,2022-06-13T00:00:00,,2019-10-20T00:00:00,2022-06-13T00:00:00,,1963-03-20T00:00:00,
3624,4330,Alexandre,Berthoud,m,Groupe libéral-radical,PLR.Les Libéraux-Radicaux,PLR,2022-06-13T00:00:00,,2019-10-20T00:00:00,2022-06-13T00:00:00,,1977-06-29T00:00:00,
3625,4331,Marc,Jost,m,Le Groupe du Centre. Le Centre. PEV.,Parti évangélique suisse,PEV,2022-11-28T00:00:00,,2019-10-20T00:00:00,2022-11-28T00:00:00,,1974-02-06T00:00:00,
3626,4332,Mathilde,Crevoisier Crelier,f,Groupe socialiste,Parti socialiste suisse,PSS,2022-12-15T00:00:00,,2019-10-20T00:00:00,2022-12-15T00:00:00,,1980-01-05T00:00:00,


In [280]:
# filter subjects to keep only those appearing in transcripts

subjects_filtered = subjects_df[subjects_df['IdSubject'].isin(all_subject_id)]
all_business_id = subjects_filtered['BusinessNumber'].unique()
print(len(all_business_id))
subjects_filtered

19619


Unnamed: 0,IdSubject,BusinessNumber,BusinessShortNumber,Title,TitleFR
0,6688,19850019,85.0190,Utilisation pacifique de l'énergie nucléaire. ...,Utilisation pacifique\nde l'énergie nucléaire....
1,6905,19850019,85.0190,Utilisation pacifique de l'énergie nucléaire. ...,Utilisation pacifique\nde l'énergie nucléaire....
2,645,19850227,85.2270,Droit des assurances sociales,Initiative parlementaire\nMeier Josi.\nDroit d...
3,775,19850227,85.2270,Droit des assurances sociales,Initiative parlementaire\nMeier Josi.\nDroit d...
4,1234,19850227,85.2270,Droit des assurances sociales,Initiative parlementaire\nMeier Josi.\nDroit d...
...,...,...,...,...,...
62225,60010,20237245,23.7245,Pénurie de médicaments. Degré d’efficacité des...,Heure des questions.\nQuestion Feller Olivier....
62232,59975,20237252,23.7252,Le collaborateur de l'OFSP responsable de la g...,Heure des questions.\nQuestion Büchel Roland R...
62233,59999,20237253,23.7253,Chômage en hausse malgré la pénurie de main-d’...,Heure des questions.\nQuestion Strupler Manuel...
62238,59993,20237258,23.7258,Comment les recommandations du comité scientif...,Heure des questions.\nQuestion Python Valentin...


In [281]:
# filter businesses to keep only those appearing in transcripts

businesses_filtered = businesses_df[businesses_df['ID'].isin(all_business_id)]
businesses_filtered = businesses_filtered.drop(columns=['Description'], errors='ignore')
businesses_filtered

Unnamed: 0,ID,BusinessShortNumber,BusinessType,BusinessTypeName,Title,BusinessStatusText,BusinessStatusDate,ResponsibleDepartmentAbbreviation,Tags,SubmissionDate,SubmissionSession,SubmissionLegislativePeriod,TagNames
2,19850019,85.0190,1,Objet du Conseil fédéral,Utilisation pacifique de l'énergie nucléaire. ...,Liquidé,2003-12-16T00:00:00,DFAE,#Tags#,1985-05-01T00:00:00,4311,42,#TagNames#
3,19850227,85.2270,4,Initiative parlementaire,Droit des assurances sociales,Liquidé,2000-10-06T00:00:00,Parl,#Tags#,1985-02-07T00:00:00,4311,42,#TagNames#
42,19910411,91.4110,4,Initiative parlementaire,Prestations familiales,Liquidé,2006-03-24T09:07:22,DFI,#Tags#,1991-03-13T00:00:00,4317,43,#TagNames#
44,19910419,91.4190,4,Initiative parlementaire,Ratification de la Charte sociale européenne,Liquidé,2004-12-17T08:34:21,Parl,#Tags#,1991-06-19T00:00:00,4319,43,#TagNames#
72,19920312,92.3120,3,Initiative déposée par un canton,Légalisation de la consommation de drogue et m...,Liquidé,2000-03-07T00:00:00,#ResponsibleDepartmentAbbreviation#,#Tags#,1992-12-07T00:00:00,4407,44,#TagNames#
...,...,...,...,...,...,...,...,...,...,...,...,...,...
57286,20237245,23.7245,14,Heure des questions. Question,Pénurie de médicaments. Degré d’efficacité des...,Liquidé,2023-03-13T00:00:00,DEFR,9|2841,2023-03-08T00:00:00,5118,51,Politique de sécurité|Santé
57293,20237252,23.7252,14,Heure des questions. Question,Le collaborateur de l'OFSP responsable de la g...,Liquidé,2023-03-13T00:00:00,ChF,2841,2023-03-08T00:00:00,5118,51,Santé
57294,20237253,23.7253,14,Heure des questions. Question,Chômage en hausse malgré la pénurie de main-d’...,Liquidé,2023-03-13T00:00:00,DEFR,44,2023-03-08T00:00:00,5118,51,Emploi et travail
57299,20237258,23.7258,14,Heure des questions. Question,Comment les recommandations du comité scientif...,Liquidé,2023-03-13T00:00:00,DEFR,32|2841,2023-03-08T00:00:00,5118,51,Éducation|Santé


In [282]:
# save filtered dfs
persons_filtered.to_csv('data/persons.csv', encoding='utf-8')
businesses_filtered.to_csv('data/businesses.csv', encoding='utf-8')
subjects_filtered.to_csv('data/subjects.csv', encoding='utf-8')

## Extract business tags

In [33]:
with open('data/businesses.csv', encoding='utf-8') as file:
    businesses_df = pd.read_csv(file).drop(columns='Unnamed: 0')

businesses_df

Unnamed: 0,ID,BusinessShortNumber,BusinessType,BusinessTypeName,Title,BusinessStatusText,BusinessStatusDate,ResponsibleDepartmentAbbreviation,Tags,SubmissionDate,SubmissionSession,SubmissionLegislativePeriod,TagNames
0,19850019,85.0190,1,Objet du Conseil fédéral,Utilisation pacifique de l'énergie nucléaire. ...,Liquidé,2003-12-16T00:00:00,DFAE,#Tags#,1985-05-01T00:00:00,4311,42,#TagNames#
1,19850227,85.2270,4,Initiative parlementaire,Droit des assurances sociales,Liquidé,2000-10-06T00:00:00,Parl,#Tags#,1985-02-07T00:00:00,4311,42,#TagNames#
2,19910411,91.4110,4,Initiative parlementaire,Prestations familiales,Liquidé,2006-03-24T09:07:22,DFI,#Tags#,1991-03-13T00:00:00,4317,43,#TagNames#
3,19910419,91.4190,4,Initiative parlementaire,Ratification de la Charte sociale européenne,Liquidé,2004-12-17T08:34:21,Parl,#Tags#,1991-06-19T00:00:00,4319,43,#TagNames#
4,19920312,92.3120,3,Initiative déposée par un canton,Légalisation de la consommation de drogue et m...,Liquidé,2000-03-07T00:00:00,#ResponsibleDepartmentAbbreviation#,#Tags#,1992-12-07T00:00:00,4407,44,#TagNames#
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19465,20237245,23.7245,14,Heure des questions. Question,Pénurie de médicaments. Degré d’efficacité des...,Liquidé,2023-03-13T00:00:00,DEFR,9|2841,2023-03-08T00:00:00,5118,51,Politique de sécurité|Santé
19466,20237252,23.7252,14,Heure des questions. Question,Le collaborateur de l'OFSP responsable de la g...,Liquidé,2023-03-13T00:00:00,ChF,2841,2023-03-08T00:00:00,5118,51,Santé
19467,20237253,23.7253,14,Heure des questions. Question,Chômage en hausse malgré la pénurie de main-d’...,Liquidé,2023-03-13T00:00:00,DEFR,44,2023-03-08T00:00:00,5118,51,Emploi et travail
19468,20237258,23.7258,14,Heure des questions. Question,Comment les recommandations du comité scientif...,Liquidé,2023-03-13T00:00:00,DEFR,32|2841,2023-03-08T00:00:00,5118,51,Éducation|Santé


In [287]:
businesses_df = businesses_df.replace('#TagNames#', np.nan)
businesses_df = businesses_df.replace('#Tags#', np.nan)
businesses_df

Unnamed: 0,ID,BusinessShortNumber,BusinessType,BusinessTypeName,Title,BusinessStatusText,BusinessStatusDate,ResponsibleDepartmentAbbreviation,Tags,SubmissionDate,SubmissionSession,SubmissionLegislativePeriod,TagNames
0,19850019,85.0190,1,Objet du Conseil fédéral,Utilisation pacifique de l'énergie nucléaire. ...,Liquidé,2003-12-16T00:00:00,DFAE,,1985-05-01T00:00:00,4311,42,
1,19850227,85.2270,4,Initiative parlementaire,Droit des assurances sociales,Liquidé,2000-10-06T00:00:00,Parl,,1985-02-07T00:00:00,4311,42,
2,19910411,91.4110,4,Initiative parlementaire,Prestations familiales,Liquidé,2006-03-24T09:07:22,DFI,,1991-03-13T00:00:00,4317,43,
3,19910419,91.4190,4,Initiative parlementaire,Ratification de la Charte sociale européenne,Liquidé,2004-12-17T08:34:21,Parl,,1991-06-19T00:00:00,4319,43,
4,19920312,92.3120,3,Initiative déposée par un canton,Légalisation de la consommation de drogue et m...,Liquidé,2000-03-07T00:00:00,#ResponsibleDepartmentAbbreviation#,,1992-12-07T00:00:00,4407,44,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19465,20237245,23.7245,14,Heure des questions. Question,Pénurie de médicaments. Degré d’efficacité des...,Liquidé,2023-03-13T00:00:00,DEFR,9|2841,2023-03-08T00:00:00,5118,51,Politique de sécurité|Santé
19466,20237252,23.7252,14,Heure des questions. Question,Le collaborateur de l'OFSP responsable de la g...,Liquidé,2023-03-13T00:00:00,ChF,2841,2023-03-08T00:00:00,5118,51,Santé
19467,20237253,23.7253,14,Heure des questions. Question,Chômage en hausse malgré la pénurie de main-d’...,Liquidé,2023-03-13T00:00:00,DEFR,44,2023-03-08T00:00:00,5118,51,Emploi et travail
19468,20237258,23.7258,14,Heure des questions. Question,Comment les recommandations du comité scientif...,Liquidé,2023-03-13T00:00:00,DEFR,32|2841,2023-03-08T00:00:00,5118,51,Éducation|Santé


In [322]:
businesses_df = businesses_df.dropna(subset='TagNames')
businesses_df = businesses_df.set_index('ID')
businesses_df.index.name = None
businesses_df

Unnamed: 0,BusinessShortNumber,BusinessType,BusinessTypeName,Title,BusinessStatusText,BusinessStatusDate,ResponsibleDepartmentAbbreviation,Tags,SubmissionDate,SubmissionSession,SubmissionLegislativePeriod,TagNames
19970419,97.4190,4,Initiative parlementaire,Article constitutionnel sur l'éducation,Liquidé,2005-12-16T06:51:38,#ResponsibleDepartmentAbbreviation#,32,1997-04-30T00:00:00,4507,45,Éducation
19980038,98.0380,1,Objet du Conseil fédéral,"CP, CPM et loi fédérale sur le droit pénal des...",Liquidé,2003-06-20T00:00:00,DFJP,12,1998-06-08T00:00:00,4514,45,Droit
19990083,99.0830,1,Objet du Conseil fédéral,Conseil national. Vérification des pouvoirs,Liquidé,1999-12-08T00:00:00,Parl,421,1999-11-10T00:00:00,4601,45,Parlement
20000056,0.0560,1,Objet du Conseil fédéral,"""Pour une durée du travail réduite"". Initiativ...",Liquidé,2001-06-22T00:00:00,DEFR,15,2000-06-28T00:00:00,4604,46,Économie
20000065,0.0650,2,Objet du Parlement,Délégation auprès de l'Union interparlementair...,Liquidé,2000-12-15T00:00:00,Parl,8|421,1999-12-31T00:00:00,4602,46,Politique internationale|Parlement
...,...,...,...,...,...,...,...,...,...,...,...,...
20237245,23.7245,14,Heure des questions. Question,Pénurie de médicaments. Degré d’efficacité des...,Liquidé,2023-03-13T00:00:00,DEFR,9|2841,2023-03-08T00:00:00,5118,51,Politique de sécurité|Santé
20237252,23.7252,14,Heure des questions. Question,Le collaborateur de l'OFSP responsable de la g...,Liquidé,2023-03-13T00:00:00,ChF,2841,2023-03-08T00:00:00,5118,51,Santé
20237253,23.7253,14,Heure des questions. Question,Chômage en hausse malgré la pénurie de main-d’...,Liquidé,2023-03-13T00:00:00,DEFR,44,2023-03-08T00:00:00,5118,51,Emploi et travail
20237258,23.7258,14,Heure des questions. Question,Comment les recommandations du comité scientif...,Liquidé,2023-03-13T00:00:00,DEFR,32|2841,2023-03-08T00:00:00,5118,51,Éducation|Santé


In [323]:
businesses_dict = businesses_df['TagNames'].to_dict()
len(businesses_dict)

18194

In [324]:
business_tag_dict = dict()

for idx, tags_str in businesses_dict.items():
    tags_list = tags_str.split('|')
    business_tag_dict[idx] = dict()
    
    for tag in tags_list:
        business_tag_dict[idx][tag] = 1
        
len(business_tag_dict)

18194

In [328]:
business_tag_df = pd.DataFrame.from_dict(business_tag_dict, orient='index').fillna(0).astype('int')
business_tag_df

Unnamed: 0,Éducation,Droit,Parlement,Économie,Politique internationale,Finances,Transports,Environnement,Santé,Politique européenne,...,Politique migratoire,Culture,Droit civil,Emploi et travail,Droits de l'homme,Droit pénal,Droit international,Fiscalité,Protection sociale,Justice
19970419,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20000086,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20003605,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20003606,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20003647,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
20204399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
20210002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
20210401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [326]:
business_tag_df.to_csv('data/businesses_tags.csv', encoding='utf-8')

## Get members history

In [35]:
with open('data/raw/raw_sessions.csv', encoding='utf-8') as file:
    sessions_df = pd.read_csv(file)
    sessions_df = sessions_df.rename(columns={'ID': 'IdSession'})
    
    # convert date columns to datetime objects
    sessions_df['StartDate'] = pd.to_datetime(sessions_df['StartDate']).dt.date
    sessions_df['EndDate'] = pd.to_datetime(sessions_df['EndDate']).dt.date
    
sessions_df

Unnamed: 0,IdSession,SessionNumber,StartDate,EndDate,LegislativePeriodNumber
0,4601,1,1999-12-06,1999-12-22,46
1,4602,2,2000-03-06,2000-03-24,46
2,4603,3,2000-06-05,2000-06-23,46
3,4604,4,2000-09-18,2000-10-06,46
4,4605,5,2000-11-27,2000-12-15,46
...,...,...,...,...,...
113,5117,17,2022-11-28,2022-12-16,51
114,5118,18,2023-02-27,2023-03-17,51
115,5119,19,2023-04-11,2023-04-13,51
116,5120,20,2023-05-02,2023-05-05,51


In [36]:
with open('data/raw/raw_members.csv', encoding='utf-8') as file:
    members_df = pd.read_csv(file)
    members_df = members_df.sort_values(by='PersonNumber').reset_index(drop=True)
    
    # convert date columns to date objects
    members_df['DateJoining'] = pd.to_datetime(members_df['DateJoining']).dt.date
    members_df['DateLeaving'] = pd.to_datetime(members_df['DateLeaving']).dt.date
    
members_df

Unnamed: 0,PersonNumber,FirstName,LastName,GenderAsString,CantonAbbreviation,CouncilAbbreviation,PartyAbbreviation,DateJoining,DateLeaving
0,1,Pierre,Aguet,m,VD,CN,PSS,1987-11-30,1995-12-03
1,1,Pierre,Aguet,m,VD,CN,PSS,1995-12-04,1999-12-05
2,2,Heinz,Allenspach,m,ZH,CN,PRD,1979-11-26,1995-12-03
3,6,Manfred,Aregger,m,LU,CN,PRD,1995-12-04,1999-12-05
4,6,Manfred,Aregger,m,LU,CN,PRD,1979-11-26,1995-12-03
...,...,...,...,...,...,...,...,...,...
6453,4331,Marc,Jost,m,BE,CN,PEV,2022-11-28,
6454,4332,Mathilde,Crevoisier Crelier,f,JU,CE,PSS,2022-12-15,
6455,4333,Andreas,Meier,m,AG,CN,M-E,2023-02-27,
6456,4334,Thomas,Bläsi,m,GE,CN,UDC,2023-05-30,


In [186]:
# keep only members who left after 12-12-1999 (after begining of 46th leg) or who haven't left yet
members_df = members_df.loc[(members_df['DateLeaving'] > datetime.date(1999, 12, 12)) | (members_df['DateLeaving'].isna())]
# drop duplicate rows
members_df = members_df.drop_duplicates(keep='first')
# drop members who are cancellors
members_df = members_df.dropna(subset='CouncilAbbreviation')
members_df = members_df.reset_index(drop=True)
members_df

Unnamed: 0,PersonNumber,FirstName,LastName,GenderAsString,CantonAbbreviation,CouncilAbbreviation,PartyAbbreviation,DateJoining,DateLeaving
0,9,Ruedi,Baumann,m,BE,CN,PES,1995-12-04,2003-11-30
1,12,Christine,Beerli,f,BE,CE,PRD,1995-12-04,2003-11-30
2,14,Duri,Bezzola,m,GR,CN,PRD,1995-12-04,2003-11-30
3,14,Duri,Bezzola,m,GR,CN,PRD,2003-12-01,2007-03-04
4,15,Max,Binder,m,ZH,CN,UDC,2007-12-03,2011-12-04
...,...,...,...,...,...,...,...,...,...
1727,4331,Marc,Jost,m,BE,CN,PEV,2022-11-28,NaT
1728,4332,Mathilde,Crevoisier Crelier,f,JU,CE,PSS,2022-12-15,NaT
1729,4333,Andreas,Meier,m,AG,CN,M-E,2023-02-27,NaT
1730,4334,Thomas,Bläsi,m,GE,CN,UDC,2023-05-30,NaT


In [187]:
council_list = ['CN', 'CE']
members_council_dict = dict()

for council in council_list:
    members_council_dict[council] = members_df.query('CouncilAbbreviation == @council').reset_index(drop=True)
    
members_council_dict['CN']

Unnamed: 0,PersonNumber,FirstName,LastName,GenderAsString,CantonAbbreviation,CouncilAbbreviation,PartyAbbreviation,DateJoining,DateLeaving
0,9,Ruedi,Baumann,m,BE,CN,PES,1995-12-04,2003-11-30
1,14,Duri,Bezzola,m,GR,CN,PRD,1995-12-04,2003-11-30
2,14,Duri,Bezzola,m,GR,CN,PRD,2003-12-01,2007-03-04
3,15,Max,Binder,m,ZH,CN,UDC,2007-12-03,2011-12-04
4,15,Max,Binder,m,ZH,CN,UDC,2003-12-01,2007-12-02
...,...,...,...,...,...,...,...,...,...
1327,4330,Alexandre,Berthoud,m,VD,CN,PLR,2022-06-13,NaT
1328,4331,Marc,Jost,m,BE,CN,PEV,2022-11-28,NaT
1329,4333,Andreas,Meier,m,AG,CN,M-E,2023-02-27,NaT
1330,4334,Thomas,Bläsi,m,GE,CN,UDC,2023-05-30,NaT


In [188]:
members_council_dict['CN'].nunique()

PersonNumber           609
FirstName              364
LastName               558
GenderAsString           2
CantonAbbreviation      26
CouncilAbbreviation      1
PartyAbbreviation       30
DateJoining             97
DateLeaving            113
dtype: int64

In [189]:
all_members_sessions_dict = dict()

for council in council_list:
    members_sessions_dict = dict()
    for idx_sessions, row_sessions in (sessions_df.iloc[:].iterrows()):
        session_id = row_sessions['IdSession']
        session_start = row_sessions['StartDate']
        session_end = row_sessions['EndDate']
        members_sessions_dict[session_id] = dict()

        for idx_members, row_members in members_council_dict[council].iterrows():
            member_start = row_members['DateJoining']
            member_end = row_members['DateLeaving']
            member_id = row_members['PersonNumber']

            # if member has a end date
            if not pd.isnull(member_end):
                # if start of session is within the dates of the member and member doesn't leave during session
                if (session_start >= member_start) & (session_start <= member_end) & (member_end >= session_end):
                    members_sessions_dict[session_id][member_id] = 1
                # if member dates start during a session and member doesn't leave during session
                elif (member_start >= session_start) & (member_start <= session_end) & (member_end >= session_end):
                    members_sessions_dict[session_id][member_id] = 1
            # if member doesn't have a end date (= still in office)
            else:
                # if session starts after member start
                if (session_start >= member_start):
                    members_sessions_dict[session_id][member_id] = 1
                # if member dates start during a session
                elif (member_start >= session_start) & (member_start <= session_end):
                    members_sessions_dict[session_id][member_id] = 1
    members_sessions_df = pd.DataFrame.from_dict(members_sessions_dict).fillna(0).astype('int')
    all_members_sessions_dict[council] = members_sessions_df
    
all_members_sessions_dict['CN']

Unnamed: 0,4601,4602,4603,4604,4605,4606,4607,4608,4609,4610,...,5112,5113,5114,5115,5116,5117,5118,5119,5120,5121
9,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
14,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
15,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
21,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
26,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4330,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,1,1,1,1,1
4331,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,1,1
4333,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,1
4334,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [196]:
# get the duration of each session
sessions_df['duration'] = sessions_df['EndDate'] - sessions_df['StartDate']
sessions_df['duration'] = sessions_df['duration'].apply(lambda x: x.days + 1)
# make a dict with duration for each session
sessions_duration_dict = sessions_df.set_index('IdSession')['duration'].to_dict()
sessions_duration_dict[4601]

17

In [199]:
# get list of legislative numbers
legis_list = list(sessions_df['LegislativePeriodNumber'].unique())

# initalize legislative duration dict with legislative numbers
legis_duration = dict()
for legis in legis_list:
    legis_duration[legis] = 0

# get the total duration of each legislative
for session, duration in sessions_duration_dict.items():
    for legis in legis_list:
        if str(session).startswith(str(legis)):
            legis_duration[legis] += duration
            
print(legis_duration)

# get the weight of each session in the legislative 
session_weight = dict()

for session, duration in sessions_duration_dict.items():
    legis = str(session)[:2]
    session_weight[session] = duration/legis_duration[int(legis)]
    
session_weight[4601]

{46: 313, 47: 309, 48: 314, 49: 317, 50: 313, 51: 303}


0.054313099041533544

In [202]:
all_members_weighted_dict = dict()

for council, df in all_members_sessions_dict.items():
    members_weighted_df = df.copy()

    # get the duration of each session for each member
    for session, duration in session_ratio.items():
        members_weighted_df[session] = members_weighted_df[session] * duration

    # transpose df
    members_weighted_df = members_weighted_df.T
    # match the legislative number for each session
    members_weighted_df = members_weighted_df.merge(sessions_df[['IdSession', 'LegislativePeriodNumber']], left_index=True, right_on='IdSession')
    members_weighted_df = members_weighted_df.drop(columns='IdSession')

    members_weighted_df = round(members_weighted_df.groupby('LegislativePeriodNumber').sum(), 2).T
    members_weighted_df.columns.name = None
    all_members_weighted_dict[council] = members_weighted_df
    
all_members_weighted_dict['CN']

Unnamed: 0,46,47,48,49,50,51
9,1.0,0.00,0.0,0.00,0.0,0.00
14,1.0,0.82,0.0,0.00,0.0,0.00
15,1.0,1.00,1.0,1.00,0.0,0.00
21,1.0,0.00,0.0,0.62,0.0,0.00
26,1.0,1.00,1.0,1.00,0.0,0.00
...,...,...,...,...,...,...
4330,0.0,0.00,0.0,0.00,0.0,0.33
4331,0.0,0.00,0.0,0.00,0.0,0.21
4333,0.0,0.00,0.0,0.00,0.0,0.15
4334,0.0,0.00,0.0,0.00,0.0,0.06


In [208]:
all_members_weighted_dict['CN'].to_csv('data/members_weighted_cn.csv')
all_members_weighted_dict['CE'].to_csv('data/members_weighted_ce.csv')

In [207]:
all_members_weighted_dict['CN'].sum()

46    200.00
47    199.76
48    200.00
49    199.83
50    199.88
51    199.94
dtype: float64

In [239]:
test = members_df.drop_duplicates(subset=['PersonNumber', 'PartyAbbreviation'])
test.loc[test.duplicated(subset=['PersonNumber'], keep=False)].sort_values(by=['PersonNumber', 'DateJoining'])

Unnamed: 0,PersonNumber,FirstName,LastName,GenderAsString,CantonAbbreviation,CouncilAbbreviation,PartyAbbreviation,DateJoining,DateLeaving
14,26,Roland F.,Borer,m,SO,CN,,1995-12-04,2003-11-30
12,26,Roland F.,Borer,m,SO,CN,UDC,2003-12-01,2007-12-02
57,74,Christoph,Eymann,m,BS,CN,PLS,1995-12-04,2001-09-16
55,74,Christoph,Eymann,m,BS,CN,PLD,2019-12-02,2021-11-28
69,91,Ulrich,Giezendanner,m,AG,CN,,1995-12-04,2003-11-30
...,...,...,...,...,...,...,...,...,...
1194,3934,Eveline,Widmer-Schlumpf,f,GR,CF,PBD,2011-01-01,2011-12-31
1199,3939,Pankraz,Freitag,m,GL,CE,PRD,2008-03-03,2011-12-04
1198,3939,Pankraz,Freitag,m,GL,CE,PLR,2011-12-05,2013-10-05
1464,4137,Roger,Golay,m,GE,CN,MCR,2013-12-02,2015-11-29


In [228]:
test = all_members_weighted_dict['CN'].merge(members_df[['PersonNumber', 'GenderAsString', 'PartyAbbreviation']], left_index=True, right_on='PersonNumber', how='left')
test

Unnamed: 0,46,47,48,49,50,51,PersonNumber,GenderAsString,PartyAbbreviation
0,1.0,0.00,0.0,0.0,0.0,0.00,9,m,PES
2,1.0,0.82,0.0,0.0,0.0,0.00,14,m,PRD
3,1.0,0.82,0.0,0.0,0.0,0.00,14,m,PRD
4,1.0,1.00,1.0,1.0,0.0,0.00,15,m,UDC
5,1.0,1.00,1.0,1.0,0.0,0.00,15,m,UDC
...,...,...,...,...,...,...,...,...,...
1726,0.0,0.00,0.0,0.0,0.0,0.33,4330,m,PLR
1727,0.0,0.00,0.0,0.0,0.0,0.21,4331,m,PEV
1729,0.0,0.00,0.0,0.0,0.0,0.15,4333,m,M-E
1730,0.0,0.00,0.0,0.0,0.0,0.06,4334,m,UDC


### Verification

In [25]:
all_members_sessions_dict['CN'].sum().value_counts()

200    109
199      8
198      1
Name: count, dtype: int64

In [26]:
all_members_sessions_dict['CE'].sum().value_counts()

46    96
45    19
44     3
Name: count, dtype: int64

In [27]:
all_members_sessions_dict['CF'].sum().value_counts()

7    117
6      1
Name: count, dtype: int64

In [29]:
a = 5117
b = 5118
df_to_compare = all_members_sessions_dict['CN']
df_to_compare[a].compare(df_to_compare[b], result_names=(a, b))

Unnamed: 0,5117,5118
1071,1.0,0.0
4100,1.0,0.0
4157,0.0,1.0
4333,0.0,1.0


In [181]:
members_df.query('PersonNumber == 21')

Unnamed: 0,PersonNumber,FirstName,LastName,GenderAsString,CantonAbbreviation,CouncilAbbreviation,PartyAbbreviation,DateJoining,DateLeaving
8,21,Christoph,Blocher,m,ZH,CF,UDC,2004-01-01,2007-12-31
9,21,Christoph,Blocher,m,ZH,CN,UDC,2003-12-01,2003-12-10
10,21,Christoph,Blocher,m,ZH,CN,UDC,1995-12-04,2003-11-30
11,21,Christoph,Blocher,m,ZH,CN,UDC,2011-12-05,2014-05-31


In [212]:
members_df.query('LastName == "Maret"')

Unnamed: 0,PersonNumber,FirstName,LastName,GenderAsString,CantonAbbreviation,CouncilAbbreviation,PartyAbbreviation,DateJoining,DateLeaving
1701,4303,Marianne,Maret,f,VS,CE,PDC,2019-12-02,NaT


In [326]:
sessions_df.query('IdSession == 5117')

Unnamed: 0,IdSession,SessionNumber,StartDate,EndDate,LegislativePeriodNumber
113,5117,17,2022-11-28,2022-12-16,51


## Scrap business tags

In [248]:
# open data files

with open('data/raw/data_businesses.csv', encoding='utf-8') as file:
    businesses_df = pd.read_csv(file)

with open('data/raw/data_subjects.csv', encoding='utf-8') as file:
    subjects_df = pd.read_csv(file)
    
with open('data/raw/data_business_tags.csv', encoding='utf-8') as file:
    all_tags_list = list(pd.read_csv(file)['TagName'])
    
businesses_df

Unnamed: 0,ID,BusinessShortNumber,BusinessType,BusinessTypeName,Title,Description,BusinessStatusText,BusinessStatusDate,ResponsibleDepartmentAbbreviation,Tags,SubmissionDate,SubmissionSession,SubmissionLegislativePeriod,TagNames
0,19780222,78.2220,4,Initiative parlementaire,Code pénal. Interruption de la grossesse (Gira...,Rapport de la Commission du 27.08.1979,Liquidé,1981-03-10T00:00:00,#ResponsibleDepartmentAbbreviation#,#Tags#,1978-06-05T00:00:00,4311,40,#TagNames#
1,19800226,80.2260,4,Initiative parlementaire,Constitutiion fédérale. Droit de l'entreprise ...,#Description#,Liquidé,1983-03-03T00:00:00,Parl,#Tags#,1980-06-02T00:00:00,4311,41,#TagNames#
2,19850019,85.0190,1,Objet du Conseil fédéral,Utilisation pacifique de l'énergie nucléaire. ...,Message et projet d'arrêté du 1er mai 1985 con...,Liquidé,2003-12-16T00:00:00,DFAE,#Tags#,1985-05-01T00:00:00,4311,42,#TagNames#
3,19850227,85.2270,4,Initiative parlementaire,Droit des assurances sociales,#Description#,Liquidé,2000-10-06T00:00:00,Parl,#Tags#,1985-02-07T00:00:00,4311,42,#TagNames#
4,19870069,87.0690,1,Objet du Conseil fédéral,Loi sur les chemins de fer. Modification,Message et projets de lois du 18 novembre 1987...,Liquidé,1987-11-18T00:00:00,DETEC,#Tags#,1987-11-18T00:00:00,4311,42,#TagNames#
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57504,20237463,23.7463,14,Heure des questions. Question,Pascal Hollenstein peut-il rester en poste ?,#Description#,Liquidé,2023-06-12T00:00:00,DFF,4|34,2023-06-07T00:00:00,5121,51,Politique d'Etat|Médias et communication
57505,20237464,23.7464,14,Heure des questions. Question,Scandaleuse récolte de bulletins de vote par L...,#Description#,Liquidé,2023-06-12T00:00:00,DETEC,4|8|9|34|48|52,2023-06-07T00:00:00,5121,51,Politique d'Etat|Politique internationale|Poli...
57506,20237465,23.7465,14,Heure des questions. Question,Le cabinet d'audit PwC est-il coresponsable de...,#Description#,Liquidé,2023-06-12T00:00:00,DFF,15|24,2023-06-07T00:00:00,5121,51,Économie|Finances
57507,20237466,23.7466,14,Heure des questions. Question,Effondrement de Credit Suisse et défaillance d...,#Description#,Liquidé,2023-06-12T00:00:00,DFF,15|24,2023-06-07T00:00:00,5121,51,Économie|Finances


In [167]:
directory = 'data/raw/transcripts_full'
transcripts_list = []
 
# iterate over files in that directory
for filename in sorted(os.listdir(directory)):
    file = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(file) and directory + '/transcripts_' in file:
        print(file)
        
        with open(file, encoding='utf-8') as f:
            transcript_df = pd.read_csv(f)
            
            # replace NaN in Text field by empty string ''
            transcript_df['Text'] = transcript_df['Text'].fillna('')

            # replace automatic placeholder for empty cells by NaN
            transcript_df = transcript_df.replace('#PersonNumber#', np.nan)
            transcript_df = transcript_df.replace('#SpeakerFunction#', np.nan)
            transcript_df = transcript_df.replace('#CouncilId#', np.nan)
            transcript_df = transcript_df.replace('#Start#', np.nan)
            transcript_df = transcript_df.replace('#End#', np.nan)
            transcript_df = transcript_df.replace('#Function#', np.nan)
            transcript_df = transcript_df.replace('#LanguageOfText#', np.nan)
            
            # append df to list of df
            transcripts_list.append(transcript_df)
            
len(transcripts_list)

data/raw/transcripts_full/transcripts_46.csv
data/raw/transcripts_full/transcripts_47.csv
data/raw/transcripts_full/transcripts_48.csv
data/raw/transcripts_full/transcripts_49.csv
data/raw/transcripts_full/transcripts_50.csv
data/raw/transcripts_full/transcripts_51.csv


6

In [177]:
col_to_keep = ['ID', 'IdSubject', 'IdSession']
all_transcripts = pd.DataFrame()
for df in transcripts_list:
    all_transcripts = pd.concat([all_transcripts, df])
    
all_transcripts = all_transcripts.reset_index(drop=True)[col_to_keep]
all_transcripts

Unnamed: 0,ID,IdSubject,IdSession
0,1,1,4601
1,2,1,4601
2,3,1,4601
3,8,2,4601
4,10,2,4601
...,...,...,...
295925,319803,60642,5120
295926,319804,60648,5120
295927,319805,60652,5120
295928,319807,60674,5120


In [269]:
filtered_transcripts = all_transcripts.query('IdSession > 4000')
filtered_transcripts

Unnamed: 0,ID,IdSubject,IdSession
0,1,1,4601
1,2,1,4601
2,3,1,4601
3,8,2,4601
4,10,2,4601
...,...,...,...
295925,319803,60642,5120
295926,319804,60648,5120
295927,319805,60652,5120
295928,319807,60674,5120


In [270]:
subject_id_list = list(filtered_transcripts['IdSubject'].unique())
len(subject_id_list)

59131

In [271]:
business_id_list = list(subjects_df.loc[subjects_df['IdSubject'].isin(subject_id_list)]['BusinessNumber'].unique())
len(business_id_list)

45244

In [239]:
def retrieve_url(url):
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    try:
        WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, "span[data-ng-repeat='tagName in tagNames | orderBy']")))
        #print('retrieved url')
    except:
        print('timeout')
        print(url)
        pass

    
    return driver

In [244]:
def get_tags(driver):
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    
    tag_dict = dict()
    #for tag in all_tags_list:
    #    tag_dict[tag] = 0
    
    soup_tags = soup.find_all('span', {'data-ng-repeat': 'tagName in tagNames | orderBy'})
    for tag in soup_tags:
        tag = tag.text
        # remove whitespace around tag
        tag = tag.replace('\t', '').replace('\n', '')
        if tag not in all_tags_list:
            print('#################')
            print(i, tag)
        tag_dict[tag] = 1
    #print(tag_dict)
        
    return tag_dict

In [245]:
base_url = 'https://www.parlament.ch/fr/ratsbetrieb/suche-curia-vista/geschaeft?AffairId='
business_tag_dict = dict()

for i in tqdm(business_id_list[:500]):
    query_url = base_url + str(i)
    
    # get web page from url
    driver = retrieve_url(query_url)
    
    # get tags from web page
    tag_dict = get_tags(driver)
    
    # if business has at least one tag, add tags to the dict
    if tag_dict:
        business_tag_dict[i] = tag_dict
    # else if business has no tag, add to the dict
    else:
        business_tag_dict[i] = {'no_tag': 1}
    #print(business_tag_dict)
    
len(business_tag_dict)

 14%|██████████████████████▊                                                                                                                                              | 69/500 [04:10<36:39,  5.10s/it]

timeout
https://www.parlament.ch/fr/ratsbetrieb/suche-curia-vista/geschaeft?AffairId=20152023


 22%|████████████████████████████████████                                                                                                                                | 110/500 [06:43<32:44,  5.04s/it]

timeout
https://www.parlament.ch/fr/ratsbetrieb/suche-curia-vista/geschaeft?AffairId=20162003


 29%|██████████████████████████████████████████████▉                                                                                                                     | 143/500 [08:55<22:18,  3.75s/it]


KeyboardInterrupt: 

In [237]:
business_tag_df = pd.DataFrame.from_dict(business_tag_dict, orient='index').fillna(0).astype('int')
business_tag_df

Unnamed: 0,no_tag
20182021,1
20182022,1
20182026,1
20182027,1


In [211]:
#business_tag_list.append(business_tag_df)
print(len(business_tag_list))
business_tag_list[0]

1


Unnamed: 0,Droit,Questions sociales,Finances,Santé,Politique internationale,Politique de sécurité,Agriculture,Aménagement du territoire et logement,Économie,Politique migratoire,...,Environnement,Droits de l'homme,Science et recherche,Médias et communication,Emploi et travail,Éducation,Énergie,Justice,Transports,Droit international
20030424,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20080334,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20090313,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20090314,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20090332,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20191059,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
20191060,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
20193480,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
20193511,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
