In [2]:
import pandas as pd
import numpy as np
import os
from bs4 import BeautifulSoup
import requests
import re

## Slice full transcripts

In [3]:
directory = 'data/raw/transcripts_full'
transcripts_list = []
 
# iterate over files in that directory
for filename in os.listdir(directory):
    file = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(file) and directory + '/transcripts_' in file:
        print(file)
        
        with open(file, encoding='utf-8') as f:
            transcript_df = pd.read_csv(f)
            
            # replace NaN in Text field by empty string ''
            transcript_df['Text'] = transcript_df['Text'].fillna('')

            # replace automatic placeholder for empty cells by NaN
            transcript_df = transcript_df.replace('#PersonNumber#', np.nan)
            transcript_df = transcript_df.replace('#SpeakerFunction#', np.nan)
            transcript_df = transcript_df.replace('#CouncilId#', np.nan)
            transcript_df = transcript_df.replace('#Start#', np.nan)
            transcript_df = transcript_df.replace('#End#', np.nan)
            transcript_df = transcript_df.replace('#Function#', np.nan)
            transcript_df = transcript_df.replace('#LanguageOfText#', np.nan)
            
            # append df to list of df
            transcripts_list.append(transcript_df)
            
len(transcripts_list)

data/raw/transcripts_full/transcripts_48.csv
data/raw/transcripts_full/transcripts_49.csv
data/raw/transcripts_full/transcripts_47.csv
data/raw/transcripts_full/transcripts_46.csv
data/raw/transcripts_full/transcripts_50.csv
data/raw/transcripts_full/transcripts_51.csv


6

In [4]:
# count number of empty cells for each column

for transcript_df in transcripts_list:
    print('size: ', len(transcript_df))
    for col in transcript_df.columns:
        print(col, transcript_df[col].isnull().sum())
        
    print('-----')

size:  51852
ID 0
IdSubject 0
PersonNumber 10300
Text 0
MeetingCouncilAbbreviation 0
MeetingDate 0
IdSession 0
SpeakerFunction 10300
CouncilId 10300
Start 7406
End 7406
Function 10305
LanguageOfText 5919
-----
size:  53019
ID 0
IdSubject 0
PersonNumber 11054
Text 0
MeetingCouncilAbbreviation 0
MeetingDate 0
IdSession 0
SpeakerFunction 11054
CouncilId 11058
Start 7280
End 7280
Function 11056
LanguageOfText 12237
-----
size:  46051
ID 0
IdSubject 0
PersonNumber 5875
Text 0
MeetingCouncilAbbreviation 0
MeetingDate 0
IdSession 0
SpeakerFunction 5875
CouncilId 5908
Start 4305
End 4305
Function 5917
LanguageOfText 5417
-----
size:  39969
ID 0
IdSubject 0
PersonNumber 5025
Text 0
MeetingCouncilAbbreviation 0
MeetingDate 0
IdSession 0
SpeakerFunction 24941
CouncilId 5026
Start 3238
End 3238
Function 31133
LanguageOfText 5537
-----
size:  51716
ID 0
IdSubject 0
PersonNumber 11708
Text 0
MeetingCouncilAbbreviation 0
MeetingDate 0
IdSession 0
SpeakerFunction 11708
CouncilId 11713
Start 6778
End 6

In [5]:
# remove html tags from texts
count = 0

for transcript_df in transcripts_list:
    print('transcript idx: ', count)
    
    transcript_df['Text'] = transcript_df['Text'].apply(lambda text: BeautifulSoup(text, 'html.parser').text)
    count += 1

transcript idx:  0
transcript idx:  1
transcript idx:  2
transcript idx:  3
transcript idx:  4
transcript idx:  5


In [6]:
# save all transcripts by session

for transcript_df in transcripts_list:
    sessions_list = list(transcript_df['IdSession'].unique())
    for session_id in sessions_list:
        transcript_session = transcript_df.loc[transcript_df['IdSession'] == session_id]
        transcript_session.to_csv('data/raw/transcripts/transcript_' + str(session_id) + '.csv', encoding='utf-8')

## Pre-process transcript files

In [7]:
# open data files

with open('data/raw/data_persons.csv', encoding='utf-8') as file:
    persons_df = pd.read_csv(file)

with open('data/raw/data_businesses.csv', encoding='utf-8') as file:
    businesses_df = pd.read_csv(file)

with open('data/raw/data_subjects.csv', encoding='utf-8') as file:
    subjects_df = pd.read_csv(file)
    
businesses_df

Unnamed: 0,ID,BusinessShortNumber,BusinessTypeName,Title,Description,InitialSituation,Proceedings,SubmittedText,SubmittedBy,BusinessStatusText,BusinessStatusDate,ResponsibleDepartmentAbbreviation,SubmissionDate,SubmissionSession
0,19780222,78.2220,Initiative parlementaire,Code pénal. Interruption de la grossesse (Gira...,Rapport de la Commission du 27.08.1979,,,,Girard Gertrude,Liquidé,1981-03-10T00:00:00,,1978-06-05T00:00:00,4311
1,19800226,80.2260,Initiative parlementaire,Constitutiion fédérale. Droit de l'entreprise ...,,,,,Jelmini Camillo,Liquidé,1983-03-03T00:00:00,Parl,1980-06-02T00:00:00,4311
2,19850019,85.0190,Objet du Conseil fédéral,Utilisation pacifique de l'énergie nucléaire. ...,Message et projet d'arrêté du 1er mai 1985 con...,<text><p>L'accord établit le cadre de droit in...,<text><p></p><p>Les deux conseils ont décidé s...,,,Liquidé,2003-12-16T00:00:00,DFAE,1985-05-01T00:00:00,4311
3,19850227,85.2270,Initiative parlementaire,Droit des assurances sociales,,<text><p>Le droit suisse des assurances social...,<text><p>Le <b>Conseil des Etats</b> a accepté...,<text><p>A la suite de la motion visant une me...,Meier Josi J.,Liquidé,2000-10-06T00:00:00,Parl,1985-02-07T00:00:00,4311
4,19870069,87.0690,Objet du Conseil fédéral,Loi sur les chemins de fer. Modification,Message et projets de lois du 18 novembre 1987...,,,,,Liquidé,1987-11-18T00:00:00,DETEC,1987-11-18T00:00:00,4311
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56869,20237263,23.7263,Heure des questions. Question,Assouplir les règles limitant la hauteur des i...,,,,<text><p>La Suisse recèle un important potenti...,Gugger Niklaus-Samuel,Liquidé,2023-03-13T00:00:00,DETEC,2023-03-08T00:00:00,5118
56870,20237264,23.7264,Heure des questions. Question,Connaissances actuelles en matière de précipit...,,,,<text><p>- De quelles informations le Conseil ...,Haab Martin,Liquidé,2023-03-13T00:00:00,DFI,2023-03-08T00:00:00,5118
56871,20237265,23.7265,Heure des questions. Question,Le langage inclusif est-il autorisé au Parleme...,,,,"<text><p>En réponse à la question 23.7122, con...",Roduit Benjamin,Liquidé,2023-03-13T00:00:00,Parl,2023-03-08T00:00:00,5118
56872,20237266,23.7266,Heure des questions. Question,L'hydraulique comme réservoir d'énergie renouv...,,,,<text><p>La Suisse doit tout mettre en oeuvre ...,Gmür Alois,Liquidé,2023-03-13T00:00:00,DETEC,2023-03-08T00:00:00,5118


In [11]:
subjects_df

Unnamed: 0,IdSubject,BusinessNumber,BusinessShortNumber,Title,TitleFR
0,6688,19850019,85.0190,Utilisation pacifique de l'énergie nucléaire. ...,Utilisation pacifique\nde l'énergie nucléaire....
1,6905,19850019,85.0190,Utilisation pacifique de l'énergie nucléaire. ...,Utilisation pacifique\nde l'énergie nucléaire....
2,645,19850227,85.2270,Droit des assurances sociales,Initiative parlementaire\nMeier Josi.\nDroit d...
3,775,19850227,85.2270,Droit des assurances sociales,Initiative parlementaire\nMeier Josi.\nDroit d...
4,1234,19850227,85.2270,Droit des assurances sociales,Initiative parlementaire\nMeier Josi.\nDroit d...
...,...,...,...,...,...
62273,60486,20239002,23.9002,Communications de la présidente,Communications de la présidente
62274,60501,20239002,23.9002,Communications de la présidente,Communications de la présidente
62275,59643,20239003,23.9003,Eloge funèbre,Eloge funèbre
62276,59647,20239003,23.9003,Eloge funèbre,Eloge funèbre


In [257]:
persons_df

Unnamed: 0,PersonNumber,FirstName,LastName,GenderAsString,ParlGroupName,PartyName,PartyAbbreviation,DateJoining,DateLeaving,DateElection,DateOath,DateResignation,DateOfBirth,DateOfDeath
0,1,Pierre,Aguet,m,Groupe socialiste,Parti socialiste suisse,PSS,1995-12-04T00:00:00,1999-12-05T00:00:00,1995-12-04T00:00:00,1995-12-04T00:00:00,1999-12-05T00:00:00,1938-03-02T00:00:00,
1,2,Heinz,Allenspach,m,Groupe libéral-radical,PLR.Les Libéraux-Radicaux,PLR,1979-11-26T00:00:00,1995-12-03T00:00:00,1979-11-26T00:00:00,1979-11-26T00:00:00,1995-12-03T00:00:00,1928-02-22T00:00:00,2022-09-16T00:00:00
2,6,Manfred,Aregger,m,Groupe libéral-radical,PLR.Les Libéraux-Radicaux,PLR,1995-12-04T00:00:00,1999-12-05T00:00:00,1995-12-04T00:00:00,1995-12-04T00:00:00,1999-12-05T00:00:00,1931-01-27T00:00:00,
3,7,Geneviève,Aubry,f,Groupe libéral-radical,PLR.Les Libéraux-Radicaux,PLR,1979-11-26T00:00:00,1995-12-03T00:00:00,1979-11-26T00:00:00,1979-11-26T00:00:00,1995-12-03T00:00:00,1928-03-04T00:00:00,
4,8,Rosmarie,Bär,f,Groupe des VERT-E-S,Liste libre (BE),LL,1987-11-30T00:00:00,1995-12-03T00:00:00,1987-11-30T00:00:00,1987-11-30T00:00:00,1995-12-03T00:00:00,1947-12-01T00:00:00,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3623,4329,Daniel,Ruch,m,Groupe libéral-radical,PLR.Les Libéraux-Radicaux,PLR,2022-06-13T00:00:00,,2019-10-20T00:00:00,2022-06-13T00:00:00,,1963-03-20T00:00:00,
3624,4330,Alexandre,Berthoud,m,Groupe libéral-radical,PLR.Les Libéraux-Radicaux,PLR,2022-06-13T00:00:00,,2019-10-20T00:00:00,2022-06-13T00:00:00,,1977-06-29T00:00:00,
3625,4331,Marc,Jost,m,Le Groupe du Centre. Le Centre. PEV.,Parti évangélique suisse,PEV,2022-11-28T00:00:00,,2019-10-20T00:00:00,2022-11-28T00:00:00,,1974-02-06T00:00:00,
3626,4332,Mathilde,Crevoisier Crelier,f,Groupe socialiste,Parti socialiste suisse,PSS,2022-12-15T00:00:00,,2019-10-20T00:00:00,2022-12-15T00:00:00,,1980-01-05T00:00:00,


In [8]:
# get all transcripts file names
directory = 'data/raw/transcripts'
transcripts_files_list = []
transcripts_list = []

# iterate over files in that directory
for filename in os.listdir(directory):
    file = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(file):
        transcripts_files_list.append(file)

# sort transcripts chronologically 
transcripts_files_list.sort()
len(transcripts_files_list)

117

In [9]:
# load all transcripts data
for file in transcripts_files_list:
    with open(file, encoding='utf-8') as f:
        transcript_df = pd.read_csv(f)
        # drop column 'Unnamed'
        transcript_df = transcript_df.drop(columns=['Unnamed: 0'])
        # append df to list of df
        transcripts_list.append(transcript_df)
            
len(transcripts_list)

117

In [25]:
all_sessions_df = pd.DataFrame()

for session_df in transcripts_list:
    session_id = session_df.loc[0, 'IdSession']
    print(session_id)
    
    # sort df
    session_df = session_df.sort_values(by=['MeetingCouncilAbbreviation', 'MeetingDate', 'Start', 'IdSubject'])
    
    # only keep rows with a PersonNumber
    session_df = session_df.dropna(subset=['PersonNumber'])
    
    #print(session_df['SpeakerFunction'].unique())
    #print(session_df['MeetingCouncilAbbreviation'].unique())
    #print(session_df['CouncilId'].unique())
    
    # drop rows when federal chancelor or council president is speaking
    functions_to_drop = ['BK-M', 'BK-F', 'P-M', 'P-F']
    session_df = session_df[~session_df['SpeakerFunction'].isin(functions_to_drop)]
    # drop rows when meeting is for both chambers (MeetingCouncilAbbreviation=V, for "Vereinigte Bundesversammlung")
    session_df = session_df[session_df['MeetingCouncilAbbreviation'] != 'V']
    
    # replace "[NB]" by a single space
    session_df.loc[:, 'Text'] = session_df.loc[:, 'Text'].replace(regex=r'\[NB\]', value=' ')
    # replace "[...]" by an empty string
    session_df.loc[:, 'Text'] = session_df.loc[:, 'Text'].replace(regex=r'\[.+?\]', value='')
    # set column PersonNumber type to int
    session_df = session_df.astype({"PersonNumber": int})
    
    all_sessions_df = pd.concat([all_sessions_df, session_df])
    
    # save df
    session_df.to_csv('data/transcripts/transcript_' + str(session_id) + '.csv', encoding='utf-8')

4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120


### Verification df processing

In [12]:
session_df = transcripts_list[4]
session_df

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
0,8039,1595,205.0,[VS]\nDie Beratung dieses Geschäftes wird unte...,N,20001127,4605,,1.0,2000-11-27T19:58:29,2000-11-27T19:59:47,,
1,8040,1595,140.0,Gestatten Sie mir vier Bemerkungen zum Budget ...,N,20001127,4605,,1.0,2000-11-27T19:52:02,2000-11-27T19:58:29,,DE
2,8041,1595,367.0,Nach den Beschlüssen der Finanzkommission bela...,N,20001127,4605,,1.0,2000-11-27T19:47:44,2000-11-27T19:52:02,,DE
3,8042,1595,440.0,"Ich glaube, wir alle sind froh, dass wir diese...",N,20001127,4605,,1.0,2000-11-27T19:45:23,2000-11-27T19:47:44,,DE
4,8043,1595,332.0,Das Budget 2001 ist mehr oder weniger ausgegli...,N,20001127,4605,,1.0,2000-11-27T19:40:16,2000-11-27T19:45:23,,DE
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2026,10065,1940,111.0,Vierter Wahlgang - Quatrième tour de scrutin\n...,V,20001206,4605,P-M,1.0,2000-12-06T09:44:54,2000-12-06T09:46:41,,
2027,10066,1940,111.0,Fünfter Wahlgang - Cinquième tour de scrutin\n...,V,20001206,4605,P-M,1.0,2000-12-06T09:59:25,2000-12-06T10:01:07,,
2028,10067,1941,111.0,Die Sitzung der Vereinigten Bundesversammlung ...,V,20001206,4605,P-M,1.0,2000-12-06T08:00:05,2000-12-06T08:01:30,,DE
2029,10068,1942,111.0,Ich erkläre die Sitzung der Vereinigten Bundes...,V,20001213,4605,P-M,1.0,2000-12-13T08:00:38,2000-12-13T08:01:49,,DE


In [13]:
# sort df
session_df = session_df.sort_values(by=['MeetingCouncilAbbreviation', 'MeetingDate', 'Start', 'IdSubject'])
session_df

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
32,8071,1596,205.0,Ich begrüsse Sie sehr herzlich zur Wintersessi...,N,20001127,4605,,1.0,2000-11-27T14:30:33,2000-11-27T14:44:36,P-M,DE
37,8076,1597,205.0,1. Wahl des Präsidenten des Nationalrates für ...,N,20001127,4605,,1.0,2000-11-27T14:44:36,2000-11-27T14:45:28,,
36,8075,1597,205.0,Ergebnis der Wahl - Résultat du scrutin\nAusge...,N,20001127,4605,,1.0,2000-11-27T14:55:57,2000-11-27T14:57:47,,
35,8074,1597,111.0,Für die ehrenvolle Wahl zum Nationalratspräsid...,N,20001127,4605,P-M,1.0,2000-11-27T14:57:47,2000-11-27T15:15:21,,
34,8073,1597,111.0,Ergebnis der Wahl - Résultat du scrutin\nAusge...,N,20001127,4605,P-M,1.0,2000-11-27T15:28:08,2000-11-27T15:30:11,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013,10052,1940,111.0,"Herr Bundesrat Schmid, ich gratuliere Ihnen zu...",V,20001206,4605,P-M,1.0,2000-12-06T10:20:19,2000-12-06T10:22:52,,DE
2012,10051,1940,111.0,3. Wahl des Bundespräsidenten für 2001 \n3. El...,V,20001206,4605,P-M,1.0,2000-12-06T10:35:58,2000-12-06T10:37:55,,
2011,10050,1940,111.0,4. Wahl des Vizepräsidenten des Bundesrates fü...,V,20001206,4605,P-M,1.0,2000-12-06T10:49:24,2000-12-06T10:50:57,,
2029,10068,1942,111.0,Ich erkläre die Sitzung der Vereinigten Bundes...,V,20001213,4605,P-M,1.0,2000-12-13T08:00:38,2000-12-13T08:01:49,,DE


In [14]:
# only keep rows with a PersonNumber
session_df = session_df.dropna(subset=['PersonNumber'])
session_df

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
32,8071,1596,205.0,Ich begrüsse Sie sehr herzlich zur Wintersessi...,N,20001127,4605,,1.0,2000-11-27T14:30:33,2000-11-27T14:44:36,P-M,DE
37,8076,1597,205.0,1. Wahl des Präsidenten des Nationalrates für ...,N,20001127,4605,,1.0,2000-11-27T14:44:36,2000-11-27T14:45:28,,
36,8075,1597,205.0,Ergebnis der Wahl - Résultat du scrutin\nAusge...,N,20001127,4605,,1.0,2000-11-27T14:55:57,2000-11-27T14:57:47,,
35,8074,1597,111.0,Für die ehrenvolle Wahl zum Nationalratspräsid...,N,20001127,4605,P-M,1.0,2000-11-27T14:57:47,2000-11-27T15:15:21,,
34,8073,1597,111.0,Ergebnis der Wahl - Résultat du scrutin\nAusge...,N,20001127,4605,P-M,1.0,2000-11-27T15:28:08,2000-11-27T15:30:11,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013,10052,1940,111.0,"Herr Bundesrat Schmid, ich gratuliere Ihnen zu...",V,20001206,4605,P-M,1.0,2000-12-06T10:20:19,2000-12-06T10:22:52,,DE
2012,10051,1940,111.0,3. Wahl des Bundespräsidenten für 2001 \n3. El...,V,20001206,4605,P-M,1.0,2000-12-06T10:35:58,2000-12-06T10:37:55,,
2011,10050,1940,111.0,4. Wahl des Vizepräsidenten des Bundesrates fü...,V,20001206,4605,P-M,1.0,2000-12-06T10:49:24,2000-12-06T10:50:57,,
2029,10068,1942,111.0,Ich erkläre die Sitzung der Vereinigten Bundes...,V,20001213,4605,P-M,1.0,2000-12-13T08:00:38,2000-12-13T08:01:49,,DE


In [15]:
print(session_df['SpeakerFunction'].unique())
print(session_df['MeetingCouncilAbbreviation'].unique())
print(session_df['CouncilId'].unique())
session_df.loc[session_df['MeetingCouncilAbbreviation'] == 'V']

[nan 'P-M' 'BR-M' '1VP-F' 'BR-F' 'BK-F' '2VP-M' 'BPR-M' 'P-F' '1VP-M']
['N' 'S' 'V']
[ 1. 99. 98.  2. nan]


Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
2028,10067,1941,111.0,Die Sitzung der Vereinigten Bundesversammlung ...,V,20001206,4605,P-M,1.0,2000-12-06T08:00:05,2000-12-06T08:01:30,,DE
2016,10055,1940,111.0,1. Rücktritt von Herrn Bundesrat Adolf Ogi\n1....,V,20001206,4605,P-M,1.0,2000-12-06T08:01:30,2000-12-06T08:15:03,,
2017,10056,1940,428.0,"Herr Präsident, Ihre Worte haben mich gerührt....",V,20001206,4605,BPR-M,99.0,2000-12-06T08:15:03,2000-12-06T08:19:23,,DE
2018,10057,1940,111.0,Ich danke Herrn Bundespräsident Ogi für seine ...,V,20001206,4605,P-M,1.0,2000-12-06T08:19:23,2000-12-06T08:20:53,,DE
2019,10058,1940,84.0,Ich möchte im Namen der Fraktion der Schweizer...,V,20001206,4605,,1.0,2000-12-06T08:20:53,2000-12-06T08:23:32,,DE
2020,10059,1940,9.0,"Stellen Sie sich vor, die Zauberformel würde h...",V,20001206,4605,,1.0,2000-12-06T08:23:32,2000-12-06T08:25:31,,DE
2021,10060,1940,241.0,"Nous allons vivre, et le pays avec nous, aujou...",V,20001206,4605,,1.0,2000-12-06T08:25:31,2000-12-06T08:30:54,,FR
2022,10061,1940,70.0,"Le Parti libéral, vous le savez, n'est pas un ...",V,20001206,4605,,1.0,2000-12-06T08:30:54,2000-12-06T08:34:37,,FR
2023,10062,1940,111.0,Erster Wahlgang - Premier tour de scrutin\n[VS...,V,20001206,4605,P-M,1.0,2000-12-06T08:55:37,2000-12-06T08:57:44,,
2024,10063,1940,111.0,Zweiter Wahlgang - Deuxième tour de scrutin\n[...,V,20001206,4605,P-M,1.0,2000-12-06T09:12:05,2000-12-06T09:14:34,,


In [16]:
# drop rows when federal chancelor or council president is speaking
functions_to_drop = ['BK-M', 'BK-F', 'P-M', 'P-F']
session_df = session_df[~session_df['SpeakerFunction'].isin(functions_to_drop)]
# drop rows when meeting is for both chambers (MeetingCouncilAbbreviation=V, for "Vereinigte Bundesversammlung")
session_df = session_df[session_df['MeetingCouncilAbbreviation'] != 'V']
session_df

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
32,8071,1596,205.0,Ich begrüsse Sie sehr herzlich zur Wintersessi...,N,20001127,4605,,1.0,2000-11-27T14:30:33,2000-11-27T14:44:36,P-M,DE
37,8076,1597,205.0,1. Wahl des Präsidenten des Nationalrates für ...,N,20001127,4605,,1.0,2000-11-27T14:44:36,2000-11-27T14:45:28,,
36,8075,1597,205.0,Ergebnis der Wahl - Résultat du scrutin\nAusge...,N,20001127,4605,,1.0,2000-11-27T14:55:57,2000-11-27T14:57:47,,
19,8058,1595,476.0,Zum ersten Mal seit 1991 unterbreitet der Bund...,N,20001127,4605,,1.0,2000-11-27T16:25:08,2000-11-27T16:47:08,*,DE
17,8056,1595,492.0,Le budget 2001 de la Confédération se présente...,N,20001127,4605,,1.0,2000-11-27T16:47:27,2000-11-27T17:12:14,*,FR
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1976,10015,1921,12.0,Sie haben wieder einen ausführlichen Bericht I...,S,20001214,4605,,2.0,2000-12-14T09:30:31,2000-12-14T09:32:58,*,DE
1975,10014,1921,30.0,Comme notre présidente de la commission l'a di...,S,20001214,4605,,2.0,2000-12-14T09:32:58,2000-12-14T09:37:33,,FR
1974,10013,1921,375.0,Il est toujours délicat d'observer qu'une loi ...,S,20001214,4605,,2.0,2000-12-14T09:37:33,2000-12-14T09:39:31,,FR
1962,10001,1916,194.0,Sie haben den Bericht der Delegation bei der I...,S,20001214,4605,,2.0,2000-12-14T09:40:58,2000-12-14T09:44:39,*,DE


In [17]:
# replace "[NB]" by a single space
session_df.loc[:, 'Text'] = session_df.loc[:, 'Text'].replace(regex=r'\[NB\]', value=' ')
# replace "[...]" by an empty string
session_df.loc[:, 'Text'] = session_df.loc[:, 'Text'].replace(regex=r'\[.+?\]', value='')
# set column PersonNumber type to int
session_df = session_df.astype({"PersonNumber": int})
session_df

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
32,8071,1596,205,Ich begrüsse Sie sehr herzlich zur Wintersessi...,N,20001127,4605,,1.0,2000-11-27T14:30:33,2000-11-27T14:44:36,P-M,DE
37,8076,1597,205,1. Wahl des Präsidenten des Nationalrates für ...,N,20001127,4605,,1.0,2000-11-27T14:44:36,2000-11-27T14:45:28,,
36,8075,1597,205,Ergebnis der Wahl - Résultat du scrutin\nAusge...,N,20001127,4605,,1.0,2000-11-27T14:55:57,2000-11-27T14:57:47,,
19,8058,1595,476,Zum ersten Mal seit 1991 unterbreitet der Bund...,N,20001127,4605,,1.0,2000-11-27T16:25:08,2000-11-27T16:47:08,*,DE
17,8056,1595,492,Le budget 2001 de la Confédération se présente...,N,20001127,4605,,1.0,2000-11-27T16:47:27,2000-11-27T17:12:14,*,FR
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1976,10015,1921,12,Sie haben wieder einen ausführlichen Bericht I...,S,20001214,4605,,2.0,2000-12-14T09:30:31,2000-12-14T09:32:58,*,DE
1975,10014,1921,30,Comme notre présidente de la commission l'a di...,S,20001214,4605,,2.0,2000-12-14T09:32:58,2000-12-14T09:37:33,,FR
1974,10013,1921,375,Il est toujours délicat d'observer qu'une loi ...,S,20001214,4605,,2.0,2000-12-14T09:37:33,2000-12-14T09:39:31,,FR
1962,10001,1916,194,Sie haben den Bericht der Delegation bei der I...,S,20001214,4605,,2.0,2000-12-14T09:40:58,2000-12-14T09:44:39,*,DE


In [273]:
session_df.loc[session_df['ID'] == 71340]

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
66,71340,11355,326,"Das Büro hat die Wahl von Herrn Marc Suter, ge...",N,20070305,4716,Mit-M,1.0,2007-03-05T14:32:41,2007-03-05T14:34:33,B,DE


## Get subjects and businesses

In [27]:
all_sessions_df

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
2,3,1,498,"Madame la Présidente de la Confédération, \nSe...",N,19991206,4601,AP-M,1.0,1999-12-06T14:32:24,1999-12-06T14:33:39,,FR
0,1,1,498,La séance d'aujourd'hui s'ouvre sous le signe ...,N,19991206,4601,AP-M,1.0,1999-12-06T14:36:50,1999-12-06T14:50:21,,FR
47,53,6,498,Il appartenait au doyen d'âge de constituer le...,N,19991206,4601,AP-M,1.0,1999-12-06T14:50:21,1999-12-06T14:52:13,,FR
46,52,6,273,Das provisorische Büro hat an seiner Sitzung v...,N,19991206,4601,,1.0,1999-12-06T14:52:13,1999-12-06T15:01:02,b,DE
45,51,6,305,Le Bureau provisoire de notre Assemblée a siég...,N,19991206,4601,,1.0,1999-12-06T15:01:02,1999-12-06T15:07:43,b,FR
...,...,...,...,...,...,...,...,...,...,...,...,...,...
728,319686,60670,4238,"Sehr geschätzter Herr Nationalrat Egger, Sie h...",N,20230504,5120,BR-F,99.0,2023-05-04T17:04:38,2023-05-04T17:06:20,BR-F,DE
816,319809,60674,4268,La questione della mediatizzazione dei process...,N,20230504,5120,Mit-F,1.0,2023-05-04T17:06:39,2023-05-04T17:11:27,Mit-F,IT
734,319699,60674,1122,"Frau Kollegin Gysin, Sie wollen eine Priorisie...",N,20230504,5120,Mit-M,1.0,2023-05-04T17:11:29,2023-05-04T17:11:46,Mit-M,DE
815,319807,60674,4268,"Collega Fluri, sono molto consapevole del prob...",N,20230504,5120,Mit-F,1.0,2023-05-04T17:11:46,2023-05-04T17:12:44,Mit-F,IT


In [29]:
all_subject_id = all_sessions_df['IdSubject'].unique()
len(all_subject_id)

25820

In [37]:
subjects_filtered = subjects_df[subjects_df['IdSubject'].isin(all_subject_id)]
subjects_filtered

Unnamed: 0,IdSubject,BusinessNumber,BusinessShortNumber,Title,TitleFR
0,6688,19850019,85.0190,Utilisation pacifique de l'énergie nucléaire. ...,Utilisation pacifique\nde l'énergie nucléaire....
1,6905,19850019,85.0190,Utilisation pacifique de l'énergie nucléaire. ...,Utilisation pacifique\nde l'énergie nucléaire....
2,645,19850227,85.2270,Droit des assurances sociales,Initiative parlementaire\nMeier Josi.\nDroit d...
3,775,19850227,85.2270,Droit des assurances sociales,Initiative parlementaire\nMeier Josi.\nDroit d...
4,1234,19850227,85.2270,Droit des assurances sociales,Initiative parlementaire\nMeier Josi.\nDroit d...
...,...,...,...,...,...
62225,60010,20237245,23.7245,Pénurie de médicaments. Degré d’efficacité des...,Heure des questions.\nQuestion Feller Olivier....
62232,59975,20237252,23.7252,Le collaborateur de l'OFSP responsable de la g...,Heure des questions.\nQuestion Büchel Roland R...
62233,59999,20237253,23.7253,Chômage en hausse malgré la pénurie de main-d’...,Heure des questions.\nQuestion Strupler Manuel...
62238,59993,20237258,23.7258,Comment les recommandations du comité scientif...,Heure des questions.\nQuestion Python Valentin...


In [40]:
subjects_filtered[subjects_filtered.duplicated(subset=['IdSubject'], keep=False)].sort_values(by=['IdSubject'])

Unnamed: 0,IdSubject,BusinessNumber,BusinessShortNumber,Title,TitleFR
625,18,19990059,99.0590,"""Pour le libre choix du médecin et de l'établi...",Pour le libre choix du médecin\net de l'établi...
658,18,19990072,99.0720,Pour des coûts hospitaliers moins élevés. Init...,Pour des coûts hospitaliers\nmoins élevés.\nIn...
1108,26,19993462,99.3462,Retour des réfugiés Kosovars,Interpellation\ngroupe radical-démocratique.\n...
496,26,19983649,98.3649,Retour des réfugiés du Kosovo,Interpellation Freund Jakob.\nRetour\ndes réfu...
394,26,19983225,98.3225,Halte aux renvois de réfugiés dans les régions...,Interpellation\ngroupe socialiste.\nHalte aux ...
...,...,...,...,...,...
60778,60636,20224565,22.4565,De quoi les victimes de violence sexualisée on...,Postulat von Falkenstein Patricia.\nDe quoi le...
57658,60673,20214535,21.4535,Réintroduire dans la LAVI la possibilité d'ind...,Motion Marti Min Li.\nRéintroduire dans la LAV...
57657,60673,20214534,21.4534,Réintroduire dans la LAVI la possibilité d'ind...,Motion Porchet Léonore.\nRéintroduire dans la ...
57656,60673,20214533,21.4533,Réintroduire dans la LAVI la possibilité d'ind...,Motion de Quattro Jacqueline.\nRéintroduire da...
