In [3]:
# Data processing
import os
import pandas as pd
import numpy as np
import ssl
import string
from collections import Counter
from transformers.pipelines import pipeline
import altair as alt
from tqdm import tqdm
import random

# Text preprocessiong
import nltk
from nltk.corpus import wordnet as wn
import spacy
import spacy_fastlang
from top2vec import Top2Vec
import gensim
from gensim import corpora
from gensim.corpora.dictionary import Dictionary
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from stanza.models.common.doc import Document
from stanza.pipeline.core import Pipeline
from stanza.pipeline.multilingual import MultilingualPipeline

# Topic model
from bertopic import BERTopic

# Dimension reduction
from umap.umap_ import UMAP

In [4]:
# disable ssl check (to be able to download nltk packages)

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [5]:
# download stanza package

import stanza
stanza.download(lang="multilingual")
stanza.download(lang="de")
stanza.download(lang="fr")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 76.8MB/s]                                                                              
2023-06-05 10:16:03 INFO: Downloading default packages for language: multilingual (multilingual) ...
2023-06-05 10:16:03 INFO: File exists: /Users/cyrille/stanza_resources/multilingual/default.zip
2023-06-05 10:16:03 INFO: Finished downloading models and saved to /Users/cyrille/stanza_resources.
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 56.7MB/s]                                                                              
2023-06-05 10:16:03 INFO: Downloading default packages for language: de (German) ...
2023-06-05 10:16:05 INFO: File exists: /Users/cyrille/stanza_resources/de/default.zip
2023-06-05 10:16:09 INFO: Finished downloading models and saved to /Users/cyrille/stanza_resources.
Downloading https://raw.

In [6]:
# download nltk packages

nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cyrille/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/cyrille/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/cyrille/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Lemmatize transcripts

In [53]:
transcript_df = pd.DataFrame()

directory = 'data/transcripts'
filepaths_list = []
transcripts_list = []

# iterate over files in that directory
for filename in os.listdir(directory):
    file = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(file) and file.endswith('.csv'):
        filepaths_list.append(file)

# sort transcripts chronologically 
filepaths_list.sort()
print(len(filepaths_list))

for filepath in tqdm(filepaths_list[97:100]):
    with open(filepath, encoding='utf-8') as file:
        session_df = pd.read_csv(file).drop(columns='Unnamed: 0')
        transcript_df = pd.concat([transcript_df, session_df])

transcript_df = transcript_df.reset_index(drop=True)
transcript_df

117


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 32.91it/s]


Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
0,253990,47803,806,"Präsidentin (Graf Maya, Alterspräsidentin): He...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:32:29,2019-12-02T14:33:42,P-F,DE
1,253996,47803,4290,"Sehr geehrte Frau Präsidentin, sehr geehrter H...",N,20191202,5101,Mit-M,1.0,2019-12-02T14:44:33,2019-12-02T14:52:10,Mit-M,DE
2,253998,47803,806,"Präsidentin (Graf Maya, Alterspräsidentin): Ge...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:52:10,2019-12-02T14:52:30,P-F,
3,254000,47804,806,"Präsidentin (Graf Maya, Alterspräsidentin): De...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:52:30,2019-12-02T14:54:24,P-F,DE
4,254011,47804,1139,Sie haben den Bericht des Bundesrates zu den N...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:54:24,2019-12-02T14:57:46,B,DE
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3210,260973,48951,4204,Der Kommissionssprecher hat es gesagt: Die Lös...,S,20200506,5103,Mit-M,2.0,2020-05-06T17:38:33,2020-05-06T17:40:28,Mit-M,DE
3211,261022,48951,3879,Je ne vais répéter en français ce qui vient d'...,S,20200506,5103,Mit-M,2.0,2020-05-06T17:40:28,2020-05-06T17:43:59,Mit-M,FR
3212,261018,48951,4240,Je serai très bref. Je soutiendrai évidemment ...,S,20200506,5103,Mit-M,2.0,2020-05-06T17:43:59,2020-05-06T17:45:01,Mit-M,FR
3213,260978,48951,146,In Bezug auf den Tourismus gab es in diesen dr...,S,20200506,5103,BR-M,99.0,2020-05-06T17:45:01,2020-05-06T17:48:48,BR-M,DE


In [93]:
selected_language = 'FR'

# only keep french texts
filtered_transcript = transcript_df.loc[transcript_df['LanguageOfText'] == selected_language]
# only keep texts longer than 300 char
filtered_transcript['text_length'] = filtered_transcript['Text'].apply(lambda x: len(x))
filtered_transcript = filtered_transcript[filtered_transcript['text_length'] > 300]
# reset index
filtered_transcript = filtered_transcript.reset_index(drop=True)
filtered_transcript

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length
0,254005,47804,4109,Vous avez reçu le rapport du Conseil fédéral s...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:57:46,2019-12-02T14:59:13,B,FR,1299
1,254007,47804,4109,Pour adopter les décisions et formuler les pro...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:11:40,2019-12-02T15:13:17,B,FR,1586
2,254046,47810,4156,"Lors de la session d'été 2019, notre conseil a...",N,20191202,5101,Mit-M,1.0,2019-12-02T17:47:43,2019-12-02T17:49:26,*,FR,1774
3,254155,47813,4154,Nous parlons ici d'une révision totale de la l...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:24:17,2019-12-03T08:29:34,*,FR,4569
4,254159,47817,3872,Lors de ses séances des 18 et 19 février et de...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:34:23,2019-12-03T08:38:30,*,FR,4506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,260916,48948,1108,Nous vivons cet après-midi un nouvel épisode d...,S,20200506,5103,VPBR-M,99.0,2020-05-06T15:24:31,2020-05-06T15:28:47,VPBR-M,FR,3454
763,261019,48949,4240,Emotions et pesée d'intérêts: il y a la politi...,S,20200506,5103,Mit-M,2.0,2020-05-06T16:19:19,2020-05-06T16:22:20,Mit-M,FR,2784
764,260946,48949,1108,"Tout d'abord, le Conseil fédéral, il faut bien...",S,20200506,5103,VPBR-M,99.0,2020-05-06T16:22:32,2020-05-06T16:31:04,VPBR-M,FR,7134
765,261022,48951,3879,Je ne vais répéter en français ce qui vient d'...,S,20200506,5103,Mit-M,2.0,2020-05-06T17:40:28,2020-05-06T17:43:59,Mit-M,FR,2737


In [94]:
nlp_sent = spacy.load('xx_sent_ud_sm')
nlp = spacy.load(selected_language.lower() + "_core_news_md")
nlp.add_pipe("language_detector")
lemmatizer = nlp.get_pipe("lemmatizer")

additional_stopwords = {
    'FR': {'-t', 'avez', 'être', 'aujourd', 'hui'},
    'DE': set(),
}
removed_stopwords = {
    'FR': {'hui'},
    'DE': set(),
}
specific_stopwords = {
    'FR': {
        'accord', 'alinéa', 'an', 'année', 'article', 'avis', 'cadre', 'canton', 'cas', 
        'collègue', 'commission', 'conseil', 'débat', 'décision', 'discussion', 'disposition', 'domaine', 'droit', 
        'fédéral', 'franc', 'groupe', 'initiative', 'législature', 'loi', 'majorité', 'matière', 'mesure', 'milliard', 'million', 'minorité', 
        'monsieur', 'motion', 'parlementaire', 'pays', 'postulat', 'politique', 'position', 'président', 'proposition', 
        'projet', 'question', 'rapport', 'rapporteur', 'réponse', 'session', 'situation', 'suisse', 'voix'
    },
    'DE': {
        'Vereinbarung', 'Absatz', 'Jahr', 'Artikel', 'Stellungnahme', 'Rahmen', 'Kanton', 'Fall', 
        'Kollege', 'Kommission', 'Rat', 'Debatte', 'Entscheidung', 'Diskussion', 'Bestimmung', 'Bereich', 'Recht', 
        'föderal', 'Franken', 'Gruppe', 'Initiative', 'Legislaturperiode', 'Gesetz', 'Mehrheit', 'Materie', 'Massnahme', 'Milliarde', 'Million', 'Minderheit', 
        'Herr', 'Frau', 'Dame' 'Motion', 'Parlamentarier', 'Land', 'Postulat', 'Politik', 'Position', 'Präsident', 'Präsidentin', 'Vorschlag', 
        'Projekt', 'Frage', 'Bericht', 'Berichterstatter', 'Antwort', 'Sitzung', 'Lage', 'Schweiz', 'Stimme',
        'Bundesrat', 'Nationalrat', 'Bundeskanzler', 'Urne'
    },
}

if selected_language == 'DE':    
    # add stopwords 
    nlp.Defaults.stop_words |= additional_stopwords[selected_language]
    nlp.Defaults.stop_words |= specific_stopwords[selected_language]
    
    # remove stopwords
    nlp.Defaults.stop_words -= additional_stopwords['FR']
    nlp.Defaults.stop_words -= specific_stopwords['FR']
elif selected_language == 'FR':
    # add stopwords 
    nlp.Defaults.stop_words |= additional_stopwords[selected_language]
    nlp.Defaults.stop_words |= specific_stopwords[selected_language]
    
    # remove stopwords
    nlp.Defaults.stop_words -= additional_stopwords['DE']
    nlp.Defaults.stop_words -= specific_stopwords['DE']

print(len(nlp.Defaults.stop_words))
print(nlp.Defaults.stop_words)

558
{'moi-même', 'jusque', 'ouste', 'déja', 'une', 'puisque', 'quelques', 'trente', 'président', 'facon', 'pays', 'toi', 'dixième', 'lesquelles', 'lès', 'auxquels', 'dont', 'au', 'est', 'jusqu', 'o', 'exactement', 'vôtres', 'celui-là', 'chaque', 'quelconque', 'etc', 'alinéa', 'je', 'desquels', 'faisant', 'vu', 'tellement', "m'", 'egalement', 'toi-meme', 'pendant', 'cent', 'ait', 'cependant', "j'", 'n’', 'ou', 'préalable', 'les', 'surtout', 'directement', 'matière', 'réponse', 'autre', 'partant', 'dit', 'mêmes', 'dehors', 'canton', 'auxquelles', 'seuls', 'ont', 'cinquante', 'seul', "s'", 'celle-ci', 'celle-la', 'votres', 'minorité', 'un', 'dire', 'ouverte', 'donc', 'tente', 'antérieur', 'cinquième', 'autrui', "c'", 'lequel', 'car', 'aujourd', 'enfin', 'douze', 'deja', 'ouvert', 'droit', 'quelque', 'ni', 'soi', 'entre', 'seule', 'ceux', 'antérieure', 'importe', 'ayant', 'tres', 'soi-meme', 'na', 'd’', 'neanmoins', 'suffisante', 'pourrait', 'pourquoi', 'revoila', 'seules', 'parle', 'rend'



In [95]:
example_row = 288
text_id = filtered_transcript.iloc[example_row]['ID']
text = filtered_transcript.iloc[example_row]['Text']

print(text_id)
print('length:', len(text))
print(text)
print('---')

entity_dict = {}

for ent in nlp(text).ents:
    entity_dict[ent.text] = ent.label_
    
print(entity_dict)
print('---')

lemma_list = [
    token.lemma_ for sent in nlp_sent(text).sents 
    if (doc := nlp(sent.text))._.language == selected_language.lower() 
    for token in doc 
    if not any([
        token.is_stop, 
        token.is_punct, 
        token.is_space, 
        token.ent_type_, 
        not token.is_alpha, 
        token.lemma_ in nlp.Defaults.stop_words, 
        token.pos_ != 'NOUN',
    ])
]

#lemma_list.sort()
print(lemma_list)

255705
length: 733
Je m'exprimerai une seule fois et de manière brève pour vous dire que le Conseil fédéral est très sensible à cette problématique. La contribution de solidarité doit revenir, dans son intégralité, aux victimes de mesures de coercition à des fins d'assistance et de placements extrafamiliaux. De l'avis du Conseil fédéral, il est incohérent, d'une part, que l'Etat accorde un dédommagement en signe de reconnaissance des torts causés et, d'autre part, qu'une prestation sociale soit réduite en conséquence.
Afin d'exclure aussi vite que possible toute prise en compte de la contribution de solidarité dans le calcul de la prestation complémentaire, le Conseil fédéral soutient l'initiative de la commission et vous invite à l'adopter.

---
{'Conseil fédéral': 'ORG', 'Etat': 'LOC'}
---
['fois', 'manière', 'problématique', 'contribution', 'solidarité', 'intégralité', 'victime', 'coercition', 'fin', 'assistance', 'placement', 'part', 'dédommagement', 'signe', 'reconnaissance', 'tor

In [63]:
# remove stopwords, punctuation and then lemmatize
tqdm.pandas()
processed_transcript = filtered_transcript.copy()
processed_transcript['text_lemmatized'] = processed_transcript['Text'].progress_apply(
    lambda x: ' '.join([
        token.lemma_ for sent in nlp_sent(x).sents 
        if (doc := nlp(sent.text))._.language == selected_language.lower() 
        for token in doc 
        if not any([
            token.is_stop, 
            token.is_punct, 
            token.is_space, 
            token.ent_type_, 
            not token.is_alpha, 
            token.lemma_ in nlp.Defaults.stop_words, 
            token.pos_ != 'NOUN',
        ])
    ])
)
# Take a look at the data
processed_transcript

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2104/2104 [05:03<00:00,  6.94it/s]


Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length,text_lemmatized
0,253990,47803,806,"Präsidentin (Graf Maya, Alterspräsidentin): He...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:32:29,2019-12-02T14:33:42,P-F,DE,7926,Präsidentin Graf Alterspräsidentin Frau Bundes...
1,253996,47803,4290,"Sehr geehrte Frau Präsidentin, sehr geehrter H...",N,20191202,5101,Mit-M,1.0,2019-12-02T14:44:33,2019-12-02T14:52:10,Mit-M,DE,5563,Frau Präsidentin Dame Kollegin Dame Tradition ...
2,254000,47804,806,"Präsidentin (Graf Maya, Alterspräsidentin): De...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:52:30,2019-12-02T14:54:24,P-F,DE,744,Präsidentin Graf Alterspräsidentin Büro Nation...
3,254011,47804,1139,Sie haben den Bericht des Bundesrates zu den N...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:54:24,2019-12-02T14:57:46,B,DE,2284,Bundesrat Büro Antrag Konstituierung Unvereinb...
4,254006,47804,1139,Das provisorische Büro hat die Prüfung der Unv...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:09:59,2019-12-02T15:11:40,B,DE,1344,Büro Prüfung Unvereinbarkeit Gesetzesbestimmun...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2099,260955,48950,4062,"Der Bundesrat ist sich bewusst, dass sich inne...",S,20200506,5103,BR-F,99.0,2020-05-06T16:40:15,2020-05-06T16:44:22,BR-F,DE,3481,Bundesrat Reisebranche Reisebüro Reiseveransta...
2100,260974,48951,4153,Es ist schon fast so spannend und interessant ...,S,20200506,5103,Mit-M,2.0,2020-05-06T17:32:58,2020-05-06T17:38:33,*,DE,3520,Antrag Differenz Session Heimweg Differenz Tou...
2101,260973,48951,4204,Der Kommissionssprecher hat es gesagt: Die Lös...,S,20200506,5103,Mit-M,2.0,2020-05-06T17:38:33,2020-05-06T17:40:28,Mit-M,DE,1301,Kommissionssprecher Lösung Lösung Ausgang Fina...
2102,260978,48951,146,In Bezug auf den Tourismus gab es in diesen dr...,S,20200506,5103,BR-M,99.0,2020-05-06T17:45:01,2020-05-06T17:48:48,BR-M,DE,3560,Bezug Tourismus Commitment Tourismus Tourismus...


In [64]:
processed_transcript['text_lemma_list'] = processed_transcript['text_lemmatized'].apply(lambda x: x.split())
processed_transcript

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length,text_lemmatized,text_lemma_list
0,253990,47803,806,"Präsidentin (Graf Maya, Alterspräsidentin): He...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:32:29,2019-12-02T14:33:42,P-F,DE,7926,Präsidentin Graf Alterspräsidentin Frau Bundes...,"[Präsidentin, Graf, Alterspräsidentin, Frau, B..."
1,253996,47803,4290,"Sehr geehrte Frau Präsidentin, sehr geehrter H...",N,20191202,5101,Mit-M,1.0,2019-12-02T14:44:33,2019-12-02T14:52:10,Mit-M,DE,5563,Frau Präsidentin Dame Kollegin Dame Tradition ...,"[Frau, Präsidentin, Dame, Kollegin, Dame, Trad..."
2,254000,47804,806,"Präsidentin (Graf Maya, Alterspräsidentin): De...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:52:30,2019-12-02T14:54:24,P-F,DE,744,Präsidentin Graf Alterspräsidentin Büro Nation...,"[Präsidentin, Graf, Alterspräsidentin, Büro, N..."
3,254011,47804,1139,Sie haben den Bericht des Bundesrates zu den N...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:54:24,2019-12-02T14:57:46,B,DE,2284,Bundesrat Büro Antrag Konstituierung Unvereinb...,"[Bundesrat, Büro, Antrag, Konstituierung, Unve..."
4,254006,47804,1139,Das provisorische Büro hat die Prüfung der Unv...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:09:59,2019-12-02T15:11:40,B,DE,1344,Büro Prüfung Unvereinbarkeit Gesetzesbestimmun...,"[Büro, Prüfung, Unvereinbarkeit, Gesetzesbesti..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2099,260955,48950,4062,"Der Bundesrat ist sich bewusst, dass sich inne...",S,20200506,5103,BR-F,99.0,2020-05-06T16:40:15,2020-05-06T16:44:22,BR-F,DE,3481,Bundesrat Reisebranche Reisebüro Reiseveransta...,"[Bundesrat, Reisebranche, Reisebüro, Reisevera..."
2100,260974,48951,4153,Es ist schon fast so spannend und interessant ...,S,20200506,5103,Mit-M,2.0,2020-05-06T17:32:58,2020-05-06T17:38:33,*,DE,3520,Antrag Differenz Session Heimweg Differenz Tou...,"[Antrag, Differenz, Session, Heimweg, Differen..."
2101,260973,48951,4204,Der Kommissionssprecher hat es gesagt: Die Lös...,S,20200506,5103,Mit-M,2.0,2020-05-06T17:38:33,2020-05-06T17:40:28,Mit-M,DE,1301,Kommissionssprecher Lösung Lösung Ausgang Fina...,"[Kommissionssprecher, Lösung, Lösung, Ausgang,..."
2102,260978,48951,146,In Bezug auf den Tourismus gab es in diesen dr...,S,20200506,5103,BR-M,99.0,2020-05-06T17:45:01,2020-05-06T17:48:48,BR-M,DE,3560,Bezug Tourismus Commitment Tourismus Tourismus...,"[Bezug, Tourismus, Commitment, Tourismus, Tour..."


In [65]:
processed_transcript = processed_transcript[processed_transcript['text_lemma_list'].map(len) >= 10]
processed_transcript = processed_transcript.reset_index().rename(columns={'index': 'transcript_idx'})
processed_transcript

Unnamed: 0,transcript_idx,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length,text_lemmatized,text_lemma_list
0,0,253990,47803,806,"Präsidentin (Graf Maya, Alterspräsidentin): He...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:32:29,2019-12-02T14:33:42,P-F,DE,7926,Präsidentin Graf Alterspräsidentin Frau Bundes...,"[Präsidentin, Graf, Alterspräsidentin, Frau, B..."
1,1,253996,47803,4290,"Sehr geehrte Frau Präsidentin, sehr geehrter H...",N,20191202,5101,Mit-M,1.0,2019-12-02T14:44:33,2019-12-02T14:52:10,Mit-M,DE,5563,Frau Präsidentin Dame Kollegin Dame Tradition ...,"[Frau, Präsidentin, Dame, Kollegin, Dame, Trad..."
2,2,254000,47804,806,"Präsidentin (Graf Maya, Alterspräsidentin): De...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:52:30,2019-12-02T14:54:24,P-F,DE,744,Präsidentin Graf Alterspräsidentin Büro Nation...,"[Präsidentin, Graf, Alterspräsidentin, Büro, N..."
3,3,254011,47804,1139,Sie haben den Bericht des Bundesrates zu den N...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:54:24,2019-12-02T14:57:46,B,DE,2284,Bundesrat Büro Antrag Konstituierung Unvereinb...,"[Bundesrat, Büro, Antrag, Konstituierung, Unve..."
4,4,254006,47804,1139,Das provisorische Büro hat die Prüfung der Unv...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:09:59,2019-12-02T15:11:40,B,DE,1344,Büro Prüfung Unvereinbarkeit Gesetzesbestimmun...,"[Büro, Prüfung, Unvereinbarkeit, Gesetzesbesti..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1895,2099,260955,48950,4062,"Der Bundesrat ist sich bewusst, dass sich inne...",S,20200506,5103,BR-F,99.0,2020-05-06T16:40:15,2020-05-06T16:44:22,BR-F,DE,3481,Bundesrat Reisebranche Reisebüro Reiseveransta...,"[Bundesrat, Reisebranche, Reisebüro, Reisevera..."
1896,2100,260974,48951,4153,Es ist schon fast so spannend und interessant ...,S,20200506,5103,Mit-M,2.0,2020-05-06T17:32:58,2020-05-06T17:38:33,*,DE,3520,Antrag Differenz Session Heimweg Differenz Tou...,"[Antrag, Differenz, Session, Heimweg, Differen..."
1897,2101,260973,48951,4204,Der Kommissionssprecher hat es gesagt: Die Lös...,S,20200506,5103,Mit-M,2.0,2020-05-06T17:38:33,2020-05-06T17:40:28,Mit-M,DE,1301,Kommissionssprecher Lösung Lösung Ausgang Fina...,"[Kommissionssprecher, Lösung, Lösung, Ausgang,..."
1898,2102,260978,48951,146,In Bezug auf den Tourismus gab es in diesen dr...,S,20200506,5103,BR-M,99.0,2020-05-06T17:45:01,2020-05-06T17:48:48,BR-M,DE,3560,Bezug Tourismus Commitment Tourismus Tourismus...,"[Bezug, Tourismus, Commitment, Tourismus, Tour..."


In [67]:
processed_transcript.iloc[0]['text_lemma_list']

['Präsidentin',
 'Graf',
 'Alterspräsidentin',
 'Frau',
 'Bundesrat',
 'Dame',
 'Frau',
 'Nationalrat',
 'Bundeskanzler',
 'Kollegin',
 'Gast',
 'Ehre',
 'Legislatur',
 'Zufall',
 'Parlamentarierin',
 'Frau',
 'Parlamentarierin',
 'Legislatur',
 'Zufäll',
 'Leben',
 'Zufall',
 'Art',
 'Luft',
 'Legislatur',
 'Parlament',
 'Nationalrätinne',
 'Frau',
 'Stauffacherin',
 'Verstärkung',
 'Trägerin',
 'Idee',
 'Arbeit',
 'linke',
 'Aufsicht',
 'Nationalratssaal',
 'Parlament',
 'Kraft',
 'Aufbruch',
 'Geschichte',
 'Bürgerin',
 'Bürger',
 'Zeitenwende',
 'Manifestation',
 'Mal',
 'Umweltbewegung',
 'Anfang',
 'Achtzigerjahre',
 'Mensch',
 'Strasse',
 'Forderung',
 'jugendliche',
 'Zukunft',
 'handeln',
 'Klimawandel',
 'Mensch',
 'Eltern',
 'Grosselter',
 'Unterstützung',
 'Solidarität',
 'Einfluss',
 'Gesellschaftsbereich',
 'Bewegung',
 'Frauenbewegung',
 'Teilnehmend',
 'Juni',
 'Wegmarke',
 'Frau',
 'Mann',
 'Strasse',
 'Schub',
 'Gleichstellung',
 'Geschlecht',
 'Demokratie',
 'Moment'

In [68]:
processed_transcript.to_csv('data/lemmatized/transcripts_lemmatized_de_51.csv', encoding='utf-8')

### Load lemmatized transcript

In [124]:
selected_language = 'fr'
selected_sessions = [50, 51]
processed_transcript = pd.DataFrame()

for session in selected_sessions:
    with open('data/lemmatized/transcripts_lemmatized_' + selected_language + '_' + str(session) + '.csv', encoding='utf-8') as file:
        transcript = pd.read_csv(file).drop(columns='Unnamed: 0')
    
    processed_transcript = pd.concat([processed_transcript, transcript])
   
processed_transcript = processed_transcript.reset_index(drop=True)
processed_transcript

Unnamed: 0,transcript_idx,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length,text_lemmatized,text_lemma_list
0,0,191151,36016,4018,Vous avez reçu le rapport du Conseil fédéral s...,N,20151130,5001,Mit-M,1.0,2015-11-30T15:07:12,2015-11-30T15:08:50,B,FR,1159,élection constitution incompatibilité déroulem...,"['élection', 'constitution', 'incompatibilité'..."
1,1,191158,36016,4018,Pour adopter les décisions et formuler les pro...,N,20151130,5001,Mit-M,1.0,2015-11-30T15:21:04,2015-11-30T15:23:08,B,FR,1643,principe explication service conseiller mandat...,"['principe', 'explication', 'service', 'consei..."
2,2,191267,36022,3898,Vous vous souvenez que le groupe UDC recommand...,N,20151130,5001,Mit-M,1.0,2015-11-30T17:58:22,2015-11-30T18:02:40,Mit-M,FR,3925,carte visite diplomate soupçon fusil client ba...,"['carte', 'visite', 'diplomate', 'soupçon', 'f..."
3,3,191199,36022,1116,"Dans ce dossier, nous sommes maintenant à bout...",N,20151130,5001,BR-M,99.0,2015-11-30T18:07:35,2015-11-30T18:12:11,BR-M,FR,4090,dossier bout possibilité début carte visite go...,"['dossier', 'bout', 'possibilité', 'début', 'c..."
4,4,191226,36022,1116,On peut rester relativement calme sur ce sujet...,N,20151130,5001,BR-M,99.0,2015-11-30T18:27:03,2015-11-30T18:32:48,BR-M,FR,5020,sujet intervention législation heure exemple c...,"['sujet', 'intervention', 'législation', 'heur..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13144,7316,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,2023-05-04T16:41:10,2023-05-04T16:44:54,Mit-M,FR,3702,modèle affaire entreprise numérique collecte d...,"['modèle', 'affaire', 'entreprise', 'numérique..."
13145,7317,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,2023-05-04T16:44:58,2023-05-04T16:47:57,BR-F,FR,3153,doigt problème recours collecte donnée surveil...,"['doigt', 'problème', 'recours', 'collecte', '..."
13146,7318,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,2023-05-04T16:48:32,2023-05-04T16:52:03,Mit-F,FR,2810,suite attentat vie aide victime infraction ind...,"['suite', 'attentat', 'vie', 'aide', 'victime'..."
13147,7319,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,2023-05-04T16:54:20,2023-05-04T16:56:37,BR-F,FR,2345,victime infraction soutien face traumatisme in...,"['victime', 'infraction', 'soutien', 'face', '..."


### Transcripts by person, party and gender

In [125]:
with open('data/persons.csv', encoding='utf-8') as file:
    persons_df = pd.read_csv(file).drop(columns='Unnamed: 0')[['PersonNumber', 'LastName', 'FirstName', 'GenderAsString', 'PartyAbbreviation']]
    
persons_df

Unnamed: 0,PersonNumber,LastName,FirstName,GenderAsString,PartyAbbreviation
0,9,Baumann,Ruedi,m,VERT-E-S
1,12,Beerli,Christine,f,PLR
2,14,Bezzola,Duri,m,PLR
3,15,Binder,Max,m,UDC
4,21,Blocher,Christoph,m,UDC
...,...,...,...,...,...
705,4329,Ruch,Daniel,m,PLR
706,4330,Berthoud,Alexandre,m,PLR
707,4331,Jost,Marc,m,PEV
708,4332,Crevoisier Crelier,Mathilde,f,PSS


In [126]:
transcript_by_person = processed_transcript.reset_index().merge(persons_df, on='PersonNumber', how='left').set_index('index')
transcript_by_person.index.name = None
transcript_by_person

Unnamed: 0,transcript_idx,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,...,End,Function,LanguageOfText,text_length,text_lemmatized,text_lemma_list,LastName,FirstName,GenderAsString,PartyAbbreviation
0,0,191151,36016,4018,Vous avez reçu le rapport du Conseil fédéral s...,N,20151130,5001,Mit-M,1.0,...,2015-11-30T15:08:50,B,FR,1159,élection constitution incompatibilité déroulem...,"['élection', 'constitution', 'incompatibilité'...",Maire,Jacques-André,m,PSS
1,1,191158,36016,4018,Pour adopter les décisions et formuler les pro...,N,20151130,5001,Mit-M,1.0,...,2015-11-30T15:23:08,B,FR,1643,principe explication service conseiller mandat...,"['principe', 'explication', 'service', 'consei...",Maire,Jacques-André,m,PSS
2,2,191267,36022,3898,Vous vous souvenez que le groupe UDC recommand...,N,20151130,5001,Mit-M,1.0,...,2015-11-30T18:02:40,Mit-M,FR,3925,carte visite diplomate soupçon fusil client ba...,"['carte', 'visite', 'diplomate', 'soupçon', 'f...",Nidegger,Yves,m,UDC
3,3,191199,36022,1116,"Dans ce dossier, nous sommes maintenant à bout...",N,20151130,5001,BR-M,99.0,...,2015-11-30T18:12:11,BR-M,FR,4090,dossier bout possibilité début carte visite go...,"['dossier', 'bout', 'possibilité', 'début', 'c...",Burkhalter,Didier,m,PLR
4,4,191226,36022,1116,On peut rester relativement calme sur ce sujet...,N,20151130,5001,BR-M,99.0,...,2015-11-30T18:32:48,BR-M,FR,5020,sujet intervention législation heure exemple c...,"['sujet', 'intervention', 'législation', 'heur...",Burkhalter,Didier,m,PLR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13144,7316,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,...,2023-05-04T16:44:54,Mit-M,FR,3702,modèle affaire entreprise numérique collecte d...,"['modèle', 'affaire', 'entreprise', 'numérique...",Fivaz,Fabien,m,VERT-E-S
13145,7317,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,...,2023-05-04T16:47:57,BR-F,FR,3153,doigt problème recours collecte donnée surveil...,"['doigt', 'problème', 'recours', 'collecte', '...",Baume-Schneider,Elisabeth,f,PSS
13146,7318,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,...,2023-05-04T16:52:03,Mit-F,FR,2810,suite attentat vie aide victime infraction ind...,"['suite', 'attentat', 'vie', 'aide', 'victime'...",de Quattro,Jacqueline,f,PLR
13147,7319,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,...,2023-05-04T16:56:37,BR-F,FR,2345,victime infraction soutien face traumatisme in...,"['victime', 'infraction', 'soutien', 'face', '...",Baume-Schneider,Elisabeth,f,PSS


In [127]:
# number of transcripts by party
transcript_by_person.groupby('PartyAbbreviation').count()['Text']

PartyAbbreviation
-             95
CSPO           8
EàG           35
Lega           2
M-E          916
MCG           46
PDC          488
PLR         2445
PSS         4898
PdT           60
UDC         2336
VERT-E-S    1544
pvl          276
Name: Text, dtype: int64

In [128]:
# number of transcripts by gender
transcript_by_person.groupby('GenderAsString').count()['Text']

GenderAsString
f    3244
m    9905
Name: Text, dtype: int64

In [129]:
transcript_by_party = transcript_by_person.groupby('PartyAbbreviation').agg({'text_lemmatized': lambda x: ' '.join(x)})
transcript_by_party.index.name = None
transcript_by_party = transcript_by_party.drop(['Al', 'Lega', 'PLD', '-'], errors='ignore')
transcript_by_party

Unnamed: 0,text_lemmatized
CSPO,juin arrêté vien essentiel rejet arrêté crédit...
EàG,conseiller reprise plupart étude partie étude ...
M-E,bloc cause objet cours répétition abstention o...
MCG,armée menace amélioration évolution sécurité a...
PDC,thème avenir planète ressource manière objecti...
PLR,dossier bout possibilité début carte visite go...
PSS,élection constitution incompatibilité déroulem...
PdT,reprise membre sujet respect rente assurance v...
UDC,carte visite diplomate soupçon fusil client ba...
VERT-E-S,peuple salle problème manière ressource contra...


In [130]:
transcript_by_gender = transcript_by_person.groupby('GenderAsString').agg({'text_lemmatized': lambda x: ' '.join(x)})
transcript_by_gender.index.name = None
transcript_by_gender

Unnamed: 0,text_lemmatized
f,peuple salle problème manière ressource contra...
m,élection constitution incompatibilité déroulem...


### TF-IDF 

In [131]:
group_transcript = processed_transcript.copy()

In [132]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(group_transcript['text_lemmatized'].to_list())
len(vectorizer.get_feature_names_out())

12002

In [133]:
tfidf_df = pd.DataFrame(vectors.toarray(), index=group_transcript.index, columns=vectorizer.get_feature_names_out())
tfidf_df.index.name = None
tfidf_df

Unnamed: 0,aaa,ab,abaissement,abandon,abandonniste,abat,abattage,abattement,abattoir,abattu,...,évoqué,évènement,évènementiel,événement,événementiel,événementielle,évêché,évêque,île,îlot
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13144,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [134]:
tfidf_df.sort_values(axis='columns', by=tfidf_df.iloc[1].name, ascending=False)

Unnamed: 0,incompatibilité,élection,vraisemblance,mandat,renonciation,communiqué,député,conseiller,échéance,élu,...,emprise,emprisonnement,emprisonné,emprisonnée,emprunt,emprunteur,empêchement,empêcher,encablure,îlot
0,0.210354,0.753928,0.000000,0.107582,0.000000,0.000000,0.000000,0.083449,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.648544,0.309925,0.223094,0.221125,0.201816,0.172794,0.172511,0.171521,0.164424,0.157396,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.084758,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.032137,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13144,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13145,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13146,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13147,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [135]:
tfidf_score_df = tfidf_df.stack().reset_index()
tfidf_score_df = tfidf_score_df.rename(columns={'ID': 'transcript','level_1': 'term', 0:'score', 'level_0': 'idx'})
tfidf_score_df

Unnamed: 0,idx,term,score
0,0,aaa,0.0
1,0,ab,0.0
2,0,abaissement,0.0
3,0,abandon,0.0
4,0,abandonniste,0.0
...,...,...,...
157814293,13148,événementielle,0.0
157814294,13148,évêché,0.0
157814295,13148,évêque,0.0
157814296,13148,île,0.0


In [136]:
top_tfidf = tfidf_score_df.sort_values(by=['idx','score'], ascending=[True,False]).groupby(['idx']).head(100)
top_tfidf

Unnamed: 0,idx,term,score
11782,0,élection,0.753928
2558,0,constitution,0.510427
5631,0,incompatibilité,0.210354
3801,0,déroulement,0.184457
4522,0,feuille,0.176052
...,...,...,...
157802352,13148,abyssale,0.000000
157802353,13148,abécédaire,0.000000
157802354,13148,abîme,0.000000
157802355,13148,acabit,0.000000


In [137]:
top_tfidf[top_tfidf['term'].str.contains("assurance")]

Unnamed: 0,idx,term,score
252869,21,assurance,0.032042
264871,22,assurance,0.067857
636933,53,assurance,0.211130
660937,55,assurance,0.135955
840967,70,assurance,0.321194
...,...,...,...
156974985,13079,assurance,0.281580
156998989,13081,assurance,0.106092
157010991,13082,assurance,0.062815
157779119,13146,assurance,0.044613


In [138]:
top_tfidf[top_tfidf['term'].str.contains("assurance")]
top_tfidf[top_tfidf['idx'] == 838]

Unnamed: 0,idx,term,score
10065094,838,objectif,0.503116
10067096,838,revenu,0.363402
10061582,838,développement,0.359346
10066517,838,période,0.253587
10067930,838,sorte,0.243882
...,...,...,...
10057743,838,acceptabilité,0.000000
10057744,838,acceptable,0.000000
10057745,838,acceptation,0.000000
10057746,838,accepte,0.000000


In [139]:
top_10_tfidf = top_tfidf.groupby('idx').head(10)
top_10_tfidf

Unnamed: 0,idx,term,score
11782,0,élection,0.753928
2558,0,constitution,0.510427
5631,0,incompatibilité,0.210354
3801,0,déroulement,0.184457
4522,0,feuille,0.176052
...,...,...,...
157813393,13148,tribunal,0.189952
157813740,13148,victimisation,0.134157
157806974,13148,fondement,0.085421
157809855,13148,optique,0.085153


In [140]:
top_10_tfidf.query('idx > 6870 & idx < 6879 & score > 0')

Unnamed: 0,idx,term,score
82473393,6871,ouverture,0.571813
82469006,6871,discothèque,0.270737
82477674,6871,étape,0.241162
82468112,6871,concept,0.238186
82468118,6871,concert,0.209831
...,...,...,...
82558140,6878,pratique,0.195454
82556330,6878,malade,0.192267
82560258,6878,suite,0.191063
82558633,6878,quarantaine,0.184826


In [141]:
top_10_tfidf = top_10_tfidf.query('score > 0')
top_10_tfidf

Unnamed: 0,idx,term,score
11782,0,élection,0.753928
2558,0,constitution,0.510427
5631,0,incompatibilité,0.210354
3801,0,déroulement,0.184457
4522,0,feuille,0.176052
...,...,...,...
157813393,13148,tribunal,0.189952
157813740,13148,victimisation,0.134157
157806974,13148,fondement,0.085421
157809855,13148,optique,0.085153


In [142]:
# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_10_tfidf.query('idx > -1 & idx < 10 & score > 0')
#top_tfidf_plusRand = top_10_tfidf.copy()
top_tfidf_plusRand['score'] = top_tfidf_plusRand['score'] + np.random.rand(top_tfidf_plusRand.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'idx:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("score", order="descending")],
    groupby = ["idx"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'score:Q'
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + text).properties(width = 1000)

In [838]:
top_10_tfidf.to_csv('top_tfidf.csv', encoding='utf-8')
top_10_tfidf

Unnamed: 0,idx,term,score
1579,0,bureau,0.642947
9859,0,élection,0.528889
2434,0,constitution,0.351921
5705,0,mandat,0.157655
4918,0,incompatibilité,0.143407
...,...,...,...
76828223,7649,tribunal,0.191068
76828483,7649,victimisation,0.132253
76825365,7649,optique,0.089030
76823050,7649,fondement,0.088222


In [143]:
top_terms = top_10_tfidf.groupby('idx')['term'].apply(list)
top_terms.name = 'top_terms'
top_terms = pd.DataFrame(top_terms)
top_terms.index.name = None
top_terms

Unnamed: 0,top_terms
0,"[élection, constitution, incompatibilité, déro..."
1,"[incompatibilité, élection, vraisemblance, man..."
2,"[potentat, carte, visite, blocage, fusil, dipl..."
3,"[notion, nécessité, détournement, réputation, ..."
4,"[confiscation, entraide, restitution, prescrip..."
...,...
13144,"[algorithme, collecte, solution, publicité, op..."
13145,"[plateforme, publicité, donnée, algorithme, in..."
13146,"[victime, acte, terrorisme, citoyen, violence,..."
13147,"[victime, infraction, étranger, indemnisation,..."


In [144]:
top_terms.loc[100]

top_terms    [poste, veto, desserte, fermeture, compétitivi...
Name: 100, dtype: object

In [145]:
top_terms.loc[sorted(random.sample(list(processed_transcript.index), 500))]

Unnamed: 0,top_terms
17,"[coût, délai, responsabilité, transition, temp..."
33,"[renseignement, surveillance, collaboration, s..."
49,"[préservation, intérêt, soulagement, coexisten..."
62,"[budget, femme, mécène, homme, donnée, entité,..."
78,"[exclusivité, principe, médicament, incitatif,..."
...,...
13039,"[compagnie, apparence, denier, aéroport, urgen..."
13076,"[examen, travail, stabilisation, acceptation, ..."
13102,"[réinstallation, tradition, réfugié, compétenc..."
13118,"[puberté, enfantine, imprescriptibilité, porno..."


### BERTopic

#### Semi-supervised

In [1185]:
with open('sample_cat.csv', encoding='utf-8') as file:
    sample_cat = pd.read_csv(file).set_index('Unnamed: 0')
    sample_cat.index.name = None
    sample_cat = sample_cat[['cat']]
    
sample_cat

Unnamed: 0,cat
120,économie
126,santé
127,santé
324,travail
563,santé
...,...
7138,économie
7197,santé
7274,immigration
7278,travail


In [1182]:
category_names = list(sample_cat['cat'].unique())
category_names

['économie',
 'santé',
 'travail',
 'écologie',
 'société',
 'sécurité',
 'formation',
 'politique',
 'recherche',
 'agriculture',
 'finance',
 'sport',
 'média',
 'immigration',
 'justice',
 'armée',
 'extérieur',
 'transport',
 'retraite',
 'énergie',
 'culture',
 'immobilier']

In [1202]:
category_dict = dict()
for i in category_names:
    idx = category_names.index(i)
    category_dict[i] = idx
    
category_dict

{'économie': 0,
 'santé': 1,
 'travail': 2,
 'écologie': 3,
 'société': 4,
 'sécurité': 5,
 'formation': 6,
 'politique': 7,
 'recherche': 8,
 'agriculture': 9,
 'finance': 10,
 'sport': 11,
 'média': 12,
 'immigration': 13,
 'justice': 14,
 'armée': 15,
 'extérieur': 16,
 'transport': 17,
 'retraite': 18,
 'énergie': 19,
 'culture': 20,
 'immobilier': 21}

In [1206]:
sample_cat = sample_cat.replace(category_dict)
sample_cat

Unnamed: 0,cat
120,0
126,1
127,1
324,2
563,1
...,...
7138,0
7197,1
7274,13
7278,2


In [1209]:
supervised_transcript = processed_transcript.merge(sample_cat, left_index=True, right_index=True, how='left')
supervised_transcript['cat'] = supervised_transcript['cat'].fillna(-1)
supervised_transcript = supervised_transcript.astype({'cat':'int'})
supervised_transcript

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length,text_lemmatized,text_lemma_list,cat
0,254005,47804,4109,Vous avez reçu le rapport du Conseil fédéral s...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:57:46,2019-12-02T14:59:13,B,FR,1299,élection bureau constitution incompatibilité b...,"[élection, bureau, constitution, incompatibili...",-1
1,254007,47804,4109,Pour adopter les décisions et formuler les pro...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:11:40,2019-12-02T15:13:17,B,FR,1586,bureau principe bureau explication service con...,"[bureau, principe, bureau, explication, servic...",-1
2,254046,47810,4156,"Lors de la session d'été 2019, notre conseil a...",N,20191202,5101,Mit-M,1.0,2019-12-02T17:47:43,2019-12-02T17:49:26,*,FR,1774,session été session automne octobre fois objet...,"[session, été, session, automne, octobre, fois...",-1
3,254155,47813,4154,Nous parlons ici d'une révision totale de la l...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:24:17,2019-12-03T08:29:34,*,FR,4569,révision protection population protection juin...,"[révision, protection, population, protection,...",-1
4,254159,47817,3872,Lors de ses séances des 18 et 19 février et de...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:34:23,2019-12-03T08:38:30,*,FR,4506,séance février juin aménagement territoire éne...,"[séance, février, juin, aménagement, territoir...",-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7316,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,2023-05-04T16:41:10,2023-05-04T16:44:54,Mit-M,FR,3702,modèle affaire entreprise numérique collecte d...,"[modèle, affaire, entreprise, numérique, colle...",-1
7317,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,2023-05-04T16:44:58,2023-05-04T16:47:57,BR-F,FR,3153,doigt problème recours collecte donnée surveil...,"[doigt, problème, recours, collecte, donnée, s...",-1
7318,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,2023-05-04T16:48:32,2023-05-04T16:52:03,Mit-F,FR,2810,suite attentat vie aide victime infraction ind...,"[suite, attentat, vie, aide, victime, infracti...",-1
7319,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,2023-05-04T16:54:20,2023-05-04T16:56:37,BR-F,FR,2345,victime infraction soutien face traumatisme in...,"[victime, infraction, soutien, face, traumatis...",-1


In [1210]:
y = supervised_transcript['cat'].to_list()
y

[-1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 0,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1

In [1212]:
# Initiate BERTopic
test_model = BERTopic(umap_model=umap_model, calculate_probabilities=True, language='french', verbose=True)

# Run BERTopic model
docs = processed_transcript['text_lemmatized'].to_list()
topics, probabilities = test_model.fit_transform(docs, y=y)
test_model.get_topic_info()

Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [01:29<00:00,  2.57it/s]
2023-05-31 20:52:02,070 - BERTopic - Transformed documents to Embeddings
2023-05-31 20:52:16,499 - BERTopic - Reduced dimensionality
2023-05-31 20:52:18,536 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name
0,-1,3252,-1_entreprise_raison_travail_manière
1,0,301,0_peine_procédure_code_crime
2,1,168,1_chômage_salaire_travail_travailleur
3,2,142,2_vaccin_pandémie_vaccination_épidémie
4,3,134,3_émission_gaz_serre_climat
...,...,...,...
93,92,11,92_entreprise_rigueur_affaire_chiffre
94,93,11,93_législature_programme_stratégie_pauvreté
95,94,11,94_naturalisation_nationalité_parent_génération
96,95,10,95_amortissement_compte_supplément_crédit


In [1213]:
new_topics = test_model.reduce_outliers(docs, topics)
test_model.update_topics(docs, topics=new_topics)
test_model.get_topic_info()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.52it/s]


Unnamed: 0,Topic,Count,Name
0,-1,1,-1_tre_lingue_jau_discur
1,0,404,0_peine_procédure_code_juge
2,1,244,1_chômage_travail_salaire_travailleur
3,2,187,2_vaccin_pandémie_épidémie_vaccination
4,3,166,3_émission_gaz_climat_serre
...,...,...,...
93,92,65,92_entreprise_affaire_rigueur_chiffre
94,93,47,93_législature_programme_stratégie_message
95,94,18,94_naturalisation_nationalité_parent_génération
96,95,28,95_dette_endettement_amortissement_compte


In [1229]:
test_model.get_topic(60)

[('solution', 0.09677835543136502),
 ('rigueur', 0.03574394611154022),
 ('problème', 0.021230163538695623),
 ('compétence', 0.02032276557531324),
 ('acceptabilité', 0.019431546625913617),
 ('chose', 0.019346144216684034),
 ('bail', 0.017948352413834988),
 ('compromis', 0.017648671639580445),
 ('dossier', 0.017087540159614324),
 ('manoeuvre', 0.016391057593906477)]

In [1228]:
test_model.get_document_info(docs).loc[2449]

Document                   discussion air base action mois contour crise ...
Topic                                                                     60
Name                                 60_solution_rigueur_problème_compétence
Top_n_words                solution - rigueur - problème - compétence - a...
Probability                                                         0.012841
Representative_document                                                False
Name: 2449, dtype: object

#### Training

In [1516]:
loaded_model = BERTopic.load("my_model")
loaded_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,1126,0_santé_assurance_coût_soin
1,1,772,1_transparence_vote_divergence_surveillance
2,2,728,2_procédure_peine_code_juge
3,3,507,3_entreprise_prix_marché_produit
4,4,469,4_enfant_femme_parent_accueil
5,5,391,5_animal_agriculture_production_vin
6,6,331,6_crédit_budget_dépense_impôt
7,7,314,7_biodiversité_émission_eau_produit
8,8,290,8_armée_guerre_service_matériel
9,9,216,9_travail_chômage_travailleur_convention


In [156]:
# Initiate UMAP
umap_model = UMAP(
    n_neighbors=15, 
    n_components=10, 
    min_dist=0.0, 
    metric='cosine', 
    random_state=100,
)

# Initiate BERTopic
topic_model = BERTopic(calculate_probabilities=True, language='french', verbose=True)

# Run BERTopic model
docs = processed_transcript['text_lemmatized'].to_list()
topics, probabilities = topic_model.fit_transform(docs)
topic_model.get_topic_info()

Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 411/411 [02:49<00:00,  2.42it/s]
2023-06-05 14:09:47,753 - BERTopic - Transformed documents to Embeddings
2023-06-05 14:09:51,192 - BERTopic - Reduced dimensionality
2023-06-05 14:10:01,495 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name
0,-1,5343,-1_travail_entreprise_raison_assurance
1,0,446,0_peine_juge_procédure_crime
2,1,328,1_chômage_travailleur_salaire_travail
3,2,322,2_vaccin_pandémie_coronavirus_crise
4,3,286,3_armée_civil_service_munition
...,...,...,...
149,148,10,148_présentation_loyer_insertion_offre
150,149,10,149_expérimentation_animal_recherche_médicament
151,150,10,150_donnée_télécommunication_télécommunication...
152,151,10,151_impôt_provision_réforme_taux


In [154]:
new_topics = topic_model.reduce_outliers(docs, topics)
topic_model.update_topics(docs, topics=new_topics)
topic_model.get_topic_info()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:03<00:00,  1.95it/s]


Unnamed: 0,Topic,Count,Name
0,0,577,0_peine_juge_procédure_code
1,1,392,1_pandémie_vaccin_crise_coronavirus
2,2,412,2_budget_dépense_crédit_banque
3,3,376,3_chômage_travailleur_salaire_travail
4,4,290,4_animal_loup_élevage_cheval
...,...,...,...
139,139,40,139_corruption_blanchiment_argent_entraide
140,140,24,140_dispositif_fabricant_règlement_produit
141,141,25,141_test_hiver_mars_charge
142,142,26,142_sphère_économie_relation_citoyen


In [155]:
topic_model.get_topic(0)

[('peine', 0.028929184412578662),
 ('juge', 0.02735071924902853),
 ('procédure', 0.023212295855029166),
 ('code', 0.0221015509283725),
 ('infraction', 0.021164680505027466),
 ('tribunal', 0.02007863165116139),
 ('justice', 0.0193380000482554),
 ('crime', 0.01837074736075378),
 ('victime', 0.015965651086658247),
 ('avocat', 0.014620977962386775)]

In [119]:
topic_model.get_document_info(docs, df=processed_transcript)

Unnamed: 0,transcript_idx,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,...,LanguageOfText,text_length,text_lemmatized,text_lemma_list,Document,Topic,Name,Top_n_words,Probability,Representative_document
0,0,191151,36016,4018,Vous avez reçu le rapport du Conseil fédéral s...,N,20151130,5001,Mit-M,1.0,...,FR,1159,élection constitution incompatibilité déroulem...,"['élection', 'constitution', 'incompatibilité'...",élection constitution incompatibilité déroulem...,5,5_vote_référendum_élection_votation,vote - référendum - élection - votation - élec...,0.039621,False
1,1,191158,36016,4018,Pour adopter les décisions et formuler les pro...,N,20151130,5001,Mit-M,1.0,...,FR,1643,principe explication service conseiller mandat...,"['principe', 'explication', 'service', 'consei...",principe explication service conseiller mandat...,5,5_vote_référendum_élection_votation,vote - référendum - élection - votation - élec...,0.029537,False
2,2,191267,36022,3898,Vous vous souvenez que le groupe UDC recommand...,N,20151130,5001,Mit-M,1.0,...,FR,3925,carte visite diplomate soupçon fusil client ba...,"['carte', 'visite', 'diplomate', 'soupçon', 'f...",carte visite diplomate soupçon fusil client ba...,86,86_texte_procédure_modification_restitution,texte - procédure - modification - restitution...,1.000000,True
3,3,191199,36022,1116,"Dans ce dossier, nous sommes maintenant à bout...",N,20151130,5001,BR-M,99.0,...,FR,4090,dossier bout possibilité début carte visite go...,"['dossier', 'bout', 'possibilité', 'début', 'c...",dossier bout possibilité début carte visite go...,86,86_texte_procédure_modification_restitution,texte - procédure - modification - restitution...,0.009178,False
4,4,191226,36022,1116,On peut rester relativement calme sur ce sujet...,N,20151130,5001,BR-M,99.0,...,FR,5020,sujet intervention législation heure exemple c...,"['sujet', 'intervention', 'législation', 'heur...",sujet intervention législation heure exemple c...,14,14_sanction_amende_peine_délit,sanction - amende - peine - délit - casier - r...,0.066848,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6213,6611,253518,47419,305,Je souhaiterais préciser qu'il appartiendra au...,S,20190926,5019,Mit-M,2.0,...,FR,1044,dangerosité cigarette point vue recul cigarett...,"['dangerosité', 'cigarette', 'point', 'vue', '...",dangerosité cigarette point vue recul cigarett...,20,20_tabac_cigarette_publicité_cannabis,tabac - cigarette - publicité - cannabis - con...,0.186574,False
6214,6612,253392,47419,1161,"Actuellement, les produits qui se fument comme...",S,20190926,5019,BR-M,99.0,...,FR,2199,produit cigarette produit snu cigarette produi...,"['produit', 'cigarette', 'produit', 'snu', 'ci...",produit cigarette produit snu cigarette produi...,20,20_tabac_cigarette_publicité_cannabis,tabac - cigarette - publicité - cannabis - con...,0.781917,False
6215,6613,253516,47421,1267,"Je peux être très bref. La commission, par 7 v...",S,20190926,5019,1VP-M,2.0,...,FR,769,abstention décembre abstention différence résu...,"['abstention', 'décembre', 'abstention', 'diff...",abstention décembre abstention différence résu...,0,0_patient_maladie_soin_prime,patient - maladie - soin - prime - coût - médi...,0.064971,False
6216,6614,253445,47422,1161,La motion vise l'introduction d'une nouvelle d...,S,20190926,5019,BR-M,99.0,...,FR,3130,introduction prévoyance contrôle vérification ...,"['introduction', 'prévoyance', 'contrôle', 'vé...",introduction prévoyance contrôle vérification ...,78,78_juge_tir_surveillance_tribunal,juge - tir - surveillance - tribunal - affaire...,0.074977,False


In [120]:
topic_model.get_topic_info().to_csv('topics_50.csv', encoding='utf-8')

In [1374]:
with open('topics_cat.csv', encoding='utf-8') as file:
    topics_df = pd.read_csv(file).drop(columns='Unnamed: 0')

topics_df

Unnamed: 0,Topic,Count,Name,cat
0,0,683,0_peine_procédure_code_infraction,justice
1,1,391,1_pandémie_vaccin_crise_épidémie,santé
2,2,237,2_crédit_budget_dépense_endettement,finance
3,3,180,3_rente_retraite_pilier_réforme,retraite
4,4,167,4_énergie_électricité_installation_approvision...,énergie
...,...,...,...,...
85,85,20,85_surface_sol_zone_estivage,agriculture
86,86,20,86_prix_énergie_électricité_essence,énergie
87,87,24,87_médicament_catégorie_prix_approvisionnement,santé
88,88,23,88_moratoire_betterave_génie_culture,agriculture


In [1509]:
test_merge = topics_df.merge(topic_model.get_topic_info(), left_index=True, right_index=True)
test_merge.loc[test_merge['Count_x'] != test_merge['Count_y']]

Unnamed: 0,Topic_x,Count_x,Name_x,cat,Topic_y,Count_y,Name_y


In [1510]:
topic_groups = list(topics_df.groupby('cat')['Topic'].apply(list))
topic_groups

[[8, 20, 39, 59, 75, 85, 88, 89],
 [26, 27, 31, 36],
 [29, 38, 83],
 [25, 44, 63],
 [2, 77, 81, 84],
 [23, 64],
 [15, 45],
 [0, 43],
 [14],
 [42],
 [5, 7, 12, 21, 32, 37],
 [16, 74],
 [3],
 [1, 18, 24, 30, 41, 49, 50, 53, 54, 57, 58, 61, 67, 69, 72, 87],
 [10, 11, 22, 48, 73, 76],
 [40],
 [33, 35, 52],
 [17, 55, 65, 80],
 [19, 28, 68, 82],
 [6, 13, 34],
 [9, 46, 47, 51, 56, 60, 62, 70, 71, 79],
 [4, 86]]

In [1511]:
topic_model.merge_topics(docs, topic_groups)
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,1126,0_santé_assurance_coût_soin
1,1,772,1_transparence_vote_divergence_surveillance
2,2,728,2_procédure_peine_code_juge
3,3,507,3_entreprise_prix_marché_produit
4,4,469,4_enfant_femme_parent_accueil
5,5,391,5_animal_agriculture_production_vin
6,6,331,6_crédit_budget_dépense_impôt
7,7,314,7_biodiversité_émission_eau_produit
8,8,290,8_armée_guerre_service_matériel
9,9,216,9_travail_chômage_travailleur_convention


In [1517]:
sample = []
sample_dict = dict()

for i in sorted(random.sample(list(processed_transcript.index), 100)):
    doc_topic = loaded_model.get_document_info(docs).loc[i]
    
    topic_idx = doc_topic['Topic']
    topic_score = round(doc_topic['Probability'], 3)
    topic_words = doc_topic['Top_n_words']
    transcript_words = top_terms.loc[i]['top_terms']

    sample_dict[i] = {
        'topic_idx': topic_idx,
        'topic_score': topic_score,
        'topic_words': topic_words,
        'transcript_words': transcript_words
    }
    
    if topic_idx >= 0:
        sample.append('TRANSCRIPT IDX: ' + str(i))
        sample.append('TOPIC: ' + str(topic_idx))
        sample.append(topic_words)
        sample.append('\nTOP TERMS')
        sample.append(str(transcript_words))
        sample.append('score: ' + str(topic_score))

        if topic_score < 0.01:
            sample.append('----- LOW SCORE -----')
            
        sample.append('\n///////////////////////////////////////\n')
    else:
        sample.append('===========================================')
        sample.append('IDX: ' + str(i))
        sample.append(str(transcript_words))
        sample.append('===========================================\n')
        
print(len(sample))
sample = '\n'.join(sample)
sample_df = pd.DataFrame.from_dict(sample_dict, orient='index')
sample_df

701


Unnamed: 0,topic_idx,topic_score,topic_words,transcript_words
10,1,0.043,transparence - vote - divergence - surveillanc...,"[tiret, paragraphe, immigration, législation, ..."
64,11,0.029,asile - donnée - requérant - réfugié - informa...,"[note, information, tradition, publication, do..."
128,2,0.131,procédure - peine - code - juge - infraction -...,"[organe, convention, trafic, punissabilité, in..."
184,16,0.203,formation - école - apprentissage - étudiant -...,"[formation, soin, personnel, besoin, planifica..."
189,2,0.298,procédure - peine - code - juge - infraction -...,"[liberté, assureur, infirmier, patient, servic..."
...,...,...,...,...
6661,2,0.198,procédure - peine - code - juge - infraction -...,"[mineur, délinquant, internement, code, assass..."
6671,17,0.124,culture - jeu - film - vidéo - message - mineu...,"[insatisfaction, déclamatoire, évaluation, pré..."
6726,13,0.058,rente - retraite - pilier - réforme - femme - ...,"[pilier, réforme, travail, résultat, chiffré, ..."
6812,0,0.036,santé - assurance - coût - soin - prime - mala...,"[cigarette, terre, usage, commerce, produit, c..."


In [1518]:
sample_df.to_csv('sample.csv', encoding='utf-8')

with open('sample.txt', 'w', encoding='utf-8') as file:
    file.write(sample)

In [1492]:
#topic_model.save("my_model")

In [1417]:
topic_model.get_topic_info().to_csv('topics_to_rename.csv', encoding='utf-8')

In [1418]:
with open('topics_renamed.csv', encoding='utf-8') as file:
    topics_renamed = pd.read_csv(file).drop(columns='Unnamed: 0')
    
topics_renamed

Unnamed: 0,Topic,Count,Name,new_name
0,0,1126,0_santé_assurance_coût_soin,santé
1,1,772,1_transparence_vote_divergence_surveillance,politique
2,2,728,2_procédure_peine_code_juge,justice
3,3,507,3_entreprise_prix_marché_produit,économie
4,4,469,4_enfant_femme_parent_accueil,société
5,5,391,5_animal_agriculture_production_vin,agriculture
6,6,331,6_crédit_budget_dépense_impôt,finance
7,7,314,7_biodiversité_émission_eau_produit,écologie
8,8,290,8_armée_guerre_service_matériel,armée
9,9,216,9_travail_chômage_travailleur_convention,travail


In [1436]:
topics_name_dict = dict()

for idx, row in topics_renamed.iterrows():
    count = idx
    if count < 10:
        count = '0' + str(count)
    else:
        count = str(count)
        
    topics_name_dict[row['Name']] = count + '_' + row['new_name']
    
topics_name_dict

{'0_santé_assurance_coût_soin': '00_santé',
 '1_transparence_vote_divergence_surveillance': '01_politique',
 '2_procédure_peine_code_juge': '02_justice',
 '3_entreprise_prix_marché_produit': '03_économie',
 '4_enfant_femme_parent_accueil': '04_société',
 '5_animal_agriculture_production_vin': '05_agriculture',
 '6_crédit_budget_dépense_impôt': '06_finance',
 '7_biodiversité_émission_eau_produit': '07_écologie',
 '8_armée_guerre_service_matériel': '08_armée',
 '9_travail_chômage_travailleur_convention': '09_travail',
 '10_sanction_développement_coopération_neutralité': '10_extérieur',
 '11_asile_donnée_requérant_réfugié': '11_immigration',
 '12_énergie_électricité_installation_approvisionnement': '12_énergie',
 '13_rente_retraite_pilier_réforme': '13_retraite',
 '14_transport_trafic_véhicule_vélo': '14_transport',
 '15_violence_terrorisme_sécurité_menace': '15_sécurité',
 '16_formation_école_apprentissage_étudiant': '16_formation',
 '17_culture_jeu_film_vidéo': '17_culture',
 '18_recher

In [1382]:
topic_by_person = topic_model.get_document_info(docs, df=transcript_by_person)
topic_by_person

Unnamed: 0,transcript_idx,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,...,LastName,FirstName,GenderAsString,PartyAbbreviation,Document,Topic,Name,Top_n_words,Probability,Representative_document
0,0,254005,47804,4109,Vous avez reçu le rapport du Conseil fédéral s...,N,20191202,5101,Mit-F,1.0,...,Piller Carrard,Valérie,f,PSS,élection bureau constitution incompatibilité b...,1,1_transparence_vote_divergence_surveillance,transparence - vote - divergence - surveillanc...,0.091612,False
1,1,254007,47804,4109,Pour adopter les décisions et formuler les pro...,N,20191202,5101,Mit-F,1.0,...,Piller Carrard,Valérie,f,PSS,bureau principe bureau explication service con...,1,1_transparence_vote_divergence_surveillance,transparence - vote - divergence - surveillanc...,0.102823,False
2,2,254046,47810,4156,"Lors de la session d'été 2019, notre conseil a...",N,20191202,5101,Mit-M,1.0,...,Buffat,Michaël,m,UDC,été automne octobre fois objet création regist...,1,1_transparence_vote_divergence_surveillance,transparence - vote - divergence - surveillanc...,0.045494,False
3,3,254155,47813,4154,Nous parlons ici d'une révision totale de la l...,N,20191203,5101,Mit-M,1.0,...,Addor,Jean-Luc,m,UDC,révision protection population protection juin...,1,1_transparence_vote_divergence_surveillance,transparence - vote - divergence - surveillanc...,0.320561,False
4,4,254159,47817,3872,Lors de ses séances des 18 et 19 février et de...,N,20191203,5101,Mit-M,1.0,...,Bourgeois,Jacques,m,PLR,séance février juin aménagement territoire éne...,5,5_animal_agriculture_production_vin,animal - agriculture - production - vin - sucr...,0.435514,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6926,7316,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,...,Fivaz,Fabien,m,VERT-E-S,modèle affaire entreprise numérique collecte d...,3,3_entreprise_prix_marché_produit,entreprise - prix - marché - produit - consomm...,0.009871,False
6927,7317,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,...,Baume-Schneider,Elisabeth,f,PSS,doigt problème recours collecte donnée surveil...,0,0_santé_assurance_coût_soin,santé - assurance - coût - soin - prime - mala...,0.005182,False
6928,7318,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,...,de Quattro,Jacqueline,f,PLR,suite attentat vie aide victime infraction ind...,2,2_procédure_peine_code_juge,procédure - peine - code - juge - infraction -...,0.137881,False
6929,7319,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,...,Baume-Schneider,Elisabeth,f,PSS,victime infraction soutien face traumatisme in...,2,2_procédure_peine_code_juge,procédure - peine - code - juge - infraction -...,0.032463,False


In [1438]:
topic_by_party = topic_by_person.groupby(['PartyAbbreviation', 'Name']).count()[['Text']].stack().unstack(level=1)
topic_by_party = topic_by_party.droplevel(level=1)

# set index and column row name to None
topic_by_party.index.name = None
topic_by_party.columns.name = None

# set all NaN to 0
topic_by_party = topic_by_party.fillna(0)

# set 
topic_by_party = topic_by_party.rename(columns=topics_name_dict)
topic_by_party = topic_by_party.reindex(sorted(topic_by_party.columns), axis=1)
topic_by_party = topic_by_party.T

for col in topic_by_party.columns:
    try:
        topic_by_party[[col]] = topic_by_party[[col]].astype(float).astype(int)
    except:
        pass
    
topic_by_party = topic_by_party.drop(columns=['Lega'])
    
for col in topic_by_party.columns:
    topic_by_party[col] = round(topic_by_party[col]/topic_by_party[col].sum()*100, 0).astype(int)
    
topic_by_party = topic_by_party.reindex(sorted(topic_by_party.columns), axis=1)
topic_by_party

Unnamed: 0,EàG,M-E,PLR,PSS,PdT,UDC,VERT-E-S,pvl
00_santé,9,18,14,23,16,10,11,9
01_politique,0,9,15,11,8,12,10,6
02_justice,11,12,13,11,0,7,11,5
03_économie,3,7,8,6,16,9,7,4
04_société,11,7,5,9,0,4,8,5
05_agriculture,3,5,4,4,4,11,6,4
06_finance,3,4,7,3,4,6,4,11
07_écologie,6,3,3,3,4,5,10,7
08_armée,6,6,3,3,12,5,5,11
09_travail,3,3,2,3,4,6,1,1


### Most frequent words

In [381]:
coun_vect = CountVectorizer()
count_matrix = coun_vect.fit_transform(filtered_transcript['text_lemmatized'].to_list())
count_array = count_matrix.toarray()
count_df = pd.DataFrame(data=count_array, columns=coun_vect.get_feature_names_out())
count_df

Unnamed: 0,00,0122,0202,0210,0219,0229,024,0244,026,0276,...,évoqué,évènement,évènementiel,événement,événementiel,événementielle,évêché,être,île,îlot
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7645,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7646,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7647,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7648,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [394]:
example_row = 1000
count_df.sort_values(axis='columns', by=example_row, ascending=False).iloc[example_row-5:example_row+5, 0:20]

Unnamed: 0,co2,objectif,émission,véhicule,automobiliste,progrès,principe,parc,dispositif,mise,voiture,emploi,heure,domaine,habitant,décarbonée,niveau,réponse,choix,polluant
995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
996,13,6,17,0,0,0,0,0,1,0,0,0,2,1,0,0,0,0,0,0
997,9,7,6,4,0,0,0,2,0,0,4,0,0,0,0,0,0,0,0,0
998,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
999,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1000,5,5,5,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1
1001,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0
1002,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1003,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1004,10,10,11,6,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0


### LDA gensim

In [810]:
filtered_transcript['text_lemm_list'] = filtered_transcript['text_lemmatized'].apply(lambda x: x.split())
filtered_transcript

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_lemmatized,text_lemma_list,text_lemm_list
0,254005,47804,4109,Vous avez reçu le rapport du Conseil fédéral s...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:57:46,2019-12-02T14:59:13,B,FR,élection bureau constitution incompatibilité b...,"[élection, bureau, constitution, incompatibili...","[élection, bureau, constitution, incompatibili..."
1,254007,47804,4109,Pour adopter les décisions et formuler les pro...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:11:40,2019-12-02T15:13:17,B,FR,décision bureau principe bureau explication se...,"[décision, bureau, principe, bureau, explicati...","[décision, bureau, principe, bureau, explicati..."
2,254046,47810,4156,"Lors de la session d'été 2019, notre conseil a...",N,20191202,5101,Mit-M,1.0,2019-12-02T17:47:43,2019-12-02T17:49:26,*,FR,session été session automne position octobre f...,"[session, été, session, automne, position, oct...","[session, été, session, automne, position, oct..."
3,254155,47813,4154,Nous parlons ici d'une révision totale de la l...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:24:17,2019-12-03T08:29:34,*,FR,révision protection population protection juin...,"[révision, protection, population, protection,...","[révision, protection, population, protection,..."
4,254159,47817,3872,Lors de ses séances des 18 et 19 février et de...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:34:23,2019-12-03T08:38:30,*,FR,séance février juin aménagement territoire éne...,"[séance, février, juin, aménagement, territoir...","[séance, février, juin, aménagement, territoir..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7645,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,2023-05-04T16:41:10,2023-05-04T16:44:54,Mit-M,FR,modèle affaire entreprise numérique collecte d...,"[modèle, affaire, entreprise, numérique, colle...","[modèle, affaire, entreprise, numérique, colle..."
7646,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,2023-05-04T16:44:58,2023-05-04T16:47:57,BR-F,FR,doigt problème recours collecte donnée surveil...,"[doigt, problème, recours, collecte, donnée, s...","[doigt, problème, recours, collecte, donnée, s..."
7647,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,2023-05-04T16:48:32,2023-05-04T16:52:03,Mit-F,FR,suite attentat vie aide victime infraction ind...,"[suite, attentat, vie, aide, victime, infracti...","[suite, attentat, vie, aide, victime, infracti..."
7648,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,2023-05-04T16:54:20,2023-05-04T16:56:37,BR-F,FR,victime infraction soutien face traumatisme in...,"[victime, infraction, soutien, face, traumatis...","[victime, infraction, soutien, face, traumatis..."


In [811]:
docs = filtered_transcript['text_lemm_list'].to_list()

dictionary = corpora.Dictionary(docs)

DT_matrix = [dictionary.doc2bow(doc) for doc in docs]

Lda_object = gensim.models.ldamodel.LdaModel

In [812]:
lda_model_1 = Lda_object(DT_matrix, num_topics=10, id2word = dictionary)

lda_model_1.print_topics(num_topics=10, num_words=5)

[(0,
  '0.018*"sanction" + 0.009*"guerre" + 0.009*"organe" + 0.008*"contrôle" + 0.008*"sécurité"'),
 (1,
  '0.013*"système" + 0.011*"objectif" + 0.010*"cadre" + 0.009*"point" + 0.008*"coût"'),
 (2,
  '0.018*"locataire" + 0.015*"loyer" + 0.014*"femme" + 0.010*"violence" + 0.009*"bailleur"'),
 (3,
  '0.017*"procédure" + 0.016*"assurance" + 0.011*"prime" + 0.010*"raison" + 0.008*"manière"'),
 (4,
  '0.009*"cadre" + 0.008*"point" + 0.007*"recherche" + 0.007*"budget" + 0.007*"moyen"'),
 (5,
  '0.025*"prix" + 0.015*"énergie" + 0.013*"patient" + 0.013*"coût" + 0.011*"santé"'),
 (6,
  '0.016*"produit" + 0.008*"raison" + 0.007*"contre-projet" + 0.007*"travail" + 0.007*"accord"'),
 (7,
  '0.022*"enfant" + 0.009*"décision" + 0.009*"parent" + 0.007*"rente" + 0.007*"procédure"'),
 (8,
  '0.026*"entreprise" + 0.018*"travail" + 0.009*"crédit" + 0.008*"aide" + 0.007*"milliard"'),
 (9,
  '0.013*"formation" + 0.012*"travail" + 0.010*"programme" + 0.009*"cadre" + 0.008*"soutien"')]

### top2vec

In [199]:
docs = filtered_transcript["text_lemmatized"].tolist()
len(docs)

324

In [196]:
topic_model = Top2Vec(
    docs,
    #embedding_model="universal-sentence-encoder-multilingual",
    speed="deep-learn",
)

2023-05-12 16:25:30,943 - top2vec - INFO - Pre-processing documents for training
2023-05-12 16:25:31,080 - top2vec - INFO - Creating joint document/word embedding
2023-05-12 16:25:38,074 - top2vec - INFO - Creating lower dimension embedding of documents
2023-05-12 16:25:39,207 - top2vec - INFO - Finding dense areas of documents
2023-05-12 16:25:39,213 - top2vec - INFO - Finding topics


In [201]:
model.get_topics()

{0: [('de', 0.13273781851509409),
  ('la', 0.10522084530715155),
  ('le', 0.07682557991849537),
  ('des', 0.06496992795926745),
  ('et', 0.06409332138178707),
  ('les', 0.061870529632359866),
  ('en', 0.055698036565708625),
  ('que', 0.055082912584038426),
  ('est', 0.05186673777975805),
  ('une', 0.042562060578284655)],
 1: [('de', 0.13792948109435965),
  ('la', 0.07491411465697119),
  ('et', 0.06830473202530185),
  ('les', 0.06791204410365591),
  ('le', 0.06573522710988462),
  ('pour', 0.06565243157193269),
  ('avs', 0.06497054756109226),
  ('des', 0.0634781534130419),
  ('est', 0.05652146656288921),
  ('une', 0.05409859315631972)]}