In [3]:
# Data processing
import os
import pandas as pd
import numpy as np
import ssl
import string
from collections import Counter
from transformers.pipelines import pipeline
import altair as alt
from tqdm import tqdm
import random

# Text preprocessiong
import nltk
from nltk.corpus import wordnet as wn
import spacy
import spacy_fastlang
from top2vec import Top2Vec
import gensim
from gensim import corpora
from gensim.corpora.dictionary import Dictionary
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from stanza.models.common.doc import Document
from stanza.pipeline.core import Pipeline
from stanza.pipeline.multilingual import MultilingualPipeline

# Topic model
from bertopic import BERTopic

# Dimension reduction
from umap.umap_ import UMAP

In [4]:
# disable ssl check (to be able to download nltk packages)

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [5]:
# download stanza package

import stanza
stanza.download(lang="multilingual")
stanza.download(lang="de")
stanza.download(lang="fr")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 76.8MB/s]                                                                              
2023-06-05 10:16:03 INFO: Downloading default packages for language: multilingual (multilingual) ...
2023-06-05 10:16:03 INFO: File exists: /Users/cyrille/stanza_resources/multilingual/default.zip
2023-06-05 10:16:03 INFO: Finished downloading models and saved to /Users/cyrille/stanza_resources.
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 56.7MB/s]                                                                              
2023-06-05 10:16:03 INFO: Downloading default packages for language: de (German) ...
2023-06-05 10:16:05 INFO: File exists: /Users/cyrille/stanza_resources/de/default.zip
2023-06-05 10:16:09 INFO: Finished downloading models and saved to /Users/cyrille/stanza_resources.
Downloading https://raw.

In [6]:
# download nltk packages

nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cyrille/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/cyrille/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/cyrille/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Lemmatize transcripts

In [53]:
transcript_df = pd.DataFrame()

directory = 'data/transcripts'
filepaths_list = []
transcripts_list = []

# iterate over files in that directory
for filename in os.listdir(directory):
    file = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(file) and file.endswith('.csv'):
        filepaths_list.append(file)

# sort transcripts chronologically 
filepaths_list.sort()
print(len(filepaths_list))

for filepath in tqdm(filepaths_list[97:100]):
    with open(filepath, encoding='utf-8') as file:
        session_df = pd.read_csv(file).drop(columns='Unnamed: 0')
        transcript_df = pd.concat([transcript_df, session_df])

transcript_df = transcript_df.reset_index(drop=True)
transcript_df

117


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 32.91it/s]


Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
0,253990,47803,806,"Präsidentin (Graf Maya, Alterspräsidentin): He...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:32:29,2019-12-02T14:33:42,P-F,DE
1,253996,47803,4290,"Sehr geehrte Frau Präsidentin, sehr geehrter H...",N,20191202,5101,Mit-M,1.0,2019-12-02T14:44:33,2019-12-02T14:52:10,Mit-M,DE
2,253998,47803,806,"Präsidentin (Graf Maya, Alterspräsidentin): Ge...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:52:10,2019-12-02T14:52:30,P-F,
3,254000,47804,806,"Präsidentin (Graf Maya, Alterspräsidentin): De...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:52:30,2019-12-02T14:54:24,P-F,DE
4,254011,47804,1139,Sie haben den Bericht des Bundesrates zu den N...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:54:24,2019-12-02T14:57:46,B,DE
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3210,260973,48951,4204,Der Kommissionssprecher hat es gesagt: Die Lös...,S,20200506,5103,Mit-M,2.0,2020-05-06T17:38:33,2020-05-06T17:40:28,Mit-M,DE
3211,261022,48951,3879,Je ne vais répéter en français ce qui vient d'...,S,20200506,5103,Mit-M,2.0,2020-05-06T17:40:28,2020-05-06T17:43:59,Mit-M,FR
3212,261018,48951,4240,Je serai très bref. Je soutiendrai évidemment ...,S,20200506,5103,Mit-M,2.0,2020-05-06T17:43:59,2020-05-06T17:45:01,Mit-M,FR
3213,260978,48951,146,In Bezug auf den Tourismus gab es in diesen dr...,S,20200506,5103,BR-M,99.0,2020-05-06T17:45:01,2020-05-06T17:48:48,BR-M,DE


In [93]:
selected_language = 'FR'

# only keep french texts
filtered_transcript = transcript_df.loc[transcript_df['LanguageOfText'] == selected_language]
# only keep texts longer than 300 char
filtered_transcript['text_length'] = filtered_transcript['Text'].apply(lambda x: len(x))
filtered_transcript = filtered_transcript[filtered_transcript['text_length'] > 300]
# reset index
filtered_transcript = filtered_transcript.reset_index(drop=True)
filtered_transcript

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length
0,254005,47804,4109,Vous avez reçu le rapport du Conseil fédéral s...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:57:46,2019-12-02T14:59:13,B,FR,1299
1,254007,47804,4109,Pour adopter les décisions et formuler les pro...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:11:40,2019-12-02T15:13:17,B,FR,1586
2,254046,47810,4156,"Lors de la session d'été 2019, notre conseil a...",N,20191202,5101,Mit-M,1.0,2019-12-02T17:47:43,2019-12-02T17:49:26,*,FR,1774
3,254155,47813,4154,Nous parlons ici d'une révision totale de la l...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:24:17,2019-12-03T08:29:34,*,FR,4569
4,254159,47817,3872,Lors de ses séances des 18 et 19 février et de...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:34:23,2019-12-03T08:38:30,*,FR,4506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,260916,48948,1108,Nous vivons cet après-midi un nouvel épisode d...,S,20200506,5103,VPBR-M,99.0,2020-05-06T15:24:31,2020-05-06T15:28:47,VPBR-M,FR,3454
763,261019,48949,4240,Emotions et pesée d'intérêts: il y a la politi...,S,20200506,5103,Mit-M,2.0,2020-05-06T16:19:19,2020-05-06T16:22:20,Mit-M,FR,2784
764,260946,48949,1108,"Tout d'abord, le Conseil fédéral, il faut bien...",S,20200506,5103,VPBR-M,99.0,2020-05-06T16:22:32,2020-05-06T16:31:04,VPBR-M,FR,7134
765,261022,48951,3879,Je ne vais répéter en français ce qui vient d'...,S,20200506,5103,Mit-M,2.0,2020-05-06T17:40:28,2020-05-06T17:43:59,Mit-M,FR,2737


In [94]:
nlp_sent = spacy.load('xx_sent_ud_sm')
nlp = spacy.load(selected_language.lower() + "_core_news_md")
nlp.add_pipe("language_detector")
lemmatizer = nlp.get_pipe("lemmatizer")

additional_stopwords = {
    'FR': {'-t', 'avez', 'être', 'aujourd', 'hui'},
    'DE': set(),
}
removed_stopwords = {
    'FR': {'hui'},
    'DE': set(),
}
specific_stopwords = {
    'FR': {
        'accord', 'alinéa', 'an', 'année', 'article', 'avis', 'cadre', 'canton', 'cas', 
        'collègue', 'commission', 'conseil', 'débat', 'décision', 'discussion', 'disposition', 'domaine', 'droit', 
        'fédéral', 'franc', 'groupe', 'initiative', 'législature', 'loi', 'majorité', 'matière', 'mesure', 'milliard', 'million', 'minorité', 
        'monsieur', 'motion', 'parlementaire', 'pays', 'postulat', 'politique', 'position', 'président', 'proposition', 
        'projet', 'question', 'rapport', 'rapporteur', 'réponse', 'session', 'situation', 'suisse', 'voix'
    },
    'DE': {
        'Vereinbarung', 'Absatz', 'Jahr', 'Artikel', 'Stellungnahme', 'Rahmen', 'Kanton', 'Fall', 
        'Kollege', 'Kommission', 'Rat', 'Debatte', 'Entscheidung', 'Diskussion', 'Bestimmung', 'Bereich', 'Recht', 
        'föderal', 'Franken', 'Gruppe', 'Initiative', 'Legislaturperiode', 'Gesetz', 'Mehrheit', 'Materie', 'Massnahme', 'Milliarde', 'Million', 'Minderheit', 
        'Herr', 'Frau', 'Dame' 'Motion', 'Parlamentarier', 'Land', 'Postulat', 'Politik', 'Position', 'Präsident', 'Präsidentin', 'Vorschlag', 
        'Projekt', 'Frage', 'Bericht', 'Berichterstatter', 'Antwort', 'Sitzung', 'Lage', 'Schweiz', 'Stimme',
        'Bundesrat', 'Nationalrat', 'Bundeskanzler', 'Urne'
    },
}

if selected_language == 'DE':    
    # add stopwords 
    nlp.Defaults.stop_words |= additional_stopwords[selected_language]
    nlp.Defaults.stop_words |= specific_stopwords[selected_language]
    
    # remove stopwords
    nlp.Defaults.stop_words -= additional_stopwords['FR']
    nlp.Defaults.stop_words -= specific_stopwords['FR']
elif selected_language == 'FR':
    # add stopwords 
    nlp.Defaults.stop_words |= additional_stopwords[selected_language]
    nlp.Defaults.stop_words |= specific_stopwords[selected_language]
    
    # remove stopwords
    nlp.Defaults.stop_words -= additional_stopwords['DE']
    nlp.Defaults.stop_words -= specific_stopwords['DE']

print(len(nlp.Defaults.stop_words))
print(nlp.Defaults.stop_words)

558
{'moi-même', 'jusque', 'ouste', 'déja', 'une', 'puisque', 'quelques', 'trente', 'président', 'facon', 'pays', 'toi', 'dixième', 'lesquelles', 'lès', 'auxquels', 'dont', 'au', 'est', 'jusqu', 'o', 'exactement', 'vôtres', 'celui-là', 'chaque', 'quelconque', 'etc', 'alinéa', 'je', 'desquels', 'faisant', 'vu', 'tellement', "m'", 'egalement', 'toi-meme', 'pendant', 'cent', 'ait', 'cependant', "j'", 'n’', 'ou', 'préalable', 'les', 'surtout', 'directement', 'matière', 'réponse', 'autre', 'partant', 'dit', 'mêmes', 'dehors', 'canton', 'auxquelles', 'seuls', 'ont', 'cinquante', 'seul', "s'", 'celle-ci', 'celle-la', 'votres', 'minorité', 'un', 'dire', 'ouverte', 'donc', 'tente', 'antérieur', 'cinquième', 'autrui', "c'", 'lequel', 'car', 'aujourd', 'enfin', 'douze', 'deja', 'ouvert', 'droit', 'quelque', 'ni', 'soi', 'entre', 'seule', 'ceux', 'antérieure', 'importe', 'ayant', 'tres', 'soi-meme', 'na', 'd’', 'neanmoins', 'suffisante', 'pourrait', 'pourquoi', 'revoila', 'seules', 'parle', 'rend'



In [727]:
example_row = 288
text_id = filtered_transcript.iloc[example_row]['ID']
text = filtered_transcript.iloc[example_row]['Text']

print(text_id)
print('length:', len(text))
print(text)
print('---')

entity_dict = {}

for ent in nlp(text).ents:
    entity_dict[ent.text] = ent.label_
    
print(entity_dict)
print('---')

lemma_list = [
    token.lemma_ for sent in nlp_sent(text).sents 
    if (doc := nlp(sent.text))._.language == selected_language.lower() 
    for token in doc 
    if not any([
        token.is_stop, 
        token.is_punct, 
        token.is_space, 
        token.ent_type_, 
        not token.is_alpha, 
        token.lemma_ in nlp.Defaults.stop_words, 
        #token.pos_ != 'NOUN',
    ])
]

#lemma_list.sort()
print(lemma_list)

255705
length: 733
Je m'exprimerai une seule fois et de manière brève pour vous dire que le Conseil fédéral est très sensible à cette problématique. La contribution de solidarité doit revenir, dans son intégralité, aux victimes de mesures de coercition à des fins d'assistance et de placements extrafamiliaux. De l'avis du Conseil fédéral, il est incohérent, d'une part, que l'Etat accorde un dédommagement en signe de reconnaissance des torts causés et, d'autre part, qu'une prestation sociale soit réduite en conséquence.
Afin d'exclure aussi vite que possible toute prise en compte de la contribution de solidarité dans le calcul de la prestation complémentaire, le Conseil fédéral soutient l'initiative de la commission et vous invite à l'adopter.

---
{'Conseil fédéral': 'ORG', 'Etat': 'LOC'}
---
['exprimer', 'fois', 'manière', 'bref', 'sensible', 'problématique', 'contribution', 'solidarité', 'revenir', 'intégralité', 'victime', 'coercition', 'fin', 'assistance', 'placement', 'extrafamilia

In [63]:
# remove stopwords, punctuation and then lemmatize
tqdm.pandas()
processed_transcript = filtered_transcript.copy()
processed_transcript['text_lemmatized'] = processed_transcript['Text'].progress_apply(
    lambda x: ' '.join([
        token.lemma_ for sent in nlp_sent(x).sents 
        if (doc := nlp(sent.text))._.language == selected_language.lower() 
        for token in doc 
        if not any([
            token.is_stop, 
            token.is_punct, 
            token.is_space, 
            token.ent_type_, 
            not token.is_alpha, 
            token.lemma_ in nlp.Defaults.stop_words, 
            token.pos_ != 'NOUN',
        ])
    ])
)
# Take a look at the data
processed_transcript

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2104/2104 [05:03<00:00,  6.94it/s]


Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length,text_lemmatized
0,253990,47803,806,"Präsidentin (Graf Maya, Alterspräsidentin): He...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:32:29,2019-12-02T14:33:42,P-F,DE,7926,Präsidentin Graf Alterspräsidentin Frau Bundes...
1,253996,47803,4290,"Sehr geehrte Frau Präsidentin, sehr geehrter H...",N,20191202,5101,Mit-M,1.0,2019-12-02T14:44:33,2019-12-02T14:52:10,Mit-M,DE,5563,Frau Präsidentin Dame Kollegin Dame Tradition ...
2,254000,47804,806,"Präsidentin (Graf Maya, Alterspräsidentin): De...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:52:30,2019-12-02T14:54:24,P-F,DE,744,Präsidentin Graf Alterspräsidentin Büro Nation...
3,254011,47804,1139,Sie haben den Bericht des Bundesrates zu den N...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:54:24,2019-12-02T14:57:46,B,DE,2284,Bundesrat Büro Antrag Konstituierung Unvereinb...
4,254006,47804,1139,Das provisorische Büro hat die Prüfung der Unv...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:09:59,2019-12-02T15:11:40,B,DE,1344,Büro Prüfung Unvereinbarkeit Gesetzesbestimmun...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2099,260955,48950,4062,"Der Bundesrat ist sich bewusst, dass sich inne...",S,20200506,5103,BR-F,99.0,2020-05-06T16:40:15,2020-05-06T16:44:22,BR-F,DE,3481,Bundesrat Reisebranche Reisebüro Reiseveransta...
2100,260974,48951,4153,Es ist schon fast so spannend und interessant ...,S,20200506,5103,Mit-M,2.0,2020-05-06T17:32:58,2020-05-06T17:38:33,*,DE,3520,Antrag Differenz Session Heimweg Differenz Tou...
2101,260973,48951,4204,Der Kommissionssprecher hat es gesagt: Die Lös...,S,20200506,5103,Mit-M,2.0,2020-05-06T17:38:33,2020-05-06T17:40:28,Mit-M,DE,1301,Kommissionssprecher Lösung Lösung Ausgang Fina...
2102,260978,48951,146,In Bezug auf den Tourismus gab es in diesen dr...,S,20200506,5103,BR-M,99.0,2020-05-06T17:45:01,2020-05-06T17:48:48,BR-M,DE,3560,Bezug Tourismus Commitment Tourismus Tourismus...


In [64]:
processed_transcript['text_lemma_list'] = processed_transcript['text_lemmatized'].apply(lambda x: x.split())
processed_transcript

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length,text_lemmatized,text_lemma_list
0,253990,47803,806,"Präsidentin (Graf Maya, Alterspräsidentin): He...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:32:29,2019-12-02T14:33:42,P-F,DE,7926,Präsidentin Graf Alterspräsidentin Frau Bundes...,"[Präsidentin, Graf, Alterspräsidentin, Frau, B..."
1,253996,47803,4290,"Sehr geehrte Frau Präsidentin, sehr geehrter H...",N,20191202,5101,Mit-M,1.0,2019-12-02T14:44:33,2019-12-02T14:52:10,Mit-M,DE,5563,Frau Präsidentin Dame Kollegin Dame Tradition ...,"[Frau, Präsidentin, Dame, Kollegin, Dame, Trad..."
2,254000,47804,806,"Präsidentin (Graf Maya, Alterspräsidentin): De...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:52:30,2019-12-02T14:54:24,P-F,DE,744,Präsidentin Graf Alterspräsidentin Büro Nation...,"[Präsidentin, Graf, Alterspräsidentin, Büro, N..."
3,254011,47804,1139,Sie haben den Bericht des Bundesrates zu den N...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:54:24,2019-12-02T14:57:46,B,DE,2284,Bundesrat Büro Antrag Konstituierung Unvereinb...,"[Bundesrat, Büro, Antrag, Konstituierung, Unve..."
4,254006,47804,1139,Das provisorische Büro hat die Prüfung der Unv...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:09:59,2019-12-02T15:11:40,B,DE,1344,Büro Prüfung Unvereinbarkeit Gesetzesbestimmun...,"[Büro, Prüfung, Unvereinbarkeit, Gesetzesbesti..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2099,260955,48950,4062,"Der Bundesrat ist sich bewusst, dass sich inne...",S,20200506,5103,BR-F,99.0,2020-05-06T16:40:15,2020-05-06T16:44:22,BR-F,DE,3481,Bundesrat Reisebranche Reisebüro Reiseveransta...,"[Bundesrat, Reisebranche, Reisebüro, Reisevera..."
2100,260974,48951,4153,Es ist schon fast so spannend und interessant ...,S,20200506,5103,Mit-M,2.0,2020-05-06T17:32:58,2020-05-06T17:38:33,*,DE,3520,Antrag Differenz Session Heimweg Differenz Tou...,"[Antrag, Differenz, Session, Heimweg, Differen..."
2101,260973,48951,4204,Der Kommissionssprecher hat es gesagt: Die Lös...,S,20200506,5103,Mit-M,2.0,2020-05-06T17:38:33,2020-05-06T17:40:28,Mit-M,DE,1301,Kommissionssprecher Lösung Lösung Ausgang Fina...,"[Kommissionssprecher, Lösung, Lösung, Ausgang,..."
2102,260978,48951,146,In Bezug auf den Tourismus gab es in diesen dr...,S,20200506,5103,BR-M,99.0,2020-05-06T17:45:01,2020-05-06T17:48:48,BR-M,DE,3560,Bezug Tourismus Commitment Tourismus Tourismus...,"[Bezug, Tourismus, Commitment, Tourismus, Tour..."


In [65]:
processed_transcript = processed_transcript[processed_transcript['text_lemma_list'].map(len) >= 10]
processed_transcript = processed_transcript.reset_index().rename(columns={'index': 'transcript_idx'})
processed_transcript

Unnamed: 0,transcript_idx,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length,text_lemmatized,text_lemma_list
0,0,253990,47803,806,"Präsidentin (Graf Maya, Alterspräsidentin): He...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:32:29,2019-12-02T14:33:42,P-F,DE,7926,Präsidentin Graf Alterspräsidentin Frau Bundes...,"[Präsidentin, Graf, Alterspräsidentin, Frau, B..."
1,1,253996,47803,4290,"Sehr geehrte Frau Präsidentin, sehr geehrter H...",N,20191202,5101,Mit-M,1.0,2019-12-02T14:44:33,2019-12-02T14:52:10,Mit-M,DE,5563,Frau Präsidentin Dame Kollegin Dame Tradition ...,"[Frau, Präsidentin, Dame, Kollegin, Dame, Trad..."
2,2,254000,47804,806,"Präsidentin (Graf Maya, Alterspräsidentin): De...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:52:30,2019-12-02T14:54:24,P-F,DE,744,Präsidentin Graf Alterspräsidentin Büro Nation...,"[Präsidentin, Graf, Alterspräsidentin, Büro, N..."
3,3,254011,47804,1139,Sie haben den Bericht des Bundesrates zu den N...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:54:24,2019-12-02T14:57:46,B,DE,2284,Bundesrat Büro Antrag Konstituierung Unvereinb...,"[Bundesrat, Büro, Antrag, Konstituierung, Unve..."
4,4,254006,47804,1139,Das provisorische Büro hat die Prüfung der Unv...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:09:59,2019-12-02T15:11:40,B,DE,1344,Büro Prüfung Unvereinbarkeit Gesetzesbestimmun...,"[Büro, Prüfung, Unvereinbarkeit, Gesetzesbesti..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1895,2099,260955,48950,4062,"Der Bundesrat ist sich bewusst, dass sich inne...",S,20200506,5103,BR-F,99.0,2020-05-06T16:40:15,2020-05-06T16:44:22,BR-F,DE,3481,Bundesrat Reisebranche Reisebüro Reiseveransta...,"[Bundesrat, Reisebranche, Reisebüro, Reisevera..."
1896,2100,260974,48951,4153,Es ist schon fast so spannend und interessant ...,S,20200506,5103,Mit-M,2.0,2020-05-06T17:32:58,2020-05-06T17:38:33,*,DE,3520,Antrag Differenz Session Heimweg Differenz Tou...,"[Antrag, Differenz, Session, Heimweg, Differen..."
1897,2101,260973,48951,4204,Der Kommissionssprecher hat es gesagt: Die Lös...,S,20200506,5103,Mit-M,2.0,2020-05-06T17:38:33,2020-05-06T17:40:28,Mit-M,DE,1301,Kommissionssprecher Lösung Lösung Ausgang Fina...,"[Kommissionssprecher, Lösung, Lösung, Ausgang,..."
1898,2102,260978,48951,146,In Bezug auf den Tourismus gab es in diesen dr...,S,20200506,5103,BR-M,99.0,2020-05-06T17:45:01,2020-05-06T17:48:48,BR-M,DE,3560,Bezug Tourismus Commitment Tourismus Tourismus...,"[Bezug, Tourismus, Commitment, Tourismus, Tour..."


In [298]:
processed_transcript.iloc[12884]['Text']

'Si je partage en tout point les inquiétudes et les préoccupations des auteures des motions, de même que les considérations de la commission et du Conseil fédéral, je voudrais juste m\'arrêter sur deux formulations employées par le rapporteur: "la Confédération doit soutenir", "on veillera à régler le problème du financement". Selon moi, c\'est assez simple. Nous sommes les représentants des cantons. Si la Confédération impose, elle doit aussi participer financièrement à ce qu\'elle impose, ne serait-ce que pour respecter un tant soit peu le principe de l\'équivalence fiscale, ce qui malheureusement, on le constate dans les cantons, n\'est pas si souvent réalisé.\nSur le fond, j\'accepterai ces deux motions, mais avec justement la condition qu\'au moment où le Conseil fédéral devra formuler des propositions, il tienne compte en particulier du problème du financement.\n'

In [486]:
processed_transcript.to_csv('data/lemmatized/transcripts_lemmatized_fr_50_51.csv', encoding='utf-8')

### Load lemmatized transcript

In [124]:
selected_language = 'fr'
selected_sessions = [50, 51]
processed_transcript = pd.DataFrame()

for session in selected_sessions:
    with open('data/lemmatized/transcripts_lemmatized_' + selected_language + '_' + str(session) + '.csv', encoding='utf-8') as file:
        transcript = pd.read_csv(file).drop(columns='Unnamed: 0')
    
    processed_transcript = pd.concat([processed_transcript, transcript])
   
processed_transcript = processed_transcript.reset_index(drop=True)
processed_transcript

Unnamed: 0,transcript_idx,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length,text_lemmatized,text_lemma_list
0,0,191151,36016,4018,Vous avez reçu le rapport du Conseil fédéral s...,N,20151130,5001,Mit-M,1.0,2015-11-30T15:07:12,2015-11-30T15:08:50,B,FR,1159,élection constitution incompatibilité déroulem...,"['élection', 'constitution', 'incompatibilité'..."
1,1,191158,36016,4018,Pour adopter les décisions et formuler les pro...,N,20151130,5001,Mit-M,1.0,2015-11-30T15:21:04,2015-11-30T15:23:08,B,FR,1643,principe explication service conseiller mandat...,"['principe', 'explication', 'service', 'consei..."
2,2,191267,36022,3898,Vous vous souvenez que le groupe UDC recommand...,N,20151130,5001,Mit-M,1.0,2015-11-30T17:58:22,2015-11-30T18:02:40,Mit-M,FR,3925,carte visite diplomate soupçon fusil client ba...,"['carte', 'visite', 'diplomate', 'soupçon', 'f..."
3,3,191199,36022,1116,"Dans ce dossier, nous sommes maintenant à bout...",N,20151130,5001,BR-M,99.0,2015-11-30T18:07:35,2015-11-30T18:12:11,BR-M,FR,4090,dossier bout possibilité début carte visite go...,"['dossier', 'bout', 'possibilité', 'début', 'c..."
4,4,191226,36022,1116,On peut rester relativement calme sur ce sujet...,N,20151130,5001,BR-M,99.0,2015-11-30T18:27:03,2015-11-30T18:32:48,BR-M,FR,5020,sujet intervention législation heure exemple c...,"['sujet', 'intervention', 'législation', 'heur..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13144,7316,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,2023-05-04T16:41:10,2023-05-04T16:44:54,Mit-M,FR,3702,modèle affaire entreprise numérique collecte d...,"['modèle', 'affaire', 'entreprise', 'numérique..."
13145,7317,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,2023-05-04T16:44:58,2023-05-04T16:47:57,BR-F,FR,3153,doigt problème recours collecte donnée surveil...,"['doigt', 'problème', 'recours', 'collecte', '..."
13146,7318,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,2023-05-04T16:48:32,2023-05-04T16:52:03,Mit-F,FR,2810,suite attentat vie aide victime infraction ind...,"['suite', 'attentat', 'vie', 'aide', 'victime'..."
13147,7319,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,2023-05-04T16:54:20,2023-05-04T16:56:37,BR-F,FR,2345,victime infraction soutien face traumatisme in...,"['victime', 'infraction', 'soutien', 'face', '..."


### Transcripts by person, party and gender

In [125]:
with open('data/persons.csv', encoding='utf-8') as file:
    persons_df = pd.read_csv(file).drop(columns='Unnamed: 0')[['PersonNumber', 'LastName', 'FirstName', 'GenderAsString', 'PartyAbbreviation']]
    
persons_df

Unnamed: 0,PersonNumber,LastName,FirstName,GenderAsString,PartyAbbreviation
0,9,Baumann,Ruedi,m,VERT-E-S
1,12,Beerli,Christine,f,PLR
2,14,Bezzola,Duri,m,PLR
3,15,Binder,Max,m,UDC
4,21,Blocher,Christoph,m,UDC
...,...,...,...,...,...
705,4329,Ruch,Daniel,m,PLR
706,4330,Berthoud,Alexandre,m,PLR
707,4331,Jost,Marc,m,PEV
708,4332,Crevoisier Crelier,Mathilde,f,PSS


In [126]:
transcript_by_person = processed_transcript.reset_index().merge(persons_df, on='PersonNumber', how='left').set_index('index')
transcript_by_person.index.name = None
transcript_by_person

Unnamed: 0,transcript_idx,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,...,End,Function,LanguageOfText,text_length,text_lemmatized,text_lemma_list,LastName,FirstName,GenderAsString,PartyAbbreviation
0,0,191151,36016,4018,Vous avez reçu le rapport du Conseil fédéral s...,N,20151130,5001,Mit-M,1.0,...,2015-11-30T15:08:50,B,FR,1159,élection constitution incompatibilité déroulem...,"['élection', 'constitution', 'incompatibilité'...",Maire,Jacques-André,m,PSS
1,1,191158,36016,4018,Pour adopter les décisions et formuler les pro...,N,20151130,5001,Mit-M,1.0,...,2015-11-30T15:23:08,B,FR,1643,principe explication service conseiller mandat...,"['principe', 'explication', 'service', 'consei...",Maire,Jacques-André,m,PSS
2,2,191267,36022,3898,Vous vous souvenez que le groupe UDC recommand...,N,20151130,5001,Mit-M,1.0,...,2015-11-30T18:02:40,Mit-M,FR,3925,carte visite diplomate soupçon fusil client ba...,"['carte', 'visite', 'diplomate', 'soupçon', 'f...",Nidegger,Yves,m,UDC
3,3,191199,36022,1116,"Dans ce dossier, nous sommes maintenant à bout...",N,20151130,5001,BR-M,99.0,...,2015-11-30T18:12:11,BR-M,FR,4090,dossier bout possibilité début carte visite go...,"['dossier', 'bout', 'possibilité', 'début', 'c...",Burkhalter,Didier,m,PLR
4,4,191226,36022,1116,On peut rester relativement calme sur ce sujet...,N,20151130,5001,BR-M,99.0,...,2015-11-30T18:32:48,BR-M,FR,5020,sujet intervention législation heure exemple c...,"['sujet', 'intervention', 'législation', 'heur...",Burkhalter,Didier,m,PLR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13144,7316,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,...,2023-05-04T16:44:54,Mit-M,FR,3702,modèle affaire entreprise numérique collecte d...,"['modèle', 'affaire', 'entreprise', 'numérique...",Fivaz,Fabien,m,VERT-E-S
13145,7317,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,...,2023-05-04T16:47:57,BR-F,FR,3153,doigt problème recours collecte donnée surveil...,"['doigt', 'problème', 'recours', 'collecte', '...",Baume-Schneider,Elisabeth,f,PSS
13146,7318,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,...,2023-05-04T16:52:03,Mit-F,FR,2810,suite attentat vie aide victime infraction ind...,"['suite', 'attentat', 'vie', 'aide', 'victime'...",de Quattro,Jacqueline,f,PLR
13147,7319,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,...,2023-05-04T16:56:37,BR-F,FR,2345,victime infraction soutien face traumatisme in...,"['victime', 'infraction', 'soutien', 'face', '...",Baume-Schneider,Elisabeth,f,PSS


In [127]:
# number of transcripts by party
transcript_by_person.groupby('PartyAbbreviation').count()['Text']

PartyAbbreviation
-             95
CSPO           8
EàG           35
Lega           2
M-E          916
MCG           46
PDC          488
PLR         2445
PSS         4898
PdT           60
UDC         2336
VERT-E-S    1544
pvl          276
Name: Text, dtype: int64

In [128]:
# number of transcripts by gender
transcript_by_person.groupby('GenderAsString').count()['Text']

GenderAsString
f    3244
m    9905
Name: Text, dtype: int64

In [129]:
transcript_by_party = transcript_by_person.groupby('PartyAbbreviation').agg({'text_lemmatized': lambda x: ' '.join(x)})
transcript_by_party.index.name = None
transcript_by_party = transcript_by_party.drop(['Al', 'Lega', 'PLD', '-'], errors='ignore')
transcript_by_party

Unnamed: 0,text_lemmatized
CSPO,juin arrêté vien essentiel rejet arrêté crédit...
EàG,conseiller reprise plupart étude partie étude ...
M-E,bloc cause objet cours répétition abstention o...
MCG,armée menace amélioration évolution sécurité a...
PDC,thème avenir planète ressource manière objecti...
PLR,dossier bout possibilité début carte visite go...
PSS,élection constitution incompatibilité déroulem...
PdT,reprise membre sujet respect rente assurance v...
UDC,carte visite diplomate soupçon fusil client ba...
VERT-E-S,peuple salle problème manière ressource contra...


In [130]:
transcript_by_gender = transcript_by_person.groupby('GenderAsString').agg({'text_lemmatized': lambda x: ' '.join(x)})
transcript_by_gender.index.name = None
transcript_by_gender

Unnamed: 0,text_lemmatized
f,peuple salle problème manière ressource contra...
m,élection constitution incompatibilité déroulem...


### TF-IDF 

In [131]:
group_transcript = processed_transcript.copy()

In [132]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(group_transcript['text_lemmatized'].to_list())
len(vectorizer.get_feature_names_out())

12002

In [133]:
tfidf_df = pd.DataFrame(vectors.toarray(), index=group_transcript.index, columns=vectorizer.get_feature_names_out())
tfidf_df.index.name = None
tfidf_df

Unnamed: 0,aaa,ab,abaissement,abandon,abandonniste,abat,abattage,abattement,abattoir,abattu,...,évoqué,évènement,évènementiel,événement,événementiel,événementielle,évêché,évêque,île,îlot
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13144,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [134]:
tfidf_df.sort_values(axis='columns', by=tfidf_df.iloc[1].name, ascending=False)

Unnamed: 0,incompatibilité,élection,vraisemblance,mandat,renonciation,communiqué,député,conseiller,échéance,élu,...,emprise,emprisonnement,emprisonné,emprisonnée,emprunt,emprunteur,empêchement,empêcher,encablure,îlot
0,0.210354,0.753928,0.000000,0.107582,0.000000,0.000000,0.000000,0.083449,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.648544,0.309925,0.223094,0.221125,0.201816,0.172794,0.172511,0.171521,0.164424,0.157396,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.084758,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.032137,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13144,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13145,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13146,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13147,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [135]:
tfidf_score_df = tfidf_df.stack().reset_index()
tfidf_score_df = tfidf_score_df.rename(columns={'ID': 'transcript','level_1': 'term', 0:'score', 'level_0': 'idx'})
tfidf_score_df

Unnamed: 0,idx,term,score
0,0,aaa,0.0
1,0,ab,0.0
2,0,abaissement,0.0
3,0,abandon,0.0
4,0,abandonniste,0.0
...,...,...,...
157814293,13148,événementielle,0.0
157814294,13148,évêché,0.0
157814295,13148,évêque,0.0
157814296,13148,île,0.0


In [136]:
top_tfidf = tfidf_score_df.sort_values(by=['idx','score'], ascending=[True,False]).groupby(['idx']).head(100)
top_tfidf

Unnamed: 0,idx,term,score
11782,0,élection,0.753928
2558,0,constitution,0.510427
5631,0,incompatibilité,0.210354
3801,0,déroulement,0.184457
4522,0,feuille,0.176052
...,...,...,...
157802352,13148,abyssale,0.000000
157802353,13148,abécédaire,0.000000
157802354,13148,abîme,0.000000
157802355,13148,acabit,0.000000


In [137]:
top_tfidf[top_tfidf['term'].str.contains("assurance")]

Unnamed: 0,idx,term,score
252869,21,assurance,0.032042
264871,22,assurance,0.067857
636933,53,assurance,0.211130
660937,55,assurance,0.135955
840967,70,assurance,0.321194
...,...,...,...
156974985,13079,assurance,0.281580
156998989,13081,assurance,0.106092
157010991,13082,assurance,0.062815
157779119,13146,assurance,0.044613


In [138]:
top_tfidf[top_tfidf['term'].str.contains("assurance")]
top_tfidf[top_tfidf['idx'] == 838]

Unnamed: 0,idx,term,score
10065094,838,objectif,0.503116
10067096,838,revenu,0.363402
10061582,838,développement,0.359346
10066517,838,période,0.253587
10067930,838,sorte,0.243882
...,...,...,...
10057743,838,acceptabilité,0.000000
10057744,838,acceptable,0.000000
10057745,838,acceptation,0.000000
10057746,838,accepte,0.000000


In [139]:
top_10_tfidf = top_tfidf.groupby('idx').head(10)
top_10_tfidf

Unnamed: 0,idx,term,score
11782,0,élection,0.753928
2558,0,constitution,0.510427
5631,0,incompatibilité,0.210354
3801,0,déroulement,0.184457
4522,0,feuille,0.176052
...,...,...,...
157813393,13148,tribunal,0.189952
157813740,13148,victimisation,0.134157
157806974,13148,fondement,0.085421
157809855,13148,optique,0.085153


In [140]:
top_10_tfidf.query('idx > 6870 & idx < 6879 & score > 0')

Unnamed: 0,idx,term,score
82473393,6871,ouverture,0.571813
82469006,6871,discothèque,0.270737
82477674,6871,étape,0.241162
82468112,6871,concept,0.238186
82468118,6871,concert,0.209831
...,...,...,...
82558140,6878,pratique,0.195454
82556330,6878,malade,0.192267
82560258,6878,suite,0.191063
82558633,6878,quarantaine,0.184826


In [141]:
top_10_tfidf = top_10_tfidf.query('score > 0')
top_10_tfidf

Unnamed: 0,idx,term,score
11782,0,élection,0.753928
2558,0,constitution,0.510427
5631,0,incompatibilité,0.210354
3801,0,déroulement,0.184457
4522,0,feuille,0.176052
...,...,...,...
157813393,13148,tribunal,0.189952
157813740,13148,victimisation,0.134157
157806974,13148,fondement,0.085421
157809855,13148,optique,0.085153


In [142]:
# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_10_tfidf.query('idx > -1 & idx < 10 & score > 0')
#top_tfidf_plusRand = top_10_tfidf.copy()
top_tfidf_plusRand['score'] = top_tfidf_plusRand['score'] + np.random.rand(top_tfidf_plusRand.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'idx:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("score", order="descending")],
    groupby = ["idx"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'score:Q'
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + text).properties(width = 1000)

In [838]:
top_10_tfidf.to_csv('top_tfidf.csv', encoding='utf-8')
top_10_tfidf

Unnamed: 0,idx,term,score
1579,0,bureau,0.642947
9859,0,élection,0.528889
2434,0,constitution,0.351921
5705,0,mandat,0.157655
4918,0,incompatibilité,0.143407
...,...,...,...
76828223,7649,tribunal,0.191068
76828483,7649,victimisation,0.132253
76825365,7649,optique,0.089030
76823050,7649,fondement,0.088222


In [143]:
top_terms = top_10_tfidf.groupby('idx')['term'].apply(list)
top_terms.name = 'top_terms'
top_terms = pd.DataFrame(top_terms)
top_terms.index.name = None
top_terms

Unnamed: 0,top_terms
0,"[élection, constitution, incompatibilité, déro..."
1,"[incompatibilité, élection, vraisemblance, man..."
2,"[potentat, carte, visite, blocage, fusil, dipl..."
3,"[notion, nécessité, détournement, réputation, ..."
4,"[confiscation, entraide, restitution, prescrip..."
...,...
13144,"[algorithme, collecte, solution, publicité, op..."
13145,"[plateforme, publicité, donnée, algorithme, in..."
13146,"[victime, acte, terrorisme, citoyen, violence,..."
13147,"[victime, infraction, étranger, indemnisation,..."


In [296]:
top_terms.loc[12884]

top_terms    [financement, équivalence, auteure, inquiétude...
Name: 12884, dtype: object

In [145]:
top_terms.loc[sorted(random.sample(list(processed_transcript.index), 500))]

Unnamed: 0,top_terms
17,"[coût, délai, responsabilité, transition, temp..."
33,"[renseignement, surveillance, collaboration, s..."
49,"[préservation, intérêt, soulagement, coexisten..."
62,"[budget, femme, mécène, homme, donnée, entité,..."
78,"[exclusivité, principe, médicament, incitatif,..."
...,...
13039,"[compagnie, apparence, denier, aéroport, urgen..."
13076,"[examen, travail, stabilisation, acceptation, ..."
13102,"[réinstallation, tradition, réfugié, compétenc..."
13118,"[puberté, enfantine, imprescriptibilité, porno..."


### BERTopic

#### Manual annotation

In [403]:
sample_top_terms = top_terms.loc[sorted(random.sample(list(top_terms.index), 1500))]
sample_top_terms.to_csv('sample_top_terms.csv', encoding='utf-8')
sample_top_terms

Unnamed: 0,top_terms
17,"[coût, délai, responsabilité, transition, temp..."
21,"[médecin, patient, assureur, confiance, symétr..."
34,"[homme, service, obligation, armée, optimisati..."
44,"[dépense, budget, personnel, exploitation, eff..."
57,"[exclusion, potentielle, cotisation, contact, ..."
...,...
13107,"[asile, centre, mineur, accompagnement, accuei..."
13113,"[mineur, plateforme, média, parent, prévention..."
13121,"[local, extérieur, armurier, sécurité, degré, ..."
13133,"[visa, collaborateur, taliban, militant, femme..."


In [653]:
text_id = 5365
print(top_terms.loc[text_id]['top_terms'], '\n')
print(processed_transcript.loc[text_id]['Text'])

['échange', 'message', 'mobilité', 'culture', 'vision', 'élément', 'consultation', 'volonté', 'soutien', 'création'] 

Comme vous l'avez rappelé, Madame Marchand-Balet, les échanges entre les régions linguistiques constituent un élément central de la politique linguistique et culturelle de la Confédération. Ils sont aussi d'une très grande importance pour les cantons. Ils contribuent à assurer la compréhension entre les communautés, à garantir le respect des autres langues et cultures de notre pays, et, par là même, à maintenir la cohésion nationale.
Vous demandez, par voie de motion, que le crédit destiné au soutien des échanges linguistiques dans le cadre de l'enveloppe 2016-2020 soit augmenté. Nous partageons avec vous la volonté de renforcer ces échanges. Il s'est passé pas mal de choses depuis le dépôt de votre motion. Un des éléments a été la collaboration avec les cantons, qui a abouti à la mise en place en 2017 de Movetia, l'agence nationale pour la  promotion des échanges et d

In [735]:
with open('sample_1500.csv', encoding='utf-8') as file:
    sample_cat = pd.read_csv(file).set_index('Unnamed: 0')
    sample_cat.index.name = None

sample_cat = sample_cat[['cat', 'transcript_words']]
sample_cat

Unnamed: 0,cat,transcript_words
1,politique,"['incompatibilité', 'élection', 'vraisemblance..."
2,extérieur,"['potentat', 'carte', 'visite', 'blocage', 'fu..."
25,armée-sécurité,"['armée', 'effectif', 'commandement', 'deva', ..."
32,armée-sécurité,"['armée', 'arrêté', 'deva', 'référendum', 'pla..."
46,extérieur,"['aide', 'développement', 'profit', 'taux', 'b..."
...,...,...
13121,armée-sécurité,"['local', 'extérieur', 'armurier', 'sécurité',..."
13124,société,"['égalité', 'sanction', 'travailleuse', 'analy..."
13129,immigration,"['contingent', 'réinstallation', 'femme', 'réf..."
13143,justice,"['corruption', 'infraction', 'blanchiment', 'c..."


In [736]:
sample_checked = sample_cat.merge(processed_transcript, left_index=True, right_index=True, how='left')[['cat', 'transcript_words', 'transcript_idx', 'IdSession']]
sample_checked

Unnamed: 0,cat,transcript_words,transcript_idx,IdSession
1,politique,"['incompatibilité', 'élection', 'vraisemblance...",1,5001
2,extérieur,"['potentat', 'carte', 'visite', 'blocage', 'fu...",2,5001
25,armée-sécurité,"['armée', 'effectif', 'commandement', 'deva', ...",26,5001
32,armée-sécurité,"['armée', 'arrêté', 'deva', 'référendum', 'pla...",34,5001
46,extérieur,"['aide', 'développement', 'profit', 'taux', 'b...",49,5001
...,...,...,...,...
13121,armée-sécurité,"['local', 'extérieur', 'armurier', 'sécurité',...",7293,5120
13124,société,"['égalité', 'sanction', 'travailleuse', 'analy...",7296,5120
13129,immigration,"['contingent', 'réinstallation', 'femme', 'réf...",7301,5120
13143,justice,"['corruption', 'infraction', 'blanchiment', 'c...",7315,5120


In [737]:
sample_checked.to_csv('sample_1500_checked.csv', encoding='utf-8')

#### Semi-supervised

In [656]:
with open('sample_1500.csv', encoding='utf-8') as file:
    sample_cat = pd.read_csv(file).set_index('Unnamed: 0')
    sample_cat.index.name = None

sample_cat = sample_cat[['cat']]
sample_cat

Unnamed: 0,cat
1,politique
2,extérieur
25,armée-sécurité
32,armée-sécurité
46,extérieur
...,...
13121,armée-sécurité
13124,société
13129,immigration
13143,justice


In [659]:
category_names = sorted(sample_cat['cat'].unique())
category_names

['agriculture-faune',
 'armée-sécurité',
 'culture-média',
 'emploi',
 'extérieur',
 'finance',
 'formation-recherche',
 'immigration',
 'justice',
 'logement-territoire',
 'manifestation-loisirs',
 'politique',
 'retraite-rentes',
 'santé',
 'société',
 'transport',
 'télécommunications',
 'écologie',
 'économie',
 'énergie']

In [660]:
category_dict = dict()
for i in category_names:
    idx = category_names.index(i)
    category_dict[i] = idx
    
category_dict

{'agriculture-faune': 0,
 'armée-sécurité': 1,
 'culture-média': 2,
 'emploi': 3,
 'extérieur': 4,
 'finance': 5,
 'formation-recherche': 6,
 'immigration': 7,
 'justice': 8,
 'logement-territoire': 9,
 'manifestation-loisirs': 10,
 'politique': 11,
 'retraite-rentes': 12,
 'santé': 13,
 'société': 14,
 'transport': 15,
 'télécommunications': 16,
 'écologie': 17,
 'économie': 18,
 'énergie': 19}

In [661]:
sample_cat = sample_cat.replace(category_dict)
sample_cat

Unnamed: 0,cat
1,11
2,4
25,1
32,1
46,4
...,...
13121,1
13124,14
13129,7
13143,8


In [662]:
supervised_transcript = processed_transcript.merge(sample_cat, left_index=True, right_index=True, how='left')
supervised_transcript['cat'] = supervised_transcript['cat'].fillna(-1)
supervised_transcript = supervised_transcript.astype({'cat':'int'})
supervised_transcript

Unnamed: 0,transcript_idx,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length,text_lemmatized,text_lemma_list,cat
0,0,191151,36016,4018,Vous avez reçu le rapport du Conseil fédéral s...,N,20151130,5001,Mit-M,1.0,2015-11-30T15:07:12,2015-11-30T15:08:50,B,FR,1159,élection constitution incompatibilité déroulem...,"['élection', 'constitution', 'incompatibilité'...",-1
1,1,191158,36016,4018,Pour adopter les décisions et formuler les pro...,N,20151130,5001,Mit-M,1.0,2015-11-30T15:21:04,2015-11-30T15:23:08,B,FR,1643,principe explication service conseiller mandat...,"['principe', 'explication', 'service', 'consei...",11
2,2,191267,36022,3898,Vous vous souvenez que le groupe UDC recommand...,N,20151130,5001,Mit-M,1.0,2015-11-30T17:58:22,2015-11-30T18:02:40,Mit-M,FR,3925,carte visite diplomate soupçon fusil client ba...,"['carte', 'visite', 'diplomate', 'soupçon', 'f...",4
3,3,191199,36022,1116,"Dans ce dossier, nous sommes maintenant à bout...",N,20151130,5001,BR-M,99.0,2015-11-30T18:07:35,2015-11-30T18:12:11,BR-M,FR,4090,dossier bout possibilité début carte visite go...,"['dossier', 'bout', 'possibilité', 'début', 'c...",-1
4,4,191226,36022,1116,On peut rester relativement calme sur ce sujet...,N,20151130,5001,BR-M,99.0,2015-11-30T18:27:03,2015-11-30T18:32:48,BR-M,FR,5020,sujet intervention législation heure exemple c...,"['sujet', 'intervention', 'législation', 'heur...",-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13144,7316,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,2023-05-04T16:41:10,2023-05-04T16:44:54,Mit-M,FR,3702,modèle affaire entreprise numérique collecte d...,"['modèle', 'affaire', 'entreprise', 'numérique...",-1
13145,7317,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,2023-05-04T16:44:58,2023-05-04T16:47:57,BR-F,FR,3153,doigt problème recours collecte donnée surveil...,"['doigt', 'problème', 'recours', 'collecte', '...",-1
13146,7318,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,2023-05-04T16:48:32,2023-05-04T16:52:03,Mit-F,FR,2810,suite attentat vie aide victime infraction ind...,"['suite', 'attentat', 'vie', 'aide', 'victime'...",1
13147,7319,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,2023-05-04T16:54:20,2023-05-04T16:56:37,BR-F,FR,2345,victime infraction soutien face traumatisme in...,"['victime', 'infraction', 'soutien', 'face', '...",-1


In [664]:
y = supervised_transcript['cat'].to_list()
len(y)

13149

In [665]:
# Initiate BERTopic
test_model = BERTopic(umap_model=umap_model, calculate_probabilities=True, language='french', verbose=True)

# Run BERTopic model
test_docs = processed_transcript['text_lemmatized'].to_list()
test_topics, test_probabilities = test_model.fit_transform(test_docs, y=y)
test_model.get_topic_info()

Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 411/411 [02:43<00:00,  2.51it/s]
2023-06-13 14:57:18,108 - BERTopic - Transformed documents to Embeddings
2023-06-13 14:57:29,607 - BERTopic - Reduced dimensionality
2023-06-13 14:57:36,902 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name
0,-1,5786,-1_entreprise_raison_travail_manière
1,0,359,0_patient_soin_prime_santé
2,1,338,1_chômage_travailleur_travail_salaire
3,2,323,2_vaccin_pandémie_coronavirus_crise
4,3,298,3_peine_crime_infraction_code
...,...,...,...
125,124,10,124_parité_clause_réservation_plateforme
126,125,10,125_anamnèse_clinique_conduite_aptitude
127,126,10,126_numéro_identification_utilisation_identifiant
128,127,10,127_budget_endettement_dépense_coupe


In [666]:
new_topics = test_model.reduce_outliers(test_docs, test_topics)
test_model.update_topics(test_docs, topics=new_topics)
test_model.get_topic_info()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00,  1.48it/s]


Unnamed: 0,Topic,Count,Name
0,0,561,0_prime_soin_patient_maladie
1,1,415,1_chômage_travailleur_travail_salaire
2,2,401,2_pandémie_vaccin_crise_coronavirus
3,3,476,3_peine_juge_code_infraction
4,4,405,4_formation_recherche_école_innovation
...,...,...,...
124,124,29,124_clause_parité_réservation_plateforme
125,125,26,125_expertise_expert_médecin_invalidité
126,126,35,126_numéro_donnée_utilisation_appariement
127,127,23,127_endettement_frein_coupe_dette


In [672]:
test_model.get_topic(44)

[('échange', 0.0694052739470887),
 ('huile', 0.047540602439647764),
 ('palme', 0.04199848172583792),
 ('commerce', 0.032317765735509436),
 ('durabilité', 0.03204221188794351),
 ('négociation', 0.028128387238272125),
 ('exportation', 0.026121057268966727),
 ('marché', 0.02518303633311117),
 ('concession', 0.020325969145048677),
 ('produit', 0.01831738725502245)]

In [671]:
test_model.get_document_info(test_docs).loc[12892]

Document                   convention sécurité coopération coopération mi...
Topic                                                                     44
Name                                         44_échange_huile_palme_commerce
Top_n_words                échange - huile - palme - commerce - durabilit...
Probability                                                         0.129491
Representative_document                                                False
Name: 12892, dtype: object

#### Supervised

In [687]:
with open('sample_1500.csv', encoding='utf-8') as file:
    sample_cat = pd.read_csv(file).set_index('Unnamed: 0')
    sample_cat.index.name = None

sample_cat = sample_cat[['cat']]
sample_cat

Unnamed: 0,cat
1,politique
2,extérieur
25,armée-sécurité
32,armée-sécurité
46,extérieur
...,...
13121,armée-sécurité
13124,société
13129,immigration
13143,justice


In [688]:
category_names = sorted(sample_cat['cat'].unique())
category_names

['agriculture-faune',
 'armée-sécurité',
 'culture-média',
 'emploi',
 'extérieur',
 'finance',
 'formation-recherche',
 'immigration',
 'justice',
 'logement-territoire',
 'manifestation-loisirs',
 'politique',
 'retraite-rentes',
 'santé',
 'société',
 'transport',
 'télécommunications',
 'écologie',
 'économie',
 'énergie']

In [689]:
category_dict = dict()
for i in category_names:
    idx = category_names.index(i)
    category_dict[i] = idx
    
category_dict

{'agriculture-faune': 0,
 'armée-sécurité': 1,
 'culture-média': 2,
 'emploi': 3,
 'extérieur': 4,
 'finance': 5,
 'formation-recherche': 6,
 'immigration': 7,
 'justice': 8,
 'logement-territoire': 9,
 'manifestation-loisirs': 10,
 'politique': 11,
 'retraite-rentes': 12,
 'santé': 13,
 'société': 14,
 'transport': 15,
 'télécommunications': 16,
 'écologie': 17,
 'économie': 18,
 'énergie': 19}

In [707]:
y = sample_cat.replace(category_dict)['cat'].to_list()
len(y)

1500

In [708]:
X = processed_transcript.loc[sample_cat.index]['text_lemmatized'].to_list()
len(X)

1500

In [709]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [710]:
print(len(X_test), len(X_train))

375 1125


In [724]:
# Initiate BERTopic
test_model = BERTopic(umap_model=umap_model, calculate_probabilities=True, language='french', verbose=True)

# Run BERTopic model
test_model.fit(X_train, y=y_train)

Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36/36 [00:13<00:00,  2.75it/s]
2023-06-13 16:06:36,056 - BERTopic - Transformed documents to Embeddings
2023-06-13 16:06:37,831 - BERTopic - Reduced dimensionality
2023-06-13 16:06:37,886 - BERTopic - Clustered reduced embeddings


<bertopic._bertopic.BERTopic at 0x32a4f5100>

In [725]:
predic_topics, predic_probs = test_model.transform(documents=X_test)

Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:04<00:00,  2.69it/s]
2023-06-13 16:08:48,279 - BERTopic - Reduced dimensionality
2023-06-13 16:08:48,306 - BERTopic - Calculated probabilities with HDBSCAN
2023-06-13 16:08:48,306 - BERTopic - Predicted clusters


In [726]:
pd.DataFrame([predic_topics, y_test])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,365,366,367,368,369,370,371,372,373,374
0,9,-1,-1,8,-1,5,0,1,14,-1,...,11,4,0,13,-1,-1,1,-1,6,11
1,19,6,3,3,2,8,4,12,7,7,...,10,1,5,9,14,10,13,14,18,2


#### Reduce outliers

In [369]:
# Initiate BERTopic
test_model = BERTopic(calculate_probabilities=True, language='french', verbose=True)

# Run BERTopic model
test_docs = processed_transcript['text_lemmatized'].to_list()
test_topics, test_probabilities = test_model.fit_transform(test_docs)
test_model.get_topic_info()

Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 411/411 [02:45<00:00,  2.48it/s]
2023-06-08 12:17:07,114 - BERTopic - Transformed documents to Embeddings
2023-06-08 12:17:10,459 - BERTopic - Reduced dimensionality
2023-06-08 12:17:19,287 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name
0,-1,5584,-1_raison_travail_entreprise_manière
1,0,316,0_peine_code_crime_procédure
2,1,310,1_vaccin_pandémie_coronavirus_épidémie
3,2,306,2_chômage_travailleur_salaire_travail
4,3,268,3_formation_recherche_innovation_école
...,...,...,...
138,137,11,137_pétrole_énergie_prix_carburant
139,138,11,138_installation_solaire_bâtiment_énergie
140,139,11,139_robot_robotisation_emploi_opportunité
141,140,10,140_guerre_paix_conflit_arme


In [357]:
# outliers: probabilities

new_topics = test_model.reduce_outliers(test_docs, test_topics, probabilities=test_probabilities, strategy="probabilities")
test_model.update_topics(test_docs, topics=new_topics)
outliers_probs = test_model.get_topic_info()
outliers_probs

Unnamed: 0,Topic,Count,Name
0,0,537,0_peine_juge_infraction_code
1,1,335,1_vaccin_pandémie_coronavirus_crise
2,2,375,2_chômage_travailleur_travail_salaire
3,3,330,3_formation_école_innovation_recherche
4,4,259,4_animal_loup_élevage_viande
...,...,...,...
134,134,50,134_crise_budget_endettement_dépense
135,135,42,135_couple_femme_imposition_rente
136,136,59,136_enfant_détention_famille_mineur
137,137,46,137_convention_ratification_accident_intégration


In [359]:
# outliers: distributions

new_topics = test_model.reduce_outliers(test_docs, test_topics, strategy="distributions")
test_model.update_topics(test_docs, topics=new_topics)
outliers_distrib = test_model.get_topic_info()
outliers_distrib

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:03<00:00,  1.63it/s]


Unnamed: 0,Topic,Count,Name
0,0,446,0_peine_infraction_code_crime
1,1,420,1_pandémie_vaccin_crise_coronavirus
2,2,391,2_chômage_travailleur_travail_salaire
3,3,404,3_formation_recherche_école_innovation
4,4,286,4_animal_loup_élevage_cheval
...,...,...,...
128,128,40,128_sexe_genre_identité_discernement
129,129,50,129_tarif_partenaire_forfait_structure
130,130,30,130_expertise_expert_médecin_assurance
131,131,45,131_traité_ratification_circulation_extension


In [370]:
# outliers: c-tf-idf

new_topics = test_model.reduce_outliers(test_docs, test_topics, strategy="c-tf-idf", threshold=0.15)
test_model.update_topics(test_docs, topics=new_topics)
outliers_ctfidf = test_model.get_topic_info()
outliers_ctfidf

Unnamed: 0,Topic,Count,Name
0,-1,1137,-1_raison_hymne_conseiller_chose
1,0,389,0_peine_infraction_juge_code
2,1,365,1_pandémie_vaccin_crise_épidémie
3,2,372,2_chômage_travailleur_salaire_travail
4,3,374,3_formation_recherche_école_innovation
...,...,...,...
138,137,22,137_énergie_carburant_prix_pétrole
139,138,33,138_installation_bâtiment_énergie_chauffage
140,139,11,139_robot_robotisation_emploi_opportunité
141,140,32,140_paix_guerre_conflit_neutralité


In [371]:
outliers_ctfidf.to_csv('test_topics.csv', encoding='utf-8')

In [385]:
test_model.get_topic(11)

[('divergence', 0.0671863110618691),
 ('version', 0.024658409576647826),
 ('unanimité', 0.012802318008709158),
 ('élimination', 0.012514145726758753),
 ('formulation', 0.011469777543849716),
 ('abstention', 0.011201342683248204),
 ('prestation', 0.010995381866594139),
 ('matin', 0.009459244165364304),
 ('montant', 0.00888220063535126),
 ('objet', 0.008716841969784131)]

In [386]:
with open('test_topics.csv', encoding='utf-8') as file:
    test_topics_df = pd.read_csv(file).drop(columns='Unnamed: 0')

test_topics_df

Unnamed: 0,Topic,Count,Name,cat
0,-1,1137,-1_raison_hymne_conseiller_chose,
1,0,389,0_peine_infraction_juge_code,justice
2,1,365,1_pandémie_vaccin_crise_épidémie,santé
3,2,372,2_chômage_travailleur_salaire_travail,travail
4,3,374,3_formation_recherche_école_innovation,formation
...,...,...,...,...
138,137,22,137_énergie_carburant_prix_pétrole,énergie
139,138,33,138_installation_bâtiment_énergie_chauffage,énergie
140,139,11,139_robot_robotisation_emploi_opportunité,travail
141,140,32,140_paix_guerre_conflit_neutralité,armée


In [387]:
test_topic_groups = list(test_topics_df.groupby('cat')['Topic'].apply(list))
test_topic_groups

[[13, 38, 41, 47, 49, 59, 66, 70, 89, 109, 111, 125],
 [14, 25, 28, 34, 50, 103, 123, 140],
 [11, 118, 120, 122],
 [75, 126, 141],
 [52, 65, 130],
 [19, 57, 63, 102],
 [8, 10, 99, 104, 107, 110, 128],
 [3, 56],
 [15, 33, 73, 82, 85],
 [0, 58, 129, 132],
 [16, 31, 64, 116],
 [23, 81],
 [18],
 [5, 12, 29, 32, 40, 86, 98, 105, 131],
 [4],
 [1,
  22,
  26,
  37,
  39,
  46,
  48,
  54,
  67,
  68,
  72,
  76,
  79,
  92,
  93,
  95,
  106,
  127,
  135],
 [7, 24, 27, 43, 53, 74, 88, 96, 97, 100, 115, 121],
 [30, 51],
 [6, 62, 94, 113, 119],
 [2, 78, 83, 84, 139],
 [21, 35, 45, 61, 77, 87, 90, 91, 136],
 [9, 20, 36, 42, 44, 55, 69, 71, 80, 101, 108, 112, 114, 117, 124, 133, 134],
 [17, 60, 137, 138]]

In [388]:
test_model.merge_topics(test_docs, test_topic_groups)
test_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,1137,-1_raison_manière_travail_problème
1,0,1622,0_assurance_santé_soin_coût
2,1,1107,1_entreprise_marché_prix_banque
3,2,1041,2_surveillance_vote_transparence_travail
4,3,849,3_enfant_femme_congé_parent
5,4,713,4_armée_arme_service_guerre
6,5,707,5_budget_impôt_dépense_imposition
7,6,704,6_animal_agriculture_production_produit
8,7,513,7_peine_procédure_juge_code
9,8,495,8_émission_eau_biodiversité_produit


In [389]:
test_model.get_topic_info().to_csv('test_topics_to_rename.csv', encoding='utf-8')

In [397]:
with open('test_topics_renamed.csv', encoding='utf-8') as file:
    test_topics_renamed = pd.read_csv(file).drop(columns='Unnamed: 0').fillna('unknown')
    
test_topics_renamed

Unnamed: 0,Topic,Count,Name,new_name
0,-1,1137,-1_raison_manière_travail_problème,unknown
1,0,1622,0_assurance_santé_soin_coût,santé
2,1,1107,1_entreprise_marché_prix_banque,économie
3,2,1041,2_surveillance_vote_transparence_travail,politique
4,3,849,3_enfant_femme_congé_parent,société
5,4,713,4_armée_arme_service_guerre,armée
6,5,707,5_budget_impôt_dépense_imposition,finance
7,6,704,6_animal_agriculture_production_produit,agriculture
8,7,513,7_peine_procédure_juge_code,justice
9,8,495,8_émission_eau_biodiversité_produit,écologie


In [395]:
test_topics_name_dict = dict()
test_topics_name_nbr_dict = dict()

for idx, row in test_topics_renamed.iterrows():
    count = idx
    if count < 10:
        count = '0' + str(count)
    else:
        count = str(count)
        
    test_topics_name_nbr_dict[row['Name']] = count + '_' + row['new_name']
    test_topics_name_dict[row['Name']] = row['new_name']
    
test_topics_name_dict

{'0_assurance_santé_soin_coût': 'santé',
 '1_entreprise_marché_prix_banque': 'économie',
 '2_surveillance_vote_transparence_travail': 'politique',
 '3_enfant_femme_congé_parent': 'société',
 '4_armée_arme_service_guerre': 'armée',
 '5_budget_impôt_dépense_imposition': 'finance',
 '6_animal_agriculture_production_produit': 'agriculture',
 '7_peine_procédure_juge_code': 'justice',
 '8_émission_eau_biodiversité_produit': 'écologie',
 '9_travail_chômage_travailleur_salaire': 'travail',
 '10_coopération_développement_sanction_aide': 'extérieur',
 '11_formation_école_recherche_langue': 'formation',
 '12_asile_réfugié_immigration_étranger': 'immigration',
 '13_transport_véhicule_trafic_route': 'transport',
 '14_loyer_locataire_logement_bail': 'logement',
 '15_divergence_numéro_version_donnée': 'autre',
 '16_rente_pilier_retraite_réforme': 'retraite',
 '17_énergie_électricité_installation_approvisionnement': 'énergie',
 '18_renseignement_terrorisme_échange_organisation': 'sécurité',
 '19_média

In [396]:
sample = []
sample_dict = dict()

for i in sorted(random.sample(list(processed_transcript.index), 100)):
    doc_topic = test_model.get_document_info(test_docs).loc[i]
    
    topic_idx = doc_topic['Topic']
    topic_score = round(doc_topic['Probability'], 3)
    topic_words = doc_topic['Top_n_words']
    topic_name = test_topics_renamed.loc[topic_idx]['new_name']
    transcript_words = top_terms.loc[i]['top_terms']
    
    sample_dict[i] = {
        'topic_idx': topic_idx,
        'topic_score': topic_score,
        'topic_name': topic_name,
        'topic_words': topic_words,
        'transcript_words': transcript_words
    }
    
    if topic_idx >= 0:
        sample.append('TRANSCRIPT IDX: ' + str(i))
        sample.append('TOPIC: ' + str(topic_idx))
        sample.append(topic_words)
        sample.append('\nTOP TERMS')
        sample.append(str(transcript_words))
        sample.append('score: ' + str(topic_score))

        if topic_score < 0.01:
            sample.append('----- LOW SCORE -----')
            
        sample.append('\n///////////////////////////////////////\n')
    else:
        sample.append('===========================================')
        sample.append('IDX: ' + str(i))
        sample.append(str(transcript_words))
        sample.append('===========================================\n')
        
print(len(sample))
sample = '\n'.join(sample)
sample_df = pd.DataFrame.from_dict(sample_dict, orient='index')
sample_df

KeyError: -1

#### Training

In [1516]:
loaded_model = BERTopic.load("my_model")
loaded_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,1126,0_santé_assurance_coût_soin
1,1,772,1_transparence_vote_divergence_surveillance
2,2,728,2_procédure_peine_code_juge
3,3,507,3_entreprise_prix_marché_produit
4,4,469,4_enfant_femme_parent_accueil
5,5,391,5_animal_agriculture_production_vin
6,6,331,6_crédit_budget_dépense_impôt
7,7,314,7_biodiversité_émission_eau_produit
8,8,290,8_armée_guerre_service_matériel
9,9,216,9_travail_chômage_travailleur_convention


In [156]:
# Initiate UMAP
umap_model = UMAP(
    n_neighbors=15, 
    n_components=10, 
    min_dist=0.0, 
    metric='cosine', 
    random_state=100,
)

# Initiate BERTopic
topic_model = BERTopic(calculate_probabilities=True, language='french', verbose=True)

# Run BERTopic model
docs = processed_transcript['text_lemmatized'].to_list()
topics, probabilities = topic_model.fit_transform(docs)
topic_model.get_topic_info()

Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 411/411 [02:49<00:00,  2.42it/s]
2023-06-05 14:09:47,753 - BERTopic - Transformed documents to Embeddings
2023-06-05 14:09:51,192 - BERTopic - Reduced dimensionality
2023-06-05 14:10:01,495 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name
0,-1,5343,-1_travail_entreprise_raison_assurance
1,0,446,0_peine_juge_procédure_crime
2,1,328,1_chômage_travailleur_salaire_travail
3,2,322,2_vaccin_pandémie_coronavirus_crise
4,3,286,3_armée_civil_service_munition
...,...,...,...
149,148,10,148_présentation_loyer_insertion_offre
150,149,10,149_expérimentation_animal_recherche_médicament
151,150,10,150_donnée_télécommunication_télécommunication...
152,151,10,151_impôt_provision_réforme_taux


In [160]:
new_topics = topic_model.reduce_outliers(docs, topics)
topic_model.update_topics(docs, topics=new_topics)
topic_model.get_topic_info()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:03<00:00,  1.81it/s]


Unnamed: 0,Topic,Count,Name
0,0,561,0_peine_juge_procédure_infraction
1,1,402,1_chômage_travailleur_salaire_travail
2,2,388,2_pandémie_vaccin_crise_coronavirus
3,3,370,3_armée_service_civil_munition
4,4,403,4_formation_école_recherche_innovation
...,...,...,...
148,148,26,148_insertion_offre_appel_marché
149,149,21,149_expérimentation_animal_recherche_méthode
150,150,78,150_donnée_préposé_protection_information
151,151,38,151_taux_impôt_brevet_bénéfice


In [208]:
topic_model.get_topic(29)

[('surveillance', 0.08194764663302176),
 ('autorité', 0.024068383823485053),
 ('organe', 0.023058589726662884),
 ('délégation', 0.019469412200937185),
 ('indépendance', 0.018016198535371182),
 ('inspection', 0.01594476321555098),
 ('gestion', 0.015121594760010643),
 ('administration', 0.014140305047241546),
 ('gouvernance', 0.014108909716112424),
 ('exécution', 0.01374710085461718)]

In [209]:
topic_model.get_document_info(docs, df=processed_transcript)
topic_model.get_document_info(docs).query('Topic >= 0')

Unnamed: 0,Document,Topic,Name,Top_n_words,Probability,Representative_document
0,élection constitution incompatibilité déroulem...,8,8_vote_référendum_élection_votation,vote - référendum - élection - votation - cito...,0.124050,False
1,principe explication service conseiller mandat...,8,8_vote_référendum_élection_votation,vote - référendum - élection - votation - cito...,0.040459,False
2,carte visite diplomate soupçon fusil client ba...,22,22_blanchiment_argent_locataire_plafond,blanchiment - argent - locataire - plafond - l...,0.027174,False
3,dossier bout possibilité début carte visite go...,22,22_blanchiment_argent_locataire_plafond,blanchiment - argent - locataire - plafond - l...,0.015727,False
4,sujet intervention législation heure exemple c...,0,0_peine_juge_procédure_infraction,peine - juge - procédure - infraction - code -...,0.051627,False
...,...,...,...,...,...,...
13144,modèle affaire entreprise numérique collecte d...,20,20_cartel_prix_concurrence_marché,cartel - prix - concurrence - marché - clause ...,0.027230,False
13145,doigt problème recours collecte donnée surveil...,150,150_donnée_préposé_protection_information,donnée - préposé - protection - information - ...,0.004716,False
13146,suite attentat vie aide victime infraction ind...,152,152_victime_réparation_soupçon_harcèlement,victime - réparation - soupçon - harcèlement -...,0.623754,False
13147,victime infraction soutien face traumatisme in...,152,152_victime_réparation_soupçon_harcèlement,victime - réparation - soupçon - harcèlement -...,0.022225,False


In [164]:
topic_model.get_topic_info().to_csv('topics_50_51.csv', encoding='utf-8')

In [210]:
with open('topics_50_51_cat.csv', encoding='utf-8') as file:
    topics_df = pd.read_csv(file).drop(columns='Unnamed: 0')

topics_df

Unnamed: 0,Topic,Count,Name,cat
0,0,561,0_peine_juge_procédure_infraction,justice
1,1,402,1_chômage_travailleur_salaire_travail,travail
2,2,388,2_pandémie_vaccin_crise_coronavirus,santé
3,3,370,3_armée_service_civil_munition,armée
4,4,403,4_formation_école_recherche_innovation,formation
...,...,...,...,...
148,148,26,148_insertion_offre_appel_marché,économie
149,149,21,149_expérimentation_animal_recherche_méthode,recherche
150,150,78,150_donnée_préposé_protection_information,sécurité
151,151,38,151_taux_impôt_brevet_bénéfice,finance


In [1509]:
test_merge = topics_df.merge(topic_model.get_topic_info(), left_index=True, right_index=True)
test_merge.loc[test_merge['Count_x'] != test_merge['Count_y']]

Unnamed: 0,Topic_x,Count_x,Name_x,cat,Topic_y,Count_y,Name_y


In [211]:
topic_groups = list(topics_df.groupby('cat')['Topic'].apply(list))
topic_groups

[[15, 25, 36, 39, 41, 56, 57, 60, 63, 99, 102, 129, 132, 135, 139],
 [3, 27, 34, 77],
 [7, 142, 143],
 [98, 121, 144],
 [53, 68, 92, 137],
 [19, 42, 51, 79],
 [9, 11, 88, 97, 116, 136, 151],
 [4, 62],
 [12, 50, 65, 80, 83, 113],
 [0, 58, 94, 152],
 [14, 22, 67, 73],
 [23, 85],
 [17],
 [8, 16, 18, 29, 70, 86, 87, 95, 119],
 [149],
 [5],
 [2,
  21,
  31,
  37,
  38,
  40,
  47,
  49,
  55,
  61,
  74,
  75,
  81,
  90,
  101,
  117,
  124,
  125,
  130,
  146],
 [6, 24, 28, 44, 96, 104, 105, 106, 109, 110, 111, 112, 141],
 [35, 45, 46, 66, 103, 114, 128, 145, 150],
 [30, 48, 64, 76, 91, 126],
 [1, 72, 84, 93],
 [10, 26, 33, 52, 69, 82, 89, 127, 147],
 [20,
  32,
  43,
  54,
  59,
  78,
  100,
  107,
  108,
  115,
  118,
  120,
  122,
  123,
  131,
  134,
  138,
  140,
  148],
 [13, 71, 133]]

In [212]:
topic_model.merge_topics(docs, topic_groups)
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,1736,0_assurance_santé_soin_coût
1,1,1115,1_vote_transparence_surveillance_membre
2,2,1011,2_entreprise_prix_marché_banque
3,3,919,3_enfant_femme_congé_parent
4,4,872,4_animal_agriculture_production_produit
5,5,719,5_budget_dépense_impôt_crédit
6,6,670,6_procédure_peine_juge_code
7,7,646,7_armée_arme_service_guerre
8,8,642,8_émission_eau_objectif_biodiversité
9,9,570,9_coopération_développement_aide_sanction


In [1492]:
#topic_model.save("my_model")

In [213]:
topic_model.get_topic_info().to_csv('topics_to_rename.csv', encoding='utf-8')

In [434]:
with open('topics_renamed_2.csv', encoding='utf-8') as file:
    topics_renamed = pd.read_csv(file).drop(columns='Unnamed: 0')
    
topics_renamed

Unnamed: 0,Topic,Count,Name,new_name
0,0,1736,0_assurance_santé_soin_coût,santé
1,1,1115,1_vote_transparence_surveillance_membre,politique
2,2,1011,2_entreprise_prix_marché_banque,économie
3,3,919,3_enfant_femme_congé_parent,société
4,4,872,4_animal_agriculture_production_produit,agriculture
5,5,719,5_budget_dépense_impôt_crédit,finance
6,6,670,6_procédure_peine_juge_code,justice
7,7,646,7_armée_arme_service_guerre,armée
8,8,642,8_émission_eau_objectif_biodiversité,écologie
9,9,570,9_coopération_développement_aide_sanction,extérieur


In [435]:
topics_name_dict = dict()
topics_name_nbr_dict = dict()

for idx, row in topics_renamed.iterrows():
    count = idx
    if count < 10:
        count = '0' + str(count)
    else:
        count = str(count)
        
    topics_name_nbr_dict[row['Name']] = count + '_' + row['new_name']
    topics_name_dict[row['Name']] = row['new_name']
    
topics_name_dict

{'0_assurance_santé_soin_coût': 'santé',
 '1_vote_transparence_surveillance_membre': 'politique',
 '2_entreprise_prix_marché_banque': 'économie',
 '3_enfant_femme_congé_parent': 'société',
 '4_animal_agriculture_production_produit': 'agriculture',
 '5_budget_dépense_impôt_crédit': 'finance',
 '6_procédure_peine_juge_code': 'justice',
 '7_armée_arme_service_guerre': 'armée',
 '8_émission_eau_objectif_biodiversité': 'écologie',
 '9_coopération_développement_aide_sanction': 'extérieur',
 '10_donnée_violence_sécurité_renseignement': 'sécurité',
 '11_travail_chômage_travailleur_salaire': 'travail',
 '12_formation_école_recherche_innovation': 'formation',
 '13_asile_réfugié_immigration_intégration': 'immigration',
 '14_loyer_locataire_logement_bail': 'logement',
 '15_transport_véhicule_trafic_route': 'transport',
 '16_rente_pilier_retraite_réforme': 'retraite',
 '17_divergence_version_prestation_montant': 'autre',
 '18_énergie_électricité_installation_approvisionnement': 'énergie',
 '19_médi

In [438]:
sample = []
sample_dict = dict()

for i in tqdm(sorted(random.sample(list(processed_transcript.index), 1500))):
    doc_topic = topic_model.get_document_info(docs).loc[i]
    
    topic_idx = doc_topic['Topic']
    topic_score = round(doc_topic['Probability'], 3)
    topic_words = doc_topic['Top_n_words']
    topic_name = topics_renamed.loc[topic_idx]['new_name']
    transcript_words = top_terms.loc[i]['top_terms']
    
    sample_dict[i] = {
        'topic_idx': topic_idx,
        'topic_score': topic_score,
        'topic_name': topic_name,
        'transcript_words': transcript_words,
        'topic_words': topic_words,
    }
    
    if topic_idx >= 0:
        sample.append('TRANSCRIPT IDX: ' + str(i))
        sample.append('TOPIC: ' + str(topic_idx))
        sample.append(topic_words)
        sample.append('\nTOP TERMS')
        sample.append(str(transcript_words))
        sample.append('score: ' + str(topic_score))

        if topic_score < 0.01:
            sample.append('----- LOW SCORE -----')
            
        sample.append('\n///////////////////////////////////////\n')
    else:
        sample.append('===========================================')
        sample.append('IDX: ' + str(i))
        sample.append(str(transcript_words))
        sample.append('===========================================\n')
        
print(len(sample))
sample = '\n'.join(sample)
sample_df = pd.DataFrame.from_dict(sample_dict, orient='index')
sample_df

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:35<00:00, 41.85it/s]

10537





Unnamed: 0,topic_idx,topic_score,topic_name,transcript_words,topic_words
1,1,0.131,politique,"[incompatibilité, élection, vraisemblance, man...",vote - transparence - surveillance - membre - ...
2,14,0.134,logement,"[potentat, carte, visite, blocage, fusil, dipl...",loyer - locataire - logement - bail - bailleur...
25,7,0.226,armée,"[armée, effectif, commandement, deva, soldat, ...",armée - arme - service - guerre - matériel - a...
32,7,1.000,armée,"[armée, arrêté, deva, référendum, plafond, réa...",armée - arme - service - guerre - matériel - a...
46,9,1.000,extérieur,"[aide, développement, profit, taux, budget, ab...",coopération - développement - aide - sanction ...
...,...,...,...,...,...
13121,7,0.053,armée,"[local, extérieur, armurier, sécurité, degré, ...",armée - arme - service - guerre - matériel - a...
13124,3,0.174,société,"[égalité, sanction, travailleuse, analyse, tra...",enfant - femme - congé - parent - famille - ég...
13129,13,0.036,immigration,"[contingent, réinstallation, femme, réfugié, m...",asile - réfugié - immigration - intégration - ...
13143,6,0.144,justice,"[corruption, infraction, blanchiment, crime, p...",procédure - peine - juge - code - tribunal - i...


In [439]:
sample_df.to_csv('sample.csv', encoding='utf-8')

with open('sample.txt', 'w', encoding='utf-8') as file:
    file.write(sample)

In [287]:
topic_by_person = topic_model.get_document_info(docs, df=transcript_by_person)
# add legislature column
topic_by_person['legislature'] = topic_by_person['IdSession'].apply(lambda x: int(str(x)[:2]))
topic_by_person

Unnamed: 0,transcript_idx,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,...,FirstName,GenderAsString,PartyAbbreviation,Document,Topic,Name,Top_n_words,Probability,Representative_document,legislature
0,0,191151,36016,4018,Vous avez reçu le rapport du Conseil fédéral s...,N,20151130,5001,Mit-M,1.0,...,Jacques-André,m,PSS,élection constitution incompatibilité déroulem...,1,1_vote_transparence_surveillance_membre,vote - transparence - surveillance - membre - ...,0.229817,False,50
1,1,191158,36016,4018,Pour adopter les décisions et formuler les pro...,N,20151130,5001,Mit-M,1.0,...,Jacques-André,m,PSS,principe explication service conseiller mandat...,1,1_vote_transparence_surveillance_membre,vote - transparence - surveillance - membre - ...,0.131285,False,50
2,2,191267,36022,3898,Vous vous souvenez que le groupe UDC recommand...,N,20151130,5001,Mit-M,1.0,...,Yves,m,UDC,carte visite diplomate soupçon fusil client ba...,14,14_loyer_locataire_logement_bail,loyer - locataire - logement - bail - bailleur...,0.134175,False,50
3,3,191199,36022,1116,"Dans ce dossier, nous sommes maintenant à bout...",N,20151130,5001,BR-M,99.0,...,Didier,m,PLR,dossier bout possibilité début carte visite go...,14,14_loyer_locataire_logement_bail,loyer - locataire - logement - bail - bailleur...,0.098286,False,50
4,4,191226,36022,1116,On peut rester relativement calme sur ce sujet...,N,20151130,5001,BR-M,99.0,...,Didier,m,PLR,sujet intervention législation heure exemple c...,6,6_procédure_peine_juge_code,procédure - peine - juge - code - tribunal - i...,0.089176,False,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13144,7316,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,...,Fabien,m,VERT-E-S,modèle affaire entreprise numérique collecte d...,2,2_entreprise_prix_marché_banque,entreprise - prix - marché - banque - produit ...,0.122128,False,51
13145,7317,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,...,Elisabeth,f,PSS,doigt problème recours collecte donnée surveil...,10,10_donnée_violence_sécurité_renseignement,donnée - violence - sécurité - renseignement -...,0.021083,False,51
13146,7318,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,...,Jacqueline,f,PLR,suite attentat vie aide victime infraction ind...,6,6_procédure_peine_juge_code,procédure - peine - juge - code - tribunal - i...,0.623754,False,51
13147,7319,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,...,Elisabeth,f,PSS,victime infraction soutien face traumatisme in...,6,6_procédure_peine_juge_code,procédure - peine - juge - code - tribunal - i...,0.099029,False,51


In [398]:
party = 'PartyAbbreviation'
legislature = 'legislature'
gender = 'GenderAsString'

ratio_by_col = True
group = legislature

# filter legislature
#filtered_topic_by_person = topic_by_person.query('legislature == 51')
filtered_topic_by_person = topic_by_person.copy()

topic_by_group = filtered_topic_by_person.groupby([group, 'Name']).count()[['Text']].stack().unstack(level=1)
topic_by_group = topic_by_group.droplevel(level=1)

# set index and column row name to None
topic_by_group.index.name = None
topic_by_group.columns.name = None

# set all NaN to 0
topic_by_group = topic_by_group.fillna(0)

# rename and sort topics
topic_by_group = topic_by_group.rename(columns=topics_name_dict)
topic_by_group = topic_by_group.reindex(sorted(topic_by_group.columns), axis=1)

# transpose df
topic_by_group = topic_by_group.T

# drop parties not in last legislature
topic_by_group = topic_by_group.drop(columns=['Lega', '-', 'MCG', 'CSPO'], errors='ignore')

# drop minor parties
topic_by_group = topic_by_group.drop(columns=['EàG', 'PdT'], errors='ignore')
    
if ratio_by_col:
    topic_by_group = round(topic_by_group.loc[:,:].div(topic_by_group.sum(axis=0), axis=1), 2)
else:
    topic_by_group = round(topic_by_group.loc[:,:].div(topic_by_group.sum(axis=1), axis=0), 2)

def make_pretty(styler):
    max_value = topic_by_group.values.max()
    styler.format(precision=2)
    styler.background_gradient(axis=None, vmin=0, vmax=max_value, cmap="YlGnBu")
    return styler

topic_by_group = topic_by_group.reindex(sorted(topic_by_group.columns), axis=1)
topic_by_group.style.pipe(make_pretty)

Unnamed: 0,50,51
agriculture,0.07,0.06
armée,0.06,0.04
autre,0.03,0.02
communication,0.01,0.01
culture,0.01,0.02
extérieur,0.05,0.04
finance,0.06,0.05
formation,0.03,0.04
immigration,0.05,0.03
justice,0.04,0.06


### Most frequent words

In [381]:
coun_vect = CountVectorizer()
count_matrix = coun_vect.fit_transform(filtered_transcript['text_lemmatized'].to_list())
count_array = count_matrix.toarray()
count_df = pd.DataFrame(data=count_array, columns=coun_vect.get_feature_names_out())
count_df

Unnamed: 0,00,0122,0202,0210,0219,0229,024,0244,026,0276,...,évoqué,évènement,évènementiel,événement,événementiel,événementielle,évêché,être,île,îlot
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7645,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7646,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7647,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7648,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [394]:
example_row = 1000
count_df.sort_values(axis='columns', by=example_row, ascending=False).iloc[example_row-5:example_row+5, 0:20]

Unnamed: 0,co2,objectif,émission,véhicule,automobiliste,progrès,principe,parc,dispositif,mise,voiture,emploi,heure,domaine,habitant,décarbonée,niveau,réponse,choix,polluant
995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
996,13,6,17,0,0,0,0,0,1,0,0,0,2,1,0,0,0,0,0,0
997,9,7,6,4,0,0,0,2,0,0,4,0,0,0,0,0,0,0,0,0
998,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
999,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1000,5,5,5,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1
1001,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0
1002,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1003,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1004,10,10,11,6,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0


### LDA gensim

In [810]:
filtered_transcript['text_lemm_list'] = filtered_transcript['text_lemmatized'].apply(lambda x: x.split())
filtered_transcript

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_lemmatized,text_lemma_list,text_lemm_list
0,254005,47804,4109,Vous avez reçu le rapport du Conseil fédéral s...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:57:46,2019-12-02T14:59:13,B,FR,élection bureau constitution incompatibilité b...,"[élection, bureau, constitution, incompatibili...","[élection, bureau, constitution, incompatibili..."
1,254007,47804,4109,Pour adopter les décisions et formuler les pro...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:11:40,2019-12-02T15:13:17,B,FR,décision bureau principe bureau explication se...,"[décision, bureau, principe, bureau, explicati...","[décision, bureau, principe, bureau, explicati..."
2,254046,47810,4156,"Lors de la session d'été 2019, notre conseil a...",N,20191202,5101,Mit-M,1.0,2019-12-02T17:47:43,2019-12-02T17:49:26,*,FR,session été session automne position octobre f...,"[session, été, session, automne, position, oct...","[session, été, session, automne, position, oct..."
3,254155,47813,4154,Nous parlons ici d'une révision totale de la l...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:24:17,2019-12-03T08:29:34,*,FR,révision protection population protection juin...,"[révision, protection, population, protection,...","[révision, protection, population, protection,..."
4,254159,47817,3872,Lors de ses séances des 18 et 19 février et de...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:34:23,2019-12-03T08:38:30,*,FR,séance février juin aménagement territoire éne...,"[séance, février, juin, aménagement, territoir...","[séance, février, juin, aménagement, territoir..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7645,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,2023-05-04T16:41:10,2023-05-04T16:44:54,Mit-M,FR,modèle affaire entreprise numérique collecte d...,"[modèle, affaire, entreprise, numérique, colle...","[modèle, affaire, entreprise, numérique, colle..."
7646,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,2023-05-04T16:44:58,2023-05-04T16:47:57,BR-F,FR,doigt problème recours collecte donnée surveil...,"[doigt, problème, recours, collecte, donnée, s...","[doigt, problème, recours, collecte, donnée, s..."
7647,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,2023-05-04T16:48:32,2023-05-04T16:52:03,Mit-F,FR,suite attentat vie aide victime infraction ind...,"[suite, attentat, vie, aide, victime, infracti...","[suite, attentat, vie, aide, victime, infracti..."
7648,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,2023-05-04T16:54:20,2023-05-04T16:56:37,BR-F,FR,victime infraction soutien face traumatisme in...,"[victime, infraction, soutien, face, traumatis...","[victime, infraction, soutien, face, traumatis..."


In [811]:
docs = filtered_transcript['text_lemm_list'].to_list()

dictionary = corpora.Dictionary(docs)

DT_matrix = [dictionary.doc2bow(doc) for doc in docs]

Lda_object = gensim.models.ldamodel.LdaModel

In [812]:
lda_model_1 = Lda_object(DT_matrix, num_topics=10, id2word = dictionary)

lda_model_1.print_topics(num_topics=10, num_words=5)

[(0,
  '0.018*"sanction" + 0.009*"guerre" + 0.009*"organe" + 0.008*"contrôle" + 0.008*"sécurité"'),
 (1,
  '0.013*"système" + 0.011*"objectif" + 0.010*"cadre" + 0.009*"point" + 0.008*"coût"'),
 (2,
  '0.018*"locataire" + 0.015*"loyer" + 0.014*"femme" + 0.010*"violence" + 0.009*"bailleur"'),
 (3,
  '0.017*"procédure" + 0.016*"assurance" + 0.011*"prime" + 0.010*"raison" + 0.008*"manière"'),
 (4,
  '0.009*"cadre" + 0.008*"point" + 0.007*"recherche" + 0.007*"budget" + 0.007*"moyen"'),
 (5,
  '0.025*"prix" + 0.015*"énergie" + 0.013*"patient" + 0.013*"coût" + 0.011*"santé"'),
 (6,
  '0.016*"produit" + 0.008*"raison" + 0.007*"contre-projet" + 0.007*"travail" + 0.007*"accord"'),
 (7,
  '0.022*"enfant" + 0.009*"décision" + 0.009*"parent" + 0.007*"rente" + 0.007*"procédure"'),
 (8,
  '0.026*"entreprise" + 0.018*"travail" + 0.009*"crédit" + 0.008*"aide" + 0.007*"milliard"'),
 (9,
  '0.013*"formation" + 0.012*"travail" + 0.010*"programme" + 0.009*"cadre" + 0.008*"soutien"')]

### top2vec

In [199]:
docs = filtered_transcript["text_lemmatized"].tolist()
len(docs)

324

In [196]:
topic_model = Top2Vec(
    docs,
    #embedding_model="universal-sentence-encoder-multilingual",
    speed="deep-learn",
)

2023-05-12 16:25:30,943 - top2vec - INFO - Pre-processing documents for training
2023-05-12 16:25:31,080 - top2vec - INFO - Creating joint document/word embedding
2023-05-12 16:25:38,074 - top2vec - INFO - Creating lower dimension embedding of documents
2023-05-12 16:25:39,207 - top2vec - INFO - Finding dense areas of documents
2023-05-12 16:25:39,213 - top2vec - INFO - Finding topics


In [201]:
model.get_topics()

{0: [('de', 0.13273781851509409),
  ('la', 0.10522084530715155),
  ('le', 0.07682557991849537),
  ('des', 0.06496992795926745),
  ('et', 0.06409332138178707),
  ('les', 0.061870529632359866),
  ('en', 0.055698036565708625),
  ('que', 0.055082912584038426),
  ('est', 0.05186673777975805),
  ('une', 0.042562060578284655)],
 1: [('de', 0.13792948109435965),
  ('la', 0.07491411465697119),
  ('et', 0.06830473202530185),
  ('les', 0.06791204410365591),
  ('le', 0.06573522710988462),
  ('pour', 0.06565243157193269),
  ('avs', 0.06497054756109226),
  ('des', 0.0634781534130419),
  ('est', 0.05652146656288921),
  ('une', 0.05409859315631972)]}