In [658]:
# Data processing
import os
import pandas as pd
import numpy as np
import ssl
import string
from collections import Counter
from transformers.pipelines import pipeline
import altair as alt
from tqdm import tqdm
import random

# Text preprocessiong
import nltk
from nltk.corpus import wordnet as wn
import spacy
import spacy_fastlang
from top2vec import Top2Vec
import gensim
from gensim import corpora
from gensim.corpora.dictionary import Dictionary
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from stanza.models.common.doc import Document
from stanza.pipeline.core import Pipeline
from stanza.pipeline.multilingual import MultilingualPipeline

# Topic model
from bertopic import BERTopic

# Dimension reduction
from umap.umap_ import UMAP

In [4]:
# disable ssl check (to be able to download nltk packages)

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [5]:
# download stanza package

import stanza
stanza.download(lang="multilingual")
stanza.download(lang="de")
stanza.download(lang="fr")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 106MB/s]                                                                               
2023-05-22 09:45:25 INFO: Downloading default packages for language: multilingual (multilingual) ...
2023-05-22 09:45:25 INFO: File exists: /Users/cyrille/stanza_resources/multilingual/default.zip
2023-05-22 09:45:25 INFO: Finished downloading models and saved to /Users/cyrille/stanza_resources.
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 46.2MB/s]                                                                              
2023-05-22 09:45:25 INFO: Downloading default packages for language: de (German) ...
2023-05-22 09:45:27 INFO: File exists: /Users/cyrille/stanza_resources/de/default.zip
2023-05-22 09:45:30 INFO: Finished downloading models and saved to /Users/cyrille/stanza_resources.
Downloading https://raw.

In [6]:
# download nltk packages

nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cyrille/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/cyrille/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/cyrille/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [903]:
transcript_df = pd.DataFrame()

directory = 'data/transcripts'
filepaths_list = []
transcripts_list = []

# iterate over files in that directory
for filename in os.listdir(directory):
    file = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(file) and file.endswith('.csv'):
        filepaths_list.append(file)

# sort transcripts chronologically 
filepaths_list.sort()
print(len(filepaths_list))

for filepath in tqdm(filepaths_list[97:117]):
    with open(filepath, encoding='utf-8') as file:
        session_df = pd.read_csv(file).drop(columns='Unnamed: 0')
        transcript_df = pd.concat([transcript_df, session_df])

transcript_df = transcript_df.reset_index(drop=True)
transcript_df

117


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 25.20it/s]


Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
0,253990,47803,806,"Präsidentin (Graf Maya, Alterspräsidentin): He...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:32:29,2019-12-02T14:33:42,P-F,DE
1,253996,47803,4290,"Sehr geehrte Frau Präsidentin, sehr geehrter H...",N,20191202,5101,Mit-M,1.0,2019-12-02T14:44:33,2019-12-02T14:52:10,Mit-M,DE
2,253998,47803,806,"Präsidentin (Graf Maya, Alterspräsidentin): Ge...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:52:10,2019-12-02T14:52:30,P-F,
3,254000,47804,806,"Präsidentin (Graf Maya, Alterspräsidentin): De...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:52:30,2019-12-02T14:54:24,P-F,DE
4,254011,47804,1139,Sie haben den Bericht des Bundesrates zu den N...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:54:24,2019-12-02T14:57:46,B,DE
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29559,319686,60670,4238,"Sehr geschätzter Herr Nationalrat Egger, Sie h...",N,20230504,5120,BR-F,99.0,2023-05-04T17:04:38,2023-05-04T17:06:20,BR-F,DE
29560,319809,60674,4268,La questione della mediatizzazione dei process...,N,20230504,5120,Mit-F,1.0,2023-05-04T17:06:39,2023-05-04T17:11:27,Mit-F,IT
29561,319699,60674,1122,"Frau Kollegin Gysin, Sie wollen eine Priorisie...",N,20230504,5120,Mit-M,1.0,2023-05-04T17:11:29,2023-05-04T17:11:46,Mit-M,DE
29562,319807,60674,4268,"Collega Fluri, sono molto consapevole del prob...",N,20230504,5120,Mit-F,1.0,2023-05-04T17:11:46,2023-05-04T17:12:44,Mit-F,IT


In [910]:
# only keep french texts
filtered_transcript = transcript_df.loc[transcript_df['LanguageOfText'] == 'FR']
# only keep texts longer than 300 char
filtered_transcript['text_length'] = filtered_transcript['Text'].apply(lambda x: len(x))
filtered_transcript = filtered_transcript[filtered_transcript['text_length'] > 300]
# reset index
filtered_transcript = filtered_transcript.reset_index(drop=True)
filtered_transcript

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length
0,254005,47804,4109,Vous avez reçu le rapport du Conseil fédéral s...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:57:46,2019-12-02T14:59:13,B,FR,1299
1,254007,47804,4109,Pour adopter les décisions et formuler les pro...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:11:40,2019-12-02T15:13:17,B,FR,1586
2,254046,47810,4156,"Lors de la session d'été 2019, notre conseil a...",N,20191202,5101,Mit-M,1.0,2019-12-02T17:47:43,2019-12-02T17:49:26,*,FR,1774
3,254155,47813,4154,Nous parlons ici d'une révision totale de la l...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:24:17,2019-12-03T08:29:34,*,FR,4569
4,254159,47817,3872,Lors de ses séances des 18 et 19 février et de...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:34:23,2019-12-03T08:38:30,*,FR,4506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7316,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,2023-05-04T16:41:10,2023-05-04T16:44:54,Mit-M,FR,3702
7317,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,2023-05-04T16:44:58,2023-05-04T16:47:57,BR-F,FR,3153
7318,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,2023-05-04T16:48:32,2023-05-04T16:52:03,Mit-F,FR,2810
7319,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,2023-05-04T16:54:20,2023-05-04T16:56:37,BR-F,FR,2345


In [1302]:
nlp_sent = spacy.load('xx_sent_ud_sm')
nlp = spacy.load("fr_core_news_md")
nlp.add_pipe("language_detector")
lemmatizer = nlp.get_pipe("lemmatizer")

additional_stopwords = {'-t', 'avez', 'être', 'aujourd', 'hui'}
specific_stopwords = {
    'accord', 'alinéa', 'an', 'année', 'article', 'avis', 'cadre', 'canton', 'cas', 
    'collègue', 'commission', 'conseil', 'débat', 'décision', 'discussion', 'disposition', 'domaine', 'droit', 
    'fédéral', 'franc', 'groupe', 'initiative', 'législature', 'loi', 'majorité', 'matière', 'mesure', 'milliard', 'million', 'minorité', 
    'monsieur', 'motion', 'parlementaire', 'pays', 'postulat', 'politique', 'position', 'président', 'proposition', 
    'projet', 'question', 'rapport', 'rapporteur', 'réponse', 'session', 'situation', 'suisse', 'voix'
}
removed_stopwords = {'hui'}

# add stopwords 
nlp.Defaults.stop_words |= additional_stopwords
nlp.Defaults.stop_words |= specific_stopwords

# remove stopwords
#nlp.Defaults.stop_words -= removed_stopwords

print(len(nlp.Defaults.stop_words))
print(nlp.Defaults.stop_words)

558
{'quelques', 'antérieure', 'celui-ci', 'tend', 'très', 'avaient', 'chez', 'déja', 'tel', 'j’', 'elle', 'cinquantième', 'semblable', 'neanmoins', 'suffit', 'tant', 'dès', 'était', 'auraient', 'groupe', 'où', 'treize', 'situation', 'au', 'sera', 'jusque', 'y', 'suivante', 'faisaient', 'rendre', 'suffisante', 'avez', 'reste', 'personne', 'quoique', 'moi-même', 'concernant', 'dire', 'deuxièmement', 'les', 'eux', 'douzième', 'es', 'suivant', "d'", 'près', 'avec', 'egalement', 'celle-ci', 'moi', 'nouveau', 'certaines', 'importe', 'celle-là', 'depuis', 'pendant', 'attendu', 'votre', 'ha', 'leur', 'déjà', 'cinquième', 'lui-meme', 'projet', 'souvent', 'quatorze', 'abord', 'lès', 'selon', 'parlementaire', 'specifique', 'également', 'douze', 'malgré', 'environ', 'avons', 'duquel', 'quiconque', 'afin', 'quarante', 'specifiques', 'son', 'plutôt', 'touchant', 'auront', 'canton', 'aie', 'tellement', 'nombreuses', 'matière', 'uns', 'après', 'ah', 'cinquante', 'être', 'cent', 'l’', 'autre', 'compri



In [1303]:
example_row = 288
text_id = filtered_transcript.iloc[example_row]['ID']
text = filtered_transcript.iloc[example_row]['Text']

print(text_id)
print('length:', len(text))
print(text)
print('---')

entity_dict = {}

for ent in nlp(text).ents:
    entity_dict[ent.text] = ent.label_
    
print(entity_dict)
print('---')

lemma_list = [
    token.lemma_ for sent in nlp_sent(text).sents 
    if (doc := nlp(sent.text))._.language == 'fr' 
    for token in doc 
    if not any([
        token.is_stop, 
        token.is_punct, 
        token.is_space, 
        token.ent_type_, 
        not token.is_alpha, 
        token.lemma_ in nlp.Defaults.stop_words, 
        token.pos_ != 'NOUN',
    ])
]

#lemma_list.sort()
print(lemma_list)

255705
length: 733
Je m'exprimerai une seule fois et de manière brève pour vous dire que le Conseil fédéral est très sensible à cette problématique. La contribution de solidarité doit revenir, dans son intégralité, aux victimes de mesures de coercition à des fins d'assistance et de placements extrafamiliaux. De l'avis du Conseil fédéral, il est incohérent, d'une part, que l'Etat accorde un dédommagement en signe de reconnaissance des torts causés et, d'autre part, qu'une prestation sociale soit réduite en conséquence.
Afin d'exclure aussi vite que possible toute prise en compte de la contribution de solidarité dans le calcul de la prestation complémentaire, le Conseil fédéral soutient l'initiative de la commission et vous invite à l'adopter.

---
{'Conseil fédéral': 'ORG', 'Etat': 'LOC'}
---
['fois', 'manière', 'problématique', 'contribution', 'solidarité', 'intégralité', 'victime', 'coercition', 'fin', 'assistance', 'placement', 'part', 'dédommagement', 'signe', 'reconnaissance', 'tor

In [1304]:
# remove stopwords, punctuation and then lemmatize
tqdm.pandas()
processed_transcript = filtered_transcript.copy()
processed_transcript['text_lemmatized'] = processed_transcript['Text'].progress_apply(
    lambda x: ' '.join([
        token.lemma_ for sent in nlp_sent(x).sents 
        if (doc := nlp(sent.text))._.language == 'fr' 
        for token in doc 
        if not any([
            token.is_stop, 
            token.is_punct, 
            token.is_space, 
            token.ent_type_, 
            not token.is_alpha, 
            token.lemma_ in nlp.Defaults.stop_words, 
            token.pos_ != 'NOUN',
        ])
    ])
)
# Take a look at the data
processed_transcript

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7321/7321 [20:07<00:00,  6.06it/s]


Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length,text_lemmatized
0,254005,47804,4109,Vous avez reçu le rapport du Conseil fédéral s...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:57:46,2019-12-02T14:59:13,B,FR,1299,élection bureau constitution incompatibilité b...
1,254007,47804,4109,Pour adopter les décisions et formuler les pro...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:11:40,2019-12-02T15:13:17,B,FR,1586,bureau principe bureau explication service con...
2,254046,47810,4156,"Lors de la session d'été 2019, notre conseil a...",N,20191202,5101,Mit-M,1.0,2019-12-02T17:47:43,2019-12-02T17:49:26,*,FR,1774,été automne octobre fois objet création regist...
3,254155,47813,4154,Nous parlons ici d'une révision totale de la l...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:24:17,2019-12-03T08:29:34,*,FR,4569,révision protection population protection juin...
4,254159,47817,3872,Lors de ses séances des 18 et 19 février et de...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:34:23,2019-12-03T08:38:30,*,FR,4506,séance février juin aménagement territoire éne...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7316,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,2023-05-04T16:41:10,2023-05-04T16:44:54,Mit-M,FR,3702,modèle affaire entreprise numérique collecte d...
7317,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,2023-05-04T16:44:58,2023-05-04T16:47:57,BR-F,FR,3153,doigt problème recours collecte donnée surveil...
7318,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,2023-05-04T16:48:32,2023-05-04T16:52:03,Mit-F,FR,2810,suite attentat vie aide victime infraction ind...
7319,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,2023-05-04T16:54:20,2023-05-04T16:56:37,BR-F,FR,2345,victime infraction soutien face traumatisme in...


In [1305]:
processed_transcript['text_lemma_list'] = processed_transcript['text_lemmatized'].apply(lambda x: x.split())
processed_transcript

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length,text_lemmatized,text_lemma_list
0,254005,47804,4109,Vous avez reçu le rapport du Conseil fédéral s...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:57:46,2019-12-02T14:59:13,B,FR,1299,élection bureau constitution incompatibilité b...,"[élection, bureau, constitution, incompatibili..."
1,254007,47804,4109,Pour adopter les décisions et formuler les pro...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:11:40,2019-12-02T15:13:17,B,FR,1586,bureau principe bureau explication service con...,"[bureau, principe, bureau, explication, servic..."
2,254046,47810,4156,"Lors de la session d'été 2019, notre conseil a...",N,20191202,5101,Mit-M,1.0,2019-12-02T17:47:43,2019-12-02T17:49:26,*,FR,1774,été automne octobre fois objet création regist...,"[été, automne, octobre, fois, objet, création,..."
3,254155,47813,4154,Nous parlons ici d'une révision totale de la l...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:24:17,2019-12-03T08:29:34,*,FR,4569,révision protection population protection juin...,"[révision, protection, population, protection,..."
4,254159,47817,3872,Lors de ses séances des 18 et 19 février et de...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:34:23,2019-12-03T08:38:30,*,FR,4506,séance février juin aménagement territoire éne...,"[séance, février, juin, aménagement, territoir..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7316,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,2023-05-04T16:41:10,2023-05-04T16:44:54,Mit-M,FR,3702,modèle affaire entreprise numérique collecte d...,"[modèle, affaire, entreprise, numérique, colle..."
7317,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,2023-05-04T16:44:58,2023-05-04T16:47:57,BR-F,FR,3153,doigt problème recours collecte donnée surveil...,"[doigt, problème, recours, collecte, donnée, s..."
7318,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,2023-05-04T16:48:32,2023-05-04T16:52:03,Mit-F,FR,2810,suite attentat vie aide victime infraction ind...,"[suite, attentat, vie, aide, victime, infracti..."
7319,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,2023-05-04T16:54:20,2023-05-04T16:56:37,BR-F,FR,2345,victime infraction soutien face traumatisme in...,"[victime, infraction, soutien, face, traumatis..."


In [1343]:
processed_transcript = processed_transcript[processed_transcript['text_lemma_list'].map(len) >= 10]
processed_transcript = processed_transcript.reset_index().rename(columns={'index': 'transcript_idx'})
processed_transcript

Unnamed: 0,transcript_idx,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length,text_lemmatized,text_lemma_list
0,0,254005,47804,4109,Vous avez reçu le rapport du Conseil fédéral s...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:57:46,2019-12-02T14:59:13,B,FR,1299,élection bureau constitution incompatibilité b...,"[élection, bureau, constitution, incompatibili..."
1,1,254007,47804,4109,Pour adopter les décisions et formuler les pro...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:11:40,2019-12-02T15:13:17,B,FR,1586,bureau principe bureau explication service con...,"[bureau, principe, bureau, explication, servic..."
2,2,254046,47810,4156,"Lors de la session d'été 2019, notre conseil a...",N,20191202,5101,Mit-M,1.0,2019-12-02T17:47:43,2019-12-02T17:49:26,*,FR,1774,été automne octobre fois objet création regist...,"[été, automne, octobre, fois, objet, création,..."
3,3,254155,47813,4154,Nous parlons ici d'une révision totale de la l...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:24:17,2019-12-03T08:29:34,*,FR,4569,révision protection population protection juin...,"[révision, protection, population, protection,..."
4,4,254159,47817,3872,Lors de ses séances des 18 et 19 février et de...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:34:23,2019-12-03T08:38:30,*,FR,4506,séance février juin aménagement territoire éne...,"[séance, février, juin, aménagement, territoir..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6926,7316,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,2023-05-04T16:41:10,2023-05-04T16:44:54,Mit-M,FR,3702,modèle affaire entreprise numérique collecte d...,"[modèle, affaire, entreprise, numérique, colle..."
6927,7317,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,2023-05-04T16:44:58,2023-05-04T16:47:57,BR-F,FR,3153,doigt problème recours collecte donnée surveil...,"[doigt, problème, recours, collecte, donnée, s..."
6928,7318,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,2023-05-04T16:48:32,2023-05-04T16:52:03,Mit-F,FR,2810,suite attentat vie aide victime infraction ind...,"[suite, attentat, vie, aide, victime, infracti..."
6929,7319,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,2023-05-04T16:54:20,2023-05-04T16:56:37,BR-F,FR,2345,victime infraction soutien face traumatisme in...,"[victime, infraction, soutien, face, traumatis..."


In [1344]:
processed_transcript.iloc[288]['text_lemmatized']

'réflexion décennie police manifestation sécurité bien sécurité force ordre tierce force ordre police lieu place force ordre besoin nécessité bien sécurité commerce manifestation société aveu échec phrase développement auteure monopole puissance attribut privatisation tâche sécurité fondement légitimité police constat constat faute moyen commune société tâche responsabilité moyen manifestation manifestation manifestation constat moyen organisateur manifestation chose exigence dérive espace sécurité risque mission tierce constat constat échec échec chef département solution solution convention niveau sécurité faute compétence ordre tâche sécurité responsabilité appui demande gestion police lumière local porte local police mandat manifestation règle direction stade police fameuse police bavure faute club rôle manifestation police police quotidien contrôle lumière verrouillage porte agent antan publicité entreprise gestion sécurité quotidien gestion sécurité manifestation député gauche co

### Subgroups

In [1308]:
with open('data/persons.csv', encoding='utf-8') as file:
    persons_df = pd.read_csv(file).drop(columns='Unnamed: 0')[['PersonNumber', 'LastName', 'FirstName', 'GenderAsString', 'PartyAbbreviation']]
    
persons_df

Unnamed: 0,PersonNumber,LastName,FirstName,GenderAsString,PartyAbbreviation
0,9,Baumann,Ruedi,m,VERT-E-S
1,12,Beerli,Christine,f,PLR
2,14,Bezzola,Duri,m,PLR
3,15,Binder,Max,m,UDC
4,21,Blocher,Christoph,m,UDC
...,...,...,...,...,...
705,4329,Ruch,Daniel,m,PLR
706,4330,Berthoud,Alexandre,m,PLR
707,4331,Jost,Marc,m,PEV
708,4332,Crevoisier Crelier,Mathilde,f,PSS


In [1345]:
transcript_by_person = processed_transcript.reset_index().merge(persons_df, on='PersonNumber', how='left').set_index('index')
transcript_by_person.index.name = None
transcript_by_person

Unnamed: 0,transcript_idx,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,...,End,Function,LanguageOfText,text_length,text_lemmatized,text_lemma_list,LastName,FirstName,GenderAsString,PartyAbbreviation
0,0,254005,47804,4109,Vous avez reçu le rapport du Conseil fédéral s...,N,20191202,5101,Mit-F,1.0,...,2019-12-02T14:59:13,B,FR,1299,élection bureau constitution incompatibilité b...,"[élection, bureau, constitution, incompatibili...",Piller Carrard,Valérie,f,PSS
1,1,254007,47804,4109,Pour adopter les décisions et formuler les pro...,N,20191202,5101,Mit-F,1.0,...,2019-12-02T15:13:17,B,FR,1586,bureau principe bureau explication service con...,"[bureau, principe, bureau, explication, servic...",Piller Carrard,Valérie,f,PSS
2,2,254046,47810,4156,"Lors de la session d'été 2019, notre conseil a...",N,20191202,5101,Mit-M,1.0,...,2019-12-02T17:49:26,*,FR,1774,été automne octobre fois objet création regist...,"[été, automne, octobre, fois, objet, création,...",Buffat,Michaël,m,UDC
3,3,254155,47813,4154,Nous parlons ici d'une révision totale de la l...,N,20191203,5101,Mit-M,1.0,...,2019-12-03T08:29:34,*,FR,4569,révision protection population protection juin...,"[révision, protection, population, protection,...",Addor,Jean-Luc,m,UDC
4,4,254159,47817,3872,Lors de ses séances des 18 et 19 février et de...,N,20191203,5101,Mit-M,1.0,...,2019-12-03T08:38:30,*,FR,4506,séance février juin aménagement territoire éne...,"[séance, février, juin, aménagement, territoir...",Bourgeois,Jacques,m,PLR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6926,7316,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,...,2023-05-04T16:44:54,Mit-M,FR,3702,modèle affaire entreprise numérique collecte d...,"[modèle, affaire, entreprise, numérique, colle...",Fivaz,Fabien,m,VERT-E-S
6927,7317,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,...,2023-05-04T16:47:57,BR-F,FR,3153,doigt problème recours collecte donnée surveil...,"[doigt, problème, recours, collecte, donnée, s...",Baume-Schneider,Elisabeth,f,PSS
6928,7318,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,...,2023-05-04T16:52:03,Mit-F,FR,2810,suite attentat vie aide victime infraction ind...,"[suite, attentat, vie, aide, victime, infracti...",de Quattro,Jacqueline,f,PLR
6929,7319,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,...,2023-05-04T16:56:37,BR-F,FR,2345,victime infraction soutien face traumatisme in...,"[victime, infraction, soutien, face, traumatis...",Baume-Schneider,Elisabeth,f,PSS


In [1346]:
# number of transcripts by party
transcript_by_person.groupby('PartyAbbreviation').count()['Text']

PartyAbbreviation
EàG           35
Lega           1
M-E          699
PLR         1146
PSS         2453
PdT           25
UDC         1392
VERT-E-S     990
pvl          190
Name: Text, dtype: int64

In [1347]:
# number of transcripts by gender
transcript_by_person.groupby('GenderAsString').count()['Text']

GenderAsString
f    1832
m    5099
Name: Text, dtype: int64

In [1348]:
transcript_by_party = transcript_by_person.groupby('PartyAbbreviation').agg({'text_lemmatized': lambda x: ' '.join(x)})
transcript_by_party.index.name = None
transcript_by_party = transcript_by_party.drop(['Al', 'Lega', 'PLD', '-'], errors='ignore')
transcript_by_party

Unnamed: 0,text_lemmatized
EàG,conseiller reprise plupart étude partie étude ...
M-E,région montagne général construction terme aff...
PLR,séance février juin aménagement territoire éne...
PSS,élection bureau constitution incompatibilité b...
PdT,reprise août novembre divergence contribution ...
UDC,été automne octobre fois objet création regist...
VERT-E-S,budget avenir réflexion terme budget marge com...
pvl,fois budget excédent compte dette défi taux en...


In [1349]:
transcript_by_gender = transcript_by_person.groupby('GenderAsString').agg({'text_lemmatized': lambda x: ' '.join(x)})
transcript_by_gender.index.name = None
transcript_by_gender

Unnamed: 0,text_lemmatized
f,élection bureau constitution incompatibilité b...
m,été automne octobre fois objet création regist...


### TF-IDF 

In [1350]:
group_transcript = processed_transcript.copy()

In [1351]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(group_transcript['text_lemmatized'].to_list())
len(vectorizer.get_feature_names_out())

9472

In [1352]:
tfidf_df = pd.DataFrame(vectors.toarray(), index=group_transcript.index, columns=vectorizer.get_feature_names_out())
tfidf_df.index.name = None
tfidf_df

Unnamed: 0,aaa,abaissement,abandon,abat,abattage,abattement,abattoir,abcès,abeille,aberration,...,évolué,évoqué,évènement,évènementiel,événement,événementiel,événementielle,évêché,île,îlot
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.057783,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6926,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
6927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
6928,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
6929,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [1353]:
tfidf_df.sort_values(axis='columns', by=tfidf_df.iloc[1].name, ascending=False)

Unnamed: 0,incompatibilité,bureau,élection,conseiller,mandat,renonciation,député,communiqué,échéance,secrétariat,...,endettement,endiguement,endroit,endk,endocrinien,endocrinologie,endocrinologue,endommagement,endométriose,îlot
0,0.147933,0.654135,0.538953,0.119834,0.159937,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.581066,0.428230,0.282261,0.235348,0.209406,0.178759,0.159945,0.15606,0.152267,0.150737,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.240077,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6926,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6927,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6928,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6929,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [1354]:
tfidf_score_df = tfidf_df.stack().reset_index()
tfidf_score_df = tfidf_score_df.rename(columns={'ID': 'transcript','level_1': 'term', 0:'score', 'level_0': 'idx'})
tfidf_score_df

Unnamed: 0,idx,term,score
0,0,aaa,0.0
1,0,abaissement,0.0
2,0,abandon,0.0
3,0,abat,0.0
4,0,abattage,0.0
...,...,...,...
65650427,6930,événementiel,0.0
65650428,6930,événementielle,0.0
65650429,6930,évêché,0.0
65650430,6930,île,0.0


In [1355]:
top_tfidf = tfidf_score_df.sort_values(by=['idx','score'], ascending=[True,False]).groupby(['idx']).head(100)
top_tfidf

Unnamed: 0,idx,term,score
1171,0,bureau,0.654135
9290,0,élection,0.538953
2009,0,constitution,0.359689
5220,0,mandat,0.159937
4444,0,incompatibilité,0.147933
...,...,...,...
65641016,6930,acceptation,0.000000
65641017,6930,accepte,0.000000
65641018,6930,acception,0.000000
65641019,6930,accessibilité,0.000000


In [1356]:
top_tfidf[top_tfidf['term'].str.contains("assurance")]

Unnamed: 0,idx,term,score
123798,13,assurance,0.113554
190102,20,assurance,0.053066
199574,21,assurance,0.043315
246934,26,assurance,0.077339
853142,90,assurance,0.112978
...,...,...,...
64988054,6861,assurance,0.287693
65006998,6863,assurance,0.106897
65016470,6864,assurance,0.062936
65622678,6928,assurance,0.044757


In [1357]:
top_tfidf[top_tfidf['term'].str.contains("assurance")]
top_tfidf[top_tfidf['idx'] == 838]

Unnamed: 0,idx,term,score
7942088,838,infrastructure,0.474338
7943916,838,pirate,0.339535
7938227,838,attaque,0.261323
7944647,838,ransomware,0.236851
7941822,838,hôpital,0.232537
...,...,...,...
7937547,838,ablation,0.000000
7937548,838,abnégation,0.000000
7937549,838,abolit,0.000000
7937550,838,abolition,0.000000


In [1358]:
top_10_tfidf = top_tfidf.groupby('idx').head(10)
top_10_tfidf

Unnamed: 0,idx,term,score
1171,0,bureau,0.654135
9290,0,élection,0.538953
2009,0,constitution,0.359689
5220,0,mandat,0.159937
4444,0,incompatibilité,0.147933
...,...,...,...
65649719,6930,tribunal,0.190104
65649972,6930,victimisation,0.132986
65646918,6930,optique,0.089033
65644645,6930,fondement,0.088211


In [1359]:
top_10_tfidf.query('idx > 6870 & idx < 6879 & score > 0')

Unnamed: 0,idx,term,score
65090285,6871,spécialiste,0.340641
65091557,6871,évacuation,0.263522
65090130,6871,site,0.247201
65089590,6871,risque,0.242521
65088650,6871,population,0.240336
...,...,...,...
65152820,6878,impulsion,0.176430
65155592,6878,recherche,0.171673
65154264,6878,objectif,0.167703
65151300,6878,démarche,0.142633


In [1360]:
top_10_tfidf = top_10_tfidf.query('score > 0')
top_10_tfidf

Unnamed: 0,idx,term,score
1171,0,bureau,0.654135
9290,0,élection,0.538953
2009,0,constitution,0.359689
5220,0,mandat,0.159937
4444,0,incompatibilité,0.147933
...,...,...,...
65649719,6930,tribunal,0.190104
65649972,6930,victimisation,0.132986
65646918,6930,optique,0.089033
65644645,6930,fondement,0.088211


In [1361]:
# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_10_tfidf.query('idx > -1 & idx < 10 & score > 0')
#top_tfidf_plusRand = top_10_tfidf.copy()
top_tfidf_plusRand['score'] = top_tfidf_plusRand['score'] + np.random.rand(top_tfidf_plusRand.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'idx:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("score", order="descending")],
    groupby = ["idx"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'score:Q'
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + text).properties(width = 1000)

In [838]:
top_10_tfidf.to_csv('top_tfidf.csv', encoding='utf-8')
top_10_tfidf

Unnamed: 0,idx,term,score
1579,0,bureau,0.642947
9859,0,élection,0.528889
2434,0,constitution,0.351921
5705,0,mandat,0.157655
4918,0,incompatibilité,0.143407
...,...,...,...
76828223,7649,tribunal,0.191068
76828483,7649,victimisation,0.132253
76825365,7649,optique,0.089030
76823050,7649,fondement,0.088222


In [1362]:
top_terms = top_10_tfidf.groupby('idx')['term'].apply(list)
top_terms.name = 'top_terms'
top_terms = pd.DataFrame(top_terms)
top_terms.index.name = None
top_terms

Unnamed: 0,top_terms
0,"[bureau, élection, constitution, mandat, incom..."
1,"[incompatibilité, bureau, élection, conseiller..."
2,"[lobbyiste, mandant, lobbyisme, transparence, ..."
3,"[abri, divergence, protection, rénovation, rem..."
4,"[bâtiment, zone, révision, territoire, aménage..."
...,...
6926,"[algorithme, collecte, solution, publicité, op..."
6927,"[plateforme, publicité, donnée, intermédiaire,..."
6928,"[victime, acte, terrorisme, citoyen, violence,..."
6929,"[victime, infraction, étranger, indemnisation,..."


In [1363]:
top_terms.loc[6870]

top_terms    [déminage, sein, renchérissement, montant, rés...
Name: 6870, dtype: object

In [1364]:
top_terms.loc[sorted(random.sample(list(processed_transcript.index), 500))]

Unnamed: 0,top_terms
3,"[abri, divergence, protection, rénovation, rem..."
7,"[zone, superficie, terre, biodiversité, animal..."
17,"[budget, personnel, supplément, augmentation, ..."
28,"[vin, stock, vigneron, promotion, vendange, ai..."
38,"[formation, ville, recherche, sacrifice, mine,..."
...,...
6861,"[affiliation, maladie, risque, cotisation, pro..."
6865,"[étude, masque, hasard, confiance, chirurgien,..."
6875,"[encapsulation, munition, sondage, conglomérat..."
6880,"[agression, conflit, crime, code, victime, che..."


### BERTopic

#### Semi-supervised

In [1185]:
with open('sample_cat.csv', encoding='utf-8') as file:
    sample_cat = pd.read_csv(file).set_index('Unnamed: 0')
    sample_cat.index.name = None
    sample_cat = sample_cat[['cat']]
    
sample_cat

Unnamed: 0,cat
120,économie
126,santé
127,santé
324,travail
563,santé
...,...
7138,économie
7197,santé
7274,immigration
7278,travail


In [1182]:
category_names = list(sample_cat['cat'].unique())
category_names

['économie',
 'santé',
 'travail',
 'écologie',
 'société',
 'sécurité',
 'formation',
 'politique',
 'recherche',
 'agriculture',
 'finance',
 'sport',
 'média',
 'immigration',
 'justice',
 'armée',
 'extérieur',
 'transport',
 'retraite',
 'énergie',
 'culture',
 'immobilier']

In [1202]:
category_dict = dict()
for i in category_names:
    idx = category_names.index(i)
    category_dict[i] = idx
    
category_dict

{'économie': 0,
 'santé': 1,
 'travail': 2,
 'écologie': 3,
 'société': 4,
 'sécurité': 5,
 'formation': 6,
 'politique': 7,
 'recherche': 8,
 'agriculture': 9,
 'finance': 10,
 'sport': 11,
 'média': 12,
 'immigration': 13,
 'justice': 14,
 'armée': 15,
 'extérieur': 16,
 'transport': 17,
 'retraite': 18,
 'énergie': 19,
 'culture': 20,
 'immobilier': 21}

In [1206]:
sample_cat = sample_cat.replace(category_dict)
sample_cat

Unnamed: 0,cat
120,0
126,1
127,1
324,2
563,1
...,...
7138,0
7197,1
7274,13
7278,2


In [1209]:
supervised_transcript = processed_transcript.merge(sample_cat, left_index=True, right_index=True, how='left')
supervised_transcript['cat'] = supervised_transcript['cat'].fillna(-1)
supervised_transcript = supervised_transcript.astype({'cat':'int'})
supervised_transcript

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length,text_lemmatized,text_lemma_list,cat
0,254005,47804,4109,Vous avez reçu le rapport du Conseil fédéral s...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:57:46,2019-12-02T14:59:13,B,FR,1299,élection bureau constitution incompatibilité b...,"[élection, bureau, constitution, incompatibili...",-1
1,254007,47804,4109,Pour adopter les décisions et formuler les pro...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:11:40,2019-12-02T15:13:17,B,FR,1586,bureau principe bureau explication service con...,"[bureau, principe, bureau, explication, servic...",-1
2,254046,47810,4156,"Lors de la session d'été 2019, notre conseil a...",N,20191202,5101,Mit-M,1.0,2019-12-02T17:47:43,2019-12-02T17:49:26,*,FR,1774,session été session automne octobre fois objet...,"[session, été, session, automne, octobre, fois...",-1
3,254155,47813,4154,Nous parlons ici d'une révision totale de la l...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:24:17,2019-12-03T08:29:34,*,FR,4569,révision protection population protection juin...,"[révision, protection, population, protection,...",-1
4,254159,47817,3872,Lors de ses séances des 18 et 19 février et de...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:34:23,2019-12-03T08:38:30,*,FR,4506,séance février juin aménagement territoire éne...,"[séance, février, juin, aménagement, territoir...",-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7316,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,2023-05-04T16:41:10,2023-05-04T16:44:54,Mit-M,FR,3702,modèle affaire entreprise numérique collecte d...,"[modèle, affaire, entreprise, numérique, colle...",-1
7317,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,2023-05-04T16:44:58,2023-05-04T16:47:57,BR-F,FR,3153,doigt problème recours collecte donnée surveil...,"[doigt, problème, recours, collecte, donnée, s...",-1
7318,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,2023-05-04T16:48:32,2023-05-04T16:52:03,Mit-F,FR,2810,suite attentat vie aide victime infraction ind...,"[suite, attentat, vie, aide, victime, infracti...",-1
7319,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,2023-05-04T16:54:20,2023-05-04T16:56:37,BR-F,FR,2345,victime infraction soutien face traumatisme in...,"[victime, infraction, soutien, face, traumatis...",-1


In [1210]:
y = supervised_transcript['cat'].to_list()
y

[-1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 0,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1

In [1212]:
# Initiate BERTopic
test_model = BERTopic(umap_model=umap_model, calculate_probabilities=True, language='french', verbose=True)

# Run BERTopic model
docs = processed_transcript['text_lemmatized'].to_list()
topics, probabilities = test_model.fit_transform(docs, y=y)
test_model.get_topic_info()

Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [01:29<00:00,  2.57it/s]
2023-05-31 20:52:02,070 - BERTopic - Transformed documents to Embeddings
2023-05-31 20:52:16,499 - BERTopic - Reduced dimensionality
2023-05-31 20:52:18,536 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name
0,-1,3252,-1_entreprise_raison_travail_manière
1,0,301,0_peine_procédure_code_crime
2,1,168,1_chômage_salaire_travail_travailleur
3,2,142,2_vaccin_pandémie_vaccination_épidémie
4,3,134,3_émission_gaz_serre_climat
...,...,...,...
93,92,11,92_entreprise_rigueur_affaire_chiffre
94,93,11,93_législature_programme_stratégie_pauvreté
95,94,11,94_naturalisation_nationalité_parent_génération
96,95,10,95_amortissement_compte_supplément_crédit


In [1213]:
new_topics = test_model.reduce_outliers(docs, topics)
test_model.update_topics(docs, topics=new_topics)
test_model.get_topic_info()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.52it/s]


Unnamed: 0,Topic,Count,Name
0,-1,1,-1_tre_lingue_jau_discur
1,0,404,0_peine_procédure_code_juge
2,1,244,1_chômage_travail_salaire_travailleur
3,2,187,2_vaccin_pandémie_épidémie_vaccination
4,3,166,3_émission_gaz_climat_serre
...,...,...,...
93,92,65,92_entreprise_affaire_rigueur_chiffre
94,93,47,93_législature_programme_stratégie_message
95,94,18,94_naturalisation_nationalité_parent_génération
96,95,28,95_dette_endettement_amortissement_compte


In [1229]:
test_model.get_topic(60)

[('solution', 0.09677835543136502),
 ('rigueur', 0.03574394611154022),
 ('problème', 0.021230163538695623),
 ('compétence', 0.02032276557531324),
 ('acceptabilité', 0.019431546625913617),
 ('chose', 0.019346144216684034),
 ('bail', 0.017948352413834988),
 ('compromis', 0.017648671639580445),
 ('dossier', 0.017087540159614324),
 ('manoeuvre', 0.016391057593906477)]

In [1228]:
test_model.get_document_info(docs).loc[2449]

Document                   discussion air base action mois contour crise ...
Topic                                                                     60
Name                                 60_solution_rigueur_problème_compétence
Top_n_words                solution - rigueur - problème - compétence - a...
Probability                                                         0.012841
Representative_document                                                False
Name: 2449, dtype: object

#### Training

In [1365]:
# Initiate UMAP
umap_model = UMAP(
    n_neighbors=15, 
    n_components=10, 
    min_dist=0.0, 
    metric='cosine', 
    random_state=100,
)

# Initiate BERTopic
topic_model = BERTopic(umap_model=umap_model, calculate_probabilities=True, language='french')

# Run BERTopic model
docs = processed_transcript['text_lemmatized'].to_list()
topics, probabilities = topic_model.fit_transform(docs)
topic_model.get_topic_info()

2023-06-02 12:13:58,254 - BERTopic - Transformed documents to Embeddings
2023-06-02 12:14:07,817 - BERTopic - Reduced dimensionality
2023-06-02 12:14:09,484 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name
0,-1,2706,-1_raison_travail_entreprise_manière
1,0,553,0_peine_procédure_code_infraction
2,1,324,1_vaccin_pandémie_crise_épidémie
3,2,156,2_crédit_budget_dépense_montant
4,3,125,3_rente_retraite_pilier_réforme
...,...,...,...
86,85,10,85_sol_zone_surface_assolement
87,86,10,86_prix_électricité_énergie_ménage
88,87,10,87_médicament_catégorie_prix_importation
89,88,10,88_betterave_génie_sucre_culture


In [1366]:
new_topics = topic_model.reduce_outliers(docs, topics)
topic_model.update_topics(docs, topics=new_topics)
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,683,0_peine_procédure_code_infraction
1,1,391,1_pandémie_vaccin_crise_épidémie
2,2,237,2_crédit_budget_dépense_endettement
3,3,180,3_rente_retraite_pilier_réforme
4,4,167,4_énergie_électricité_installation_approvision...
...,...,...,...
85,85,20,85_surface_sol_zone_estivage
86,86,20,86_prix_énergie_électricité_essence
87,87,24,87_médicament_catégorie_prix_approvisionnement
88,88,23,88_moratoire_betterave_génie_culture


In [1373]:
topic_model.get_topic(78)

[('invalidité', 0.046877426510602706),
 ('analyse', 0.024900095224211487),
 ('application', 0.022913060522485984),
 ('phase', 0.022687438477080404),
 ('évaluation', 0.02179249747727556),
 ('chose', 0.020738647342092385),
 ('assurance', 0.02069154148845213),
 ('texte', 0.018778410348467237),
 ('temps', 0.017749522813316636),
 ('base', 0.017234644302304977)]

In [1368]:
topic_model.get_document_info(docs, df=processed_transcript)

Unnamed: 0,transcript_idx,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,...,LanguageOfText,text_length,text_lemmatized,text_lemma_list,Document,Topic,Name,Top_n_words,Probability,Representative_document
0,0,254005,47804,4109,Vous avez reçu le rapport du Conseil fédéral s...,N,20191202,5101,Mit-F,1.0,...,FR,1299,élection bureau constitution incompatibilité b...,"[élection, bureau, constitution, incompatibili...",élection bureau constitution incompatibilité b...,5,5_vote_élection_référendum_votation,vote - élection - référendum - votation - démo...,0.039292,False
1,1,254007,47804,4109,Pour adopter les décisions et formuler les pro...,N,20191202,5101,Mit-F,1.0,...,FR,1586,bureau principe bureau explication service con...,"[bureau, principe, bureau, explication, servic...",bureau principe bureau explication service con...,5,5_vote_élection_référendum_votation,vote - élection - référendum - votation - démo...,0.046814,False
2,2,254046,47810,4156,"Lors de la session d'été 2019, notre conseil a...",N,20191202,5101,Mit-M,1.0,...,FR,1774,été automne octobre fois objet création regist...,"[été, automne, octobre, fois, objet, création,...",été automne octobre fois objet création regist...,21,21_transparence_campagne_parti_élection,transparence - campagne - parti - élection - d...,0.022231,False
3,3,254155,47813,4154,Nous parlons ici d'une révision totale de la l...,N,20191203,5101,Mit-M,1.0,...,FR,4569,révision protection population protection juin...,"[révision, protection, population, protection,...",révision protection population protection juin...,12,12_divergence_version_unanimité_recours,divergence - version - unanimité - recours - é...,0.277976,False
4,4,254159,47817,3872,Lors de ses séances des 18 et 19 février et de...,N,20191203,5101,Mit-M,1.0,...,FR,4506,séance février juin aménagement territoire éne...,"[séance, février, juin, aménagement, territoir...",séance février juin aménagement territoire éne...,8,8_animal_élevage_viande_exploitation,animal - élevage - viande - exploitation - cor...,0.306375,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6926,7316,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,...,FR,3702,modèle affaire entreprise numérique collecte d...,"[modèle, affaire, entreprise, numérique, colle...",modèle affaire entreprise numérique collecte d...,9,9_concurrence_marché_prix_cartel,concurrence - marché - prix - cartel - consomm...,0.001879,False
6927,7317,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,...,FR,3153,doigt problème recours collecte donnée surveil...,"[doigt, problème, recours, collecte, donnée, s...",doigt problème recours collecte donnée surveil...,30,30_assurance_assureur_intermédiaire_assuré,assurance - assureur - intermédiaire - assuré ...,0.000935,False
6928,7318,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,...,FR,2810,suite attentat vie aide victime infraction ind...,"[suite, attentat, vie, aide, victime, infracti...",suite attentat vie aide victime infraction ind...,0,0_peine_procédure_code_infraction,peine - procédure - code - infraction - juge -...,0.124043,False
6929,7319,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,...,FR,2345,victime infraction soutien face traumatisme in...,"[victime, infraction, soutien, face, traumatis...",victime infraction soutien face traumatisme in...,0,0_peine_procédure_code_infraction,peine - procédure - code - infraction - juge -...,0.024125,False


In [1369]:
topic_model.get_topic_info().to_csv('topics.csv', encoding='utf-8')

In [1374]:
with open('topics_cat.csv', encoding='utf-8') as file:
    topics_df = pd.read_csv(file).drop(columns='Unnamed: 0')

topics_df

Unnamed: 0,Topic,Count,Name,cat
0,0,683,0_peine_procédure_code_infraction,justice
1,1,391,1_pandémie_vaccin_crise_épidémie,santé
2,2,237,2_crédit_budget_dépense_endettement,finance
3,3,180,3_rente_retraite_pilier_réforme,retraite
4,4,167,4_énergie_électricité_installation_approvision...,énergie
...,...,...,...,...
85,85,20,85_surface_sol_zone_estivage,agriculture
86,86,20,86_prix_énergie_électricité_essence,énergie
87,87,24,87_médicament_catégorie_prix_approvisionnement,santé
88,88,23,88_moratoire_betterave_génie_culture,agriculture


In [1375]:
topic_groups = list(topics_df.groupby('cat')['Topic'].apply(list))
topic_groups

[[8, 20, 39, 59, 75, 85, 88, 89],
 [26, 27, 31, 36],
 [29, 38, 83],
 [25, 44, 63],
 [2, 77, 81, 84],
 [23, 64],
 [15, 45],
 [0, 43],
 [14],
 [42],
 [5, 7, 12, 21, 32, 37],
 [16, 74],
 [3],
 [1, 18, 24, 30, 41, 49, 50, 53, 54, 57, 58, 61, 67, 69, 72, 87],
 [10, 11, 22, 48, 73, 76],
 [40],
 [33, 35, 52],
 [17, 55, 65, 80],
 [19, 28, 68, 82],
 [6, 13, 34],
 [9, 46, 47, 51, 56, 60, 62, 70, 71, 79],
 [4, 86]]

In [1376]:
topic_model.merge_topics(docs, topic_groups)
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,1126,0_santé_assurance_coût_soin
1,1,772,1_transparence_vote_divergence_surveillance
2,2,728,2_procédure_peine_code_juge
3,3,507,3_entreprise_prix_marché_produit
4,4,469,4_enfant_femme_parent_accueil
5,5,391,5_animal_agriculture_production_vin
6,6,331,6_crédit_budget_dépense_impôt
7,7,314,7_biodiversité_émission_eau_produit
8,8,290,8_armée_guerre_service_matériel
9,9,216,9_travail_chômage_travailleur_convention


In [1377]:
sample = []
sample_dict = dict()

for i in sorted(random.sample(list(processed_transcript.index), 100)):
    doc_topic = topic_model.get_document_info(docs).loc[i]
    
    topic_idx = doc_topic['Topic']
    topic_score = round(doc_topic['Probability'], 3)
    topic_words = doc_topic['Top_n_words']
    transcript_words = top_terms.loc[i]['top_terms']

    sample_dict[i] = {
        'topic_idx': topic_idx,
        'topic_score': topic_score,
        'topic_words': topic_words,
        'transcript_words': transcript_words
    }
    
    if topic_idx >= 0:
        sample.append('TRANSCRIPT IDX: ' + str(i))
        sample.append('TOPIC: ' + str(topic_idx))
        sample.append(topic_words)
        sample.append('\nTOP TERMS')
        sample.append(str(transcript_words))
        sample.append('score: ' + str(topic_score))

        if topic_score < 0.01:
            sample.append('----- LOW SCORE -----')
            
        sample.append('\n///////////////////////////////////////\n')
    else:
        sample.append('===========================================')
        sample.append('IDX: ' + str(i))
        sample.append(str(transcript_words))
        sample.append('===========================================\n')
        
print(len(sample))
sample = '\n'.join(sample)
sample_df = pd.DataFrame.from_dict(sample_dict, orient='index')
sample_df

701


Unnamed: 0,topic_idx,topic_score,topic_words,transcript_words
19,6,0.154,crédit - budget - dépense - impôt - montant - ...,"[budget, dépense, augmentation, crédit, coupe,..."
26,10,0.091,sanction - développement - coopération - neutr...,"[prévention, budget, produit, problème, aide, ..."
266,0,0.539,santé - assurance - coût - soin - prime - mala...,"[charge, assurance, prix, modèle, critère, pri..."
305,10,0.020,sanction - développement - coopération - neutr...,"[amende, banque, sanction, déduction, autorité..."
344,9,0.186,travail - chômage - travailleur - convention -...,"[statistique, chômage, placement, office, comp..."
...,...,...,...,...
6796,19,0.036,loyer - logement - locataire - bail - cautionn...,"[rénovation, congé, logement, immeuble, étude,..."
6798,5,0.325,animal - agriculture - production - vin - sucr...,"[agroforesterie, arbre, bois, production, syst..."
6806,7,0.115,biodiversité - émission - eau - produit - obje...,"[environnement, économie, déchet, matériau, pr..."
6829,5,0.042,animal - agriculture - production - vin - sucr...,"[bois, pouce, piste, profession, importation, ..."


In [1378]:
sample_df.to_csv('sample.csv', encoding='utf-8')

with open('sample.txt', 'w', encoding='utf-8') as file:
    file.write(sample)

In [1381]:
topic_model.get_document_info(docs, df=transcript_by_person)

Unnamed: 0,transcript_idx,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,...,LastName,FirstName,GenderAsString,PartyAbbreviation,Document,Topic,Name,Top_n_words,Probability,Representative_document
0,0,254005,47804,4109,Vous avez reçu le rapport du Conseil fédéral s...,N,20191202,5101,Mit-F,1.0,...,Piller Carrard,Valérie,f,PSS,élection bureau constitution incompatibilité b...,1,1_transparence_vote_divergence_surveillance,transparence - vote - divergence - surveillanc...,0.091612,False
1,1,254007,47804,4109,Pour adopter les décisions et formuler les pro...,N,20191202,5101,Mit-F,1.0,...,Piller Carrard,Valérie,f,PSS,bureau principe bureau explication service con...,1,1_transparence_vote_divergence_surveillance,transparence - vote - divergence - surveillanc...,0.102823,False
2,2,254046,47810,4156,"Lors de la session d'été 2019, notre conseil a...",N,20191202,5101,Mit-M,1.0,...,Buffat,Michaël,m,UDC,été automne octobre fois objet création regist...,1,1_transparence_vote_divergence_surveillance,transparence - vote - divergence - surveillanc...,0.045494,False
3,3,254155,47813,4154,Nous parlons ici d'une révision totale de la l...,N,20191203,5101,Mit-M,1.0,...,Addor,Jean-Luc,m,UDC,révision protection population protection juin...,1,1_transparence_vote_divergence_surveillance,transparence - vote - divergence - surveillanc...,0.320561,False
4,4,254159,47817,3872,Lors de ses séances des 18 et 19 février et de...,N,20191203,5101,Mit-M,1.0,...,Bourgeois,Jacques,m,PLR,séance février juin aménagement territoire éne...,5,5_animal_agriculture_production_vin,animal - agriculture - production - vin - sucr...,0.435514,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6926,7316,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,...,Fivaz,Fabien,m,VERT-E-S,modèle affaire entreprise numérique collecte d...,3,3_entreprise_prix_marché_produit,entreprise - prix - marché - produit - consomm...,0.009871,False
6927,7317,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,...,Baume-Schneider,Elisabeth,f,PSS,doigt problème recours collecte donnée surveil...,0,0_santé_assurance_coût_soin,santé - assurance - coût - soin - prime - mala...,0.005182,False
6928,7318,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,...,de Quattro,Jacqueline,f,PLR,suite attentat vie aide victime infraction ind...,2,2_procédure_peine_code_juge,procédure - peine - code - juge - infraction -...,0.137881,False
6929,7319,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,...,Baume-Schneider,Elisabeth,f,PSS,victime infraction soutien face traumatisme in...,2,2_procédure_peine_code_juge,procédure - peine - code - juge - infraction -...,0.032463,False


### Most frequent words

In [381]:
coun_vect = CountVectorizer()
count_matrix = coun_vect.fit_transform(filtered_transcript['text_lemmatized'].to_list())
count_array = count_matrix.toarray()
count_df = pd.DataFrame(data=count_array, columns=coun_vect.get_feature_names_out())
count_df

Unnamed: 0,00,0122,0202,0210,0219,0229,024,0244,026,0276,...,évoqué,évènement,évènementiel,événement,événementiel,événementielle,évêché,être,île,îlot
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7645,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7646,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7647,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7648,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [394]:
example_row = 1000
count_df.sort_values(axis='columns', by=example_row, ascending=False).iloc[example_row-5:example_row+5, 0:20]

Unnamed: 0,co2,objectif,émission,véhicule,automobiliste,progrès,principe,parc,dispositif,mise,voiture,emploi,heure,domaine,habitant,décarbonée,niveau,réponse,choix,polluant
995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
996,13,6,17,0,0,0,0,0,1,0,0,0,2,1,0,0,0,0,0,0
997,9,7,6,4,0,0,0,2,0,0,4,0,0,0,0,0,0,0,0,0
998,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
999,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1000,5,5,5,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1
1001,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0
1002,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1003,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1004,10,10,11,6,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0


### LDA gensim

In [810]:
filtered_transcript['text_lemm_list'] = filtered_transcript['text_lemmatized'].apply(lambda x: x.split())
filtered_transcript

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_lemmatized,text_lemma_list,text_lemm_list
0,254005,47804,4109,Vous avez reçu le rapport du Conseil fédéral s...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:57:46,2019-12-02T14:59:13,B,FR,élection bureau constitution incompatibilité b...,"[élection, bureau, constitution, incompatibili...","[élection, bureau, constitution, incompatibili..."
1,254007,47804,4109,Pour adopter les décisions et formuler les pro...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:11:40,2019-12-02T15:13:17,B,FR,décision bureau principe bureau explication se...,"[décision, bureau, principe, bureau, explicati...","[décision, bureau, principe, bureau, explicati..."
2,254046,47810,4156,"Lors de la session d'été 2019, notre conseil a...",N,20191202,5101,Mit-M,1.0,2019-12-02T17:47:43,2019-12-02T17:49:26,*,FR,session été session automne position octobre f...,"[session, été, session, automne, position, oct...","[session, été, session, automne, position, oct..."
3,254155,47813,4154,Nous parlons ici d'une révision totale de la l...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:24:17,2019-12-03T08:29:34,*,FR,révision protection population protection juin...,"[révision, protection, population, protection,...","[révision, protection, population, protection,..."
4,254159,47817,3872,Lors de ses séances des 18 et 19 février et de...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:34:23,2019-12-03T08:38:30,*,FR,séance février juin aménagement territoire éne...,"[séance, février, juin, aménagement, territoir...","[séance, février, juin, aménagement, territoir..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7645,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,2023-05-04T16:41:10,2023-05-04T16:44:54,Mit-M,FR,modèle affaire entreprise numérique collecte d...,"[modèle, affaire, entreprise, numérique, colle...","[modèle, affaire, entreprise, numérique, colle..."
7646,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,2023-05-04T16:44:58,2023-05-04T16:47:57,BR-F,FR,doigt problème recours collecte donnée surveil...,"[doigt, problème, recours, collecte, donnée, s...","[doigt, problème, recours, collecte, donnée, s..."
7647,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,2023-05-04T16:48:32,2023-05-04T16:52:03,Mit-F,FR,suite attentat vie aide victime infraction ind...,"[suite, attentat, vie, aide, victime, infracti...","[suite, attentat, vie, aide, victime, infracti..."
7648,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,2023-05-04T16:54:20,2023-05-04T16:56:37,BR-F,FR,victime infraction soutien face traumatisme in...,"[victime, infraction, soutien, face, traumatis...","[victime, infraction, soutien, face, traumatis..."


In [811]:
docs = filtered_transcript['text_lemm_list'].to_list()

dictionary = corpora.Dictionary(docs)

DT_matrix = [dictionary.doc2bow(doc) for doc in docs]

Lda_object = gensim.models.ldamodel.LdaModel

In [812]:
lda_model_1 = Lda_object(DT_matrix, num_topics=10, id2word = dictionary)

lda_model_1.print_topics(num_topics=10, num_words=5)

[(0,
  '0.018*"sanction" + 0.009*"guerre" + 0.009*"organe" + 0.008*"contrôle" + 0.008*"sécurité"'),
 (1,
  '0.013*"système" + 0.011*"objectif" + 0.010*"cadre" + 0.009*"point" + 0.008*"coût"'),
 (2,
  '0.018*"locataire" + 0.015*"loyer" + 0.014*"femme" + 0.010*"violence" + 0.009*"bailleur"'),
 (3,
  '0.017*"procédure" + 0.016*"assurance" + 0.011*"prime" + 0.010*"raison" + 0.008*"manière"'),
 (4,
  '0.009*"cadre" + 0.008*"point" + 0.007*"recherche" + 0.007*"budget" + 0.007*"moyen"'),
 (5,
  '0.025*"prix" + 0.015*"énergie" + 0.013*"patient" + 0.013*"coût" + 0.011*"santé"'),
 (6,
  '0.016*"produit" + 0.008*"raison" + 0.007*"contre-projet" + 0.007*"travail" + 0.007*"accord"'),
 (7,
  '0.022*"enfant" + 0.009*"décision" + 0.009*"parent" + 0.007*"rente" + 0.007*"procédure"'),
 (8,
  '0.026*"entreprise" + 0.018*"travail" + 0.009*"crédit" + 0.008*"aide" + 0.007*"milliard"'),
 (9,
  '0.013*"formation" + 0.012*"travail" + 0.010*"programme" + 0.009*"cadre" + 0.008*"soutien"')]

### top2vec

In [199]:
docs = filtered_transcript["text_lemmatized"].tolist()
len(docs)

324

In [196]:
topic_model = Top2Vec(
    docs,
    #embedding_model="universal-sentence-encoder-multilingual",
    speed="deep-learn",
)

2023-05-12 16:25:30,943 - top2vec - INFO - Pre-processing documents for training
2023-05-12 16:25:31,080 - top2vec - INFO - Creating joint document/word embedding
2023-05-12 16:25:38,074 - top2vec - INFO - Creating lower dimension embedding of documents
2023-05-12 16:25:39,207 - top2vec - INFO - Finding dense areas of documents
2023-05-12 16:25:39,213 - top2vec - INFO - Finding topics


In [201]:
model.get_topics()

{0: [('de', 0.13273781851509409),
  ('la', 0.10522084530715155),
  ('le', 0.07682557991849537),
  ('des', 0.06496992795926745),
  ('et', 0.06409332138178707),
  ('les', 0.061870529632359866),
  ('en', 0.055698036565708625),
  ('que', 0.055082912584038426),
  ('est', 0.05186673777975805),
  ('une', 0.042562060578284655)],
 1: [('de', 0.13792948109435965),
  ('la', 0.07491411465697119),
  ('et', 0.06830473202530185),
  ('les', 0.06791204410365591),
  ('le', 0.06573522710988462),
  ('pour', 0.06565243157193269),
  ('avs', 0.06497054756109226),
  ('des', 0.0634781534130419),
  ('est', 0.05652146656288921),
  ('une', 0.05409859315631972)]}