In [658]:
# Data processing
import os
import pandas as pd
import numpy as np
import ssl
import string
from collections import Counter
from transformers.pipelines import pipeline
import altair as alt
from tqdm import tqdm
import random

# Text preprocessiong
import nltk
from nltk.corpus import wordnet as wn
import spacy
import spacy_fastlang
from top2vec import Top2Vec
import gensim
from gensim import corpora
from gensim.corpora.dictionary import Dictionary
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from stanza.models.common.doc import Document
from stanza.pipeline.core import Pipeline
from stanza.pipeline.multilingual import MultilingualPipeline

# Topic model
from bertopic import BERTopic

# Dimension reduction
from umap.umap_ import UMAP

In [4]:
# disable ssl check (to be able to download nltk packages)

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [5]:
# download stanza package

import stanza
stanza.download(lang="multilingual")
stanza.download(lang="de")
stanza.download(lang="fr")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 106MB/s]                                                                               
2023-05-22 09:45:25 INFO: Downloading default packages for language: multilingual (multilingual) ...
2023-05-22 09:45:25 INFO: File exists: /Users/cyrille/stanza_resources/multilingual/default.zip
2023-05-22 09:45:25 INFO: Finished downloading models and saved to /Users/cyrille/stanza_resources.
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 46.2MB/s]                                                                              
2023-05-22 09:45:25 INFO: Downloading default packages for language: de (German) ...
2023-05-22 09:45:27 INFO: File exists: /Users/cyrille/stanza_resources/de/default.zip
2023-05-22 09:45:30 INFO: Finished downloading models and saved to /Users/cyrille/stanza_resources.
Downloading https://raw.

In [6]:
# download nltk packages

nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cyrille/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/cyrille/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/cyrille/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [903]:
transcript_df = pd.DataFrame()

directory = 'data/transcripts'
filepaths_list = []
transcripts_list = []

# iterate over files in that directory
for filename in os.listdir(directory):
    file = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(file) and file.endswith('.csv'):
        filepaths_list.append(file)

# sort transcripts chronologically 
filepaths_list.sort()
print(len(filepaths_list))

for filepath in tqdm(filepaths_list[97:117]):
    with open(filepath, encoding='utf-8') as file:
        session_df = pd.read_csv(file).drop(columns='Unnamed: 0')
        transcript_df = pd.concat([transcript_df, session_df])

transcript_df = transcript_df.reset_index(drop=True)
transcript_df

117


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 25.20it/s]


Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
0,253990,47803,806,"Präsidentin (Graf Maya, Alterspräsidentin): He...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:32:29,2019-12-02T14:33:42,P-F,DE
1,253996,47803,4290,"Sehr geehrte Frau Präsidentin, sehr geehrter H...",N,20191202,5101,Mit-M,1.0,2019-12-02T14:44:33,2019-12-02T14:52:10,Mit-M,DE
2,253998,47803,806,"Präsidentin (Graf Maya, Alterspräsidentin): Ge...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:52:10,2019-12-02T14:52:30,P-F,
3,254000,47804,806,"Präsidentin (Graf Maya, Alterspräsidentin): De...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:52:30,2019-12-02T14:54:24,P-F,DE
4,254011,47804,1139,Sie haben den Bericht des Bundesrates zu den N...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:54:24,2019-12-02T14:57:46,B,DE
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29559,319686,60670,4238,"Sehr geschätzter Herr Nationalrat Egger, Sie h...",N,20230504,5120,BR-F,99.0,2023-05-04T17:04:38,2023-05-04T17:06:20,BR-F,DE
29560,319809,60674,4268,La questione della mediatizzazione dei process...,N,20230504,5120,Mit-F,1.0,2023-05-04T17:06:39,2023-05-04T17:11:27,Mit-F,IT
29561,319699,60674,1122,"Frau Kollegin Gysin, Sie wollen eine Priorisie...",N,20230504,5120,Mit-M,1.0,2023-05-04T17:11:29,2023-05-04T17:11:46,Mit-M,DE
29562,319807,60674,4268,"Collega Fluri, sono molto consapevole del prob...",N,20230504,5120,Mit-F,1.0,2023-05-04T17:11:46,2023-05-04T17:12:44,Mit-F,IT


In [910]:
# only keep french texts
filtered_transcript = transcript_df.loc[transcript_df['LanguageOfText'] == 'FR']
# only keep texts longer than 300 char
filtered_transcript['text_length'] = filtered_transcript['Text'].apply(lambda x: len(x))
filtered_transcript = filtered_transcript[filtered_transcript['text_length'] > 300]
# reset index
filtered_transcript = filtered_transcript.reset_index(drop=True)
filtered_transcript

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length
0,254005,47804,4109,Vous avez reçu le rapport du Conseil fédéral s...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:57:46,2019-12-02T14:59:13,B,FR,1299
1,254007,47804,4109,Pour adopter les décisions et formuler les pro...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:11:40,2019-12-02T15:13:17,B,FR,1586
2,254046,47810,4156,"Lors de la session d'été 2019, notre conseil a...",N,20191202,5101,Mit-M,1.0,2019-12-02T17:47:43,2019-12-02T17:49:26,*,FR,1774
3,254155,47813,4154,Nous parlons ici d'une révision totale de la l...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:24:17,2019-12-03T08:29:34,*,FR,4569
4,254159,47817,3872,Lors de ses séances des 18 et 19 février et de...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:34:23,2019-12-03T08:38:30,*,FR,4506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7316,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,2023-05-04T16:41:10,2023-05-04T16:44:54,Mit-M,FR,3702
7317,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,2023-05-04T16:44:58,2023-05-04T16:47:57,BR-F,FR,3153
7318,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,2023-05-04T16:48:32,2023-05-04T16:52:03,Mit-F,FR,2810
7319,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,2023-05-04T16:54:20,2023-05-04T16:56:37,BR-F,FR,2345


In [911]:
nlp_sent = spacy.load('xx_sent_ud_sm')
nlp = spacy.load("fr_core_news_md")
nlp.add_pipe("language_detector")
lemmatizer = nlp.get_pipe("lemmatizer")

additional_stopwords = {'-t', 'avez', 'être', 'aujourd', 'hui'}
specific_stopwords = {
    'accord', 'alinéa', 'an', 'année', 'article', 'avis', 'cadre', 'canton', 'cas', 
    'collègue', 'commission', 'conseil', 'décision', 'disposition', 'domaine', 'droit', 
    'fédéral', 'franc', 'groupe', 'initiative', 'loi', 'majorité', 'matière', 'mesure', 'milliard', 'million', 'minorité', 
    'monsieur', 'motion', 'parlementaire', 'pays', 'postulat', 'politique', 'position', 'président', 'proposition', 
    'projet', 'question', 'rapport', 'réponse', 'situation', 'suisse', 'voix'
}
removed_stopwords = {'hui'}

# add stopwords 
nlp.Defaults.stop_words |= additional_stopwords
nlp.Defaults.stop_words |= specific_stopwords

# remove stopwords
#nlp.Defaults.stop_words -= removed_stopwords

print(len(nlp.Defaults.stop_words))
print(nlp.Defaults.stop_words)

554
{'da', 'quelques', 'antérieure', 'celui-ci', 'tend', 'très', 'avaient', 'chez', 'déja', 'tel', 'j’', 'elle', 'cinquantième', 'semblable', 'neanmoins', 'suffit', 'tant', 'dès', 'était', 'auraient', 'groupe', 'où', 'treize', 'situation', 'au', 'sera', 'jusque', 'y', 'suivante', 'faisaient', 'rendre', 'suffisante', 'avez', 'reste', 'personne', 'quoique', 'moi-même', 'concernant', 'dire', 'deuxièmement', 'les', 'eux', 'douzième', 'es', 'suivant', "d'", 'près', 'avec', 'egalement', 'celle-ci', 'moi', 'nouveau', 'certaines', 'importe', 'celle-là', 'depuis', 'pendant', 'attendu', 'votre', 'ha', 'leur', 'déjà', 'cinquième', 'lui-meme', 'projet', 'souvent', 'quatorze', 'abord', 'lès', 'selon', 'parlementaire', 'specifique', 'également', 'douze', 'malgré', 'environ', 'avons', 'duquel', 'quiconque', 'afin', 'quarante', 'specifiques', 'son', 'plutôt', 'touchant', 'auront', 'canton', 'aie', 'tellement', 'nombreuses', 'matière', 'uns', 'après', 'ah', 'cinquante', 'être', 'cent', 'l’', 'autre', '



In [1022]:
example_row = 399
text_id = filtered_transcript.iloc[example_row]['ID']
text = filtered_transcript.iloc[example_row]['Text']

print(text_id)
print('length:', len(text))
print(text)
print('---')

entity_dict = {}

for ent in nlp(text).ents:
    entity_dict[ent.text] = ent.label_
    
print(entity_dict)
print('---')

lemma_list = [
    token.lemma_ for sent in nlp_sent(text).sents 
    if (doc := nlp(sent.text))._.language == 'fr' 
    for token in doc 
    if not any([
        token.is_stop, 
        token.is_punct, 
        token.is_space, 
        token.ent_type_, 
        not token.is_alpha, 
        token.lemma_ in nlp.Defaults.stop_words, 
        token.pos_ != 'NOUN',
    ])
]

#lemma_list.sort()
print(lemma_list)

258146
length: 4390
Je ne sais pas si vous le saviez, parfois vous pouvez acheter un objet, comme un téléphone mobile, avec une garantie de deux ans, mais en fait ce ne sont pas tous les composants de l'objet qui sont sous garantie, mais seulement une partie. Il est donc de fait légal aujourd'hui d'exclure de la garantie certains composants. On pourrait imaginer que si votre téléphone mobile était totalement modulaire, ce ne serait pas un problème. Finalement, si la batterie, par exemple, fait défaut, vous pourriez dire que ce n'est pas grave qu'elle ne soit pas sous garantie et vous en achetez une autre.
Le problème, c'est que pour beaucoup de produits, certains composants sont essentiels, c'est-à-dire que sans le composant essentiel, vous ne pouvez pas utiliser l'appareil. On vous vend un appareil avec une garantie de deux ans, mais de facto, si certains de ses composants font défaut, votre garantie n'a pas d'effet. Cette pratique n'est pas acceptable.
Vous connaissez peut-être l'exp

In [916]:
# remove stopwords, punctuation and then lemmatize
tqdm.pandas()
processed_transcript = filtered_transcript.copy()
processed_transcript['text_lemmatized'] = processed_transcript['Text'].progress_apply(
    lambda x: ' '.join([
        token.lemma_ for sent in nlp_sent(x).sents 
        if (doc := nlp(sent.text))._.language == 'fr' 
        for token in doc 
        if not any([
            token.is_stop, 
            token.is_punct, 
            token.is_space, 
            token.ent_type_, 
            token.is_digit, 
            token.lemma_ in nlp.Defaults.stop_words, 
            token.pos_ != 'NOUN',
        ])
    ])
)
# Take a look at the data
processed_transcript

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7321/7321 [18:43<00:00,  6.52it/s]


Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length,text_lemmatized
0,254005,47804,4109,Vous avez reçu le rapport du Conseil fédéral s...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:57:46,2019-12-02T14:59:13,B,FR,1299,élection bureau constitution incompatibilité b...
1,254007,47804,4109,Pour adopter les décisions et formuler les pro...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:11:40,2019-12-02T15:13:17,B,FR,1586,bureau principe bureau explication service con...
2,254046,47810,4156,"Lors de la session d'été 2019, notre conseil a...",N,20191202,5101,Mit-M,1.0,2019-12-02T17:47:43,2019-12-02T17:49:26,*,FR,1774,session été session automne octobre fois objet...
3,254155,47813,4154,Nous parlons ici d'une révision totale de la l...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:24:17,2019-12-03T08:29:34,*,FR,4569,révision protection population protection juin...
4,254159,47817,3872,Lors de ses séances des 18 et 19 février et de...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:34:23,2019-12-03T08:38:30,*,FR,4506,séance février juin aménagement territoire éne...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7316,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,2023-05-04T16:41:10,2023-05-04T16:44:54,Mit-M,FR,3702,modèle affaire entreprise numérique collecte d...
7317,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,2023-05-04T16:44:58,2023-05-04T16:47:57,BR-F,FR,3153,doigt problème recours collecte donnée surveil...
7318,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,2023-05-04T16:48:32,2023-05-04T16:52:03,Mit-F,FR,2810,suite attentat vie aide victime infraction ind...
7319,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,2023-05-04T16:54:20,2023-05-04T16:56:37,BR-F,FR,2345,victime infraction soutien face traumatisme in...


In [917]:
processed_transcript['text_lemma_list'] = processed_transcript['text_lemmatized'].apply(lambda x: x.split())
processed_transcript

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length,text_lemmatized,text_lemma_list
0,254005,47804,4109,Vous avez reçu le rapport du Conseil fédéral s...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:57:46,2019-12-02T14:59:13,B,FR,1299,élection bureau constitution incompatibilité b...,"[élection, bureau, constitution, incompatibili..."
1,254007,47804,4109,Pour adopter les décisions et formuler les pro...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:11:40,2019-12-02T15:13:17,B,FR,1586,bureau principe bureau explication service con...,"[bureau, principe, bureau, explication, servic..."
2,254046,47810,4156,"Lors de la session d'été 2019, notre conseil a...",N,20191202,5101,Mit-M,1.0,2019-12-02T17:47:43,2019-12-02T17:49:26,*,FR,1774,session été session automne octobre fois objet...,"[session, été, session, automne, octobre, fois..."
3,254155,47813,4154,Nous parlons ici d'une révision totale de la l...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:24:17,2019-12-03T08:29:34,*,FR,4569,révision protection population protection juin...,"[révision, protection, population, protection,..."
4,254159,47817,3872,Lors de ses séances des 18 et 19 février et de...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:34:23,2019-12-03T08:38:30,*,FR,4506,séance février juin aménagement territoire éne...,"[séance, février, juin, aménagement, territoir..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7316,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,2023-05-04T16:41:10,2023-05-04T16:44:54,Mit-M,FR,3702,modèle affaire entreprise numérique collecte d...,"[modèle, affaire, entreprise, numérique, colle..."
7317,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,2023-05-04T16:44:58,2023-05-04T16:47:57,BR-F,FR,3153,doigt problème recours collecte donnée surveil...,"[doigt, problème, recours, collecte, donnée, s..."
7318,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,2023-05-04T16:48:32,2023-05-04T16:52:03,Mit-F,FR,2810,suite attentat vie aide victime infraction ind...,"[suite, attentat, vie, aide, victime, infracti..."
7319,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,2023-05-04T16:54:20,2023-05-04T16:56:37,BR-F,FR,2345,victime infraction soutien face traumatisme in...,"[victime, infraction, soutien, face, traumatis..."


### Subgroups

In [918]:
with open('data/persons.csv', encoding='utf-8') as file:
    persons_df = pd.read_csv(file).drop(columns='Unnamed: 0')[['PersonNumber', 'LastName', 'FirstName', 'GenderAsString', 'PartyAbbreviation']]
    
persons_df

Unnamed: 0,PersonNumber,LastName,FirstName,GenderAsString,PartyAbbreviation
0,9,Baumann,Ruedi,m,VERT-E-S
1,12,Beerli,Christine,f,PLR
2,14,Bezzola,Duri,m,PLR
3,15,Binder,Max,m,UDC
4,21,Blocher,Christoph,m,UDC
...,...,...,...,...,...
705,4329,Ruch,Daniel,m,PLR
706,4330,Berthoud,Alexandre,m,PLR
707,4331,Jost,Marc,m,PEV
708,4332,Crevoisier Crelier,Mathilde,f,PSS


In [919]:
transcript_by_person = processed_transcript.merge(persons_df, on='PersonNumber')
transcript_by_person

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length,text_lemmatized,text_lemma_list,LastName,FirstName,GenderAsString,PartyAbbreviation
0,254005,47804,4109,Vous avez reçu le rapport du Conseil fédéral s...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:57:46,2019-12-02T14:59:13,B,FR,1299,élection bureau constitution incompatibilité b...,"[élection, bureau, constitution, incompatibili...",Piller Carrard,Valérie,f,PSS
1,254007,47804,4109,Pour adopter les décisions et formuler les pro...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:11:40,2019-12-02T15:13:17,B,FR,1586,bureau principe bureau explication service con...,"[bureau, principe, bureau, explication, servic...",Piller Carrard,Valérie,f,PSS
2,254289,47824,4109,"Monsieur Nantermod, vous avez cité des exemple...",N,20191203,5101,Mit-F,1.0,2019-12-03T12:21:48,2019-12-03T12:22:21,Mit-F,FR,584,cité exemple intervention région pédiatre pati...,"[cité, exemple, intervention, région, pédiatre...",Piller Carrard,Valérie,f,PSS
3,254291,47826,4109,"L'initiative parlementaire 19.432, ""Garantie d...",N,20191203,5101,Mit-F,1.0,2019-12-03T12:29:12,2019-12-03T12:32:47,*,FR,3443,soutien membre rattachement collaborateur plan...,"[soutien, membre, rattachement, collaborateur,...",Piller Carrard,Valérie,f,PSS
4,255101,47955,4109,"Madame la conseillère fédérale, en ce qui conc...",N,20191209,5101,Mit-F,1.0,2019-12-09T17:49:20,2019-12-09T17:49:51,Mit-F,FR,516,conseiller hypothèse avion combat région atten...,"[conseiller, hypothèse, avion, combat, région,...",Piller Carrard,Valérie,f,PSS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7316,317568,60144,4330,La Commission des affaires juridiques de notre...,N,20230315,5118,Mit-M,1.0,2023-03-15T16:49:36,2023-03-15T16:51:21,*,FR,1563,février examen titre juin conseiller minimum v...,"[février, examen, titre, juin, conseiller, min...",Berthoud,Alexandre,m,PLR
7317,317570,60144,4330,"Merci pour votre question, cher collègue Hurni...",N,20230315,5118,Mit-M,1.0,2023-03-15T16:51:52,2023-03-15T16:52:09,*,FR,340,semaine vacance charge travail collaborateur c...,"[semaine, vacance, charge, travail, collaborat...",Berthoud,Alexandre,m,PLR
7318,317581,60149,4330,La Commission des affaires juridiques du Conse...,N,20230315,5118,Mit-M,1.0,2023-03-15T17:31:38,2023-03-15T17:33:28,*,FR,1796,février examen septembre objectif travail plat...,"[février, examen, septembre, objectif, travail...",Berthoud,Alexandre,m,PLR
7319,316351,59958,4332,L'Union européenne a récemment adopté le règle...,S,20230309,5118,Mit-F,2.0,2023-03-09T10:09:03,2023-03-09T10:10:02,Mit-F,FR,850,règlement déforestation importation produit dé...,"[règlement, déforestation, importation, produi...",Crevoisier Crelier,Mathilde,f,PSS


In [920]:
# number of transcripts by party
transcript_by_person.groupby('PartyAbbreviation').count()['Text']

PartyAbbreviation
EàG           39
Lega           1
M-E          748
PLD            1
PLR         1210
PSS         2565
PdT           25
UDC         1488
VERT-E-S    1041
pvl          203
Name: Text, dtype: int64

In [921]:
# number of transcripts by gender
transcript_by_person.groupby('GenderAsString').count()['Text']

GenderAsString
f    1940
m    5381
Name: Text, dtype: int64

In [922]:
transcript_by_party = transcript_by_person.groupby('PartyAbbreviation').agg({'text_lemmatized': lambda x: ' '.join(x)})
transcript_by_party.index.name = None
transcript_by_party = transcript_by_party.drop(['Al', 'Lega', 'PLD', '-'], errors='ignore')
transcript_by_party

Unnamed: 0,text_lemmatized
EàG,conseiller reprise plupart étude partie étude ...
M-E,région montagne général construction terme aff...
PLR,séance février juin aménagement territoire éne...
PSS,élection bureau constitution incompatibilité b...
PdT,reprise août novembre divergence contribution ...
UDC,session été session automne octobre fois objet...
VERT-E-S,budget avenir réflexion terme budget marge com...
pvl,fois budget excédent compte dette défi taux en...


In [923]:
transcript_by_gender = transcript_by_person.groupby('GenderAsString').agg({'text_lemmatized': lambda x: ' '.join(x)})
transcript_by_gender.index.name = None
transcript_by_gender

Unnamed: 0,text_lemmatized
f,élection bureau constitution incompatibilité b...
m,session été session automne octobre fois objet...


### Most frequent words

In [381]:
coun_vect = CountVectorizer()
count_matrix = coun_vect.fit_transform(filtered_transcript['text_lemmatized'].to_list())
count_array = count_matrix.toarray()
count_df = pd.DataFrame(data=count_array, columns=coun_vect.get_feature_names_out())
count_df

Unnamed: 0,00,0122,0202,0210,0219,0229,024,0244,026,0276,...,évoqué,évènement,évènementiel,événement,événementiel,événementielle,évêché,être,île,îlot
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7645,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7646,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7647,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7648,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [394]:
example_row = 1000
count_df.sort_values(axis='columns', by=example_row, ascending=False).iloc[example_row-5:example_row+5, 0:20]

Unnamed: 0,co2,objectif,émission,véhicule,automobiliste,progrès,principe,parc,dispositif,mise,voiture,emploi,heure,domaine,habitant,décarbonée,niveau,réponse,choix,polluant
995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
996,13,6,17,0,0,0,0,0,1,0,0,0,2,1,0,0,0,0,0,0
997,9,7,6,4,0,0,0,2,0,0,4,0,0,0,0,0,0,0,0,0
998,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
999,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1000,5,5,5,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1
1001,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0
1002,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1003,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1004,10,10,11,6,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0


### TF-IDF 

In [924]:
group_transcript = processed_transcript.copy()

In [925]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(group_transcript['text_lemmatized'].to_list())
len(vectorizer.get_feature_names_out())

10033

In [926]:
tfidf_df = pd.DataFrame(vectors.toarray(), index=group_transcript.index, columns=vectorizer.get_feature_names_out())
tfidf_df

Unnamed: 0,00,0122,0202,0210,0219,0229,024,0244,026,0276,...,évoqué,évènement,évènementiel,événement,événementiel,événementielle,évêché,être,île,îlot
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.05724,0.0,0.0,0.0,0.000000,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.117083,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0
7317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0
7318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0
7319,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0


In [927]:
tfidf_df.sort_values(axis='columns', by=tfidf_df.iloc[1].name, ascending=False)

Unnamed: 0,incompatibilité,bureau,élection,conseiller,mandat,renonciation,député,communiqué,échéance,secrétariat,...,démolition,démon,démondialisation,démonstration,démonstrative,démontage,démotivante,démotive,démultiplication,îlot
0,0.143604,0.642485,0.528452,0.116085,0.157173,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.577063,0.430297,0.283140,0.233240,0.210530,0.179249,0.160024,0.156676,0.152903,0.151382,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.233193,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.083783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7316,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7317,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7318,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7319,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [928]:
tfidf_score_df = tfidf_df.stack().reset_index()
tfidf_score_df = tfidf_score_df.rename(columns={'ID': 'transcript','level_1': 'term', 0:'score', 'level_0': 'idx'})
tfidf_score_df

Unnamed: 0,idx,term,score
0,0,00,0.0
1,0,0122,0.0
2,0,0202,0.0
3,0,0210,0.0
4,0,0219,0.0
...,...,...,...
73451588,7320,événementielle,0.0
73451589,7320,évêché,0.0
73451590,7320,être,0.0
73451591,7320,île,0.0


In [929]:
top_tfidf = tfidf_score_df.sort_values(by=['idx','score'], ascending=[True,False]).groupby(['idx']).head(100)
top_tfidf

Unnamed: 0,idx,term,score
1578,0,bureau,0.642485
9849,0,élection,0.528452
2431,0,constitution,0.352953
5700,0,mandat,0.157173
4914,0,incompatibilité,0.143604
...,...,...,...
73441616,7320,16h,0.000000
73441617,7320,16i,0.000000
73441618,7320,16k,0.000000
73441619,7320,17,0.000000


In [930]:
top_tfidf[top_tfidf['term'].str.contains("assurance")]

Unnamed: 0,idx,term,score
151554,15,assurance,0.113757
221785,22,assurance,0.053272
231818,23,assurance,0.043413
281983,28,assurance,0.077408
964227,96,assurance,0.105430
...,...,...,...
72740309,7250,assurance,0.276965
72760375,7252,assurance,0.107372
72770408,7253,assurance,0.061960
73422553,7318,assurance,0.045004


In [931]:
top_tfidf[top_tfidf['term'].str.contains("assurance")]
top_tfidf[top_tfidf['idx'] == 838]

Unnamed: 0,idx,term,score
8412499,838,importateur,0.372346
8413808,838,ménage,0.342661
8413193,838,logement,0.241838
8410066,838,consommateur,0.201829
8417113,838,utilité,0.182991
...,...,...,...
8412280,838,heure,0.036330
8416332,838,soutien,0.036138
8412966,838,jour,0.035101
8410337,838,coût,0.034525


In [932]:
top_10_tfidf = top_tfidf.groupby('idx').head(10)
top_10_tfidf

Unnamed: 0,idx,term,score
1578,0,bureau,0.642485
9849,0,élection,0.528452
2431,0,constitution,0.352953
5700,0,mandat,0.157173
4914,0,incompatibilité,0.143604
...,...,...,...
73450867,7320,tribunal,0.190576
73451127,7320,victimisation,0.132626
73448011,7320,optique,0.089065
73445699,7320,fondement,0.088250


In [933]:
top_10_tfidf.query('idx > 0 & idx < 5 & score > 0')
top_10_tfidf = top_10_tfidf.query('score > 0')
top_10_tfidf

Unnamed: 0,idx,term,score
1578,0,bureau,0.642485
9849,0,élection,0.528452
2431,0,constitution,0.352953
5700,0,mandat,0.157173
4914,0,incompatibilité,0.143604
...,...,...,...
73450867,7320,tribunal,0.190576
73451127,7320,victimisation,0.132626
73448011,7320,optique,0.089065
73445699,7320,fondement,0.088250


In [934]:
# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_10_tfidf.query('idx > -1 & idx < 10 & score > 0')
#top_tfidf_plusRand = top_10_tfidf.copy()
top_tfidf_plusRand['score'] = top_tfidf_plusRand['score'] + np.random.rand(top_tfidf_plusRand.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'idx:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("score", order="descending")],
    groupby = ["idx"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'score:Q'
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + text).properties(width = 1000)

In [838]:
top_10_tfidf.to_csv('top_tfidf.csv', encoding='utf-8')
top_10_tfidf

Unnamed: 0,idx,term,score
1579,0,bureau,0.642947
9859,0,élection,0.528889
2434,0,constitution,0.351921
5705,0,mandat,0.157655
4918,0,incompatibilité,0.143407
...,...,...,...
76828223,7649,tribunal,0.191068
76828483,7649,victimisation,0.132253
76825365,7649,optique,0.089030
76823050,7649,fondement,0.088222


In [935]:
top_terms = top_10_tfidf.groupby('idx')['term'].apply(list)
top_terms.name = 'top_terms'
top_terms = pd.DataFrame(top_terms)
top_terms.index.name = None
top_terms

Unnamed: 0,top_terms
0,"[bureau, élection, constitution, mandat, incom..."
1,"[incompatibilité, bureau, élection, conseiller..."
2,"[lobbyiste, mandant, lobbyisme, session, trans..."
3,"[abri, protection, divergence, rénovation, rem..."
4,"[bâtiment, zone, révision, territoire, aménage..."
...,...
7316,"[algorithme, collecte, solution, publicité, op..."
7317,"[plateforme, publicité, donnée, intermédiaire,..."
7318,"[victime, acte, terrorisme, citoyen, violence,..."
7319,"[victime, infraction, étranger, indemnisation,..."


### BERTopic

In [1024]:
# Initiate UMAP
umap_model = UMAP(
    n_neighbors=15, 
    n_components=10, 
    min_dist=0.0, 
    metric='cosine', 
    random_state=100,
)

# Initiate BERTopic
topic_model = BERTopic(umap_model=umap_model, calculate_probabilities=True, language='french')

# Run BERTopic model
docs = processed_transcript['text_lemmatized'].to_list()
topics, probabilities = topic_model.fit_transform(docs)
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,2938,-1_raison_entreprise_manière_travail
1,0,311,0_vaccin_pandémie_épidémie_crise
2,1,221,1_peine_crime_procédure_code
3,2,216,2_chômage_salaire_travail_travailleur
4,3,161,3_locataire_loyer_bail_bailleur
...,...,...,...
95,94,12,94_mariage_couple_père_enfant
96,95,12,95_convention_imposition_double_protocole
97,96,11,96_augmentation_crédit_capital_arrêté
98,97,11,97_naturalisation_nationalité_génération_parent


In [1025]:
new_topics = topic_model.reduce_outliers(docs, topics)
topic_model.update_topics(docs, topics=new_topics)
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,1,-1_tre_lingue_jau_discur
1,0,416,0_vaccin_pandémie_crise_épidémie
2,1,312,1_peine_code_procédure_infraction
3,2,279,2_chômage_salaire_travail_travailleur
4,3,223,3_locataire_loyer_bail_bailleur
...,...,...,...
95,94,18,94_mariage_couple_père_enfant
96,95,42,95_convention_ratification_imposition_double
97,96,28,96_capital_banque_augmentation_corruption
98,97,16,97_naturalisation_nationalité_génération_parent


In [1036]:
topic_model.get_topic(98)

[('registre', 0.09806241864881673),
 ('acte', 0.04771608481224721),
 ('notariat', 0.040219508391041714),
 ('papier', 0.03704532683819753),
 ('document', 0.035055530699335376),
 ('notaire', 0.032267525218241035),
 ('soupçon', 0.03097775114708983),
 ('blanchiment', 0.027817506100239527),
 ('émolument', 0.027223639531988447),
 ('numérisation', 0.024020326387802538)]

In [1027]:
topic_model.get_document_info(docs).loc[5723]

Document                   hypothèse modalité calcul prime assurance mala...
Topic                                                                     19
Name                                             19_prime_coût_maladie_santé
Top_n_words                prime - coût - maladie - santé - assurance - f...
Probability                                                         0.074441
Representative_document                                                False
Name: 5723, dtype: object

In [1058]:
sample = []

for i in sorted(random.sample(list(processed_transcript.index), 20)):
    doc_topic = topic_model.get_document_info(docs).loc[i]
    topic_idx = doc_topic['Topic']
    topic_score = doc_topic['Probability']
    topic_words = doc_topic['Top_n_words']
    
    if topic_idx >= 0:
        sample.append('TRANSCRIPT IDX: ' + str(i))
        sample.append('TOPIC: ' + str(topic_idx))
        sample.append(topic_words)
        sample.append('\nTOP TERMS')
        sample.append(str(top_terms.loc[i]['top_terms']))
        sample.append('score: ' + str(topic_score))

        if topic_score < 0.01:
            sample.append('----- LOW SCORE -----')
            
        sample.append('\n///////////////////////////////////////\n')
    else:
        sample.append('===========================================')
        sample.append('IDX: ' + str(i))
        sample.append(str(top_terms.loc[i]['top_terms']))
        sample.append('===========================================\n')
        
print(len(sample))
sample = '\n'.join(sample)

142


In [1059]:
with open('sample.txt', 'w') as file:
    file.write(sample)

In [1031]:
topic_model.get_topic_info().to_csv('topics.csv', encoding='utf-8')

In [1037]:
with open('topics.csv', encoding='utf-8') as file:
    topics_df = pd.read_csv(file).drop(columns='Unnamed: 0')

topics_df

Unnamed: 0,Topic,Count,Name,cat
0,-1,1,-1_tre_lingue_jau_discur,
1,0,416,0_vaccin_pandémie_crise_épidémie,santé
2,1,312,1_peine_code_procédure_infraction,justice
3,2,279,2_chômage_salaire_travail_travailleur,travail
4,3,223,3_locataire_loyer_bail_bailleur,immobilier
...,...,...,...,...
95,94,18,94_mariage_couple_père_enfant,société
96,95,42,95_convention_ratification_imposition_double,économie
97,96,28,96_capital_banque_augmentation_corruption,économie
98,97,16,97_naturalisation_nationalité_génération_parent,société


In [1038]:
topic_groups = list(topics_df.groupby('cat')['Topic'].apply(list))
topic_groups

[[21, 26, 41, 55, 57, 60],
 [8, 17, 34, 58],
 [28, 35],
 [36, 45, 89],
 [18, 38, 76],
 [6],
 [15, 53],
 [3, 20, 70],
 [1, 47, 83, 98],
 [31],
 [11, 14, 23, 29, 32, 43, 44, 54, 56, 63, 66],
 [22],
 [4],
 [0, 16, 19, 40, 42, 51, 52, 59, 65, 67, 77, 78, 88, 92],
 [10, 12, 33, 48, 73, 84, 94, 97],
 [30],
 [7, 37],
 [39, 62],
 [2],
 [13, 25, 27, 46, 75],
 [9, 24, 49, 50, 64, 68, 69, 80, 85, 95, 96],
 [5, 90, 91]]

In [1039]:
topic_model.merge_topics(docs, topic_groups)

In [1040]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,1,-1_tre_lingue_jau_discur
1,0,1104,0_santé_assurance_coût_soin
2,1,997,1_divergence_vote_transparence_solution
3,2,619,2_entreprise_prix_impôt_marché
4,3,440,3_procédure_peine_code_juge
5,4,430,4_enfant_femme_parent_accueil
6,5,368,5_émission_biodiversité_produit_eau
7,6,350,6_armée_guerre_service_matériel
8,7,341,7_animal_agriculture_production_vin
9,8,338,8_loyer_locataire_bail_logement


### LDA gensim

In [810]:
filtered_transcript['text_lemm_list'] = filtered_transcript['text_lemmatized'].apply(lambda x: x.split())
filtered_transcript

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_lemmatized,text_lemma_list,text_lemm_list
0,254005,47804,4109,Vous avez reçu le rapport du Conseil fédéral s...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:57:46,2019-12-02T14:59:13,B,FR,élection bureau constitution incompatibilité b...,"[élection, bureau, constitution, incompatibili...","[élection, bureau, constitution, incompatibili..."
1,254007,47804,4109,Pour adopter les décisions et formuler les pro...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:11:40,2019-12-02T15:13:17,B,FR,décision bureau principe bureau explication se...,"[décision, bureau, principe, bureau, explicati...","[décision, bureau, principe, bureau, explicati..."
2,254046,47810,4156,"Lors de la session d'été 2019, notre conseil a...",N,20191202,5101,Mit-M,1.0,2019-12-02T17:47:43,2019-12-02T17:49:26,*,FR,session été session automne position octobre f...,"[session, été, session, automne, position, oct...","[session, été, session, automne, position, oct..."
3,254155,47813,4154,Nous parlons ici d'une révision totale de la l...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:24:17,2019-12-03T08:29:34,*,FR,révision protection population protection juin...,"[révision, protection, population, protection,...","[révision, protection, population, protection,..."
4,254159,47817,3872,Lors de ses séances des 18 et 19 février et de...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:34:23,2019-12-03T08:38:30,*,FR,séance février juin aménagement territoire éne...,"[séance, février, juin, aménagement, territoir...","[séance, février, juin, aménagement, territoir..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7645,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,2023-05-04T16:41:10,2023-05-04T16:44:54,Mit-M,FR,modèle affaire entreprise numérique collecte d...,"[modèle, affaire, entreprise, numérique, colle...","[modèle, affaire, entreprise, numérique, colle..."
7646,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,2023-05-04T16:44:58,2023-05-04T16:47:57,BR-F,FR,doigt problème recours collecte donnée surveil...,"[doigt, problème, recours, collecte, donnée, s...","[doigt, problème, recours, collecte, donnée, s..."
7647,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,2023-05-04T16:48:32,2023-05-04T16:52:03,Mit-F,FR,suite attentat vie aide victime infraction ind...,"[suite, attentat, vie, aide, victime, infracti...","[suite, attentat, vie, aide, victime, infracti..."
7648,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,2023-05-04T16:54:20,2023-05-04T16:56:37,BR-F,FR,victime infraction soutien face traumatisme in...,"[victime, infraction, soutien, face, traumatis...","[victime, infraction, soutien, face, traumatis..."


In [811]:
docs = filtered_transcript['text_lemm_list'].to_list()

dictionary = corpora.Dictionary(docs)

DT_matrix = [dictionary.doc2bow(doc) for doc in docs]

Lda_object = gensim.models.ldamodel.LdaModel

In [812]:
lda_model_1 = Lda_object(DT_matrix, num_topics=10, id2word = dictionary)

lda_model_1.print_topics(num_topics=10, num_words=5)

[(0,
  '0.018*"sanction" + 0.009*"guerre" + 0.009*"organe" + 0.008*"contrôle" + 0.008*"sécurité"'),
 (1,
  '0.013*"système" + 0.011*"objectif" + 0.010*"cadre" + 0.009*"point" + 0.008*"coût"'),
 (2,
  '0.018*"locataire" + 0.015*"loyer" + 0.014*"femme" + 0.010*"violence" + 0.009*"bailleur"'),
 (3,
  '0.017*"procédure" + 0.016*"assurance" + 0.011*"prime" + 0.010*"raison" + 0.008*"manière"'),
 (4,
  '0.009*"cadre" + 0.008*"point" + 0.007*"recherche" + 0.007*"budget" + 0.007*"moyen"'),
 (5,
  '0.025*"prix" + 0.015*"énergie" + 0.013*"patient" + 0.013*"coût" + 0.011*"santé"'),
 (6,
  '0.016*"produit" + 0.008*"raison" + 0.007*"contre-projet" + 0.007*"travail" + 0.007*"accord"'),
 (7,
  '0.022*"enfant" + 0.009*"décision" + 0.009*"parent" + 0.007*"rente" + 0.007*"procédure"'),
 (8,
  '0.026*"entreprise" + 0.018*"travail" + 0.009*"crédit" + 0.008*"aide" + 0.007*"milliard"'),
 (9,
  '0.013*"formation" + 0.012*"travail" + 0.010*"programme" + 0.009*"cadre" + 0.008*"soutien"')]

### top2vec

In [199]:
docs = filtered_transcript["text_lemmatized"].tolist()
len(docs)

324

In [196]:
topic_model = Top2Vec(
    docs,
    #embedding_model="universal-sentence-encoder-multilingual",
    speed="deep-learn",
)

2023-05-12 16:25:30,943 - top2vec - INFO - Pre-processing documents for training
2023-05-12 16:25:31,080 - top2vec - INFO - Creating joint document/word embedding
2023-05-12 16:25:38,074 - top2vec - INFO - Creating lower dimension embedding of documents
2023-05-12 16:25:39,207 - top2vec - INFO - Finding dense areas of documents
2023-05-12 16:25:39,213 - top2vec - INFO - Finding topics


In [201]:
model.get_topics()

{0: [('de', 0.13273781851509409),
  ('la', 0.10522084530715155),
  ('le', 0.07682557991849537),
  ('des', 0.06496992795926745),
  ('et', 0.06409332138178707),
  ('les', 0.061870529632359866),
  ('en', 0.055698036565708625),
  ('que', 0.055082912584038426),
  ('est', 0.05186673777975805),
  ('une', 0.042562060578284655)],
 1: [('de', 0.13792948109435965),
  ('la', 0.07491411465697119),
  ('et', 0.06830473202530185),
  ('les', 0.06791204410365591),
  ('le', 0.06573522710988462),
  ('pour', 0.06565243157193269),
  ('avs', 0.06497054756109226),
  ('des', 0.0634781534130419),
  ('est', 0.05652146656288921),
  ('une', 0.05409859315631972)]}