In [3]:
# Data processing
import os
import pandas as pd
import numpy as np
import ssl
import string
from collections import Counter
from transformers.pipelines import pipeline
import altair as alt
from tqdm import tqdm
import random

# Text preprocessiong
import nltk
from nltk.corpus import wordnet as wn
import spacy
import spacy_fastlang
from top2vec import Top2Vec
import gensim
from gensim import corpora
from gensim.corpora.dictionary import Dictionary
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from stanza.models.common.doc import Document
from stanza.pipeline.core import Pipeline
from stanza.pipeline.multilingual import MultilingualPipeline

# Topic model
from bertopic import BERTopic

# Dimension reduction
from umap.umap_ import UMAP

In [4]:
# disable ssl check (to be able to download nltk packages)

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [5]:
# download stanza package

import stanza
stanza.download(lang="multilingual")
stanza.download(lang="de")
stanza.download(lang="fr")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 76.8MB/s]                                                                              
2023-06-05 10:16:03 INFO: Downloading default packages for language: multilingual (multilingual) ...
2023-06-05 10:16:03 INFO: File exists: /Users/cyrille/stanza_resources/multilingual/default.zip
2023-06-05 10:16:03 INFO: Finished downloading models and saved to /Users/cyrille/stanza_resources.
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 56.7MB/s]                                                                              
2023-06-05 10:16:03 INFO: Downloading default packages for language: de (German) ...
2023-06-05 10:16:05 INFO: File exists: /Users/cyrille/stanza_resources/de/default.zip
2023-06-05 10:16:09 INFO: Finished downloading models and saved to /Users/cyrille/stanza_resources.
Downloading https://raw.

In [6]:
# download nltk packages

nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cyrille/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/cyrille/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/cyrille/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Lemmatize transcripts

In [757]:
transcript_df = pd.DataFrame()

directory = 'data/transcripts'
filepaths_list = []
transcripts_list = []

# iterate over files in that directory
for filename in os.listdir(directory):
    file = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(file) and file.endswith('.csv'):
        filepaths_list.append(file)

# sort transcripts chronologically 
filepaths_list.sort()
print(len(filepaths_list))

for filepath in tqdm(filepaths_list[78:]):
    with open(filepath, encoding='utf-8') as file:
        session_df = pd.read_csv(file).drop(columns='Unnamed: 0')
        transcript_df = pd.concat([transcript_df, session_df])

transcript_df = transcript_df.reset_index(drop=True)
transcript_df

117


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39/39 [00:01<00:00, 27.80it/s]


Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
0,191147,36015,214,"Präsident (Stamm Luzi, Alterspräsident): Frau ...",N,20151130,5001,Mit-M,1.0,2015-11-30T14:31:12,2015-11-30T14:54:17,Mit-M,
1,191153,36015,4186,"""Un politico guarda alle prossime elezioni. Un...",N,20151130,5001,Mit-F,1.0,2015-11-30T14:54:17,2015-11-30T15:03:07,Mit-F,IT
2,191155,36015,214,"Präsident (Stamm Luzi, Alterspräsident): Ich b...",N,20151130,5001,Mit-M,1.0,2015-11-30T15:03:07,2015-11-30T15:03:46,Mit-M,DE
3,191152,36016,214,"Präsident (Stamm Luzi, Alterspräsident): Dem A...",N,20151130,5001,Mit-M,1.0,2015-11-30T15:03:46,2015-11-30T15:05:12,Mit-M,DE
4,191157,36016,519,Zur Konstituierung des Rates: Sie haben den Be...,N,20151130,5001,Mit-M,1.0,2015-11-30T15:05:12,2015-11-30T15:07:12,B,DE
...,...,...,...,...,...,...,...,...,...,...,...,...,...
61687,319686,60670,4238,"Sehr geschätzter Herr Nationalrat Egger, Sie h...",N,20230504,5120,BR-F,99.0,2023-05-04T17:04:38,2023-05-04T17:06:20,BR-F,DE
61688,319809,60674,4268,La questione della mediatizzazione dei process...,N,20230504,5120,Mit-F,1.0,2023-05-04T17:06:39,2023-05-04T17:11:27,Mit-F,IT
61689,319699,60674,1122,"Frau Kollegin Gysin, Sie wollen eine Priorisie...",N,20230504,5120,Mit-M,1.0,2023-05-04T17:11:29,2023-05-04T17:11:46,Mit-M,DE
61690,319807,60674,4268,"Collega Fluri, sono molto consapevole del prob...",N,20230504,5120,Mit-F,1.0,2023-05-04T17:11:46,2023-05-04T17:12:44,Mit-F,IT


In [758]:
selected_language = 'FR'

# only keep french texts
filtered_transcript = transcript_df.loc[transcript_df['LanguageOfText'] == selected_language]
# only keep texts longer than 300 char
filtered_transcript['text_length'] = filtered_transcript['Text'].apply(lambda x: len(x))
filtered_transcript = filtered_transcript[filtered_transcript['text_length'] > 300]
# reset index
filtered_transcript = filtered_transcript.reset_index(drop=True)
filtered_transcript

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length
0,191151,36016,4018,Vous avez reçu le rapport du Conseil fédéral s...,N,20151130,5001,Mit-M,1.0,2015-11-30T15:07:12,2015-11-30T15:08:50,B,FR,1159
1,191158,36016,4018,Pour adopter les décisions et formuler les pro...,N,20151130,5001,Mit-M,1.0,2015-11-30T15:21:04,2015-11-30T15:23:08,B,FR,1643
2,191267,36022,3898,Vous vous souvenez que le groupe UDC recommand...,N,20151130,5001,Mit-M,1.0,2015-11-30T17:58:22,2015-11-30T18:02:40,Mit-M,FR,3925
3,191199,36022,1116,"Dans ce dossier, nous sommes maintenant à bout...",N,20151130,5001,BR-M,99.0,2015-11-30T18:07:35,2015-11-30T18:12:11,BR-M,FR,4090
4,191226,36022,1116,On peut rester relativement calme sur ce sujet...,N,20151130,5001,BR-M,99.0,2015-11-30T18:27:03,2015-11-30T18:32:48,BR-M,FR,5020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13932,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,2023-05-04T16:41:10,2023-05-04T16:44:54,Mit-M,FR,3702
13933,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,2023-05-04T16:44:58,2023-05-04T16:47:57,BR-F,FR,3153
13934,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,2023-05-04T16:48:32,2023-05-04T16:52:03,Mit-F,FR,2810
13935,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,2023-05-04T16:54:20,2023-05-04T16:56:37,BR-F,FR,2345


In [777]:
filtered_transcript.loc[filtered_transcript['ID'] == 316834]

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length
13668,316834,60089,1161,"L'Albanie, le pays dont on parle maintenant, e...",S,20230314,5118,BPR-M,99.0,2023-03-14T08:21:49,2023-03-14T08:23:30,BPR-M,FR,1746


In [94]:
nlp_sent = spacy.load('xx_sent_ud_sm')
nlp = spacy.load(selected_language.lower() + "_core_news_md")
nlp.add_pipe("language_detector")
lemmatizer = nlp.get_pipe("lemmatizer")

additional_stopwords = {
    'FR': {'-t', 'avez', 'être', 'aujourd', 'hui'},
    'DE': set(),
}
removed_stopwords = {
    'FR': {'hui'},
    'DE': set(),
}
specific_stopwords = {
    'FR': {
        'accord', 'alinéa', 'an', 'année', 'article', 'avis', 'cadre', 'canton', 'cas', 
        'collègue', 'commission', 'conseil', 'débat', 'décision', 'discussion', 'disposition', 'domaine', 'droit', 
        'fédéral', 'franc', 'groupe', 'initiative', 'législature', 'loi', 'majorité', 'matière', 'mesure', 'milliard', 'million', 'minorité', 
        'monsieur', 'motion', 'parlementaire', 'pays', 'postulat', 'politique', 'position', 'président', 'proposition', 
        'projet', 'question', 'rapport', 'rapporteur', 'réponse', 'session', 'situation', 'suisse', 'voix'
    },
    'DE': {
        'Vereinbarung', 'Absatz', 'Jahr', 'Artikel', 'Stellungnahme', 'Rahmen', 'Kanton', 'Fall', 
        'Kollege', 'Kommission', 'Rat', 'Debatte', 'Entscheidung', 'Diskussion', 'Bestimmung', 'Bereich', 'Recht', 
        'föderal', 'Franken', 'Gruppe', 'Initiative', 'Legislaturperiode', 'Gesetz', 'Mehrheit', 'Materie', 'Massnahme', 'Milliarde', 'Million', 'Minderheit', 
        'Herr', 'Frau', 'Dame' 'Motion', 'Parlamentarier', 'Land', 'Postulat', 'Politik', 'Position', 'Präsident', 'Präsidentin', 'Vorschlag', 
        'Projekt', 'Frage', 'Bericht', 'Berichterstatter', 'Antwort', 'Sitzung', 'Lage', 'Schweiz', 'Stimme',
        'Bundesrat', 'Nationalrat', 'Bundeskanzler', 'Urne'
    },
}

if selected_language == 'DE':    
    # add stopwords 
    nlp.Defaults.stop_words |= additional_stopwords[selected_language]
    nlp.Defaults.stop_words |= specific_stopwords[selected_language]
    
    # remove stopwords
    nlp.Defaults.stop_words -= additional_stopwords['FR']
    nlp.Defaults.stop_words -= specific_stopwords['FR']
elif selected_language == 'FR':
    # add stopwords 
    nlp.Defaults.stop_words |= additional_stopwords[selected_language]
    nlp.Defaults.stop_words |= specific_stopwords[selected_language]
    
    # remove stopwords
    nlp.Defaults.stop_words -= additional_stopwords['DE']
    nlp.Defaults.stop_words -= specific_stopwords['DE']

print(len(nlp.Defaults.stop_words))
print(nlp.Defaults.stop_words)

558
{'moi-même', 'jusque', 'ouste', 'déja', 'une', 'puisque', 'quelques', 'trente', 'président', 'facon', 'pays', 'toi', 'dixième', 'lesquelles', 'lès', 'auxquels', 'dont', 'au', 'est', 'jusqu', 'o', 'exactement', 'vôtres', 'celui-là', 'chaque', 'quelconque', 'etc', 'alinéa', 'je', 'desquels', 'faisant', 'vu', 'tellement', "m'", 'egalement', 'toi-meme', 'pendant', 'cent', 'ait', 'cependant', "j'", 'n’', 'ou', 'préalable', 'les', 'surtout', 'directement', 'matière', 'réponse', 'autre', 'partant', 'dit', 'mêmes', 'dehors', 'canton', 'auxquelles', 'seuls', 'ont', 'cinquante', 'seul', "s'", 'celle-ci', 'celle-la', 'votres', 'minorité', 'un', 'dire', 'ouverte', 'donc', 'tente', 'antérieur', 'cinquième', 'autrui', "c'", 'lequel', 'car', 'aujourd', 'enfin', 'douze', 'deja', 'ouvert', 'droit', 'quelque', 'ni', 'soi', 'entre', 'seule', 'ceux', 'antérieure', 'importe', 'ayant', 'tres', 'soi-meme', 'na', 'd’', 'neanmoins', 'suffisante', 'pourrait', 'pourquoi', 'revoila', 'seules', 'parle', 'rend'



In [786]:
example_row = 13668
text_id = filtered_transcript.iloc[example_row]['ID']
text = filtered_transcript.iloc[example_row]['Text']

print(text_id)
print('length:', len(text))
print(text)
print('---')

entity_dict = {}

for ent in nlp(text).ents:
    entity_dict[ent.text] = ent.label_
    
print(entity_dict)
print('---')

lemma_list = [
    token.lemma_ for sent in nlp_sent(text).sents 
    if (doc := nlp(sent.text))._.language == selected_language.lower() 
    for token in doc 
    if not any([
        token.is_stop, 
        token.is_punct, 
        token.is_space, 
        token.ent_type_, 
        not token.is_alpha, 
        token.lemma_ in nlp.Defaults.stop_words, 
        token.pos_ not in ['NOUN', 'ADJ'],
    ])
]

#lemma_list.sort()
print(lemma_list)

316834
length: 1746
L'Albanie, le pays dont on parle maintenant, est en fait le dernier pays des Balkans occidentaux avec lequel nous n'avons pas conclu de convention sur la sécurité sociale. En encourageant la coopération économique et la coopération en matière de migration, nous contribuons aussi à stabiliser la situation dans les Balkans occidentaux, et cet accord facilite, cela a été rappelé par le rapporteur de la commission, le retour des ressortissants albanais dans leur pays d'origine; il contribue aussi à faciliter les échanges économiques entre nos deux Etats.
Je ne reviens pas sur les détails de l'accord, ils correspondent à ce qu'on fait en général dans ce type de cas; le rapporteur de la commission a été très complet. Il y a juste un élément que j'aimerais encore apporter. Monsieur Müller, vous avez parlé des coûts; effectivement, on les estime à 2,5 millions de francs. Vous avez aussi rappelé la répartition. Il faut voir que ce sont des coûts liés au versement des rentes 

In [787]:
# remove stopwords, punctuation and then lemmatize
tqdm.pandas()
processed_transcript = filtered_transcript.copy()
processed_transcript['text_lemmatized'] = processed_transcript['Text'].progress_apply(
    lambda x: ' '.join([
        token.lemma_ for sent in nlp_sent(x).sents 
        if (doc := nlp(sent.text))._.language == selected_language.lower() 
        for token in doc 
        if not any([
            token.is_stop, 
            token.is_punct, 
            token.is_space, 
            token.ent_type_, 
            not token.is_alpha, 
            token.lemma_ in nlp.Defaults.stop_words, 
            token.pos_ not in ['NOUN', 'ADJ'],
        ])
    ])
)
# Take a look at the data
processed_transcript

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13937/13937 [30:32<00:00,  7.60it/s]


Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length,text_lemmatized
0,191151,36016,4018,Vous avez reçu le rapport du Conseil fédéral s...,N,20151130,5001,Mit-M,1.0,2015-11-30T15:07:12,2015-11-30T15:08:50,B,FR,1159,élection constitution incompatibilité déroulem...
1,191158,36016,4018,Pour adopter les décisions et formuler les pro...,N,20151130,5001,Mit-M,1.0,2015-11-30T15:21:04,2015-11-30T15:23:08,B,FR,1643,légal constitutionnel pertinent principe inter...
2,191267,36022,3898,Vous vous souvenez que le groupe UDC recommand...,N,20151130,5001,Mit-M,1.0,2015-11-30T17:58:22,2015-11-30T18:02:40,Mit-M,FR,3925,carte visite diplomate blanc soupçon fusil cli...
3,191199,36022,1116,"Dans ce dossier, nous sommes maintenant à bout...",N,20151130,5001,BR-M,99.0,2015-11-30T18:07:35,2015-11-30T18:12:11,BR-M,FR,4090,dossier bout possibilité début carte visite go...
4,191226,36022,1116,On peut rester relativement calme sur ce sujet...,N,20151130,5001,BR-M,99.0,2015-11-30T18:27:03,2015-11-30T18:32:48,BR-M,FR,5020,calme sujet dernier intervention législation s...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13932,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,2023-05-04T16:41:10,2023-05-04T16:44:54,Mit-M,FR,3702,modèle affaire grand entreprise numérique coll...
13933,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,2023-05-04T16:44:58,2023-05-04T16:47:57,BR-F,FR,3153,doigt problème important recours collecte donn...
13934,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,2023-05-04T16:48:32,2023-05-04T16:52:03,Mit-F,FR,2810,suite attentat terroriste vie aide victime inf...
13935,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,2023-05-04T16:54:20,2023-05-04T16:56:37,BR-F,FR,2345,clair victime infraction soutien nécessaire fa...


In [788]:
processed_transcript['text_lemma_list'] = processed_transcript['text_lemmatized'].apply(lambda x: x.split())
processed_transcript

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length,text_lemmatized,text_lemma_list
0,191151,36016,4018,Vous avez reçu le rapport du Conseil fédéral s...,N,20151130,5001,Mit-M,1.0,2015-11-30T15:07:12,2015-11-30T15:08:50,B,FR,1159,élection constitution incompatibilité déroulem...,"[élection, constitution, incompatibilité, déro..."
1,191158,36016,4018,Pour adopter les décisions et formuler les pro...,N,20151130,5001,Mit-M,1.0,2015-11-30T15:21:04,2015-11-30T15:23:08,B,FR,1643,légal constitutionnel pertinent principe inter...,"[légal, constitutionnel, pertinent, principe, ..."
2,191267,36022,3898,Vous vous souvenez que le groupe UDC recommand...,N,20151130,5001,Mit-M,1.0,2015-11-30T17:58:22,2015-11-30T18:02:40,Mit-M,FR,3925,carte visite diplomate blanc soupçon fusil cli...,"[carte, visite, diplomate, blanc, soupçon, fus..."
3,191199,36022,1116,"Dans ce dossier, nous sommes maintenant à bout...",N,20151130,5001,BR-M,99.0,2015-11-30T18:07:35,2015-11-30T18:12:11,BR-M,FR,4090,dossier bout possibilité début carte visite go...,"[dossier, bout, possibilité, début, carte, vis..."
4,191226,36022,1116,On peut rester relativement calme sur ce sujet...,N,20151130,5001,BR-M,99.0,2015-11-30T18:27:03,2015-11-30T18:32:48,BR-M,FR,5020,calme sujet dernier intervention législation s...,"[calme, sujet, dernier, intervention, législat..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13932,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,2023-05-04T16:41:10,2023-05-04T16:44:54,Mit-M,FR,3702,modèle affaire grand entreprise numérique coll...,"[modèle, affaire, grand, entreprise, numérique..."
13933,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,2023-05-04T16:44:58,2023-05-04T16:47:57,BR-F,FR,3153,doigt problème important recours collecte donn...,"[doigt, problème, important, recours, collecte..."
13934,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,2023-05-04T16:48:32,2023-05-04T16:52:03,Mit-F,FR,2810,suite attentat terroriste vie aide victime inf...,"[suite, attentat, terroriste, vie, aide, victi..."
13935,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,2023-05-04T16:54:20,2023-05-04T16:56:37,BR-F,FR,2345,clair victime infraction soutien nécessaire fa...,"[clair, victime, infraction, soutien, nécessai..."


In [789]:
processed_transcript = processed_transcript[processed_transcript['text_lemma_list'].map(len) >= 10]
processed_transcript = processed_transcript.reset_index().rename(columns={'index': 'transcript_idx'})
processed_transcript

Unnamed: 0,transcript_idx,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length,text_lemmatized,text_lemma_list
0,0,191151,36016,4018,Vous avez reçu le rapport du Conseil fédéral s...,N,20151130,5001,Mit-M,1.0,2015-11-30T15:07:12,2015-11-30T15:08:50,B,FR,1159,élection constitution incompatibilité déroulem...,"[élection, constitution, incompatibilité, déro..."
1,1,191158,36016,4018,Pour adopter les décisions et formuler les pro...,N,20151130,5001,Mit-M,1.0,2015-11-30T15:21:04,2015-11-30T15:23:08,B,FR,1643,légal constitutionnel pertinent principe inter...,"[légal, constitutionnel, pertinent, principe, ..."
2,2,191267,36022,3898,Vous vous souvenez que le groupe UDC recommand...,N,20151130,5001,Mit-M,1.0,2015-11-30T17:58:22,2015-11-30T18:02:40,Mit-M,FR,3925,carte visite diplomate blanc soupçon fusil cli...,"[carte, visite, diplomate, blanc, soupçon, fus..."
3,3,191199,36022,1116,"Dans ce dossier, nous sommes maintenant à bout...",N,20151130,5001,BR-M,99.0,2015-11-30T18:07:35,2015-11-30T18:12:11,BR-M,FR,4090,dossier bout possibilité début carte visite go...,"[dossier, bout, possibilité, début, carte, vis..."
4,4,191226,36022,1116,On peut rester relativement calme sur ce sujet...,N,20151130,5001,BR-M,99.0,2015-11-30T18:27:03,2015-11-30T18:32:48,BR-M,FR,5020,calme sujet dernier intervention législation s...,"[calme, sujet, dernier, intervention, législat..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13531,13932,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,2023-05-04T16:41:10,2023-05-04T16:44:54,Mit-M,FR,3702,modèle affaire grand entreprise numérique coll...,"[modèle, affaire, grand, entreprise, numérique..."
13532,13933,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,2023-05-04T16:44:58,2023-05-04T16:47:57,BR-F,FR,3153,doigt problème important recours collecte donn...,"[doigt, problème, important, recours, collecte..."
13533,13934,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,2023-05-04T16:48:32,2023-05-04T16:52:03,Mit-F,FR,2810,suite attentat terroriste vie aide victime inf...,"[suite, attentat, terroriste, vie, aide, victi..."
13534,13935,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,2023-05-04T16:54:20,2023-05-04T16:56:37,BR-F,FR,2345,clair victime infraction soutien nécessaire fa...,"[clair, victime, infraction, soutien, nécessai..."


In [852]:
processed_transcript.iloc[5661]['Text']

"Madame la vice-présidente du Conseil fédéral, merci de votre réponse. Vous avez mentionné qu'il s'agissait d'un incident et qu'une analyse était en cours. Ne pensez-vous pas qu'il serait utile de reprendre l'examen du dossier, après que l'analyse aura été faite, afin de voir si des modifications devraient être apportées pour éviter de tels incidents à l'avenir?\n"

In [791]:
processed_transcript.to_csv('data/lemmatized/transcripts_lemmatized_fr_50_51_v2.csv', encoding='utf-8')

### Load lemmatized transcript

In [124]:
selected_language = 'fr'
selected_sessions = [50, 51]
processed_transcript = pd.DataFrame()

for session in selected_sessions:
    with open('data/lemmatized/transcripts_lemmatized_' + selected_language + '_' + str(session) + '.csv', encoding='utf-8') as file:
        transcript = pd.read_csv(file).drop(columns='Unnamed: 0')
    
    processed_transcript = pd.concat([processed_transcript, transcript])
   
processed_transcript = processed_transcript.reset_index(drop=True)
processed_transcript

Unnamed: 0,transcript_idx,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length,text_lemmatized,text_lemma_list
0,0,191151,36016,4018,Vous avez reçu le rapport du Conseil fédéral s...,N,20151130,5001,Mit-M,1.0,2015-11-30T15:07:12,2015-11-30T15:08:50,B,FR,1159,élection constitution incompatibilité déroulem...,"['élection', 'constitution', 'incompatibilité'..."
1,1,191158,36016,4018,Pour adopter les décisions et formuler les pro...,N,20151130,5001,Mit-M,1.0,2015-11-30T15:21:04,2015-11-30T15:23:08,B,FR,1643,principe explication service conseiller mandat...,"['principe', 'explication', 'service', 'consei..."
2,2,191267,36022,3898,Vous vous souvenez que le groupe UDC recommand...,N,20151130,5001,Mit-M,1.0,2015-11-30T17:58:22,2015-11-30T18:02:40,Mit-M,FR,3925,carte visite diplomate soupçon fusil client ba...,"['carte', 'visite', 'diplomate', 'soupçon', 'f..."
3,3,191199,36022,1116,"Dans ce dossier, nous sommes maintenant à bout...",N,20151130,5001,BR-M,99.0,2015-11-30T18:07:35,2015-11-30T18:12:11,BR-M,FR,4090,dossier bout possibilité début carte visite go...,"['dossier', 'bout', 'possibilité', 'début', 'c..."
4,4,191226,36022,1116,On peut rester relativement calme sur ce sujet...,N,20151130,5001,BR-M,99.0,2015-11-30T18:27:03,2015-11-30T18:32:48,BR-M,FR,5020,sujet intervention législation heure exemple c...,"['sujet', 'intervention', 'législation', 'heur..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13144,7316,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,2023-05-04T16:41:10,2023-05-04T16:44:54,Mit-M,FR,3702,modèle affaire entreprise numérique collecte d...,"['modèle', 'affaire', 'entreprise', 'numérique..."
13145,7317,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,2023-05-04T16:44:58,2023-05-04T16:47:57,BR-F,FR,3153,doigt problème recours collecte donnée surveil...,"['doigt', 'problème', 'recours', 'collecte', '..."
13146,7318,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,2023-05-04T16:48:32,2023-05-04T16:52:03,Mit-F,FR,2810,suite attentat vie aide victime infraction ind...,"['suite', 'attentat', 'vie', 'aide', 'victime'..."
13147,7319,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,2023-05-04T16:54:20,2023-05-04T16:56:37,BR-F,FR,2345,victime infraction soutien face traumatisme in...,"['victime', 'infraction', 'soutien', 'face', '..."


### Transcripts by person, party and gender

In [792]:
with open('data/persons.csv', encoding='utf-8') as file:
    persons_df = pd.read_csv(file).drop(columns='Unnamed: 0')[['PersonNumber', 'LastName', 'FirstName', 'GenderAsString', 'PartyAbbreviation']]
    
persons_df

Unnamed: 0,PersonNumber,LastName,FirstName,GenderAsString,PartyAbbreviation
0,9,Baumann,Ruedi,m,VERT-E-S
1,12,Beerli,Christine,f,PLR
2,14,Bezzola,Duri,m,PLR
3,15,Binder,Max,m,UDC
4,21,Blocher,Christoph,m,UDC
...,...,...,...,...,...
705,4329,Ruch,Daniel,m,PLR
706,4330,Berthoud,Alexandre,m,PLR
707,4331,Jost,Marc,m,PEV
708,4332,Crevoisier Crelier,Mathilde,f,PSS


In [793]:
transcript_by_person = processed_transcript.reset_index().merge(persons_df, on='PersonNumber', how='left').set_index('index')
transcript_by_person.index.name = None
transcript_by_person

Unnamed: 0,transcript_idx,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,...,End,Function,LanguageOfText,text_length,text_lemmatized,text_lemma_list,LastName,FirstName,GenderAsString,PartyAbbreviation
0,0,191151,36016,4018,Vous avez reçu le rapport du Conseil fédéral s...,N,20151130,5001,Mit-M,1.0,...,2015-11-30T15:08:50,B,FR,1159,élection constitution incompatibilité déroulem...,"[élection, constitution, incompatibilité, déro...",Maire,Jacques-André,m,PSS
1,1,191158,36016,4018,Pour adopter les décisions et formuler les pro...,N,20151130,5001,Mit-M,1.0,...,2015-11-30T15:23:08,B,FR,1643,légal constitutionnel pertinent principe inter...,"[légal, constitutionnel, pertinent, principe, ...",Maire,Jacques-André,m,PSS
2,2,191267,36022,3898,Vous vous souvenez que le groupe UDC recommand...,N,20151130,5001,Mit-M,1.0,...,2015-11-30T18:02:40,Mit-M,FR,3925,carte visite diplomate blanc soupçon fusil cli...,"[carte, visite, diplomate, blanc, soupçon, fus...",Nidegger,Yves,m,UDC
3,3,191199,36022,1116,"Dans ce dossier, nous sommes maintenant à bout...",N,20151130,5001,BR-M,99.0,...,2015-11-30T18:12:11,BR-M,FR,4090,dossier bout possibilité début carte visite go...,"[dossier, bout, possibilité, début, carte, vis...",Burkhalter,Didier,m,PLR
4,4,191226,36022,1116,On peut rester relativement calme sur ce sujet...,N,20151130,5001,BR-M,99.0,...,2015-11-30T18:32:48,BR-M,FR,5020,calme sujet dernier intervention législation s...,"[calme, sujet, dernier, intervention, législat...",Burkhalter,Didier,m,PLR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13531,13932,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,...,2023-05-04T16:44:54,Mit-M,FR,3702,modèle affaire grand entreprise numérique coll...,"[modèle, affaire, grand, entreprise, numérique...",Fivaz,Fabien,m,VERT-E-S
13532,13933,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,...,2023-05-04T16:47:57,BR-F,FR,3153,doigt problème important recours collecte donn...,"[doigt, problème, important, recours, collecte...",Baume-Schneider,Elisabeth,f,PSS
13533,13934,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,...,2023-05-04T16:52:03,Mit-F,FR,2810,suite attentat terroriste vie aide victime inf...,"[suite, attentat, terroriste, vie, aide, victi...",de Quattro,Jacqueline,f,PLR
13534,13935,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,...,2023-05-04T16:56:37,BR-F,FR,2345,clair victime infraction soutien nécessaire fa...,"[clair, victime, infraction, soutien, nécessai...",Baume-Schneider,Elisabeth,f,PSS


In [794]:
# number of transcripts by party
transcript_by_person.groupby('PartyAbbreviation').count()['Text']

PartyAbbreviation
-            100
CSPO           8
EàG           38
Lega           2
M-E          951
MCG           47
PDC          504
PLR         2516
PSS         5000
PdT           61
UDC         2447
VERT-E-S    1578
pvl          284
Name: Text, dtype: int64

In [795]:
# number of transcripts by gender
transcript_by_person.groupby('GenderAsString').count()['Text']

GenderAsString
f     3321
m    10215
Name: Text, dtype: int64

In [796]:
transcript_by_party = transcript_by_person.groupby('PartyAbbreviation').agg({'text_lemmatized': lambda x: ' '.join(x)})
transcript_by_party.index.name = None
transcript_by_party = transcript_by_party.drop(['Al', 'Lega', 'PLD', '-'], errors='ignore')
transcript_by_party

Unnamed: 0,text_lemmatized
CSPO,juin arrêté vien essentiel rejet populaire arr...
EàG,conseiller reprise plupart étude international...
M-E,bloc important cause objet cours répétition ab...
MCG,armée fort menace actuel prévisible améliorati...
PDC,thème essentiel avenir planète ressource maniè...
PLR,dossier bout possibilité début carte visite go...
PSS,élection constitution incompatibilité déroulem...
PdT,reprise membre sujet pointilleux respect rente...
UDC,carte visite diplomate blanc soupçon fusil cli...
VERT-E-S,peuple salle problème manière ressource contra...


In [797]:
transcript_by_gender = transcript_by_person.groupby('GenderAsString').agg({'text_lemmatized': lambda x: ' '.join(x)})
transcript_by_gender.index.name = None
transcript_by_gender

Unnamed: 0,text_lemmatized
f,peuple salle problème manière ressource contra...
m,élection constitution incompatibilité déroulem...


### TF-IDF 

In [798]:
group_transcript = processed_transcript.copy()

In [799]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(group_transcript['text_lemmatized'].to_list())
len(vectorizer.get_feature_names_out())

16038

In [800]:
tfidf_df = pd.DataFrame(vectors.toarray(), index=group_transcript.index, columns=vectorizer.get_feature_names_out())
tfidf_df.index.name = None
tfidf_df

Unnamed: 0,aaa,ab,abaissement,abandon,abandonniste,abandonné,abat,abattage,abattement,abattoir,...,évoqué,évènement,évènementiel,événement,événementiel,événementielle,évêché,évêque,île,îlot
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13532,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13533,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13534,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [801]:
tfidf_df.sort_values(axis='columns', by=tfidf_df.iloc[1].name, ascending=False)

Unnamed: 0,incompatibilité,élection,interprétatif,vraisemblance,mandat,renonciation,incompatible,national,communiqué,député,...,enverrion,envi,enviable,envie,envieux,environment,environmental,environnant,environnement,îlot
0,0.196255,0.706330,0.000000,0.000000,0.101251,0.000000,0.000000,0.079651,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,0.592882,0.284507,0.241232,0.205063,0.203918,0.185579,0.172586,0.160416,0.158746,0.158489,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13531,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.090225,0.0,0.0,0.0,0.0,0.0,0.0
13532,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
13533,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
13534,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [802]:
tfidf_score_df = tfidf_df.stack().reset_index()
tfidf_score_df = tfidf_score_df.rename(columns={'ID': 'transcript','level_1': 'term', 0:'score', 'level_0': 'idx'})
tfidf_score_df

Unnamed: 0,idx,term,score
0,0,aaa,0.0
1,0,ab,0.0
2,0,abaissement,0.0
3,0,abandon,0.0
4,0,abandonniste,0.0
...,...,...,...
217090363,13535,événementielle,0.0
217090364,13535,évêché,0.0
217090365,13535,évêque,0.0
217090366,13535,île,0.0


In [803]:
top_tfidf = tfidf_score_df.sort_values(by=['idx','score'], ascending=[True,False]).groupby(['idx']).head(100)
top_tfidf

Unnamed: 0,idx,term,score
15739,0,élection,0.706330
3295,0,constitution,0.478501
8964,0,marquant,0.211856
7397,0,incompatibilité,0.196255
4888,0,déroulement,0.172701
...,...,...,...
217074371,13535,abroutissement,0.000000
217074372,13535,abrupt,0.000000
217074373,13535,abréviation,0.000000
217074374,13535,abscons,0.000000


In [804]:
top_tfidf[top_tfidf['term'].str.contains("assurance")]

Unnamed: 0,idx,term,score
337906,21,assurance,0.030655
353944,22,assurance,0.063676
883198,55,assurance,0.208093
915274,57,assurance,0.127274
1171882,73,assurance,0.273244
...,...,...,...
215952778,13465,assurance,0.231422
215984854,13467,assurance,0.101706
216000892,13468,assurance,0.059333
217043362,13533,assurance,0.041380


In [805]:
top_tfidf[top_tfidf['term'].str.contains("assurance")]
top_tfidf[top_tfidf['idx'] == 838]

Unnamed: 0,idx,term,score
13442870,838,compte,0.298094
13447176,838,impôt,0.169321
13451238,838,progression,0.169258
13448414,838,livraison,0.160276
13445372,838,excédent,0.159801
...,...,...,...
13452003,838,recommandation,0.059616
13449776,838,négatif,0.059291
13450777,838,plein,0.059157
13441874,838,bénéfice,0.058408


In [806]:
top_10_tfidf = top_tfidf.groupby('idx').head(10)
top_10_tfidf

Unnamed: 0,idx,term,score
15739,0,élection,0.706330
3295,0,constitution,0.478501
8964,0,marquant,0.211856
7397,0,incompatibilité,0.196255
4888,0,déroulement,0.172701
...,...,...,...
217089137,13535,tribunal,0.184952
217089605,13535,victimisation,0.130225
217080413,13535,fondement,0.083068
217085666,13535,procès,0.081936


In [807]:
top_10_tfidf.query('idx > 6870 & idx < 6879 & score > 0')

Unnamed: 0,idx,term,score
110212329,6871,version,0.431810
110212542,6871,votant,0.309776
110207830,6871,peuple,0.274133
110213053,6871,étau,0.219122
110209222,6871,realpolitik,0.210842
...,...,...,...
110318399,6878,maximum,0.205515
110324722,6878,vis,0.190221
110317749,6878,large,0.181767
110320683,6878,problème,0.174858


In [808]:
top_10_tfidf = top_10_tfidf.query('score > 0')
top_10_tfidf

Unnamed: 0,idx,term,score
15739,0,élection,0.706330
3295,0,constitution,0.478501
8964,0,marquant,0.211856
7397,0,incompatibilité,0.196255
4888,0,déroulement,0.172701
...,...,...,...
217089137,13535,tribunal,0.184952
217089605,13535,victimisation,0.130225
217080413,13535,fondement,0.083068
217085666,13535,procès,0.081936


In [809]:
# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_10_tfidf.query('idx > -1 & idx < 10 & score > 0')
#top_tfidf_plusRand = top_10_tfidf.copy()
top_tfidf_plusRand['score'] = top_tfidf_plusRand['score'] + np.random.rand(top_tfidf_plusRand.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'idx:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("score", order="descending")],
    groupby = ["idx"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'score:Q'
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + text).properties(width = 1000)

In [838]:
top_10_tfidf.to_csv('top_tfidf.csv', encoding='utf-8')
top_10_tfidf

Unnamed: 0,idx,term,score
1579,0,bureau,0.642947
9859,0,élection,0.528889
2434,0,constitution,0.351921
5705,0,mandat,0.157655
4918,0,incompatibilité,0.143407
...,...,...,...
76828223,7649,tribunal,0.191068
76828483,7649,victimisation,0.132253
76825365,7649,optique,0.089030
76823050,7649,fondement,0.088222


In [810]:
top_terms = top_10_tfidf.groupby('idx')['term'].apply(list)
top_terms.name = 'top_terms'
top_terms = pd.DataFrame(top_terms)
top_terms.index.name = None
top_terms

Unnamed: 0,top_terms
0,"[élection, constitution, marquant, incompatibi..."
1,"[incompatibilité, élection, interprétatif, vra..."
2,"[potentat, carte, visite, blocage, fusil, dipl..."
3,"[notion, nécessité, détournement, reconnaissab..."
4,"[confiscation, entraide, pénal, restitution, p..."
...,...
13531,"[algorithme, fallacieux, publicitaire, collect..."
13532,"[plateforme, publicité, donnée, algorithme, in..."
13533,"[victime, acte, terrorisme, citoyen, violence,..."
13534,"[victime, infraction, étranger, indemnisation,..."


In [296]:
top_terms.loc[12884]

top_terms    [financement, équivalence, auteure, inquiétude...
Name: 12884, dtype: object

### BERTopic

#### Manual annotation

In [403]:
sample_top_terms = top_terms.loc[sorted(random.sample(list(top_terms.index), 1500))]
sample_top_terms.to_csv('sample_top_terms.csv', encoding='utf-8')
sample_top_terms

Unnamed: 0,top_terms
17,"[coût, délai, responsabilité, transition, temp..."
21,"[médecin, patient, assureur, confiance, symétr..."
34,"[homme, service, obligation, armée, optimisati..."
44,"[dépense, budget, personnel, exploitation, eff..."
57,"[exclusion, potentielle, cotisation, contact, ..."
...,...
13107,"[asile, centre, mineur, accompagnement, accuei..."
13113,"[mineur, plateforme, média, parent, prévention..."
13121,"[local, extérieur, armurier, sécurité, degré, ..."
13133,"[visa, collaborateur, taliban, militant, femme..."


In [653]:
text_id = 5365
print(top_terms.loc[text_id]['top_terms'], '\n')
print(processed_transcript.loc[text_id]['Text'])

['échange', 'message', 'mobilité', 'culture', 'vision', 'élément', 'consultation', 'volonté', 'soutien', 'création'] 

Comme vous l'avez rappelé, Madame Marchand-Balet, les échanges entre les régions linguistiques constituent un élément central de la politique linguistique et culturelle de la Confédération. Ils sont aussi d'une très grande importance pour les cantons. Ils contribuent à assurer la compréhension entre les communautés, à garantir le respect des autres langues et cultures de notre pays, et, par là même, à maintenir la cohésion nationale.
Vous demandez, par voie de motion, que le crédit destiné au soutien des échanges linguistiques dans le cadre de l'enveloppe 2016-2020 soit augmenté. Nous partageons avec vous la volonté de renforcer ces échanges. Il s'est passé pas mal de choses depuis le dépôt de votre motion. Un des éléments a été la collaboration avec les cantons, qui a abouti à la mise en place en 2017 de Movetia, l'agence nationale pour la  promotion des échanges et d

In [735]:
with open('sample_1500.csv', encoding='utf-8') as file:
    sample_cat = pd.read_csv(file).set_index('Unnamed: 0')
    sample_cat.index.name = None

sample_cat = sample_cat[['cat', 'transcript_words']]
sample_cat

Unnamed: 0,cat,transcript_words
1,politique,"['incompatibilité', 'élection', 'vraisemblance..."
2,extérieur,"['potentat', 'carte', 'visite', 'blocage', 'fu..."
25,armée-sécurité,"['armée', 'effectif', 'commandement', 'deva', ..."
32,armée-sécurité,"['armée', 'arrêté', 'deva', 'référendum', 'pla..."
46,extérieur,"['aide', 'développement', 'profit', 'taux', 'b..."
...,...,...
13121,armée-sécurité,"['local', 'extérieur', 'armurier', 'sécurité',..."
13124,société,"['égalité', 'sanction', 'travailleuse', 'analy..."
13129,immigration,"['contingent', 'réinstallation', 'femme', 'réf..."
13143,justice,"['corruption', 'infraction', 'blanchiment', 'c..."


In [749]:
sample_checked = sample_cat.merge(processed_transcript, left_index=True, right_index=True, how='left')[['cat', 'transcript_words', 'transcript_idx', 'IdSession', 'ID']]
sample_checked

Unnamed: 0,cat,transcript_words,transcript_idx,IdSession,ID
1,politique,"['incompatibilité', 'élection', 'vraisemblance...",1,5001,191158
2,extérieur,"['potentat', 'carte', 'visite', 'blocage', 'fu...",2,5001,191267
25,armée-sécurité,"['armée', 'effectif', 'commandement', 'deva', ...",26,5001,191603
32,armée-sécurité,"['armée', 'arrêté', 'deva', 'référendum', 'pla...",34,5001,191682
46,extérieur,"['aide', 'développement', 'profit', 'taux', 'b...",49,5001,191929
...,...,...,...,...,...
13121,armée-sécurité,"['local', 'extérieur', 'armurier', 'sécurité',...",7293,5120,319779
13124,société,"['égalité', 'sanction', 'travailleuse', 'analy...",7296,5120,319626
13129,immigration,"['contingent', 'réinstallation', 'femme', 'réf...",7301,5120,319782
13143,justice,"['corruption', 'infraction', 'blanchiment', 'c...",7315,5120,319685


In [750]:
sample_checked.to_csv('sample_1500_checked.csv', encoding='utf-8')

#### Semi-supervised

In [872]:
with open('sample_1500_checked.csv', encoding='utf-8') as file:
    sample_cat = pd.read_csv(file).set_index('Unnamed: 0')
    sample_cat.index.name = None

sample_cat = sample_cat[['cat']]
sample_cat

Unnamed: 0,cat
1,politique
2,extérieur
25,armée-sécurité
32,armée-sécurité
46,extérieur
...,...
13121,armée-sécurité
13124,société
13129,immigration
13143,justice


In [854]:
category_names = sorted(sample_cat['cat'].unique())
category_names

['agriculture-faune',
 'armée-sécurité',
 'culture-média',
 'emploi',
 'extérieur',
 'finance',
 'formation-recherche',
 'immigration',
 'justice',
 'logement-territoire',
 'manifestation-loisirs',
 'politique',
 'retraite-rentes',
 'santé',
 'société',
 'transport',
 'télécommunications',
 'écologie',
 'économie',
 'énergie']

In [855]:
category_dict = dict()
for i in category_names:
    idx = category_names.index(i)
    category_dict[i] = idx
    
category_dict

{'agriculture-faune': 0,
 'armée-sécurité': 1,
 'culture-média': 2,
 'emploi': 3,
 'extérieur': 4,
 'finance': 5,
 'formation-recherche': 6,
 'immigration': 7,
 'justice': 8,
 'logement-territoire': 9,
 'manifestation-loisirs': 10,
 'politique': 11,
 'retraite-rentes': 12,
 'santé': 13,
 'société': 14,
 'transport': 15,
 'télécommunications': 16,
 'écologie': 17,
 'économie': 18,
 'énergie': 19}

In [856]:
sample_cat = sample_cat.replace(category_dict)
sample_cat

Unnamed: 0,cat
1,11
2,4
25,1
32,1
46,4
...,...
13121,1
13124,14
13129,7
13143,8


In [857]:
supervised_transcript = processed_transcript.merge(sample_cat, left_index=True, right_index=True, how='left')
supervised_transcript['cat'] = supervised_transcript['cat'].fillna(-1)
supervised_transcript = supervised_transcript.astype({'cat':'int'})
supervised_transcript

Unnamed: 0,transcript_idx,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_length,text_lemmatized,text_lemma_list,cat
0,0,191151,36016,4018,Vous avez reçu le rapport du Conseil fédéral s...,N,20151130,5001,Mit-M,1.0,2015-11-30T15:07:12,2015-11-30T15:08:50,B,FR,1159,élection constitution incompatibilité déroulem...,"[élection, constitution, incompatibilité, déro...",-1
1,1,191158,36016,4018,Pour adopter les décisions et formuler les pro...,N,20151130,5001,Mit-M,1.0,2015-11-30T15:21:04,2015-11-30T15:23:08,B,FR,1643,légal constitutionnel pertinent principe inter...,"[légal, constitutionnel, pertinent, principe, ...",11
2,2,191267,36022,3898,Vous vous souvenez que le groupe UDC recommand...,N,20151130,5001,Mit-M,1.0,2015-11-30T17:58:22,2015-11-30T18:02:40,Mit-M,FR,3925,carte visite diplomate blanc soupçon fusil cli...,"[carte, visite, diplomate, blanc, soupçon, fus...",4
3,3,191199,36022,1116,"Dans ce dossier, nous sommes maintenant à bout...",N,20151130,5001,BR-M,99.0,2015-11-30T18:07:35,2015-11-30T18:12:11,BR-M,FR,4090,dossier bout possibilité début carte visite go...,"[dossier, bout, possibilité, début, carte, vis...",-1
4,4,191226,36022,1116,On peut rester relativement calme sur ce sujet...,N,20151130,5001,BR-M,99.0,2015-11-30T18:27:03,2015-11-30T18:32:48,BR-M,FR,5020,calme sujet dernier intervention législation s...,"[calme, sujet, dernier, intervention, législat...",-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13531,13932,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,2023-05-04T16:41:10,2023-05-04T16:44:54,Mit-M,FR,3702,modèle affaire grand entreprise numérique coll...,"[modèle, affaire, grand, entreprise, numérique...",-1
13532,13933,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,2023-05-04T16:44:58,2023-05-04T16:47:57,BR-F,FR,3153,doigt problème important recours collecte donn...,"[doigt, problème, important, recours, collecte...",-1
13533,13934,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,2023-05-04T16:48:32,2023-05-04T16:52:03,Mit-F,FR,2810,suite attentat terroriste vie aide victime inf...,"[suite, attentat, terroriste, vie, aide, victi...",-1
13534,13935,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,2023-05-04T16:54:20,2023-05-04T16:56:37,BR-F,FR,2345,clair victime infraction soutien nécessaire fa...,"[clair, victime, infraction, soutien, nécessai...",-1


In [858]:
y = supervised_transcript['cat'].to_list()
len(y)

13536

In [863]:
# Initiate BERTopic
topic_model = BERTopic(umap_model=umap_model, calculate_probabilities=True, language='french', verbose=True)

# Run BERTopic model
docs = processed_transcript['text_lemmatized'].to_list()
topics, probabilities = topic_model.fit_transform(docs, y=y)
topic_model.get_topic_info()

Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 423/423 [02:56<00:00,  2.40it/s]
2023-06-15 11:44:32,815 - BERTopic - Transformed documents to Embeddings
2023-06-15 11:44:44,105 - BERTopic - Reduced dimensionality
2023-06-15 11:44:52,267 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name
0,-1,5653,-1_important_raison_assurance_travail
1,0,300,0_enfant_congé_accueil_allocation
2,1,266,1_chômage_travailleur_salaire_travail
3,2,241,2_pénal_peine_procédure_code
4,3,235,3_tir_avocat_argent_jeu
...,...,...,...
131,130,11,130_robot_robotisation_emploi_opportunité
132,131,11,131_anglais_langue_arbitral_arbitrage
133,132,10,132_consommateur_contrat_garantie_révocation
134,133,10,133_innovation_fourchette_encouragement_flexib...


In [864]:
new_topics = topic_model.reduce_outliers(docs, topics)
topic_model.update_topics(docs, topics=new_topics)
topic_model.get_topic_info()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00,  1.11it/s]


Unnamed: 0,Topic,Count,Name
0,0,424,0_enfant_congé_parent_accueil
1,1,379,1_chômage_travailleur_travail_salaire
2,2,322,2_pénal_peine_crime_code
3,3,363,3_tir_avocat_secret_victime
4,4,293,4_transport_ferroviaire_trafic_route
...,...,...,...
130,130,11,130_robot_robotisation_emploi_opportunité
131,131,23,131_anglais_arbitrage_langue_arbitral
132,132,39,132_consommateur_contrat_garantie_révocation
133,133,27,133_innovation_encouragement_fourchette_cti


In [865]:
topic_model.get_topic(44)

[('biodiversité', 0.10496376500272445),
 ('surface', 0.061274959763468285),
 ('espèce', 0.03498644941873163),
 ('biotope', 0.026410682267408895),
 ('paysage', 0.024237627999942567),
 ('sol', 0.022524481161171787),
 ('terre', 0.02081688507890815),
 ('agricole', 0.01900327831968392),
 ('agriculture', 0.016853466228465445),
 ('naturel', 0.016752346333744828)]

In [866]:
topic_model.get_document_info(docs).loc[12892]

Document                   tour bienvenue conseiller qualité conseiller o...
Topic                                                                     71
Name                                 71_conciliation_juge_audience_procédure
Top_n_words                conciliation - juge - audience - procédure - j...
Probability                                                         0.022217
Representative_document                                                False
Name: 12892, dtype: object

#### Supervised

In [687]:
with open('sample_1500.csv', encoding='utf-8') as file:
    sample_cat = pd.read_csv(file).set_index('Unnamed: 0')
    sample_cat.index.name = None

sample_cat = sample_cat[['cat']]
sample_cat

Unnamed: 0,cat
1,politique
2,extérieur
25,armée-sécurité
32,armée-sécurité
46,extérieur
...,...
13121,armée-sécurité
13124,société
13129,immigration
13143,justice


In [688]:
category_names = sorted(sample_cat['cat'].unique())
category_names

['agriculture-faune',
 'armée-sécurité',
 'culture-média',
 'emploi',
 'extérieur',
 'finance',
 'formation-recherche',
 'immigration',
 'justice',
 'logement-territoire',
 'manifestation-loisirs',
 'politique',
 'retraite-rentes',
 'santé',
 'société',
 'transport',
 'télécommunications',
 'écologie',
 'économie',
 'énergie']

In [689]:
category_dict = dict()
for i in category_names:
    idx = category_names.index(i)
    category_dict[i] = idx
    
category_dict

{'agriculture-faune': 0,
 'armée-sécurité': 1,
 'culture-média': 2,
 'emploi': 3,
 'extérieur': 4,
 'finance': 5,
 'formation-recherche': 6,
 'immigration': 7,
 'justice': 8,
 'logement-territoire': 9,
 'manifestation-loisirs': 10,
 'politique': 11,
 'retraite-rentes': 12,
 'santé': 13,
 'société': 14,
 'transport': 15,
 'télécommunications': 16,
 'écologie': 17,
 'économie': 18,
 'énergie': 19}

In [707]:
y = sample_cat.replace(category_dict)['cat'].to_list()
len(y)

1500

In [708]:
X = processed_transcript.loc[sample_cat.index]['text_lemmatized'].to_list()
len(X)

1500

In [709]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [710]:
print(len(X_test), len(X_train))

375 1125


In [724]:
# Initiate BERTopic
test_model = BERTopic(umap_model=umap_model, calculate_probabilities=True, language='french', verbose=True)

# Run BERTopic model
test_model.fit(X_train, y=y_train)

Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36/36 [00:13<00:00,  2.75it/s]
2023-06-13 16:06:36,056 - BERTopic - Transformed documents to Embeddings
2023-06-13 16:06:37,831 - BERTopic - Reduced dimensionality
2023-06-13 16:06:37,886 - BERTopic - Clustered reduced embeddings


<bertopic._bertopic.BERTopic at 0x32a4f5100>

In [725]:
predic_topics, predic_probs = test_model.transform(documents=X_test)

Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:04<00:00,  2.69it/s]
2023-06-13 16:08:48,279 - BERTopic - Reduced dimensionality
2023-06-13 16:08:48,306 - BERTopic - Calculated probabilities with HDBSCAN
2023-06-13 16:08:48,306 - BERTopic - Predicted clusters


In [726]:
pd.DataFrame([predic_topics, y_test])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,365,366,367,368,369,370,371,372,373,374
0,9,-1,-1,8,-1,5,0,1,14,-1,...,11,4,0,13,-1,-1,1,-1,6,11
1,19,6,3,3,2,8,4,12,7,7,...,10,1,5,9,14,10,13,14,18,2


#### Reduce outliers

In [369]:
# Initiate BERTopic
test_model = BERTopic(calculate_probabilities=True, language='french', verbose=True)

# Run BERTopic model
test_docs = processed_transcript['text_lemmatized'].to_list()
test_topics, test_probabilities = test_model.fit_transform(test_docs)
test_model.get_topic_info()

Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 411/411 [02:45<00:00,  2.48it/s]
2023-06-08 12:17:07,114 - BERTopic - Transformed documents to Embeddings
2023-06-08 12:17:10,459 - BERTopic - Reduced dimensionality
2023-06-08 12:17:19,287 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name
0,-1,5584,-1_raison_travail_entreprise_manière
1,0,316,0_peine_code_crime_procédure
2,1,310,1_vaccin_pandémie_coronavirus_épidémie
3,2,306,2_chômage_travailleur_salaire_travail
4,3,268,3_formation_recherche_innovation_école
...,...,...,...
138,137,11,137_pétrole_énergie_prix_carburant
139,138,11,138_installation_solaire_bâtiment_énergie
140,139,11,139_robot_robotisation_emploi_opportunité
141,140,10,140_guerre_paix_conflit_arme


In [357]:
# outliers: probabilities

new_topics = test_model.reduce_outliers(test_docs, test_topics, probabilities=test_probabilities, strategy="probabilities")
test_model.update_topics(test_docs, topics=new_topics)
outliers_probs = test_model.get_topic_info()
outliers_probs

Unnamed: 0,Topic,Count,Name
0,0,537,0_peine_juge_infraction_code
1,1,335,1_vaccin_pandémie_coronavirus_crise
2,2,375,2_chômage_travailleur_travail_salaire
3,3,330,3_formation_école_innovation_recherche
4,4,259,4_animal_loup_élevage_viande
...,...,...,...
134,134,50,134_crise_budget_endettement_dépense
135,135,42,135_couple_femme_imposition_rente
136,136,59,136_enfant_détention_famille_mineur
137,137,46,137_convention_ratification_accident_intégration


In [359]:
# outliers: distributions

new_topics = test_model.reduce_outliers(test_docs, test_topics, strategy="distributions")
test_model.update_topics(test_docs, topics=new_topics)
outliers_distrib = test_model.get_topic_info()
outliers_distrib

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:03<00:00,  1.63it/s]


Unnamed: 0,Topic,Count,Name
0,0,446,0_peine_infraction_code_crime
1,1,420,1_pandémie_vaccin_crise_coronavirus
2,2,391,2_chômage_travailleur_travail_salaire
3,3,404,3_formation_recherche_école_innovation
4,4,286,4_animal_loup_élevage_cheval
...,...,...,...
128,128,40,128_sexe_genre_identité_discernement
129,129,50,129_tarif_partenaire_forfait_structure
130,130,30,130_expertise_expert_médecin_assurance
131,131,45,131_traité_ratification_circulation_extension


In [370]:
# outliers: c-tf-idf

new_topics = test_model.reduce_outliers(test_docs, test_topics, strategy="c-tf-idf", threshold=0.15)
test_model.update_topics(test_docs, topics=new_topics)
outliers_ctfidf = test_model.get_topic_info()
outliers_ctfidf

Unnamed: 0,Topic,Count,Name
0,-1,1137,-1_raison_hymne_conseiller_chose
1,0,389,0_peine_infraction_juge_code
2,1,365,1_pandémie_vaccin_crise_épidémie
3,2,372,2_chômage_travailleur_salaire_travail
4,3,374,3_formation_recherche_école_innovation
...,...,...,...
138,137,22,137_énergie_carburant_prix_pétrole
139,138,33,138_installation_bâtiment_énergie_chauffage
140,139,11,139_robot_robotisation_emploi_opportunité
141,140,32,140_paix_guerre_conflit_neutralité


In [371]:
outliers_ctfidf.to_csv('test_topics.csv', encoding='utf-8')

In [385]:
test_model.get_topic(11)

[('divergence', 0.0671863110618691),
 ('version', 0.024658409576647826),
 ('unanimité', 0.012802318008709158),
 ('élimination', 0.012514145726758753),
 ('formulation', 0.011469777543849716),
 ('abstention', 0.011201342683248204),
 ('prestation', 0.010995381866594139),
 ('matin', 0.009459244165364304),
 ('montant', 0.00888220063535126),
 ('objet', 0.008716841969784131)]

In [386]:
with open('test_topics.csv', encoding='utf-8') as file:
    test_topics_df = pd.read_csv(file).drop(columns='Unnamed: 0')

test_topics_df

Unnamed: 0,Topic,Count,Name,cat
0,-1,1137,-1_raison_hymne_conseiller_chose,
1,0,389,0_peine_infraction_juge_code,justice
2,1,365,1_pandémie_vaccin_crise_épidémie,santé
3,2,372,2_chômage_travailleur_salaire_travail,travail
4,3,374,3_formation_recherche_école_innovation,formation
...,...,...,...,...
138,137,22,137_énergie_carburant_prix_pétrole,énergie
139,138,33,138_installation_bâtiment_énergie_chauffage,énergie
140,139,11,139_robot_robotisation_emploi_opportunité,travail
141,140,32,140_paix_guerre_conflit_neutralité,armée


In [387]:
test_topic_groups = list(test_topics_df.groupby('cat')['Topic'].apply(list))
test_topic_groups

[[13, 38, 41, 47, 49, 59, 66, 70, 89, 109, 111, 125],
 [14, 25, 28, 34, 50, 103, 123, 140],
 [11, 118, 120, 122],
 [75, 126, 141],
 [52, 65, 130],
 [19, 57, 63, 102],
 [8, 10, 99, 104, 107, 110, 128],
 [3, 56],
 [15, 33, 73, 82, 85],
 [0, 58, 129, 132],
 [16, 31, 64, 116],
 [23, 81],
 [18],
 [5, 12, 29, 32, 40, 86, 98, 105, 131],
 [4],
 [1,
  22,
  26,
  37,
  39,
  46,
  48,
  54,
  67,
  68,
  72,
  76,
  79,
  92,
  93,
  95,
  106,
  127,
  135],
 [7, 24, 27, 43, 53, 74, 88, 96, 97, 100, 115, 121],
 [30, 51],
 [6, 62, 94, 113, 119],
 [2, 78, 83, 84, 139],
 [21, 35, 45, 61, 77, 87, 90, 91, 136],
 [9, 20, 36, 42, 44, 55, 69, 71, 80, 101, 108, 112, 114, 117, 124, 133, 134],
 [17, 60, 137, 138]]

In [388]:
test_model.merge_topics(test_docs, test_topic_groups)
test_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,1137,-1_raison_manière_travail_problème
1,0,1622,0_assurance_santé_soin_coût
2,1,1107,1_entreprise_marché_prix_banque
3,2,1041,2_surveillance_vote_transparence_travail
4,3,849,3_enfant_femme_congé_parent
5,4,713,4_armée_arme_service_guerre
6,5,707,5_budget_impôt_dépense_imposition
7,6,704,6_animal_agriculture_production_produit
8,7,513,7_peine_procédure_juge_code
9,8,495,8_émission_eau_biodiversité_produit


In [389]:
test_model.get_topic_info().to_csv('test_topics_to_rename.csv', encoding='utf-8')

In [397]:
with open('test_topics_renamed.csv', encoding='utf-8') as file:
    test_topics_renamed = pd.read_csv(file).drop(columns='Unnamed: 0').fillna('unknown')
    
test_topics_renamed

Unnamed: 0,Topic,Count,Name,new_name
0,-1,1137,-1_raison_manière_travail_problème,unknown
1,0,1622,0_assurance_santé_soin_coût,santé
2,1,1107,1_entreprise_marché_prix_banque,économie
3,2,1041,2_surveillance_vote_transparence_travail,politique
4,3,849,3_enfant_femme_congé_parent,société
5,4,713,4_armée_arme_service_guerre,armée
6,5,707,5_budget_impôt_dépense_imposition,finance
7,6,704,6_animal_agriculture_production_produit,agriculture
8,7,513,7_peine_procédure_juge_code,justice
9,8,495,8_émission_eau_biodiversité_produit,écologie


In [395]:
test_topics_name_dict = dict()
test_topics_name_nbr_dict = dict()

for idx, row in test_topics_renamed.iterrows():
    count = idx
    if count < 10:
        count = '0' + str(count)
    else:
        count = str(count)
        
    test_topics_name_nbr_dict[row['Name']] = count + '_' + row['new_name']
    test_topics_name_dict[row['Name']] = row['new_name']
    
test_topics_name_dict

{'0_assurance_santé_soin_coût': 'santé',
 '1_entreprise_marché_prix_banque': 'économie',
 '2_surveillance_vote_transparence_travail': 'politique',
 '3_enfant_femme_congé_parent': 'société',
 '4_armée_arme_service_guerre': 'armée',
 '5_budget_impôt_dépense_imposition': 'finance',
 '6_animal_agriculture_production_produit': 'agriculture',
 '7_peine_procédure_juge_code': 'justice',
 '8_émission_eau_biodiversité_produit': 'écologie',
 '9_travail_chômage_travailleur_salaire': 'travail',
 '10_coopération_développement_sanction_aide': 'extérieur',
 '11_formation_école_recherche_langue': 'formation',
 '12_asile_réfugié_immigration_étranger': 'immigration',
 '13_transport_véhicule_trafic_route': 'transport',
 '14_loyer_locataire_logement_bail': 'logement',
 '15_divergence_numéro_version_donnée': 'autre',
 '16_rente_pilier_retraite_réforme': 'retraite',
 '17_énergie_électricité_installation_approvisionnement': 'énergie',
 '18_renseignement_terrorisme_échange_organisation': 'sécurité',
 '19_média

In [396]:
sample = []
sample_dict = dict()

for i in sorted(random.sample(list(processed_transcript.index), 100)):
    doc_topic = test_model.get_document_info(test_docs).loc[i]
    
    topic_idx = doc_topic['Topic']
    topic_score = round(doc_topic['Probability'], 3)
    topic_words = doc_topic['Top_n_words']
    topic_name = test_topics_renamed.loc[topic_idx]['new_name']
    transcript_words = top_terms.loc[i]['top_terms']
    
    sample_dict[i] = {
        'topic_idx': topic_idx,
        'topic_score': topic_score,
        'topic_name': topic_name,
        'topic_words': topic_words,
        'transcript_words': transcript_words
    }
    
    if topic_idx >= 0:
        sample.append('TRANSCRIPT IDX: ' + str(i))
        sample.append('TOPIC: ' + str(topic_idx))
        sample.append(topic_words)
        sample.append('\nTOP TERMS')
        sample.append(str(transcript_words))
        sample.append('score: ' + str(topic_score))

        if topic_score < 0.01:
            sample.append('----- LOW SCORE -----')
            
        sample.append('\n///////////////////////////////////////\n')
    else:
        sample.append('===========================================')
        sample.append('IDX: ' + str(i))
        sample.append(str(transcript_words))
        sample.append('===========================================\n')
        
print(len(sample))
sample = '\n'.join(sample)
sample_df = pd.DataFrame.from_dict(sample_dict, orient='index')
sample_df

KeyError: -1

#### Training

In [1516]:
loaded_model = BERTopic.load("my_model")
loaded_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,1126,0_santé_assurance_coût_soin
1,1,772,1_transparence_vote_divergence_surveillance
2,2,728,2_procédure_peine_code_juge
3,3,507,3_entreprise_prix_marché_produit
4,4,469,4_enfant_femme_parent_accueil
5,5,391,5_animal_agriculture_production_vin
6,6,331,6_crédit_budget_dépense_impôt
7,7,314,7_biodiversité_émission_eau_produit
8,8,290,8_armée_guerre_service_matériel
9,9,216,9_travail_chômage_travailleur_convention


In [812]:
# Initiate UMAP
umap_model = UMAP(
    n_neighbors=15, 
    n_components=10, 
    min_dist=0.0, 
    metric='cosine', 
    random_state=100,
)

# Initiate BERTopic
topic_model = BERTopic(calculate_probabilities=True, language='french', verbose=True)

# Run BERTopic model
docs = processed_transcript['text_lemmatized'].to_list()
topics, probabilities = topic_model.fit_transform(docs)
topic_model.get_topic_info()

Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 423/423 [02:52<00:00,  2.45it/s]
2023-06-13 20:22:16,746 - BERTopic - Transformed documents to Embeddings
2023-06-13 20:22:20,500 - BERTopic - Reduced dimensionality
2023-06-13 20:22:28,487 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name
0,-1,5445,-1_assurance_important_raison_travail
1,0,324,0_enfant_congé_accueil_parent
2,1,260,1_chômage_travailleur_salaire_travail
3,2,260,2_transport_ferroviaire_trafic_route
4,3,252,3_impôt_fiscal_imposition_taxe
...,...,...,...
139,138,11,138_animal_stabulation_étable_dégât
140,139,10,139_infirmier_patient_médecin_soin
141,140,10,140_statistique_besoin_donnée_information
142,141,10,141_révocation_affaire_propriétaire_rigueur


In [813]:
new_topics = topic_model.reduce_outliers(docs, topics)
topic_model.update_topics(docs, topics=new_topics)
topic_model.get_topic_info()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00,  1.38it/s]


Unnamed: 0,Topic,Count,Name
0,0,433,0_enfant_congé_parent_accueil
1,1,347,1_chômage_travailleur_salaire_travail
2,2,309,2_transport_ferroviaire_trafic_route
3,3,335,3_impôt_fiscal_imposition_réforme
4,4,291,4_énergie_électricité_renouvelable_installation
...,...,...,...
138,138,19,138_animal_sauvage_stabulation_dégât
139,139,32,139_médecin_patient_soin_assureur
140,140,31,140_statistique_donnée_relevé_besoin
141,141,39,141_rigueur_affaire_entreprise_chiffre


In [879]:
topic_model.get_topic(3)

[('tir', 0.00846541063258302),
 ('avocat', 0.0076609413279951095),
 ('secret', 0.00671368934882835),
 ('victime', 0.006683933034929796),
 ('argent', 0.00653484670519462),
 ('public', 0.006525234363538079),
 ('acte', 0.006030307666135343),
 ('prestation', 0.005899940706038273),
 ('société', 0.005604223867971448),
 ('professionnel', 0.005517573137263772)]

In [878]:
topic_model.get_document_info(docs, df=processed_transcript)
topic_model.get_document_info(docs).query('Topic == 12')['Document']

135      séance octobre opposition objet léger modifica...
146      séance soir divergence vote soutien divergence...
147      entrée vigueur douane besoin rafraîchir lacune...
199      matin divergence solution compromis dépliant c...
530      chambre mercredi mars heure dernier divergence...
                               ...                        
13098    commentaire su procédure élimination divergenc...
13160    bref ordre jour temps septembre dizaine diverg...
13229    individuel freezing content exemple prêt compr...
13282    seuil entrée compromis temps partenaire social...
13313    procédure élimination divergence issue positif...
Name: Document, Length: 261, dtype: object

In [871]:
topic_model.get_topic_info().to_csv('topics_50_51_v3.csv', encoding='utf-8')

In [880]:
with open('topics_50_51_cat_v3.csv', encoding='utf-8') as file:
    topics_df = pd.read_csv(file).drop(columns='Unnamed: 0')

topics_df

Unnamed: 0,Topic,Count,Name,cat
0,0,424,0_enfant_congé_parent_accueil,société
1,1,379,1_chômage_travailleur_travail_salaire,emploi
2,2,322,2_pénal_peine_crime_code,justice
3,3,363,3_tir_avocat_secret_victime,justice
4,4,293,4_transport_ferroviaire_trafic_route,transport
...,...,...,...,...
130,130,11,130_robot_robotisation_emploi_opportunité,emploi
131,131,23,131_anglais_arbitrage_langue_arbitral,politique
132,132,39,132_consommateur_contrat_garantie_révocation,économie
133,133,27,133_innovation_encouragement_fourchette_cti,formation


In [881]:
test_merge = topics_df.merge(topic_model.get_topic_info(), left_index=True, right_index=True)
test_merge.loc[test_merge['Count_x'] != test_merge['Count_y']]

Unnamed: 0,Topic_x,Count_x,Name_x,cat,Topic_y,Count_y,Name_y


In [882]:
topic_groups = list(topics_df.groupby('cat')['Topic'].apply(list))
topic_groups

[[7, 41, 50, 67, 69, 70, 76, 85, 95, 101, 105, 119, 122, 127],
 [10, 20, 21, 22, 45, 48, 63, 66, 98],
 [15, 33, 62, 100],
 [1, 97, 130],
 [31, 47, 49, 53, 57, 61, 77, 79, 114],
 [8, 13, 37, 80, 99, 120],
 [26, 35, 58, 84, 91, 126, 133],
 [23, 72, 102, 103, 112],
 [2, 3, 64, 71, 88, 118],
 [19, 43, 52],
 [18, 113],
 [6, 11, 12, 17, 54, 104, 115, 129, 131],
 [9, 86],
 [24,
  28,
  30,
  34,
  36,
  40,
  42,
  51,
  55,
  56,
  59,
  78,
  87,
  89,
  90,
  117,
  121,
  125,
  128,
  134],
 [0, 14, 29, 65, 82, 94, 96, 111, 124],
 [4, 81],
 [46, 60, 93, 108, 116],
 [16, 32, 38, 44, 68, 75, 107],
 [25, 27, 39, 73, 74, 83, 92, 106, 109, 110, 123, 132],
 [5]]

In [883]:
topic_model.merge_topics(docs, topic_groups)
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,1636,0_assurance_santé_coût_soin
1,1,1467,1_divergence_vote_surveillance_transparence
2,2,994,2_enfant_femme_congé_parent
3,3,993,3_armée_militaire_service_arme
4,4,932,4_pénal_procédure_peine_juge
5,5,910,5_budget_impôt_fiscal_dépense
6,6,882,6_animal_agricole_alimentaire_agriculture
7,7,876,7_international_coopération_développement_aide
8,8,815,8_entreprise_prix_marché_échange
9,9,574,9_émission_climatique_produit_objectif


In [811]:
topic_model.save("my_model")

In [884]:
topic_model.get_topic_info().to_csv('topics_to_rename.csv', encoding='utf-8')

In [885]:
with open('topics_renamed_4.csv', encoding='utf-8') as file:
    topics_renamed = pd.read_csv(file).drop(columns='Unnamed: 0')
    
topics_renamed

Unnamed: 0,Topic,Count,Name,new_name
0,0,1636,0_assurance_santé_coût_soin,santé
1,1,1467,1_divergence_vote_surveillance_transparence,politique
2,2,994,2_enfant_femme_congé_parent,société
3,3,993,3_armée_militaire_service_arme,armée
4,4,932,4_pénal_procédure_peine_juge,justice
5,5,910,5_budget_impôt_fiscal_dépense,finance
6,6,882,6_animal_agricole_alimentaire_agriculture,agriculture
7,7,876,7_international_coopération_développement_aide,extérieur
8,8,815,8_entreprise_prix_marché_échange,économie
9,9,574,9_émission_climatique_produit_objectif,écologie


In [886]:
topics_name_dict = dict()
topics_name_nbr_dict = dict()

for idx, row in topics_renamed.iterrows():
    count = idx
    if count < 10:
        count = '0' + str(count)
    else:
        count = str(count)
        
    topics_name_nbr_dict[row['Name']] = count + '_' + row['new_name']
    topics_name_dict[row['Name']] = row['new_name']
    
topics_name_dict

{'0_assurance_santé_coût_soin': 'santé',
 '1_divergence_vote_surveillance_transparence': 'politique',
 '2_enfant_femme_congé_parent': 'société',
 '3_armée_militaire_service_arme': 'armée',
 '4_pénal_procédure_peine_juge': 'justice',
 '5_budget_impôt_fiscal_dépense': 'finance',
 '6_animal_agricole_alimentaire_agriculture': 'agriculture',
 '7_international_coopération_développement_aide': 'extérieur',
 '8_entreprise_prix_marché_échange': 'économie',
 '9_émission_climatique_produit_objectif': 'écologie',
 '10_formation_recherche_école_professionnel': 'formation',
 '11_travail_chômage_travailleur_salaire': 'emploi',
 '12_asile_réfugié_immigration_frontière': 'immigration',
 '13_rente_pilier_retraite_réforme': 'retraites',
 '14_média_culturel_télévision_culture': 'culture',
 '15_loyer_locataire_logement_bail': 'logement',
 '16_transport_trafic_ferroviaire_route': 'transport',
 '17_énergie_électricité_installation_renouvelable': 'énergie',
 '18_donnée_numérique_postal_service': 'télécom',
 '

In [887]:
sample = []
sample_dict = dict()

for i in tqdm(sorted(random.sample(list(processed_transcript.index), 100))):
    doc_topic = topic_model.get_document_info(docs).loc[i]
    
    topic_idx = doc_topic['Topic']
    topic_score = round(doc_topic['Probability'], 3)
    topic_words = doc_topic['Top_n_words']
    topic_name = topics_renamed.loc[topic_idx]['new_name']
    transcript_words = top_terms.loc[i]['top_terms']
    
    sample_dict[i] = {
        'topic_idx': topic_idx,
        'topic_score': topic_score,
        'topic_name': topic_name,
        'transcript_words': transcript_words,
        'topic_words': topic_words,
    }
    
    if topic_idx >= 0:
        sample.append('TRANSCRIPT IDX: ' + str(i))
        sample.append('TOPIC: ' + str(topic_idx))
        sample.append(topic_name)
        sample.append('\nTOP TERMS')
        sample.append(str(transcript_words))
        sample.append('score: ' + str(topic_score))

        if topic_score < 0.01:
            sample.append('----- LOW SCORE -----')
            
        sample.append('\n///////////////////////////////////////\n')
    else:
        sample.append('===========================================')
        sample.append('IDX: ' + str(i))
        sample.append(str(transcript_words))
        sample.append('===========================================\n')
        
print(len(sample))
sample = '\n'.join(sample)
sample_df = pd.DataFrame.from_dict(sample_dict, orient='index')
sample_df

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 43.66it/s]

705





Unnamed: 0,topic_idx,topic_score,topic_name,transcript_words,topic_words
83,0,0.695,santé,"[ordonnance, électronique, patient, médicament...",assurance - santé - coût - soin - maladie - pr...
106,2,0.381,société,"[femme, homme, immigration, objectif, masse, p...",enfant - femme - congé - parent - égalité - fa...
133,6,0.381,agriculture,"[chasse, lynx, loup, plan, espèce, gestion, lé...",animal - agricole - alimentaire - agriculture ...
516,12,1.000,immigration,"[centre, asile, départ, procédure, cadencé, ex...",asile - réfugié - immigration - frontière - mi...
649,1,1.000,politique,"[ristourne, licite, signalement, livre, autori...",divergence - vote - surveillance - transparenc...
...,...,...,...,...,...
12539,0,0.388,santé,"[épidémie, abattoir, porcine, mauvais, peste, ...",assurance - santé - coût - soin - maladie - pr...
12651,13,0.127,retraites,"[conversion, taux, rendement, rente, compensat...",rente - pilier - retraite - réforme - social -...
12857,7,0.071,extérieur,"[sanction, ring, russe, application, faille, r...",international - coopération - développement - ...
12976,6,0.111,agriculture,"[alimentaire, agricole, denrée, production, du...",animal - agricole - alimentaire - agriculture ...


In [888]:
sample_df.to_csv('sample.csv', encoding='utf-8')

with open('sample.txt', 'w', encoding='utf-8') as file:
    file.write(sample)

In [287]:
topic_by_person = topic_model.get_document_info(docs, df=transcript_by_person)
# add legislature column
topic_by_person['legislature'] = topic_by_person['IdSession'].apply(lambda x: int(str(x)[:2]))
topic_by_person

Unnamed: 0,transcript_idx,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,...,FirstName,GenderAsString,PartyAbbreviation,Document,Topic,Name,Top_n_words,Probability,Representative_document,legislature
0,0,191151,36016,4018,Vous avez reçu le rapport du Conseil fédéral s...,N,20151130,5001,Mit-M,1.0,...,Jacques-André,m,PSS,élection constitution incompatibilité déroulem...,1,1_vote_transparence_surveillance_membre,vote - transparence - surveillance - membre - ...,0.229817,False,50
1,1,191158,36016,4018,Pour adopter les décisions et formuler les pro...,N,20151130,5001,Mit-M,1.0,...,Jacques-André,m,PSS,principe explication service conseiller mandat...,1,1_vote_transparence_surveillance_membre,vote - transparence - surveillance - membre - ...,0.131285,False,50
2,2,191267,36022,3898,Vous vous souvenez que le groupe UDC recommand...,N,20151130,5001,Mit-M,1.0,...,Yves,m,UDC,carte visite diplomate soupçon fusil client ba...,14,14_loyer_locataire_logement_bail,loyer - locataire - logement - bail - bailleur...,0.134175,False,50
3,3,191199,36022,1116,"Dans ce dossier, nous sommes maintenant à bout...",N,20151130,5001,BR-M,99.0,...,Didier,m,PLR,dossier bout possibilité début carte visite go...,14,14_loyer_locataire_logement_bail,loyer - locataire - logement - bail - bailleur...,0.098286,False,50
4,4,191226,36022,1116,On peut rester relativement calme sur ce sujet...,N,20151130,5001,BR-M,99.0,...,Didier,m,PLR,sujet intervention législation heure exemple c...,6,6_procédure_peine_juge_code,procédure - peine - juge - code - tribunal - i...,0.089176,False,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13144,7316,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,...,Fabien,m,VERT-E-S,modèle affaire entreprise numérique collecte d...,2,2_entreprise_prix_marché_banque,entreprise - prix - marché - banque - produit ...,0.122128,False,51
13145,7317,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,...,Elisabeth,f,PSS,doigt problème recours collecte donnée surveil...,10,10_donnée_violence_sécurité_renseignement,donnée - violence - sécurité - renseignement -...,0.021083,False,51
13146,7318,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,...,Jacqueline,f,PLR,suite attentat vie aide victime infraction ind...,6,6_procédure_peine_juge_code,procédure - peine - juge - code - tribunal - i...,0.623754,False,51
13147,7319,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,...,Elisabeth,f,PSS,victime infraction soutien face traumatisme in...,6,6_procédure_peine_juge_code,procédure - peine - juge - code - tribunal - i...,0.099029,False,51


In [398]:
party = 'PartyAbbreviation'
legislature = 'legislature'
gender = 'GenderAsString'

ratio_by_col = True
group = legislature

# filter legislature
#filtered_topic_by_person = topic_by_person.query('legislature == 51')
filtered_topic_by_person = topic_by_person.copy()

topic_by_group = filtered_topic_by_person.groupby([group, 'Name']).count()[['Text']].stack().unstack(level=1)
topic_by_group = topic_by_group.droplevel(level=1)

# set index and column row name to None
topic_by_group.index.name = None
topic_by_group.columns.name = None

# set all NaN to 0
topic_by_group = topic_by_group.fillna(0)

# rename and sort topics
topic_by_group = topic_by_group.rename(columns=topics_name_dict)
topic_by_group = topic_by_group.reindex(sorted(topic_by_group.columns), axis=1)

# transpose df
topic_by_group = topic_by_group.T

# drop parties not in last legislature
topic_by_group = topic_by_group.drop(columns=['Lega', '-', 'MCG', 'CSPO'], errors='ignore')

# drop minor parties
topic_by_group = topic_by_group.drop(columns=['EàG', 'PdT'], errors='ignore')
    
if ratio_by_col:
    topic_by_group = round(topic_by_group.loc[:,:].div(topic_by_group.sum(axis=0), axis=1), 2)
else:
    topic_by_group = round(topic_by_group.loc[:,:].div(topic_by_group.sum(axis=1), axis=0), 2)

def make_pretty(styler):
    max_value = topic_by_group.values.max()
    styler.format(precision=2)
    styler.background_gradient(axis=None, vmin=0, vmax=max_value, cmap="YlGnBu")
    return styler

topic_by_group = topic_by_group.reindex(sorted(topic_by_group.columns), axis=1)
topic_by_group.style.pipe(make_pretty)

Unnamed: 0,50,51
agriculture,0.07,0.06
armée,0.06,0.04
autre,0.03,0.02
communication,0.01,0.01
culture,0.01,0.02
extérieur,0.05,0.04
finance,0.06,0.05
formation,0.03,0.04
immigration,0.05,0.03
justice,0.04,0.06


### Most frequent words

In [381]:
coun_vect = CountVectorizer()
count_matrix = coun_vect.fit_transform(filtered_transcript['text_lemmatized'].to_list())
count_array = count_matrix.toarray()
count_df = pd.DataFrame(data=count_array, columns=coun_vect.get_feature_names_out())
count_df

Unnamed: 0,00,0122,0202,0210,0219,0229,024,0244,026,0276,...,évoqué,évènement,évènementiel,événement,événementiel,événementielle,évêché,être,île,îlot
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7645,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7646,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7647,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7648,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [394]:
example_row = 1000
count_df.sort_values(axis='columns', by=example_row, ascending=False).iloc[example_row-5:example_row+5, 0:20]

Unnamed: 0,co2,objectif,émission,véhicule,automobiliste,progrès,principe,parc,dispositif,mise,voiture,emploi,heure,domaine,habitant,décarbonée,niveau,réponse,choix,polluant
995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
996,13,6,17,0,0,0,0,0,1,0,0,0,2,1,0,0,0,0,0,0
997,9,7,6,4,0,0,0,2,0,0,4,0,0,0,0,0,0,0,0,0
998,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
999,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1000,5,5,5,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1
1001,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0
1002,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1003,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1004,10,10,11,6,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0


### LDA gensim

In [810]:
filtered_transcript['text_lemm_list'] = filtered_transcript['text_lemmatized'].apply(lambda x: x.split())
filtered_transcript

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_lemmatized,text_lemma_list,text_lemm_list
0,254005,47804,4109,Vous avez reçu le rapport du Conseil fédéral s...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:57:46,2019-12-02T14:59:13,B,FR,élection bureau constitution incompatibilité b...,"[élection, bureau, constitution, incompatibili...","[élection, bureau, constitution, incompatibili..."
1,254007,47804,4109,Pour adopter les décisions et formuler les pro...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:11:40,2019-12-02T15:13:17,B,FR,décision bureau principe bureau explication se...,"[décision, bureau, principe, bureau, explicati...","[décision, bureau, principe, bureau, explicati..."
2,254046,47810,4156,"Lors de la session d'été 2019, notre conseil a...",N,20191202,5101,Mit-M,1.0,2019-12-02T17:47:43,2019-12-02T17:49:26,*,FR,session été session automne position octobre f...,"[session, été, session, automne, position, oct...","[session, été, session, automne, position, oct..."
3,254155,47813,4154,Nous parlons ici d'une révision totale de la l...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:24:17,2019-12-03T08:29:34,*,FR,révision protection population protection juin...,"[révision, protection, population, protection,...","[révision, protection, population, protection,..."
4,254159,47817,3872,Lors de ses séances des 18 et 19 février et de...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:34:23,2019-12-03T08:38:30,*,FR,séance février juin aménagement territoire éne...,"[séance, février, juin, aménagement, territoir...","[séance, février, juin, aménagement, territoir..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7645,319791,60671,4260,Le modèle d'affaires dominant des grandes entr...,N,20230504,5120,Mit-M,1.0,2023-05-04T16:41:10,2023-05-04T16:44:54,Mit-M,FR,modèle affaire entreprise numérique collecte d...,"[modèle, affaire, entreprise, numérique, colle...","[modèle, affaire, entreprise, numérique, colle..."
7646,319748,60671,4238,Le postulat met le doigt sur le problème impor...,N,20230504,5120,BR-F,99.0,2023-05-04T16:44:58,2023-05-04T16:47:57,BR-F,FR,doigt problème recours collecte donnée surveil...,"[doigt, problème, recours, collecte, donnée, s...","[doigt, problème, recours, collecte, donnée, s..."
7647,319796,60673,4257,Suite à l'attentat terroriste de Louxor de 199...,N,20230504,5120,Mit-F,1.0,2023-05-04T16:48:32,2023-05-04T16:52:03,Mit-F,FR,suite attentat vie aide victime infraction ind...,"[suite, attentat, vie, aide, victime, infracti...","[suite, attentat, vie, aide, victime, infracti..."
7648,319720,60673,4238,Il est clair et juste que les victimes d'infra...,N,20230504,5120,BR-F,99.0,2023-05-04T16:54:20,2023-05-04T16:56:37,BR-F,FR,victime infraction soutien face traumatisme in...,"[victime, infraction, soutien, face, traumatis...","[victime, infraction, soutien, face, traumatis..."


In [811]:
docs = filtered_transcript['text_lemm_list'].to_list()

dictionary = corpora.Dictionary(docs)

DT_matrix = [dictionary.doc2bow(doc) for doc in docs]

Lda_object = gensim.models.ldamodel.LdaModel

In [812]:
lda_model_1 = Lda_object(DT_matrix, num_topics=10, id2word = dictionary)

lda_model_1.print_topics(num_topics=10, num_words=5)

[(0,
  '0.018*"sanction" + 0.009*"guerre" + 0.009*"organe" + 0.008*"contrôle" + 0.008*"sécurité"'),
 (1,
  '0.013*"système" + 0.011*"objectif" + 0.010*"cadre" + 0.009*"point" + 0.008*"coût"'),
 (2,
  '0.018*"locataire" + 0.015*"loyer" + 0.014*"femme" + 0.010*"violence" + 0.009*"bailleur"'),
 (3,
  '0.017*"procédure" + 0.016*"assurance" + 0.011*"prime" + 0.010*"raison" + 0.008*"manière"'),
 (4,
  '0.009*"cadre" + 0.008*"point" + 0.007*"recherche" + 0.007*"budget" + 0.007*"moyen"'),
 (5,
  '0.025*"prix" + 0.015*"énergie" + 0.013*"patient" + 0.013*"coût" + 0.011*"santé"'),
 (6,
  '0.016*"produit" + 0.008*"raison" + 0.007*"contre-projet" + 0.007*"travail" + 0.007*"accord"'),
 (7,
  '0.022*"enfant" + 0.009*"décision" + 0.009*"parent" + 0.007*"rente" + 0.007*"procédure"'),
 (8,
  '0.026*"entreprise" + 0.018*"travail" + 0.009*"crédit" + 0.008*"aide" + 0.007*"milliard"'),
 (9,
  '0.013*"formation" + 0.012*"travail" + 0.010*"programme" + 0.009*"cadre" + 0.008*"soutien"')]

### top2vec

In [199]:
docs = filtered_transcript["text_lemmatized"].tolist()
len(docs)

324

In [196]:
topic_model = Top2Vec(
    docs,
    #embedding_model="universal-sentence-encoder-multilingual",
    speed="deep-learn",
)

2023-05-12 16:25:30,943 - top2vec - INFO - Pre-processing documents for training
2023-05-12 16:25:31,080 - top2vec - INFO - Creating joint document/word embedding
2023-05-12 16:25:38,074 - top2vec - INFO - Creating lower dimension embedding of documents
2023-05-12 16:25:39,207 - top2vec - INFO - Finding dense areas of documents
2023-05-12 16:25:39,213 - top2vec - INFO - Finding topics


In [201]:
model.get_topics()

{0: [('de', 0.13273781851509409),
  ('la', 0.10522084530715155),
  ('le', 0.07682557991849537),
  ('des', 0.06496992795926745),
  ('et', 0.06409332138178707),
  ('les', 0.061870529632359866),
  ('en', 0.055698036565708625),
  ('que', 0.055082912584038426),
  ('est', 0.05186673777975805),
  ('une', 0.042562060578284655)],
 1: [('de', 0.13792948109435965),
  ('la', 0.07491411465697119),
  ('et', 0.06830473202530185),
  ('les', 0.06791204410365591),
  ('le', 0.06573522710988462),
  ('pour', 0.06565243157193269),
  ('avs', 0.06497054756109226),
  ('des', 0.0634781534130419),
  ('est', 0.05652146656288921),
  ('une', 0.05409859315631972)]}