In [62]:
# Data processing
import os
import pandas as pd
import numpy as np
import ssl
import string
from collections import Counter
from transformers.pipelines import pipeline
import altair as alt
from tqdm import tqdm

# Text preprocessiong
import nltk
import spacy
import spacy_fastlang
from top2vec import Top2Vec
import gensim
from gensim import corpora
from gensim.corpora.dictionary import Dictionary
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from stanza.models.common.doc import Document
from stanza.pipeline.core import Pipeline
from stanza.pipeline.multilingual import MultilingualPipeline

# Topic model
from bertopic import BERTopic

# Dimension reduction
from umap.umap_ import UMAP

In [4]:
# disable ssl check (to be able to download nltk packages)

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [5]:
# download stanza package

import stanza
stanza.download(lang="multilingual")
stanza.download(lang="de")
stanza.download(lang="fr")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 106MB/s]                                                                               
2023-05-22 09:45:25 INFO: Downloading default packages for language: multilingual (multilingual) ...
2023-05-22 09:45:25 INFO: File exists: /Users/cyrille/stanza_resources/multilingual/default.zip
2023-05-22 09:45:25 INFO: Finished downloading models and saved to /Users/cyrille/stanza_resources.
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 46.2MB/s]                                                                              
2023-05-22 09:45:25 INFO: Downloading default packages for language: de (German) ...
2023-05-22 09:45:27 INFO: File exists: /Users/cyrille/stanza_resources/de/default.zip
2023-05-22 09:45:30 INFO: Finished downloading models and saved to /Users/cyrille/stanza_resources.
Downloading https://raw.

In [6]:
# download nltk packages

nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cyrille/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/cyrille/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/cyrille/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [159]:
transcript_df = pd.DataFrame()

directory = 'data/transcripts'
filepaths_list = []
transcripts_list = []

# iterate over files in that directory
for filename in os.listdir(directory):
    file = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(file) and file.endswith('.csv'):
        filepaths_list.append(file)

# sort transcripts chronologically 
filepaths_list.sort()
print(len(filepaths_list))

for filepath in tqdm(filepaths_list[97:100]):
    with open(filepath, encoding='utf-8') as file:
        session_df = pd.read_csv(file).drop(columns='Unnamed: 0')
        transcript_df = pd.concat([transcript_df, session_df])

transcript_df = transcript_df.reset_index(drop=True)
transcript_df

117


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 35.91it/s]


Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
0,253990,47803,806,"Präsidentin (Graf Maya, Alterspräsidentin): He...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:32:29,2019-12-02T14:33:42,P-F,DE
1,253996,47803,4290,"Sehr geehrte Frau Präsidentin, sehr geehrter H...",N,20191202,5101,Mit-M,1.0,2019-12-02T14:44:33,2019-12-02T14:52:10,Mit-M,DE
2,253998,47803,806,"Präsidentin (Graf Maya, Alterspräsidentin): Ge...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:52:10,2019-12-02T14:52:30,P-F,
3,254000,47804,806,"Präsidentin (Graf Maya, Alterspräsidentin): De...",N,20191202,5101,Mit-F,1.0,2019-12-02T14:52:30,2019-12-02T14:54:24,P-F,DE
4,254011,47804,1139,Sie haben den Bericht des Bundesrates zu den N...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:54:24,2019-12-02T14:57:46,B,DE
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3210,260973,48951,4204,Der Kommissionssprecher hat es gesagt: Die Lös...,S,20200506,5103,Mit-M,2.0,2020-05-06T17:38:33,2020-05-06T17:40:28,Mit-M,DE
3211,261022,48951,3879,Je ne vais répéter en français ce qui vient d'...,S,20200506,5103,Mit-M,2.0,2020-05-06T17:40:28,2020-05-06T17:43:59,Mit-M,FR
3212,261018,48951,4240,Je serai très bref. Je soutiendrai évidemment ...,S,20200506,5103,Mit-M,2.0,2020-05-06T17:43:59,2020-05-06T17:45:01,Mit-M,FR
3213,260978,48951,146,In Bezug auf den Tourismus gab es in diesen dr...,S,20200506,5103,BR-M,99.0,2020-05-06T17:45:01,2020-05-06T17:48:48,BR-M,DE


In [160]:
filtered_transcript = transcript_df.loc[transcript_df['LanguageOfText'] == 'FR'].reset_index(drop=True)
filtered_transcript

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
0,254005,47804,4109,Vous avez reçu le rapport du Conseil fédéral s...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:57:46,2019-12-02T14:59:13,B,FR
1,254007,47804,4109,Pour adopter les décisions et formuler les pro...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:11:40,2019-12-02T15:13:17,B,FR
2,254046,47810,4156,"Lors de la session d'été 2019, notre conseil a...",N,20191202,5101,Mit-M,1.0,2019-12-02T17:47:43,2019-12-02T17:49:26,*,FR
3,254155,47813,4154,Nous parlons ici d'une révision totale de la l...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:24:17,2019-12-03T08:29:34,*,FR
4,254159,47817,3872,Lors de ses séances des 18 et 19 février et de...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:34:23,2019-12-03T08:38:30,*,FR
...,...,...,...,...,...,...,...,...,...,...,...,...,...
787,260916,48948,1108,Nous vivons cet après-midi un nouvel épisode d...,S,20200506,5103,VPBR-M,99.0,2020-05-06T15:24:31,2020-05-06T15:28:47,VPBR-M,FR
788,261019,48949,4240,Emotions et pesée d'intérêts: il y a la politi...,S,20200506,5103,Mit-M,2.0,2020-05-06T16:19:19,2020-05-06T16:22:20,Mit-M,FR
789,260946,48949,1108,"Tout d'abord, le Conseil fédéral, il faut bien...",S,20200506,5103,VPBR-M,99.0,2020-05-06T16:22:32,2020-05-06T16:31:04,VPBR-M,FR
790,261022,48951,3879,Je ne vais répéter en français ce qui vient d'...,S,20200506,5103,Mit-M,2.0,2020-05-06T17:40:28,2020-05-06T17:43:59,Mit-M,FR


In [162]:
filtered_transcript[filtered_transcript.Text.str.contains('das')]

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
13,254303,47821,3828,"Avec cette motion, la Commission de l'économie...",N,20191203,5101,BR-M,99.0,2019-12-03T11:14:29,2019-12-03T11:24:35,BR-M,FR
35,254650,47851,1279,"Ce matin, le bureau a malheureusement décidé d...",N,20191205,5101,Mit-M,1.0,2019-12-05T09:58:24,2019-12-05T10:00:12,Mit-M,FR
270,257043,48223,1279,"Rarement, un dossier aura été aussi mal emmanc...",N,20191220,5101,Mit-M,1.0,2019-12-20T09:17:22,2019-12-20T09:23:42,Mit-M,FR
271,254175,47816,1150,"Tout d'abord, je pense que l'on devrait réguli...",S,20191203,5101,Mit-M,2.0,2019-12-03T09:13:08,2019-12-03T09:20:14,*,FR
308,256389,48155,1150,C'est le troisième débat que l'on mène dans ce...,S,20191218,5101,Mit-M,2.0,2019-12-18T09:37:46,2019-12-18T09:53:39,Mit-M,FR
378,257783,48499,1161,Différentes questions sont abordées dans ce bl...,N,20200304,5102,BR-M,99.0,2020-03-04T10:28:51,2020-03-04T10:36:56,BR-M,FR
432,258343,48544,1161,"Madame la présidente, je réponds de manière gr...",N,20200309,5102,BR-M,99.0,2020-03-09T14:49:27,2020-03-09T15:02:32,BR-M,FR
531,259493,48756,3828,Plusieurs interventions parlementaires ont été...,N,20200312,5102,BR-M,99.0,2020-03-12T11:04:19,2020-03-12T11:11:43,BR-M,FR
534,259495,48761,3828,Permettez-moi tout d'abord de résumer brièveme...,N,20200312,5102,BR-M,99.0,2020-03-12T11:57:56,2020-03-12T12:03:09,BR-M,FR
554,258249,48527,4240,"Conformément aux décisions de la commission, i...",S,20200305,5102,Mit-M,2.0,2020-03-05T09:03:05,2020-03-05T09:12:56,*,FR


In [188]:
nlp_sent = spacy.load('xx_sent_ud_sm')
nlp = spacy.load("fr_core_news_md")
nlp.add_pipe("language_detector")
lemmatizer = nlp.get_pipe("lemmatizer")

additional_stopwords = {'-t', 'avez', 'être', 'aujourd', 'hui'}
specific_stopwords = {
    'alinéa', 'article', 'cas', 'commission', 'conseil', 'droit', 'fédéral', 'groupe' 'loi', 'majorité', 'minorité', 
    'motion', 'parlementaire', 'postulat', 'politique', 'proposition', 'projet', 'question', 'suisse',
}
removed_stopwords = {'hui'}

# add stopwords 
nlp.Defaults.stop_words |= additional_stopwords
nlp.Defaults.stop_words |= specific_stopwords

# remove stopwords
#nlp.Defaults.stop_words -= removed_stopwords

print(len(nlp.Defaults.stop_words))
print(nlp.Defaults.stop_words)

529
{'da', 'quelques', 'antérieure', 'celui-ci', 'tend', 'très', 'avaient', 'chez', 'déja', 'tel', 'j’', 'elle', 'cinquantième', 'semblable', 'neanmoins', 'suffit', 'tant', 'dès', 'était', 'auraient', 'où', 'treize', 'au', 'sera', 'jusque', 'y', 'suivante', 'faisaient', 'rendre', 'suffisante', 'avez', 'reste', 'personne', 'quoique', 'moi-même', 'concernant', 'dire', 'deuxièmement', 'les', 'eux', 'douzième', 'es', 'suivant', "d'", 'près', 'avec', 'egalement', 'celle-ci', 'moi', 'nouveau', 'certaines', 'importe', 'celle-là', 'depuis', 'pendant', 'attendu', 'votre', 'ha', 'leur', 'déjà', 'cinquième', 'lui-meme', 'projet', 'souvent', 'quatorze', 'abord', 'lès', 'selon', 'parlementaire', 'specifique', 'également', 'douze', 'malgré', 'environ', 'avons', 'duquel', 'quiconque', 'afin', 'quarante', 'specifiques', 'son', 'plutôt', 'touchant', 'auront', 'aie', 'tellement', 'nombreuses', 'uns', 'après', 'ah', 'cinquante', 'être', 'cent', 'l’', 'autre', 'compris', 'seules', 'pourquoi', 'toute', 'et



In [250]:
example_row = 130
text_id = filtered_transcript.iloc[example_row]['ID']
text = filtered_transcript.iloc[example_row]['Text']

print(text_id)
print(text)
print('---')

entity_dict = {}

for ent in nlp(text).ents:
    entity_dict[ent.text] = ent.label_
    
print(entity_dict)
print('---')

lemma_list = [
    token.lemma_ for sent in nlp_sent(text).sents 
    if (doc := nlp(sent.text))._.language == 'fr' 
    for token in doc 
    if not any([
        token.is_stop, 
        token.is_punct, 
        token.is_space, 
        token.ent_type_, 
        token.is_digit, 
        token.lemma_ in nlp.Defaults.stop_words, 
        token.pos_ != 'NOUN',
    ])
]

#lemma_list.sort()
print(lemma_list)

255445
Monsieur Maitre, cela prendrait un peu de temps, et Mme la présidente ne serait certainement pas contente, si on énumérait les résultats des études menées, par exemple, en Espagne. Mais ce qu'elles montrent, c'est en effet une consommation mieux maîtrisée. On ne peut pas constater à ce stade de véritable augmentation ou diminution de la consommation, mais on constate une maîtrise plus importante de la part des consommateurs, et un sentiment de sécurité plus important par rapport à la qualité de ce qu'ils consomment, à la quantité consommée et à la quantité de THC qui est inhalée. Je pense que c'est déjà, d'un point de vue de santé publique, très important.

---
{'Monsieur Maitre': 'PER', 'Mme': 'PER', 'Espagne': 'LOC'}
---
['temps', 'président', 'résultat', 'étude', 'exemple', 'consommation', 'stade', 'augmentation', 'diminution', 'consommation', 'maîtrise', 'part', 'consommateur', 'sentiment', 'sécurité', 'rapport', 'qualité', 'consomment', 'quantité', 'quantité', 'point', 'vue

In [190]:
# remove stopwords, punctuation and then lemmatize
tqdm.pandas()
filtered_transcript['text_lemmatized'] = filtered_transcript['Text'].progress_apply(
    lambda x: ' '.join([
        token.lemma_ for sent in nlp_sent(x).sents 
        if (doc := nlp(sent.text))._.language == 'fr' 
        for token in doc 
        if not any([
            token.is_stop, 
            token.is_punct, 
            token.is_space, 
            token.ent_type_, 
            token.is_digit, 
            token.lemma_ in nlp.Defaults.stop_words, 
            token.pos_ != 'NOUN',
        ])
    ])
)
# Take a look at the data
filtered_transcript

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 792/792 [01:49<00:00,  7.22it/s]


Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_lemmatized
0,254005,47804,4109,Vous avez reçu le rapport du Conseil fédéral s...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:57:46,2019-12-02T14:59:13,B,FR,rapport élection rapport bureau constitution i...
1,254007,47804,4109,Pour adopter les décisions et formuler les pro...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:11:40,2019-12-02T15:13:17,B,FR,décision rapport bureau disposition principe b...
2,254046,47810,4156,"Lors de la session d'été 2019, notre conseil a...",N,20191202,5101,Mit-M,1.0,2019-12-02T17:47:43,2019-12-02T17:49:26,*,FR,session été voix matière initiative session au...
3,254155,47813,4154,Nous parlons ici d'une révision totale de la l...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:24:17,2019-12-03T08:29:34,*,FR,révision protection population protection juin...
4,254159,47817,3872,Lors de ses séances des 18 et 19 février et de...,N,20191203,5101,Mit-M,1.0,2019-12-03T08:34:23,2019-12-03T08:38:30,*,FR,séance février juin aménagement territoire éne...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
787,260916,48948,1108,Nous vivons cet après-midi un nouvel épisode d...,S,20200506,5103,VPBR-M,99.0,2020-05-06T15:24:31,2020-05-06T15:28:47,VPBR-M,FR,après-midi épisode saga loyer situation plan m...
788,261019,48949,4240,Emotions et pesée d'intérêts: il y a la politi...,S,20200506,5103,Mit-M,2.0,2020-05-06T16:19:19,2020-05-06T16:22:20,Mit-M,FR,emotion pesée intérêt éthique morale réalité t...
789,260946,48949,1108,"Tout d'abord, le Conseil fédéral, il faut bien...",S,20200506,5103,VPBR-M,99.0,2020-05-06T16:22:32,2020-05-06T16:31:04,VPBR-M,FR,réglementation trouble ordre situation urgence...
790,261022,48951,3879,Je ne vais répéter en français ce qui vient d'...,S,20200506,5103,Mit-M,2.0,2020-05-06T17:40:28,2020-05-06T17:43:59,Mit-M,FR,collègue mission argent habitude cogestion rep...


### Subgroups

In [191]:
with open('data/persons.csv', encoding='utf-8') as file:
    persons_df = pd.read_csv(file).drop(columns='Unnamed: 0')[['PersonNumber', 'LastName', 'FirstName', 'GenderAsString', 'PartyAbbreviation']]
    
persons_df

Unnamed: 0,PersonNumber,LastName,FirstName,GenderAsString,PartyAbbreviation
0,9,Baumann,Ruedi,m,VERT-E-S
1,12,Beerli,Christine,f,PLR
2,14,Bezzola,Duri,m,PLR
3,15,Binder,Max,m,UDC
4,21,Blocher,Christoph,m,UDC
...,...,...,...,...,...
705,4329,Ruch,Daniel,m,PLR
706,4330,Berthoud,Alexandre,m,PLR
707,4331,Jost,Marc,m,PEV
708,4332,Crevoisier Crelier,Mathilde,f,PSS


In [192]:
transcript_by_person = filtered_transcript.merge(persons_df, on='PersonNumber')
transcript_by_person

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText,text_lemmatized,LastName,FirstName,GenderAsString,PartyAbbreviation
0,254005,47804,4109,Vous avez reçu le rapport du Conseil fédéral s...,N,20191202,5101,Mit-F,1.0,2019-12-02T14:57:46,2019-12-02T14:59:13,B,FR,rapport élection rapport bureau constitution i...,Piller Carrard,Valérie,f,PSS
1,254007,47804,4109,Pour adopter les décisions et formuler les pro...,N,20191202,5101,Mit-F,1.0,2019-12-02T15:11:40,2019-12-02T15:13:17,B,FR,décision rapport bureau disposition principe b...,Piller Carrard,Valérie,f,PSS
2,254289,47824,4109,"Monsieur Nantermod, vous avez cité des exemple...",N,20191203,5101,Mit-F,1.0,2019-12-03T12:21:48,2019-12-03T12:22:21,Mit-F,FR,Monsieur cité exemple intervention région pédi...,Piller Carrard,Valérie,f,PSS
3,254291,47826,4109,"L'initiative parlementaire 19.432, ""Garantie d...",N,20191203,5101,Mit-F,1.0,2019-12-03T12:29:12,2019-12-03T12:32:47,*,FR,initiative soutien membre rattachement collabo...,Piller Carrard,Valérie,f,PSS
4,255101,47955,4109,"Madame la conseillère fédérale, en ce qui conc...",N,20191209,5101,Mit-F,1.0,2019-12-09T17:49:20,2019-12-09T17:49:51,Mit-F,FR,conseiller hypothèse avion combat région atten...,Piller Carrard,Valérie,f,PSS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
787,261009,48946,4276,Les restrictions et recommandations politiques...,N,20200506,5103,Mit-M,1.0,2020-05-06T14:43:42,2020-05-06T14:47:18,Mit-M,FR,restriction recommandation effondrement demand...,Matter,Michel,m,pvl
788,260749,48914,4057,"Monsieur le conseiller fédéral, vous avez reno...",N,20200505,5103,Mit-M,1.0,2020-05-05T12:42:36,2020-05-05T12:43:16,Mit-M,FR,Monsieur conseiller appel party citation courr...,Wermuth,Cédric,m,PSS
789,260898,48921,4301,"Monsieur le conseiller fédéral, j'ai une quest...",N,20200505,5103,Mit-F,1.0,2020-05-05T20:00:31,2020-05-05T20:01:22,Mit-F,FR,Monsieur conseiller reprise plupart étude part...,Prezioso Batou,Stefania,f,EàG
790,260962,48927,4307,"Le dépôt de la motion 20.3163, ""Soutien aux en...",N,20200505,5103,Mit-F,1.0,2020-05-05T22:01:07,2020-05-05T22:05:43,*,FR,dépôt entreprise formateur opposition avril so...,Python,Valentine,f,VERT-E-S


In [193]:
transcript_by_party = transcript_by_person.groupby('PartyAbbreviation').agg({'text_lemmatized': lambda x: ' '.join(x)})
transcript_by_party.index.name = None
transcript_by_party

Unnamed: 0,text_lemmatized
EàG,Monsieur conseiller reprise plupart étude part...
M-E,Monsieur région montagne général canton constr...
PLR,séance février juin aménagement territoire éne...
PSS,rapport élection rapport bureau constitution i...
PdT,reprise août novembre divergence décision cont...
UDC,session été voix matière initiative session au...
VERT-E-S,budget collègue avenir réflexion terme budget ...
pvl,fois budget excédent année compte milliard fra...


In [194]:
transcript_by_gender = transcript_by_person.groupby('GenderAsString').agg({'text_lemmatized': lambda x: ' '.join(x)})
transcript_by_gender.index.name = None
transcript_by_gender

Unnamed: 0,text_lemmatized
f,rapport élection rapport bureau constitution i...
m,session été voix matière initiative session au...


### Most frequent words

In [237]:
coun_vect = CountVectorizer()
count_matrix = coun_vect.fit_transform(filtered_transcript['text_lemmatized'].to_list())
count_array = count_matrix.toarray()
count_df = pd.DataFrame(data=count_array, columns=coun_vect.get_feature_names_out())
count_df

Unnamed: 0,00,000,001,0122,02,032,043,069,071,073,...,événement,événementiel,être,île,îlot,ôter,öffentlichen,über,übergangsphase,übrighaben
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
548,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
549,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
550,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
551,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [255]:
count_df.sort_values(axis='columns', by=example_row, ascending=False).iloc[example_row-5:example_row+5]

Unnamed: 0,question,politique,faire,volonté,ce,ici,for,appeler,grève,fois,...,dévastateur,détériorer,détérioration,détruire,détriment,détresse,détourner,détournement,détonateur,übrighaben
195,1,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
196,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
197,0,0,2,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
199,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200,4,3,3,2,2,2,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
201,0,0,2,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
202,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
203,0,0,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
204,0,0,6,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### TF-IDF 

In [225]:
group_transcript = transcript_by_gender

In [226]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(group_transcript['text_lemmatized'].to_list())
len(vectorizer.get_feature_names_out())

4035

In [227]:
tfidf_df = pd.DataFrame(vectors.toarray(), index=group_transcript.index, columns=vectorizer.get_feature_names_out())
tfidf_df

Unnamed: 0,00,0202,0210,0219,0329,0332,083,104a,10a,11,...,éventualité,évidence,évocation,évolution,évènement,événement,événementiel,être,île,îlot
f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.007545,0.002121,0.01509,0.0,0.013581,0.0,0.003018,0.0,0.007545
m,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000926,...,0.003243,0.00857,0.0,0.037245,0.002316,0.005274,0.000463,0.003626,0.000463,0.002637


In [228]:
tfidf_df.sort_values(axis='columns', by=tfidf_df.iloc[1].name, ascending=False)

Unnamed: 0,franc,mesure,entreprise,pays,prestation,initiative,domaine,million,situation,année,...,minceur,captif,cardiologie,migratoire,carreau,profonde,mercenariat,démocrate,signataire,glacier
f,0.090543,0.182594,0.244465,0.215793,0.082997,0.262573,0.076961,0.051307,0.120723,0.11016,...,0.002121,0.002121,0.002121,0.002121,0.002121,0.002121,0.002121,0.006363,0.004242,0.002121
m,0.238301,0.233027,0.168755,0.167766,0.161174,0.157549,0.152275,0.150298,0.149638,0.148979,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [229]:
tfidf_score_df = tfidf_df.stack().reset_index()
tfidf_score_df = tfidf_score_df.rename(columns={'ID': 'transcript','level_1': 'term', 0:'score', 'level_0': 'idx'})
tfidf_score_df

Unnamed: 0,idx,term,score
0,f,00,0.000000
1,f,0202,0.000000
2,f,0210,0.000000
3,f,0219,0.000000
4,f,0329,0.000000
...,...,...,...
8065,m,événement,0.005274
8066,m,événementiel,0.000463
8067,m,être,0.003626
8068,m,île,0.000463


In [230]:
tfidf_score_df[tfidf_score_df['term'].str.contains("assurance")]

Unnamed: 0,idx,term,score
349,f,assurance,0.081488
4384,m,assurance,0.094595


In [231]:
top_tfidf = tfidf_score_df.sort_values(by=['idx','score'], ascending=[True,False]).groupby(['idx']).head(100)
top_tfidf

Unnamed: 0,idx,term,score
1976,f,initiative,0.262573
1416,f,entreprise,0.244465
2664,f,pays,0.215793
2299,f,mesure,0.182594
3721,f,travail,0.167504
...,...,...,...
7509,m,soutien,0.047133
4123,m,abstention,0.046803
5626,m,financement,0.046803
4408,m,augmentation,0.046474


In [232]:
top_tfidf[top_tfidf['term'].str.contains("assurance")]

Unnamed: 0,idx,term,score
349,f,assurance,0.081488
4384,m,assurance,0.094595


In [233]:
top_tfidf = top_tfidf.loc[top_tfidf['idx'] != 'PLD']
top_tfidf = top_tfidf.loc[top_tfidf['idx'] != '-']
top_tfidf

Unnamed: 0,idx,term,score
1976,f,initiative,0.262573
1416,f,entreprise,0.244465
2664,f,pays,0.215793
2299,f,mesure,0.182594
3721,f,travail,0.167504
...,...,...,...
7509,m,soutien,0.047133
4123,m,abstention,0.046803
5626,m,financement,0.046803
4408,m,augmentation,0.046474


In [234]:
top_tfidf[top_tfidf['term'].str.contains("assurance")]

Unnamed: 0,idx,term,score
349,f,assurance,0.081488
4384,m,assurance,0.094595


In [235]:
#top_tfidf = top_tfidf.sort_values(by=['score']).drop_duplicates(subset=['term'], keep='first')
#top_tfidf = top_tfidf.sort_values(by=['idx', 'score'])
top_tfidf

Unnamed: 0,idx,term,score
1976,f,initiative,0.262573
1416,f,entreprise,0.244465
2664,f,pays,0.215793
2299,f,mesure,0.182594
3721,f,travail,0.167504
...,...,...,...
7509,m,soutien,0.047133
4123,m,abstention,0.046803
5626,m,financement,0.046803
4408,m,augmentation,0.046474


In [236]:
top_tfidf[top_tfidf['term'].str.contains("assurance")]

Unnamed: 0,idx,term,score
349,f,assurance,0.081488
4384,m,assurance,0.094595


In [237]:
top_10_tfidf = top_tfidf.groupby('idx').head(15)
top_10_tfidf

Unnamed: 0,idx,term,score
1976,f,initiative,0.262573
1416,f,entreprise,0.244465
2664,f,pays,0.215793
2299,f,mesure,0.182594
3721,f,travail,0.167504
2881,f,projet,0.140341
936,f,contre,0.138832
3036,f,raison,0.125251
3872,f,voix,0.123741
3048,f,rapport,0.122232


In [238]:
list(top_tfidf[top_tfidf.duplicated(subset=['term'])].sort_values(by=['term'])['term'])

['abstention',
 'accord',
 'activité',
 'affaire',
 'aide',
 'an',
 'année',
 'assurance',
 'besoin',
 'cadre',
 'canton',
 'compte',
 'condition',
 'conseiller',
 'conséquence',
 'contre',
 'crise',
 'demande',
 'discussion',
 'disposition',
 'domaine',
 'débat',
 'décision',
 'enfant',
 'entreprise',
 'exemple',
 'fois',
 'franc',
 'groupe',
 'initiative',
 'lieu',
 'manière',
 'marché',
 'matière',
 'mesure',
 'million',
 'modification',
 'moyen',
 'niveau',
 'nombre',
 'objectif',
 'objet',
 'part',
 'partie',
 'pays',
 'place',
 'point',
 'population',
 'position',
 'prestation',
 'principe',
 'prix',
 'problème',
 'projet',
 'protection',
 'raison',
 'rapport',
 'risque',
 'santé',
 'secteur',
 'sens',
 'service',
 'situation',
 'solution',
 'soutien',
 'suite',
 'système',
 'sécurité',
 'temps',
 'terme',
 'travail',
 'voix',
 'économie',
 'étranger']

In [239]:
# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_10_tfidf.copy()
top_tfidf_plusRand['score'] = top_tfidf_plusRand['score'] + np.random.rand(top_10_tfidf.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'idx:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("score", order="descending")],
    groupby = ["idx"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'score:Q'
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + text).properties(width = 1000)

### BERTopic

In [240]:
# Initiate UMAP
umap_model = UMAP(
    n_neighbors=15, 
    n_components=10, 
    min_dist=0.0, 
    metric='cosine', 
    random_state=100,
)

# Initiate BERTopic
topic_model = BERTopic(umap_model=umap_model, calculate_probabilities=True, language='french')

# Run BERTopic model
docs = filtered_transcript['text_lemmatized'].to_list()
topics, probabilities = topic_model.fit_transform(docs)

In [241]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,299,-1_mesure_domaine_initiative_entreprise
1,0,69,0_prestation_loyer_an_solution
2,1,61,1_franc_million_budget_crédit
3,2,55,2_chômage_travail_an_assurance
4,3,37,3_délégation_surveillance_rapport_vote
5,4,37,4_animal_chat_lait_zone
6,5,34,5_guerre_matériel_arme_accord
7,6,32,6_crise_virus_mesure_situation
8,7,25,7_canton_recours_assureur_compétence
9,8,23,8_avion_aviation_combat_vol


In [242]:
topic_model.get_topic(0)

[('prestation', 0.038223329128215036),
 ('loyer', 0.0287337414921464),
 ('an', 0.02163601613006281),
 ('solution', 0.020814919543610425),
 ('plafond', 0.02058426156411089),
 ('frontière', 0.02003625681522701),
 ('situation', 0.01985418417556765),
 ('locataire', 0.019673520046564003),
 ('raison', 0.018763156723561113),
 ('société', 0.01790633638524981)]

In [249]:
topic_model.get_document_info(docs).iloc[130]

Document                   temps président résultat étude exemple consomm...
Topic                                                                      9
Name                                        9_prix_produit_marché_entreprise
Top_n_words                prix - produit - marché - entreprise - brevet ...
Probability                                                         0.542781
Representative_document                                                False
Name: 130, dtype: object

### LDA gensim

In [231]:
filtered_transcript['text_lemm_list'] = filtered_transcript['text_lemmatized'].apply(lambda x: x.split())
filtered_transcript

Unnamed: 0,ID,Text,text_lemmatized,text_lemm_list
5,191151,Vous avez reçu le rapport du Conseil fédéral s...,recevoir rapport élection national rapport bur...,"[recevoir, rapport, élection, national, rappor..."
9,191158,Pour adopter les décisions et formuler les pro...,adopter décision formuler proposition figurer ...,"[adopter, décision, formuler, proposition, fig..."
18,191267,Vous vous souvenez que le groupe UDC recommand...,souvenir groupe UDC recommander entrer matière...,"[souvenir, groupe, UDC, recommander, entrer, m..."
21,191199,"Dans ce dossier, nous sommes maintenant à bout...",dossier bout nouvellement élu possibilité marq...,"[dossier, bout, nouvellement, élu, possibilité..."
28,191226,On peut rester relativement calme sur ce sujet...,rester calme sujet contrairement passer dernie...,"[rester, calme, sujet, contrairement, passer, ..."
...,...,...,...,...
1426,193850,"Madame Bruderer Wyss, je peux en effet confirm...",monsieur Bruderer Wyss confirmer figurer répon...,"[monsieur, Bruderer, Wyss, confirmer, figurer,..."
1428,193899,Le but de mon interpellation n'est pas d'ajout...,but interpellation ajouter ligne liste long in...,"[but, interpellation, ajouter, ligne, liste, l..."
1430,193929,"En préambule, j'aimerais saluer l'interpellati...",préambule aimer saluer interpellation collègue...,"[préambule, aimer, saluer, interpellation, col..."
1432,193855,L'apprentissage des langues est une question q...,apprentissage langue question revenir régulièr...,"[apprentissage, langue, question, revenir, rég..."


In [237]:
docs = filtered_transcript['text_lemm_list'].to_list()

dictionary = corpora.Dictionary(docs)

DT_matrix = [dictionary.doc2bow(doc) for doc in docs]

Lda_object = gensim.models.ldamodel.LdaModel

In [238]:
lda_model_1 = Lda_object(DT_matrix, num_topics=10, id2word = dictionary)

lda_model_1.print_topics(num_topics=10, num_words=5)

[(0,
  '0.008*"proposition" + 0.007*"franc" + 0.006*"million" + 0.006*"budget" + 0.005*"agir"'),
 (1,
  '0.007*"etat" + 0.006*"franc" + 0.005*"million" + 0.004*"année" + 0.004*"contre"'),
 (2,
  '0.008*"initiative" + 0.007*"franc" + 0.006*"agir" + 0.006*"contre" + 0.005*"question"'),
 (3,
  '0.008*"initiative" + 0.006*"canton" + 0.006*"franc" + 0.005*"proposer" + 0.005*"falloir"'),
 (4,
  '0.009*"initiative" + 0.006*"question" + 0.005*"falloir" + 0.005*"franc" + 0.005*"canton"'),
 (5,
  '0.011*"franc" + 0.010*"million" + 0.007*"budget" + 0.007*"canton" + 0.006*"proposition"'),
 (6,
  '0.007*"initiative" + 0.005*"franc" + 0.005*"rente" + 0.004*"question" + 0.004*"etat"'),
 (7,
  '0.008*"canton" + 0.005*"faire" + 0.005*"etat" + 0.005*"proposer" + 0.004*"proposition"'),
 (8,
  '0.006*"faire" + 0.005*"falloir" + 0.005*"question" + 0.005*"initiative" + 0.005*"franc"'),
 (9,
  '0.006*"faire" + 0.006*"question" + 0.005*"non" + 0.005*"initiative" + 0.004*"agir"')]

### top2vec

In [199]:
docs = filtered_transcript["text_lemmatized"].tolist()
len(docs)

324

In [196]:
topic_model = Top2Vec(
    docs,
    #embedding_model="universal-sentence-encoder-multilingual",
    speed="deep-learn",
)

2023-05-12 16:25:30,943 - top2vec - INFO - Pre-processing documents for training
2023-05-12 16:25:31,080 - top2vec - INFO - Creating joint document/word embedding
2023-05-12 16:25:38,074 - top2vec - INFO - Creating lower dimension embedding of documents
2023-05-12 16:25:39,207 - top2vec - INFO - Finding dense areas of documents
2023-05-12 16:25:39,213 - top2vec - INFO - Finding topics


In [201]:
model.get_topics()

{0: [('de', 0.13273781851509409),
  ('la', 0.10522084530715155),
  ('le', 0.07682557991849537),
  ('des', 0.06496992795926745),
  ('et', 0.06409332138178707),
  ('les', 0.061870529632359866),
  ('en', 0.055698036565708625),
  ('que', 0.055082912584038426),
  ('est', 0.05186673777975805),
  ('une', 0.042562060578284655)],
 1: [('de', 0.13792948109435965),
  ('la', 0.07491411465697119),
  ('et', 0.06830473202530185),
  ('les', 0.06791204410365591),
  ('le', 0.06573522710988462),
  ('pour', 0.06565243157193269),
  ('avs', 0.06497054756109226),
  ('des', 0.0634781534130419),
  ('est', 0.05652146656288921),
  ('une', 0.05409859315631972)]}