In [98]:
# Data processing
import pandas as pd
import numpy as np
import ssl
import string
from collections import Counter
from transformers.pipelines import pipeline
import altair as alt
from tqdm import tqdm

# Text preprocessiong
import nltk
import spacy
from top2vec import Top2Vec
import gensim
from gensim import corpora
from gensim.corpora.dictionary import Dictionary
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Topic model
from bertopic import BERTopic

# Dimension reduction
from umap.umap_ import UMAP

In [14]:
# disable ssl check (to be able to download nltk packages)

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [15]:
# download nltk packages

nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cyrille/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/cyrille/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/cyrille/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [92]:
with open('data/transcripts/transcript_5002.csv', encoding='utf-8') as file:
    transcript_df = pd.read_csv(file).drop(columns='Unnamed: 0')
    
transcript_df

Unnamed: 0,ID,IdSubject,PersonNumber,Text,MeetingCouncilAbbreviation,MeetingDate,IdSession,SpeakerFunction,CouncilId,Start,End,Function,LanguageOfText
0,194253,36573,1279,Le Bureau a constaté que l'élection de Madame ...,N,20160229,5002,Mit-M,1.0,2016-02-29T14:35:43,2016-02-29T14:36:56,B,FR
1,194263,36575,4102,Das Bundesgesetz über die Ladenöffnungszeiten ...,N,20160229,5002,Mit-F,1.0,2016-02-29T14:39:25,2016-02-29T14:47:57,*,DE
2,194260,36575,1119,Nous traitons la loi sur les heures d'ouvertur...,N,20160229,5002,Mit-M,1.0,2016-02-29T14:47:57,2016-02-29T14:50:48,*,FR
3,194256,36575,4115,"Dans le canton de Vaud, mon canton, ce n'est p...",N,20160229,5002,Mit-M,1.0,2016-02-29T14:50:48,2016-02-29T14:51:15,Mit-M,FR
4,194258,36575,1119,Je crois que nous sommes aujourd'hui confronté...,N,20160229,5002,Mit-M,1.0,2016-02-29T14:51:15,2016-02-29T14:51:42,*,FR
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1986,197453,36979,3921,Mit dieser Petition verlangt die Aktion der Ch...,S,20160317,5002,Mit-M,2.0,2016-03-17T10:46:17,2016-03-17T10:48:48,*,DE
1987,197456,36979,172,Ich nehme die beabsichtigte Abschreibung der P...,S,20160317,5002,Mit-M,2.0,2016-03-17T10:48:48,2016-03-17T10:54:27,Mit-M,DE
1988,197438,36983,321,"Es ist schon so und üblich, dass solche Petiti...",S,20160317,5002,Mit-M,2.0,2016-03-17T10:55:34,2016-03-17T11:02:45,*,DE
1989,197861,37004,4112,Nach Artikel 3 Absatz 3 der Verordnung der Bun...,S,20160318,5002,Mit-M,2.0,2016-03-18T08:18:02,2016-03-18T08:20:41,*,DE


In [93]:
filtered_transcript = transcript_df.loc[transcript_df['LanguageOfText'] == 'FR'][['ID', 'Text']]
filtered_transcript

Unnamed: 0,ID,Text
0,194253,Le Bureau a constaté que l'élection de Madame ...
2,194260,Nous traitons la loi sur les heures d'ouvertur...
3,194256,"Dans le canton de Vaud, mon canton, ce n'est p..."
4,194258,Je crois que nous sommes aujourd'hui confronté...
7,194268,Nous sommes en train de nous prononcer sur le ...
...,...,...
1974,197349,C'est le dernier rapport que j'ai à vous prése...
1976,197433,A mon tour de remercier le Conseil fédéral pou...
1977,197360,"Vous savez, en Suisse, le chef d'orchestre, c'..."
1978,197535,Il y a donc quelques progrès. Il y a aussi des...


In [115]:
nlp = spacy.load("fr_core_news_sm")
lemmatizer = nlp.get_pipe("lemmatizer")
additional_stopwords = {'-t', 'avez', 'conseil', 'commission', 'fédéral', 'suisse', 'être', 'loi', 'projet', 'postulat'}
removed_stopwords = {'hui'}

# add stopwords 
nlp.Defaults.stop_words |= additional_stopwords

# remove stopwords
nlp.Defaults.stop_words -= removed_stopwords

print(len(nlp.Defaults.stop_words))
print(nlp.Defaults.stop_words)

515
{'toi-même', 'chez', 'tres', 'ceci', 'moi-meme', 'parler', 'sienne', 'certaines', 'etre', 'par', 'peux', 'semblent', 'diverse', 'avant', 'dix', 'cet', 'différente', 'a', 'quiconque', 'première', 'préalable', 'chacune', 'cinq', 'conseil', "t'", 'car', 'aux', 'premièrement', 'revoici', 'soi-meme', 'voici', 'revoila', 'plutot', 'sur', 'maintenant', 'quand', 'vu', 'toutes', "quelqu'un", 'autres', 'celles', 'dit', 'outre', 'parmi', 'reste', 'suivante', 'donc', 'mes', 'aura', 'quinze', 'la', 'suit', 'quelconque', 'votre', 'miennes', 'tiennes', 'spécifique', 'relative', 'lui-meme', 'l’', 'font', 'peuvent', 'neanmoins', 'desquels', 'proche', 'dix-huit', 'ceux-ci', 'très', 'sauf', 'également', 'certes', 'da', 'lui', 'vé', 'ho', 'semble', 'stop', 'entre', 'comme', 'chaque', "c'", 'septième', 'telles', 'eh', "d'", 'pas', 'vingt', 'siens', 'rendre', 'debout', 'dejà', 'lui-même', 'suivants', 'hi', 'pourrais', 'antérieure', "j'", 'avez', 'etc', 'o', 'specifiques', 'tenant', 'abord', 'ayant', 'ba

In [169]:
example_row = 113
text_id = filtered_transcript.iloc[example_row]['ID']
text = filtered_transcript.iloc[example_row]['Text']

print(text_id)
print(text)
print('---')

entity_dict = {}

for ent in nlp(text).ents:
    entity_dict[ent.text] = ent.label_
    
print(entity_dict)
print('---')

lemma_list = [token.lemma_ for token in nlp(text) if not any([token.is_stop, token.is_punct, token.is_space, token.ent_type_, token.lemma_ in nlp.Defaults.stop_words])]
lemma_list.sort()
print(lemma_list)

195434
Le projet Innosuisse concerne l'Agence suisse pour l'encouragement de l'innovation, qui sera un établissement fédéral de droit public, qui vise à remplacer l'actuelle Commission pour la technologie et l'innovation, connue sous le nom de CTI. C'est pour cela, Monsieur Tuena, qu'on doit élaborer une nouvelle loi.
Ce projet est le bienvenu car de nombreuses critiques sont émises contre la CTI depuis longtemps. Pour résumer, on peut dire que la CTI est considérée trop souvent comme une boîte noire où sévit une bande de petits copains.
Du côté administratif, on peut relever quelques exemples de flops, comme la possibilité de déposer des demandes en ligne avec l'outil "Merlin" en 2010 - cela s'est soldé par un fiasco et le retrait de l'outil. La nouvelle possibilité offerte en 2015 est visiblement de nouveau un fiasco puisque la demande en ligne ne correspond pas à la demande papier. On peut se demander s'il y a un pilote dans l'avion.
Ce projet de loi vise à définir le mode d'organis

In [None]:
# remove stopwords, punctuation and then lemmatize
tqdm.pandas()
filtered_transcript['text_lemmatized'] = filtered_transcript['Text'].progress_apply(
    lambda x: ' '.join([token.lemma_ for token in nlp(x) if not any(
        [token.is_stop, token.is_punct, token.is_space, token.ent_type_, token.lemma_ in nlp.Defaults.stop_words]
    )])
)
# Take a look at the data
filtered_transcript

 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 459/476 [00:31<00:00, 21.70it/s]

### TF-IDF 

In [171]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(filtered_transcript['text_lemmatized'].to_list())
len(vectorizer.get_feature_names_out())

135

In [137]:
tfidf_df = pd.DataFrame(vectors.toarray(), index=filtered_transcript['ID'], columns=vectorizer.get_feature_names_out())
tfidf_df

Unnamed: 0_level_0,000,06,07,080,085,09,090,10,100,1000,...,être,île,îlot,österreich,über,überhaupt,überlassen,überschaubarkeit,überzeugend,überzeugung
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
194253,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
194260,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041200,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
194256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
194258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
194268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020929,0.0,0.0,...,0.019328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
197433,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
197360,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
197535,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.028058,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [164]:
tfidf_df.sort_values(axis='columns', by=tfidf_df.iloc[example_row].name, ascending=False).columns[:10]

Index(['innovation', 'cti', 'chocolat', 'innosuisse', 'prospérité', 'fiasco',
       'recherche', 'demande', 'tâche', 'tuena'],
      dtype='object')

In [121]:
tfidf_df = tfidf_df.stack().reset_index()
tfidf_df = tfidf_df.rename(columns={'ID': 'transcript','level_1': 'term', 0:'score',})
tfidf_df

Unnamed: 0,transcript,term,score
0,194253,000,0.0
1,194253,06,0.0
2,194253,07,0.0
3,194253,080,0.0
4,194253,085,0.0
...,...,...,...
3834175,197393,überhaupt,0.0
3834176,197393,überlassen,0.0
3834177,197393,überschaubarkeit,0.0
3834178,197393,überzeugend,0.0


In [122]:
top_tfidf = tfidf_df.sort_values(by=['transcript','score'], ascending=[True,False]).groupby(['transcript']).head(10)
top_tfidf

Unnamed: 0,transcript,term,score
3445,194253,glauser,0.531630
7553,194253,vaud,0.284722
1448,194253,champvent,0.212652
7825,194253,zufferey,0.212652
4927,194253,no,0.199334
...,...,...,...
2921590,197737,prolonger,0.155425
2918878,197737,explicitement,0.153057
2916947,197737,autoriser,0.136925
2918396,197737,délai,0.132883


In [130]:
top_tfidf[top_tfidf['term'].str.contains("économ")]

Unnamed: 0,transcript,term,score
120650,194347,économie,0.247454
257585,194566,économie,0.156743
281750,194620,économie,0.135085
3286265,195436,économie,0.149083
3407091,196012,économique,0.170575
1320846,196216,économique,0.11712
1336956,196233,économique,0.22689
2287446,196993,économique,0.225265
3697070,197016,économie,0.157479
3809841,197433,économique,0.144997


In [123]:
# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['score'] = top_tfidf_plusRand['score'] + np.random.rand(top_tfidf.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'transcript:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("score", order="descending")],
    groupby = ["transcript"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'score:Q'
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + text).properties(width = 1000)

### BERTopic

In [23]:
# Initiate UMAP
umap_model = UMAP(
    n_neighbors=15, 
    n_components=10, 
    min_dist=0.0, 
    metric='cosine', 
    random_state=100,
)

# Initiate BERTopic
topic_model = BERTopic(umap_model=umap_model, calculate_probabilities=True, language='french')

# Run BERTopic model
docs = filtered_transcript['text_lemmatized'].to_list()
topics, probabilities = topic_model.fit_transform(docs)

In [24]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,34,-1_million_franc_di_contre
1,0,226,0_franc_canton_initiative_budget
2,1,33,1_rente_av_avs_pilier
3,2,31,2_patient_médecin_maladie_cas


In [25]:
topic_model.get_topic(0)

[('franc', 0.02500756352486594),
 ('canton', 0.023069515769689065),
 ('initiative', 0.022595361411483605),
 ('budget', 0.022499083990328513),
 ('question', 0.021669492238124807),
 ('million', 0.021185845276831254),
 ('agir', 0.02065738713971583),
 ('faire', 0.020476128908096167),
 ('proposition', 0.0203601949393429),
 ('proposer', 0.018801284358347114)]

In [26]:
topic_model.get_document_info(docs).iloc[100]

Document                   lire message sujet cas bien perdre temps compl...
Topic                                                                      0
Name                                        0_franc_canton_initiative_budget
Top_n_words                franc - canton - initiative - budget - questio...
Probability                                                         0.401092
Representative_document                                                False
Name: 100, dtype: object

### LDA gensim

In [231]:
filtered_transcript['text_lemm_list'] = filtered_transcript['text_lemmatized'].apply(lambda x: x.split())
filtered_transcript

Unnamed: 0,ID,Text,text_lemmatized,text_lemm_list
5,191151,Vous avez reçu le rapport du Conseil fédéral s...,recevoir rapport élection national rapport bur...,"[recevoir, rapport, élection, national, rappor..."
9,191158,Pour adopter les décisions et formuler les pro...,adopter décision formuler proposition figurer ...,"[adopter, décision, formuler, proposition, fig..."
18,191267,Vous vous souvenez que le groupe UDC recommand...,souvenir groupe UDC recommander entrer matière...,"[souvenir, groupe, UDC, recommander, entrer, m..."
21,191199,"Dans ce dossier, nous sommes maintenant à bout...",dossier bout nouvellement élu possibilité marq...,"[dossier, bout, nouvellement, élu, possibilité..."
28,191226,On peut rester relativement calme sur ce sujet...,rester calme sujet contrairement passer dernie...,"[rester, calme, sujet, contrairement, passer, ..."
...,...,...,...,...
1426,193850,"Madame Bruderer Wyss, je peux en effet confirm...",monsieur Bruderer Wyss confirmer figurer répon...,"[monsieur, Bruderer, Wyss, confirmer, figurer,..."
1428,193899,Le but de mon interpellation n'est pas d'ajout...,but interpellation ajouter ligne liste long in...,"[but, interpellation, ajouter, ligne, liste, l..."
1430,193929,"En préambule, j'aimerais saluer l'interpellati...",préambule aimer saluer interpellation collègue...,"[préambule, aimer, saluer, interpellation, col..."
1432,193855,L'apprentissage des langues est une question q...,apprentissage langue question revenir régulièr...,"[apprentissage, langue, question, revenir, rég..."


In [237]:
docs = filtered_transcript['text_lemm_list'].to_list()

dictionary = corpora.Dictionary(docs)

DT_matrix = [dictionary.doc2bow(doc) for doc in docs]

Lda_object = gensim.models.ldamodel.LdaModel

In [238]:
lda_model_1 = Lda_object(DT_matrix, num_topics=10, id2word = dictionary)

lda_model_1.print_topics(num_topics=10, num_words=5)

[(0,
  '0.008*"proposition" + 0.007*"franc" + 0.006*"million" + 0.006*"budget" + 0.005*"agir"'),
 (1,
  '0.007*"etat" + 0.006*"franc" + 0.005*"million" + 0.004*"année" + 0.004*"contre"'),
 (2,
  '0.008*"initiative" + 0.007*"franc" + 0.006*"agir" + 0.006*"contre" + 0.005*"question"'),
 (3,
  '0.008*"initiative" + 0.006*"canton" + 0.006*"franc" + 0.005*"proposer" + 0.005*"falloir"'),
 (4,
  '0.009*"initiative" + 0.006*"question" + 0.005*"falloir" + 0.005*"franc" + 0.005*"canton"'),
 (5,
  '0.011*"franc" + 0.010*"million" + 0.007*"budget" + 0.007*"canton" + 0.006*"proposition"'),
 (6,
  '0.007*"initiative" + 0.005*"franc" + 0.005*"rente" + 0.004*"question" + 0.004*"etat"'),
 (7,
  '0.008*"canton" + 0.005*"faire" + 0.005*"etat" + 0.005*"proposer" + 0.004*"proposition"'),
 (8,
  '0.006*"faire" + 0.005*"falloir" + 0.005*"question" + 0.005*"initiative" + 0.005*"franc"'),
 (9,
  '0.006*"faire" + 0.006*"question" + 0.005*"non" + 0.005*"initiative" + 0.004*"agir"')]

### top2vec

In [199]:
docs = filtered_transcript["text_lemmatized"].tolist()
len(docs)

324

In [196]:
topic_model = Top2Vec(
    docs,
    #embedding_model="universal-sentence-encoder-multilingual",
    speed="deep-learn",
)

2023-05-12 16:25:30,943 - top2vec - INFO - Pre-processing documents for training
2023-05-12 16:25:31,080 - top2vec - INFO - Creating joint document/word embedding
2023-05-12 16:25:38,074 - top2vec - INFO - Creating lower dimension embedding of documents
2023-05-12 16:25:39,207 - top2vec - INFO - Finding dense areas of documents
2023-05-12 16:25:39,213 - top2vec - INFO - Finding topics


In [201]:
model.get_topics()

{0: [('de', 0.13273781851509409),
  ('la', 0.10522084530715155),
  ('le', 0.07682557991849537),
  ('des', 0.06496992795926745),
  ('et', 0.06409332138178707),
  ('les', 0.061870529632359866),
  ('en', 0.055698036565708625),
  ('que', 0.055082912584038426),
  ('est', 0.05186673777975805),
  ('une', 0.042562060578284655)],
 1: [('de', 0.13792948109435965),
  ('la', 0.07491411465697119),
  ('et', 0.06830473202530185),
  ('les', 0.06791204410365591),
  ('le', 0.06573522710988462),
  ('pour', 0.06565243157193269),
  ('avs', 0.06497054756109226),
  ('des', 0.0634781534130419),
  ('est', 0.05652146656288921),
  ('une', 0.05409859315631972)]}