In [42]:
import pandas as pd
import re
import nltk
import spacy
from nltk.corpus import stopwords

from gensim.models import LdaMulticore, TfidfModel, CoherenceModel
from gensim.corpora import Dictionary
from gensim.models.phrases import Phrases

import time # to know how long training took
import multiprocessing # to speed things up by parallelizing

In [16]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.text not in stop_words and not token.is_punct]
    return ' '.join(tokens)

df = pd.read_csv('./csv_chunks_en_filtered.csv')

df['preprocess'] = df['chunk'].apply(preprocess)

df

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sylcherry/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,ID_file,leg,class,language,surname,name,year_birth,gender,group,position,length,chunk,preprocess
0,48019,XVI,"SPORT E TEMPO LIBERO, Infrastrutture ricreativ...",en,Lucianaz,Diego,1963,M,RV,2,609,"Yes, quickly, the questions look at the situat...",yes quickly question look situation regard pro...
1,48019,XVI,"SPORT E TEMPO LIBERO, Infrastrutture ricreativ...",en,Testolin,Renzo,1968,M,UV,4,3873,"First of all, I would like to thank the Counci...",first would like thank councillor give we oppo...
2,48019,XVI,"SPORT E TEMPO LIBERO, Infrastrutture ricreativ...",en,Lucianaz,Diego,1963,M,RV,6,557,"Thank you Mr President Testolin, today you sur...",thank mr president testolin today surprise wel...
3,47950,XVI,"TRASPORTI E VIABILITÀ, Impianti a fune",en,Lavy,Erik,1995,M,LEGA VDA,2,5261,We have already addressed this issue in the la...,already address issue last council fact bring ...
4,47950,XVI,"TRASPORTI E VIABILITÀ, Impianti a fune",en,Bertschy,Luigi Giovanni,1965,M,UV,4,4884,"Thank you, colleague Lavy, also for how you ha...",thank colleague lavy also illustrate initiativ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
223,48017,XVI,CREDITO,en,Lavy,Erik,1995,M,LEGA VDA,6,2952,"President Testolin, I am quite shocked, also i...",president testolin quite shocked also light di...
224,48017,XVI,CREDITO,en,Testolin,Renzo,1968,M,UV,8,103,Just to say that populism is allowed in reruns...,say populism allow rerun make peace brain home
225,48018,XVI,"ENTI LOCALI, Comuni",en,Brunod,Dennis,1978,M,RV,2,1801,"At the end of October 2024, we learned from pr...",end october 2024 learn press information obser...
226,48018,XVI,"ENTI LOCALI, Comuni",en,Testolin,Renzo,1968,M,UV,4,2187,English: Allow me to start off with a bit of a...,english allow start bit joke think council off...


In [61]:
instances = df['preprocess'].apply(str.split).tolist()
print("creating dictionary", flush=True)
phrases = Phrases(instances, min_count=5, threshold=1)
instances_colloc = phrases[instances]

dictionary = Dictionary(instances_colloc)
dictionary.filter_extremes(no_below=5, no_above=0.2)

#replace words by their numerical IDs and their frequency
print("translating corpus to IDs", flush=True)
ldacorpus = [dictionary.doc2bow(text) for text in instances]
# learn TFIDF values from corpus
print("tf-idf transformation", flush=True)
tfidfmodel = TfidfModel(ldacorpus)
# transform raw frequencies into TFIDF
model_corpus = tfidfmodel[ldacorpus]

num_topics = 6

# find chunksize to make about 200 updates
num_passes = 10
chunk_size = len(model_corpus) * num_passes/200
print(chunk_size)

start = time.time()
print("fitting model", flush=True)
model = LdaMulticore(num_topics=num_topics, # number of topics
                     corpus=model_corpus, # what to train on
                     id2word=dictionary, # mapping from IDs to words
                     workers=min(10, multiprocessing.cpu_count()-1), # choose 10 cores, or whatever computer has
                     passes=num_passes, # make this many passes over data
                     chunksize=chunk_size, # update after this many instances
                     alpha=0.5,
                     random_state=42
                    )

print("done in {}".format(time.time()-start), flush=True)

topic_corpus = model[model_corpus]

model.print_topics()

topic_sep = re.compile("0\.[0-9]{3}\*")
model_topics = [(topic_no, re.sub(topic_sep, '', model_topic).split(' + '))
                for topic_no, model_topic in
                model.print_topics(num_topics=num_topics, num_words=5)]

descriptors = []
for i, m in model_topics:
    print(i+1, ", ".join(m[:5]))
    descriptors.append(", ".join(m[:2]).replace('"', ''))

df

creating dictionary


translating corpus to IDs
tf-idf transformation
11.4
fitting model
done in 0.7828454971313477
1 "satisfied", "page", "vote", "briefly", "source"
2 "join", "heating", "withdraw", "gender", "arbitrary"
3 "broadcast", "price", "break", "seriously", "senator"
4 "mountain", "municipality", "health", "president", "territory"
5 "ministry", "pleased", "group", "president", "hear"
6 "mountain", "correspondence", "underline", "intervene", "home"


Unnamed: 0,ID_file,leg,class,language,surname,name,year_birth,gender,group,position,length,chunk,preprocess,row_topics,dominant_topic
0,48019,XVI,"SPORT E TEMPO LIBERO, Infrastrutture ricreativ...",en,Lucianaz,Diego,1963,M,RV,2,609,"Yes, quickly, the questions look at the situat...",yes quickly question look situation regard pro...,"[0.024*""plan"" + 0.024*""palafent"" + 0.024*""proj...",0
1,48019,XVI,"SPORT E TEMPO LIBERO, Infrastrutture ricreativ...",en,Testolin,Renzo,1968,M,UV,4,3873,"First of all, I would like to thank the Counci...",first would like thank councillor give we oppo...,"[0.005*""possibly"" + 0.005*""popular"" + 0.005*""p...",0
2,48019,XVI,"SPORT E TEMPO LIBERO, Infrastrutture ricreativ...",en,Lucianaz,Diego,1963,M,RV,6,557,"Thank you Mr President Testolin, today you sur...",thank mr president testolin today surprise wel...,"[0.024*""serious"" + 0.024*""satisfied"" + 0.024*""...",0
3,47950,XVI,"TRASPORTI E VIABILITÀ, Impianti a fune",en,Lavy,Erik,1995,M,LEGA VDA,2,5261,We have already addressed this issue in the la...,already address issue last council fact bring ...,"[0.005*""political"" + 0.005*""personally"" + 0.00...",0
4,47950,XVI,"TRASPORTI E VIABILITÀ, Impianti a fune",en,Bertschy,Luigi Giovanni,1965,M,UV,4,4884,"Thank you, colleague Lavy, also for how you ha...",thank colleague lavy also illustrate initiativ...,"[0.005*""preclude"" + 0.005*""point"" + 0.005*""pro...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223,48017,XVI,CREDITO,en,Lavy,Erik,1995,M,LEGA VDA,6,2952,"President Testolin, I am quite shocked, also i...",president testolin quite shocked also light di...,"[0.007*""perhaps"" + 0.007*""people"" + 0.007*""pla...",0
224,48017,XVI,CREDITO,en,Testolin,Renzo,1968,M,UV,8,103,Just to say that populism is allowed in reruns...,say populism allow rerun make peace brain home,"[0.125*""populism"" + 0.125*""make"" + 0.125*""reru...",0
225,48018,XVI,"ENTI LOCALI, Comuni",en,Brunod,Dennis,1978,M,RV,2,1801,"At the end of October 2024, we learned from pr...",end october 2024 learn press information obser...,"[0.009*""public"" + 0.009*""quality"" + 0.009*""pre...",0
226,48018,XVI,"ENTI LOCALI, Comuni",en,Testolin,Renzo,1968,M,UV,4,2187,English: Allow me to start off with a bit of a...,english allow start bit joke think council off...,"[0.008*""questioner"" + 0.008*""question"" + 0.008...",0


In [None]:
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from gensim.models import TfidfModel

# Supponiamo che 'df' sia il tuo DataFrame
topics_per_row = []

# Itera su ogni riga del DataFrame
for idx, row in df.iterrows():
    # Prendi la riga e creala come una lista di parole (token)
    text = str(row['preprocess']).split()

    # Crea un dizionario per la riga
    dictionary = Dictionary([text])
    
    # Crea una rappresentazione BOW (bag of words) per la riga
    bow = dictionary.doc2bow(text)

    # Trasformazione tf-idf (opzionale ma consigliato per pesare le parole)
    tfidf_model = TfidfModel([bow])
    tfidf_bow = tfidf_model[bow]

    # Allena un modello LDA sulla singola riga
    lda_model = LdaModel([tfidf_bow], num_topics=3, id2word=dictionary, passes=10, random_state=42)

    # Estrai i 6 topic (ogni topic è una stringa)
    topics = lda_model.print_topics(num_topics=3, num_words=5)
    
    # Memorizza i topic per ogni riga
    row_topics = [topic[1] for topic in topics]  # Prendi solo la stringa del topic
    topics_per_row.append(row_topics)

# Aggiungi i topic come nuove colonne al DataFrame
df['row_topics'] = topics_per_row

# Se vuoi vedere anche i topic dominanti per ogni riga
df['dominant_topic'] = [max(row, key=lambda x: x[1])[0] for row in topics_per_row]

df

  perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)
  perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)


Unnamed: 0,ID_file,leg,class,language,surname,name,year_birth,gender,group,position,length,chunk,preprocess,row_topics,dominant_topic
0,48019,XVI,"SPORT E TEMPO LIBERO, Infrastrutture ricreativ...",en,Lucianaz,Diego,1963,M,RV,2,609,"Yes, quickly, the questions look at the situat...",yes quickly question look situation regard pro...,"[0.024*""plan"" + 0.024*""palafent"" + 0.024*""proj...",0
1,48019,XVI,"SPORT E TEMPO LIBERO, Infrastrutture ricreativ...",en,Testolin,Renzo,1968,M,UV,4,3873,"First of all, I would like to thank the Counci...",first would like thank councillor give we oppo...,"[0.005*""possibly"" + 0.005*""popular"" + 0.005*""p...",0
2,48019,XVI,"SPORT E TEMPO LIBERO, Infrastrutture ricreativ...",en,Lucianaz,Diego,1963,M,RV,6,557,"Thank you Mr President Testolin, today you sur...",thank mr president testolin today surprise wel...,"[0.024*""serious"" + 0.024*""satisfied"" + 0.024*""...",0
3,47950,XVI,"TRASPORTI E VIABILITÀ, Impianti a fune",en,Lavy,Erik,1995,M,LEGA VDA,2,5261,We have already addressed this issue in the la...,already address issue last council fact bring ...,"[0.005*""political"" + 0.005*""personally"" + 0.00...",0
4,47950,XVI,"TRASPORTI E VIABILITÀ, Impianti a fune",en,Bertschy,Luigi Giovanni,1965,M,UV,4,4884,"Thank you, colleague Lavy, also for how you ha...",thank colleague lavy also illustrate initiativ...,"[0.005*""preclude"" + 0.005*""point"" + 0.005*""pro...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223,48017,XVI,CREDITO,en,Lavy,Erik,1995,M,LEGA VDA,6,2952,"President Testolin, I am quite shocked, also i...",president testolin quite shocked also light di...,"[0.007*""perhaps"" + 0.007*""people"" + 0.007*""pla...",0
224,48017,XVI,CREDITO,en,Testolin,Renzo,1968,M,UV,8,103,Just to say that populism is allowed in reruns...,say populism allow rerun make peace brain home,"[0.125*""populism"" + 0.125*""make"" + 0.125*""reru...",0
225,48018,XVI,"ENTI LOCALI, Comuni",en,Brunod,Dennis,1978,M,RV,2,1801,"At the end of October 2024, we learned from pr...",end october 2024 learn press information obser...,"[0.009*""public"" + 0.009*""quality"" + 0.009*""pre...",0
226,48018,XVI,"ENTI LOCALI, Comuni",en,Testolin,Renzo,1968,M,UV,4,2187,English: Allow me to start off with a bit of a...,english allow start bit joke think council off...,"[0.008*""questioner"" + 0.008*""question"" + 0.008...",0
