In [1]:
import pandas as pd
import re
import stanza
from collections import Counter
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Load data
file_path = '../data/pre-processed_rs.csv'
minutes_rs = pd.read_csv(file_path)
print(f'There are {len(minutes_rs)} rows in the minutes dataset.')

There are 167392 rows in the minutes dataset.


In [3]:
# Remove speeches with word count less than 50
minutes_rs = minutes_rs[minutes_rs['speech'].apply(lambda x: len(x.split()) >= 50)]
print(f'There are {len(minutes_rs)} rows in the minutes dataset.')

There are 97026 rows in the minutes dataset.


In [None]:
# Procedural phrases to remove
phrases_to_remove = [
    "Poštovane dame i gospodo narodni poslanici",
    "Poštovana predsednice Narodne skupštine",
    "Zahvaljujem gospodi ministar",
    "Dame i gospodo narodni poslanici,",
    "Da li još neko od predsednika želi reč?",
    "Hvala, gospođo predsednice.",
    "Zahvaljujem.",
    "Poštovani predsedavajući",
    "Poštovano predsedništvo, poštovana gospodo ministar",
    "Koliko imamo vremena, izvinjavam se, koja poslanička grupa?"
                    ]


# Create a single regular expression pattern to match all phrases
pattern = re.compile("|".join([re.escape(phrase) for phrase in phrases_to_remove]), re.IGNORECASE)

# Function to remove specified phrases using regex
def remove_phrases(text, pattern):
    return pattern.sub("", text)

minutes_rs['speech'] = minutes_rs['speech'].apply(lambda x: remove_phrases(str(x), pattern))

In [None]:
# Download the Serbian model for Stanza
stanza.download('sr')

# Initialize the Stanza pipeline for Serbian
nlp = stanza.Pipeline('sr')


In [None]:
# Load stopwords
stopwords_file = '../data/serbian.txt'
with open(stopwords_file, 'r', encoding='utf-8') as f:
    serbian_stopwords  = set([line.strip() for line in f])

In [None]:
# Function to preprocess text
def preprocess_text(text, stopwords_list, nlp_pipeline):
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Process text with Stanza
    doc = nlp_pipeline(text)
    
    # Extract lemmas and remove stopwords
    tokens = [word.lemma for sentence in doc.sentences for word in sentence.words if word.lemma.lower() not in stopwords_list]
    
    return tokens

In [None]:
def process_chunk(chunk_file, stopwords_list, nlp_pipeline):
    df = pd.read_csv(chunk_file)
    df['processed_speech'] = df['speech'].apply(lambda x: preprocess_text(x, stopwords_list, nlp_pipeline))
    return df

In [None]:
# As the tokenization and lemmatization is time-consuming, it's processed in chunks and saved to disk to be combined later

#import os
#chunk_size = 100
#output_dir = '/Users/busraalbayrak/Desktop/chunks'

#if not os.path.exists(output_dir):
#    os.makedirs(output_dir)

#for i, start in enumerate(range(0, len(minutes_rs), chunk_size)):
#    end = start + chunk_size
#    chunk = minutes_rs[start:end]
#    chunk.to_csv(f'{output_dir}/chunk_{i}.csv', index=False)

In [None]:
# processed_dir = '/Users/busraalbayrak/Desktop/processed'
# Process the first 10 chunks
# for i in range(10):
#     chunk_file = f'{output_dir}/chunk_{i}.csv'
#     processed_chunk = process_chunk(chunk_file, serbian_stopwords, nlp)
#     processed_chunk.to_csv(f'{processed_dir}/processed_chunk_{i}.csv', index=False)

#      ...    

# for i in range(960,970):
#     chunk_file = f'{output_dir}/chunk_{i}.csv'
#     processed_chunk = process_chunk(chunk_file, serbian_stopwords, nlp)
#     processed_chunk.to_csv(f'{processed_dir}/processed_chunk_{i}.csv', index=False)

In [None]:
## It's combined and saved as a single file
## processed_dir = '../../../Desktop/processed'
# combined_df = pd.read_csv(f'{processed_dir}/processed_chunk_0.csv')

## Sequentially read and append the remaining chunks
# for i in range(1, 970):
#     processed_chunk_file = f'{processed_dir}/processed_chunk_{i}.csv'
#     processed_chunk = pd.read_csv(processed_chunk_file)
#     combined_df = pd.concat([combined_df, processed_chunk], ignore_index=True)

# combined_df.to_csv(f'{processed_dir}/minutes_rs.csv', index=False)  

In [2]:
processed_dir = '../../../Desktop/processed'
minutes_rs = pd.read_csv(f'{processed_dir}/minutes_rs.csv')  

In [3]:
# Function to clean and split processed_speech
def clean_and_split(speech):
    cleaned_speech = speech.replace("'", " ").replace("[", "").replace("]", "").replace(",", "").strip()
    return cleaned_speech.split()

# Apply the function to the DataFrame
minutes_rs['processed_speech'] = minutes_rs['processed_speech'].apply(clean_and_split)


In [4]:
# Check documnet frequency of tokens
all_tokens =  [token for sublist in minutes_rs['processed_speech'] for token in sublist]
token_counts = Counter(all_tokens)


In [30]:
# Exclude tokens with common names of MPs and non-informative tokens
tokens_to_exclude = ['prof','dr', 'mi', 'on', 'do', 'li', 'jer','svoj', 'po', 'za', 'od', 'ovaj', 'da', 'vi', 'od', 'sa', 'sebe', 'na', 'ne', 'ja', 'moći', 'sav', 'iz',
                     'što', 'ili', 'gospodina', 'gospodin', 'ali', 'ako', 'samo', 'morati', 'koliko', 'ko', 'tako', 'zašto', 'zato što', 'pa', 'hteti', 'imati',
                     'onda', 'kakav', 'njihov', 'nemati', 'hvala', 'već', 'moj', 'Aleksandra', 'od' 'Dragan', 'Branko', 'Ivan', 'Aleksandar', 'kazati', 'kada', 'dan',
                     'Blagoja', 'Marija','Inđija', 'Kovač', 'Pastor','Čanak','Elvir', 'Morava', 'Goran', 'Balint', 'Maja', 'Ivana', 'Tomica', 'Verica', 'ni', 'Srbija',
                     'Miloš', 'Miroslav', 'Miroslava', 'Vladan', 'Izvolita' 'Čedomir', 'Bojan', 'Izvolit', 'Jerkov', 'Nataša', 'Jovan', 'Vesna', 'Gordana', 'Šešelj'
                    'Marko', 'Dragan', 'Rade', 'Šešelj', 'Izvolita', 'Vladimir', 'Nenad', 'tekst', 'Vjerica', 'deo', 'Šutanovac', 'poljoprivredni', 'dakle', 'videti',
                     'Zoran', 'Boško', 'Saša', 'Dušan', 'Marinik', 'Ana', 'Nikola', 'Zdravko', 'Tomislav', 'Radoslav', 'Milan', 'Đorđe', 'Stefan', 'reč', 'Nogo']
# Filter tokens with document frequency below 10
min_doc_freq = 10
tokens_to_keep = {token for token, count in token_counts.items() if count >= min_doc_freq and len(token) > 1 and not (token.endswith('ić') or token in tokens_to_exclude)}

In [31]:
#print first 3 tokens to keep
print(list(tokens_to_keep)[:3])

['davasti', 'monah', 'mikrofilmovanje']


In [32]:
# Filter tokens in each speech
minutes_rs['processed_speech'] = minutes_rs['processed_speech'].apply(lambda tokens: [token for token in tokens if token in tokens_to_keep])


In [33]:
lda_input = minutes_rs['processed_speech'].to_list()

In [34]:
vectorizer = CountVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x, lowercase=False)
X = vectorizer.fit_transform(lda_input)



In [35]:
#n_topics = 10
n_topics = 20
#n_topics = 50
lda = LatentDirichletAllocation(n_components=n_topics, 
                                learning_method='online', 
                                max_iter=10, 
                                random_state=0,
                                batch_size=1024,
                                n_jobs=1 
                               )

In [36]:
# Fit the model
lda.fit(X)

In [38]:
# Display the top words for each topic
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
no_top_words = 20
display_topics(lda, vectorizer.get_feature_names_out(), no_top_words)

Topic 0:
javan odbor izveštaj medij građanin komisija izbor agencija državni rad pitanje institucija organ skupština izborni savet politički narodni telo član
Topic 1:
grad opština Beograd godina građanin nov čovek milion velik raditi stan pitanje Vojvodina živeti izgradnja centar evro Niš mesto problem
Topic 2:
obrazovanje škola kultura godina fakultet student visok nacionalan deca zakon sistem mlad trebati nauka kulturan jezik sport znanje ministar velik
Topic 3:
zakon izmena predlog javan postupak dopuna odnosno pravni rešenje rok oblast podatak način godina zaštita uslov posao lice nov primena
Topic 4:
lokalan samouprava budžet sredstvo zakon javan poreski porez uprava prihod republika godina trebati dinar Vojvodina plata građanin budžetski sistem državni
Topic 5:
sredina životan projekt razvoj saobraćaj infrastruktura godina zaštita izgradnja republika energija velik koridor železnica autoput nov energetski voda projekat pruga
Topic 6:
narodni skupština poslanik predlog poslovnik 