In [1]:
import re
import pandas as pd
import string
from stemming.porter2 import stem
import numpy as np


import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import spacy
from collections import defaultdict


import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel



import pyLDAvis
import pyLDAvis.gensim  
import matplotlib.pyplot as plt
from collections import Counter
from math import log


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
np.random.seed(101)

In [3]:


class TextExtractor:
    def __init__(self, path, documents):
        self.path = path
        self.documents = documents
        self.data = self.dataset_load()
        

    def dataset_load(self):
        docs = pd.read_csv(self.path)        
        

        filtered_data = docs[docs['parliamentary_period'].isin(self.documents)]

        # Convert 'speech' column to strings
        filtered_data['speech'] = filtered_data['speech'].astype(str)

        return filtered_data[['parliamentary_period', 'speech']]
    





In [4]:
speeches = TextExtractor('tell_all_cleaned.csv', ['period 12','period 13', 'period 14', 'period 15','period 16', 'period 17'])




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['speech'] = filtered_data['speech'].astype(str)


In [5]:
speeches.data.head(10)

Unnamed: 0,parliamentary_period,speech
645448,period 12,κυριες @sw κυριοι βουλευτες κηρυσσεται @sw ενα...
645449,period 12,@sw @sw @sw τιμη @sw ανακοινωσω @sw σωμα @sw ε...
645450,period 12,παρακαλειται @sw προσωρινος γραμματεας γεωργιο...
645451,period 12,@sw γραπτες δηλωσεις @sw γενικης γραμματεως @s...
645452,period 12,παρακαλω @sw κυριες @sw @sw κυριους βουλευτες ...
645453,period 12,κυριες @sw κυριοι συναδελφοι παρακαλω @sw σωμα...
645454,period 12,@sw @sw
645455,period 12,@sw βουλη παρεσχε @sw ζητηθεισα εξουσιοδοτηση....
645456,period 12,@sw @sw
645457,period 12,κυριες @sw κυριοι συναδελφοι αρχιζει @sw συνεδ...


In [6]:

class DataCleaner:
    def __init__(self, speeches):
        self.speeches = speeches
        self.remove_sw()
        self.remove_numbers()
        self.remove_punctuation()
        self.remove_short_speeches()
        self.lemmatize()
        


        
        self.num_of_docs = self.doc_count()
        self.documents =  self.compile_documents()
        
        self.corpus= self.word_doc_count()
        self.set_of_words = self.create_word_set()

        self.word_doc_frequency = self.document_frequency()
        self.idf_dict = self.calculate_idf()

        self.tokenize()
        self.remove_common_words()

        self.remove_party_references()

        


    def remove_sw(self):
        self.speeches['speech'] = self.speeches['speech'].str.replace(r'@sw', '')
        return self.speeches
    
    
    
    def remove_numbers (self):
        self.speeches['speech'] = self.speeches['speech'].str.replace(r'\b[0-9]+\b', '', regex=True)
        return self.speeches

        
    def remove_punctuation(self):
        self.speeches['speech'] = self.speeches['speech'].str.replace(r'\.', " ", regex=True)
        return self.speeches

        
    def remove_short_speeches(self, threshold = 10000):#change??
        self.speeches= self.speeches[self.speeches['speech'].apply(lambda x: len(str(x)) > threshold)]
        return self.speeches


    def lemmatize(self): #maybe I should stemm
        nlp = spacy.load("el_core_news_sm")
        self.speeches['speech'] = self.speeches['speech'].apply(lambda x: " ".join([word.lemma_ for word in nlp(x)]))


        return self.speeches


    def tokenize(self):
        self.speeches['speech'] = self.speeches['speech'].apply(lambda x: x.split())


        return self.speeches
        

    def remove_common_words(self):
        parliamentary_stopwords = set(word for word in self.idf_dict.keys() if self.idf_dict[word] == 0.0)
        self.speeches['speech'] = self.speeches['speech'].apply(lambda words: [word for word in words if word not in parliamentary_stopwords])

        return self.speeches

    def remove_party_references(self):
        self.speeches['speech'] = self.speeches['speech'].apply(lambda words: [word for word in words if "@" not in word])

        return self.speeches['speech']
    
    def doc_count(self): # counts the number of documents of the corpus
        corpus_len= len(speeches.documents)
        return corpus_len

    
    def compile_documents(self):#change!
        docs = self.speeches.groupby('parliamentary_period')['speech'].agg(lambda x: ' '.join(x)).reset_index()
        doc_dict={period: all_speeches.split() for (period, all_speeches) in zip(docs['parliamentary_period'],docs['speech'])}
        
        
        return doc_dict


    def word_doc_count(self): # creates a nested dictionary that contains the occurrence each document in each text (only the words that appear in the particular text)
        word_doc_dict={}
        for document, words in self.documents.items():
            word_doc_dict[document] = Counter(words)
        return word_doc_dict
    

    def create_word_set(self): # creates a set of all the words of the corpus
        word_set = {word for doc in self.documents.values() for word in doc}  
        return word_set

    def document_frequency(self):
        doc_freq =  {word: 0  for word in self.set_of_words}
        for document, nested_tf_dictionary in self.corpus.items():
            for word in self.set_of_words:
                if word in nested_tf_dictionary:
                    doc_freq[word]+=1

        return doc_freq


    def calculate_idf(self): # calculates inverse document frequency
        idf= {}
        for word, occurance in self.word_doc_frequency.items():
            idf_score = log(self.num_of_docs / occurance)
            idf[word]= idf_score
            
        return idf



In [7]:
preprocessed_data = DataCleaner(speeches.data)
docs = preprocessed_data.speeches


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.speeches['speech'] = self.speeches['speech'].apply(lambda x: " ".join([word.lemma_ for word in nlp(x)]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.speeches['speech'] = self.speeches['speech'].apply(lambda x: x.split())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.speeches['spe

In [8]:
docs.head

<bound method NDFrame.head of         parliamentary_period  \
645452             period 12   
645492             period 12   
645498             period 12   
645506             period 12   
645541             period 12   
...                      ...   
1236395            period 17   
1236397            period 17   
1236543            period 17   
1249766            period 16   
1249768            period 16   

                                                    speech  
645452   [παρακαλω, κυρια, κυριος, βουλευτης, υψωσω, δε...  
645492   [κυρια, κυριος, συναδελφος, ελληνιδα, ελληνος,...  
645498   [κυρια, κυριος, συναδελφος, εκλογος, 16ης, σεπ...  
645506   [ευχαριστως, κυριες, προεδρος, δυο, μηνος, φορ...  
645541   [κυριες, προεδρε, κυρια, κυριος, συναδελφος, ’...  
...                                                    ...  
1236395  [κυρια, κυριος, βουλευτης, αξιοτιμα, μελη, νομ...  
1236397  [σημειωτεον, ευρεια, ποινικοποιηση, δωροδοκια,...  
1236543  [ευχαριστως, κυριες, προεδρ

In [9]:
aggregated_df = docs.groupby('parliamentary_period')['speech'].agg(lambda x: [item for sublist in x for item in sublist]).reset_index()

# Displaying the resulting DataFrame
print(aggregated_df)

list_of_docs= aggregated_df['speech'].to_list()

  parliamentary_period                                             speech
0            period 12  [παρακαλω, κυρια, κυριος, βουλευτης, υψωσω, δε...
1            period 13  [παρακαλουνται, βουλευτης, κυριος, τσετιν, μαν...
2            period 15  [κυρια, κυριος, συναδελφος, μιλησω, περιστροφε...
3            period 16  [κυρια, κυριος, βουλευτος, κυρια, κυριος, συνα...
4            period 17  [κυρια, κυριος, βουλευτα, ευχαριστως, καρδια, ...


In [10]:
print(len(list_of_docs))

5


In [11]:
word_frequency = gensim.corpora.Dictionary(list_of_docs)
print(word_frequency)

Dictionary<95180 unique tokens: ['05βς', '0900α', '0o€', '100ου', '1032β']...>


In [12]:
word_frequency.filter_extremes(no_below=1, no_above=0.5, keep_n=30000)


In [13]:
print(word_frequency)

Dictionary<30000 unique tokens: ['05βς', '0900α', '0o€', '100ου', '1032β']...>


In [14]:
bow= [word_frequency.doc2bow(doc) for doc in list_of_docs]

In [15]:
print(bow)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 4), (12, 2), (13, 1), (14, 1), (15, 1), (16, 2), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 2), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 2), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 2), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 2), (67, 1), (68, 1), (69, 1), (70, 6), (71, 2), (72, 4), (73, 2), (74, 5), (75, 1), (76, 2), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 2), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 2), (90, 1), (91, 1), (92, 1), (93, 1), (94, 3), (95, 1), (96, 1), (97, 1), (98, 1), (99, 4), (100, 2), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1), (110, 1)

In [16]:
lda_model = gensim.models.LdaMulticore(bow,num_topics=10, id2word= word_frequency, passes=50)

In [17]:

for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx} Word composition:{topic}\n")

Topic: 0 Word composition:0.000*"καρχιμακη" + 0.000*"καρασμανης" + 0.000*"πελλη" + 0.000*"ανελ" + 0.000*"αριθ" + 0.000*"εψηφισε" + 0.000*"μανουση" + 0.000*"οσεκα" + 0.000*"βαρουφακη" + 0.000*"ελτε"

Topic: 1 Word composition:0.007*"καρχιμακη" + 0.004*"πελλη" + 0.004*"καρασμανης" + 0.003*"μανουση" + 0.002*"αριθ" + 0.002*"μεταλλιο" + 0.002*"αλογοσκουφης" + 0.002*"κρινιω" + 0.002*"παυλιδης" + 0.001*"σημ"

Topic: 2 Word composition:0.006*"ανελ" + 0.005*"εψηφισε" + 0.002*"προσφυγικο" + 0.002*"κατρουγκαλου" + 0.002*"μακεδονικος" + 0.002*"ελτε" + 0.002*"controls" + 0.002*"οσεκα" + 0.001*"κοινσεπς" + 0.001*"brain"

Topic: 3 Word composition:0.000*"καρχιμακη" + 0.000*"πελλη" + 0.000*"καρασμανης" + 0.000*"ανελ" + 0.000*"σημ" + 0.000*"μανουση" + 0.000*"αριθ" + 0.000*"μεταλλιο" + 0.000*"εψηφισε" + 0.000*"αλογοσκουφης"

Topic: 4 Word composition:0.003*"καρχιμακη" + 0.002*"πελλη" + 0.002*"καρασμανης" + 0.002*"στρατακης" + 0.002*"παπουτση" + 0.002*"φραγκισκος" + 0.002*"αιβαλιωτης" + 0.002*"πλευρης" +

In [18]:

coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=word_frequency, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  nan


  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))


In [19]:
preprocessed_data.speeches.to_csv('output.csv')

In [20]:
res = preprocessed_data.speeches['speech'].apply(lambda words: [word for word in words if not re.search(r'\d', word)]) #suppose to delete items of the list that have digits but doesn't

In [21]:
print(res)

645452     [παρακαλω, κυρια, κυριος, βουλευτης, υψωσω, δε...
645492     [κυρια, κυριος, συναδελφος, ελληνιδα, ελληνος,...
645498     [κυρια, κυριος, συναδελφος, εκλογος, σεπτεμβρη...
645506     [ευχαριστως, κυριες, προεδρος, δυο, μηνος, φορ...
645541     [κυριες, προεδρε, κυρια, κυριος, συναδελφος, ’...
                                 ...                        
1236395    [κυρια, κυριος, βουλευτης, αξιοτιμα, μελη, νομ...
1236397    [σημειωτεον, ευρεια, ποινικοποιηση, δωροδοκια,...
1236543    [ευχαριστως, κυριες, προεδρος, συγκινητικο, κλ...
1249766    [ευχαριστως, κυριες, προεδρε, κυρια, κυριος, σ...
1249768    [ευχαριστως, κυριες, προεδρος, ειλικρινα, εκφρ...
Name: speech, Length: 2176, dtype: object


In [22]:
preprocessed_data.speeches.to_csv('outputdigit.csv')

In [23]:


        
        
     


     
    



    
    def remove_whitespace(self):
        self.speeches['speech'] = self.speeches['speech'].apply(lambda text: text.split())

        return self.speeches


    

        


    






In [24]:
#tf_related_code
    
    def find_max_occur(self): # creates a dictionary which contains the highest occurrence for each text
        self.max_occur = {text: max(words.values()) for text, words in self.corpus.items()}
        return self.max_occur
        
    
    
    def calculate_tf(self): # calculates term frequency for words in documents
        tf= {} 
        for document, words in self.corpus.items():
            tf[document] = {}
            for word in words.keys():
                term_frequency = self.corpus[document][word]
                max_occur = self.max_occur_dict[document]
                tf[document][word] = term_frequency / int(max_occur)


        return tf
    


def calculate_tf_idf(self): # calculates tf-idf scores and stores them in a dictionary
        tf_idf= {}
        for word in self.set_of_words:
            for document in self.documents.keys():
                tf_idf[document][word] = self.tf_dict[document][word]* self.idf_dict[word]
        return tf_idf
    
    


    

#sanity_check_documents(dictionary{period:all speeches})
period_text_dict = clean_speeches.documents

# Specify the file path
output_file_path = 'output.txt'

# Open the file in write mode with utf-8 encoding
with open(output_file_path, 'w', encoding='utf-8') as file:
    # Iterate over the dictionary
    for period, text in period_text_dict.items():
        # Write period identifier to the file
        file.write(f"{period}:\n")

        # Write the text to the file
        file.write(f"    {text}\n\n")

#sanity_check_documents(dictionary{period treated as document:{word:frequency})
output_file_path = 'outputnested.txt'

# Open the file in write mode with utf-8 encoding
with open(output_file_path, 'w', encoding='utf-8') as file:
    for document, word_counts in clean_speeches.corpus.items():
        file.write(f"{document}:\n")
        for word, count in word_counts.items():
            file.write(f"    {word}: {count}\n")
        file.write('\n')
#sanity_check_documents(dictionary{period treated as document:{word:tf_score}})

output_file_path = 'output_tf.txt'
with open(output_file_path, 'w', encoding='utf-8') as file:
        for document, word_tf in clean_speeches.tf_dict.items():
            file.write(f"{document}:\n")
            for word, tf_score in word_tf.items():
                file.write(f"    {word}: {tf_score}\n")
            file.write('\n')
#sanity_check_documents(dictionary{word:{num of docs it appears}})

output_file_path = 'output_doc_freq.txt'
with open(output_file_path, 'w', encoding='utf-8') as file:
    for word, frequency in clean_speeches.word_doc_frequency.items():
        file.write(f"{word}: {frequency}\n")
output_file_path_idf = 'output_idf.txt'
with open(output_file_path_idf, 'w', encoding='utf-8') as file:
    for word, idf_score in clean_speeches.idf_dict.items():
        file.write(f"{word}: {idf_score}\n")
   

    


IndentationError: unexpected indent (3918817672.py, line 3)