In [3]:
import pandas as pd
from gensim import corpora, models
from ast import literal_eval

# Load the CSV file into a DataFrame
df = pd.read_csv('tokenized_output.csv')

# Convert the 'tokens' column from string to list of tokens
df['tokens'] = df['tokens'].apply(literal_eval)

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(df['tokens'])

# Filter out tokens that appear in less than 5 documents or more than 50% of the documents
dictionary.filter_extremes(no_below=5, no_above=0.5)

# Create a bag-of-words representation of the documents
corpus = [dictionary.doc2bow(tokens) for tokens in df['tokens']]

# Check if corpus is not empty
if not corpus:
    print("Corpus is empty. Please check your data and filtering criteria.")
else:
    # Train the LDA model
    lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

    # Print the topics and their corresponding words without weights
    for idx, topic in lda_model.print_topics(-1):
        words = topic.split('+')
        topic_words = [word.split('*')[1].replace('"', '').strip() for word in words]
        print(f'Topic: {idx}')
        print(f'Words: {topic_words}')
        print()

    # Save the LDA model
    lda_model.save('lda_model')


Topic: 0
Words: ['loc', 'respinge', 'următor', 'reprezentant', 'sistem', 'Newsro', 'ac', 'urma']

Topic: 1
Words: ['loc', 'reprezentant', 'urma', 'ac', 'Newsro', 'sistem', 'următor', 'respinge']

Topic: 2
Words: ['sistem', 'respinge', 'urma', 'ac', 'următor', 'reprezentant', 'Newsro', 'loc']

Topic: 3
Words: ['sistem', 'loc', 'ac', 'Newsro', 'următor', 'urma', 'reprezentant', 'respinge']

Topic: 4
Words: ['respinge', 'Newsro', 'ac', 'următor', 'reprezentant', 'urma', 'loc', 'sistem']



In [10]:
from transformers import AutoModel, AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("readerbench/RoBERT-base")
model = AutoModel.from_pretrained("readerbench/RoBERT-base")
inputs = tokenizer("exemplu de propoziție", return_tensors="pt")
outputs = model(**inputs)
print(outputs)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.1799, -0.0813,  0.8690,  ..., -0.2820,  0.0934,  0.7427],
         [ 0.5068,  0.4884,  0.1603,  ...,  0.3275,  0.6655, -0.3906],
         [ 0.6516,  0.4548, -0.2504,  ...,  0.1670,  0.4344,  0.1669],
         ...,
         [ 0.2018, -0.9343, -0.4237,  ...,  1.2236, -0.0991, -0.1604],
         [-0.6006, -0.3131,  0.0780,  ...,  0.5276, -0.4769,  0.3721],
         [ 0.0914, -0.1258,  0.0552,  ...,  0.1965, -0.1086,  0.2170]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-9.8939e-01, -3.9414e-01,  3.7290e-01,  9.7404e-01, -1.6963e-01,
          5.5492e-01,  1.1029e-01, -2.8783e-01,  3.9374e-01,  1.7747e-01,
          3.0243e-01,  6.1234e-02, -4.5448e-01, -1.2506e-01,  1.5203e-01,
         -3.6685e-01, -6.6735e-01,  7.7938e-01, -9.8936e-01,  4.1359e-01,
          3.3446e-01,  2.6447e-01, -2.0789e-01,  4.5235e-01, -4.2164e-01,
         -7.6997e-01, -3.4082e-01, -9.6521e-01, -8.3696e-01, -2.721

In [11]:
from transformers import AutoModel, AutoTokenizer
import torch

# Load the RoBERTa tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("readerbench/RoBERT-base")
model = AutoModel.from_pretrained("readerbench/RoBERT-base")

# Example text data
text_data = [
    "exemplu de propoziție 1",
    "exemplu de propoziție 2",
    "exemplu de propoziție 3"
]

# List to store embeddings
embeddings = []

# Iterate through each sentence, obtain embeddings, and store
for text in text_data:
    # Tokenize and encode the text
    inputs = tokenizer(text, return_tensors="pt")
    
    # Pass the input through the RoBERTa model
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the last hidden state (last layer's output)
    last_hidden_state = outputs.last_hidden_state
    
    # Take the mean of the embeddings for each token in the sentence
    sentence_embedding = torch.mean(last_hidden_state, dim=1).squeeze().numpy()
    
    embeddings.append(sentence_embedding)


In [12]:
import numpy as np

# Combine embeddings to represent each document
document_embeddings = np.stack(embeddings)


In [15]:
import pandas as pd
from gensim import corpora, models
from ast import literal_eval
from gensim.models import TfidfModel

# Load the CSV file into a DataFrame
df = pd.read_csv('tokenized_output.csv')

# Convert the 'tokens' column from string to list of tokens
df['tokens'] = df['tokens'].apply(literal_eval)

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(df['tokens'])

# Filter out tokens that appear in less than 5 documents or more than 50% of the documents
dictionary.filter_extremes(no_below=5, no_above=0.5)

# Create a bag-of-words representation of the documents
corpus = [dictionary.doc2bow(tokens) for tokens in df['tokens']]

# Check if corpus is not empty
if not corpus:
    print("Corpus is empty. Please check your data and filtering criteria.")
else:
    # Create the TF-IDF model
    tfidf = TfidfModel(corpus)
    
    # Transform the corpus to TF-IDF representation
    tfidf_corpus = tfidf[corpus]
    
    # Train the LDA model using TF-IDF
    lda_model = models.LdaModel(tfidf_corpus, num_topics=5, id2word=dictionary, passes=15)

    # Print the topics and their corresponding words without weights
    for idx, topic in lda_model.print_topics(-1):
        words = topic.split('+')
        topic_words = [word.split('*')[1].replace('"', '').strip() for word in words]
        print(f'Topic: {idx}')
        print(f'Words: {topic_words}')
        print()

    # Save the LDA model
    lda_model.save('lda_model')


Topic: 0
Words: ['loc', 'sistem', 'respinge', 'Newsro', 'reprezentant', 'următor', 'ac', 'urma']

Topic: 1
Words: ['ac', 'loc', 'reprezentant', 'urma', 'următor', 'Newsro', 'sistem', 'respinge']

Topic: 2
Words: ['sistem', 'respinge', 'reprezentant', 'următor', 'ac', 'urma', 'Newsro', 'loc']

Topic: 3
Words: ['loc', 'reprezentant', 'următor', 'sistem', 'respinge', 'Newsro', 'ac', 'urma']

Topic: 4
Words: ['respinge', 'Newsro', 'următor', 'urma', 'ac', 'sistem', 'reprezentant', 'loc']



In [24]:
import spacy
import gensim
from gensim import corpora, models
from spacy.lang.ro.stop_words import STOP_WORDS

# Load the spaCy model for Romanian
nlp = spacy.load("ro_core_news_lg")

# Define your text data (list of strings)
texts = pd.read_csv('tokenized_output.csv')

# Tokenize, remove stopwords, and get document vectors
processed_texts = []
for text in texts['tokens']:
    doc = nlp(text)
    processed_text = [token.text.lower() for token in doc if token.text.lower() not in STOP_WORDS and token.is_alpha]
    processed_texts.append(processed_text)

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(processed_texts)

# Create corpus of document vectors
corpus = [dictionary.doc2bow(text) for text in processed_texts]

# Train the LDA model
lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

# Print the topics and their corresponding words without weights
for idx, topic in lda_model.print_topics(10):
    words = topic.split('+')
    topic_words = [word.split('*')[1].replace('"', '').strip() for word in words]
    print(f'Topic: {idx}')
    print(f'Words: {topic_words}')
    print()

# Save the LDA model
lda_model.save('lda_model_spacy')


Topic: 0
Words: ['staţie', 'loc', 'metrou', 'minister', 'medic', 'persoană', 'vedere', 'panou', 'monta', 'punctaj']

Topic: 1
Words: ['locomotivă', 'tren', 'abur', 'brașov', 'fum', 'regal', 'loc', 'an', 'veni', 'transmite']

Topic: 2
Words: ['rinkevics', 'preşedinte', 'edgars', 'funcţie', 'letonia', 'ministru', 'alegere', 'ţară', 'declara', 'extern']

Topic: 3
Words: ['spune', 'economic', 'operator', 'mână', 'sursă', 'coleg', 'pune', 'întâmpla', 'încuraja', 'horia']

Topic: 4
Words: ['leu', 'guvern', 'sindicat', 'majorare', 'ofertă', 'salariu', 'brut', 'an', 'grevă', 'profesor']

<spacy.lang.ro.Romanian object at 0x76e0941c49d0>


In [26]:
import spacy
import gensim
from gensim import corpora, models
from gensim.models import CoherenceModel
from spacy.lang.ro.stop_words import STOP_WORDS

# Load the spaCy model for Romanian
nlp = spacy.load("ro_core_news_lg")

# Define your text data (list of strings)
texts = pd.read_csv('tokenized_output.csv')

# Tokenize, remove stopwords, and get document vectors
processed_texts = []
for text in texts['tokens']:
    doc = nlp(text)
    processed_text = [token.text.lower() for token in doc if token.text.lower() not in STOP_WORDS and token.is_alpha]
    processed_texts.append(processed_text)

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(processed_texts)

# Create corpus of document vectors
corpus = [dictionary.doc2bow(text) for text in processed_texts]

# Train the LDA model
lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

# Calculate Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda}')

# Calculate Perplexity
perplexity = lda_model.log_perplexity(corpus)
print(f'Perplexity: {perplexity}')

# Print the topics and their corresponding words without weights
for idx, topic in lda_model.print_topics(10):
    words = topic.split('+')
    topic_words = [word.split('*')[1].replace('"', '').strip() for word in words]
    print(f'Topic: {idx}')
    print(f'Words: {topic_words}')
    print()

# Save the LDA model
lda_model.save('lda_model_spacy')


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Coherence Score: 0.6826622617113627
Perplexity: -6.624298077033246
Topic: 0
Words: ['spune', 'operator', 'economic', 'sursă', 'pune', 'coleg', 'horia', 'constantinescu', 'încet', 'obișnuit']

Topic: 1
Words: ['staţie', 'metrou', 'proiect', 'locomotivă', 'panou', 'persoană', 'monta', 'tren', 'element', 'informare']

Topic: 2
Words: ['loc', 'persoană', 'tate', 'minister', 'medic', 'bucureşti', 'punctaj', 'sănătaţe', 'medicină', 'familie']

Topic: 3
Words: ['rinkevics', 'preşedinte', 'edgars', 'letonia', 'funcţie', 'ţară', 'alegere', 'declara', 'deveni', 'extern']

Topic: 4
Words: ['leu', 'guvern', 'sindicat', 'majorare', 'ofertă', 'brut', 'an', 'educație', 'didactic', 'salariu']



In [27]:
import spacy
import gensim
from gensim import corpora, models
from gensim.models import CoherenceModel
from spacy.lang.ro.stop_words import STOP_WORDS

# Load the spaCy model for Romanian
nlp = spacy.load("ro_core_news_lg")

# Load the LDA model
lda_model = models.LdaModel.load('lda_model_spacy')

# Define function for input-output pipeline
def lda_pipeline(input_text):
    # Tokenize, remove stopwords, and get document vectors
    doc = nlp(input_text)
    processed_text = [token.text.lower() for token in doc if token.text.lower() not in STOP_WORDS and token.is_alpha]
    
    # Create corpus of document vectors
    text_corpus = [dictionary.doc2bow(processed_text)]
    
    # Get the topic distribution for the input text
    topic_distribution = lda_model[text_corpus][0]
    
    return topic_distribution

# Test the pipeline function
input_text = "Your input text goes here..."
output = lda_pipeline(input_text)
print(output)


[(0, 0.100090675), (1, 0.10005186), (2, 0.10005265), (3, 0.10008407), (4, 0.5997208)]


In [28]:
import spacy
import gensim
from gensim import corpora, models
from spacy.lang.ro.stop_words import STOP_WORDS
import pandas as pd

def text_pipeline(data_file, num_topics=5, passes=15):
    # Load the spaCy model for Romanian
    nlp = spacy.load("ro_core_news_lg")

    # Load your text data from CSV
    texts = pd.read_csv(data_file)

    # Tokenize, remove stopwords, and get document vectors
    def process_text(text):
        doc = nlp(text)
        processed_text = [token.text.lower() for token in doc if token.text.lower() not in STOP_WORDS and token.is_alpha]
        return processed_text

    processed_texts = [process_text(text) for text in texts['tokens']]

    # Create a dictionary representation of the documents
    dictionary = corpora.Dictionary(processed_texts)

    # Create corpus of document vectors
    corpus = [dictionary.doc2bow(text) for text in processed_texts]

    # Train the LDA model
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=passes)

    # Print the topics and their corresponding words without weights
    for idx, topic in lda_model.print_topics(10):
        words = topic.split('+')
        topic_words = [word.split('*')[1].replace('"', '').strip() for word in words]
        print(f'Topic: {idx}')
        print(f'Words: {topic_words}')
        print()

    # Save the LDA model
    lda_model.save('lda_model_spacy')

# Example Usage:
text_pipeline('tokenized_output.csv')


Topic: 0
Words: ['preşedinte', 'rinkevics', 'salariu', 'declara', 'edgars', 'lege', 'ministru', 'miercuri', 'fapt', 'trebui']

Topic: 1
Words: ['staţie', 'metrou', 'persoană', 'proiect', 'panou', 'monta', 'majorare', 'guvern', 'informare', 'an']

Topic: 2
Words: ['leu', 'guvern', 'sindicat', 'majorare', 'an', 'ofertă', 'brut', 'educație', 'didactic', 'grevă']

Topic: 3
Words: ['loc', 'minister', 'medic', 'locomotivă', 'sănătaţe', 'punctaj', 'medicină', 'familie', 'tren', 'universitate']

Topic: 4
Words: ['persoană', 'staţie', 'metrou', 'tate', 'panou', 'bucureşti', 'monta', 'element', 'proiect', 'informare']



In [2]:
import spacy
import gensim
from gensim import corpora, models
from spacy.lang.ro.stop_words import STOP_WORDS
import pandas as pd

def text_pipeline(data_file, num_topics=5, passes=15):
    # Load the spaCy model for Romanian
    nlp = spacy.load("ro_core_news_lg")

    # Load your text data from CSV
    texts = pd.read_csv(data_file)

    # Tokenize, remove stopwords, and get document vectors
    def process_text(text):
        doc = nlp(text)
        processed_text = [token.text.lower() for token in doc if token.text.lower() not in STOP_WORDS and token.is_alpha]
        return processed_text

    processed_texts = [process_text(text) for text in texts['tokens']]

    # Create a dictionary representation of the documents
    dictionary = corpora.Dictionary(processed_texts)

    # Create corpus of document vectors
    corpus = [dictionary.doc2bow(text) for text in processed_texts]

    # Train the LDA model
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=passes)

    # Print the topics and their corresponding words without weights
    for idx, topic in lda_model.print_topics(10):
        words = topic.split('+')
        topic_words = [word.split('*')[1].replace('"', '').strip() for word in words]
        print(f'Topic: {idx}')
        print(f'Words: {topic_words}')
        print()

    # Save the LDA model
    lda_model.save('lda_model_spacy')

# Example Usage:
text_pipeline('cool.csv')


Topic: 0
Words: ['leu', 'sindicat', 'guvern', 'salariu', 'ofertă', 'an', 'brut', 'grevă', 'salarial', 'profesor']

Topic: 1
Words: ['leu', 'sindicat', 'guvern', 'an', 'loc', 'ofertă', 'spune', 'grevă', 'majorare', 'didactic']

Topic: 2
Words: ['staţie', 'metrou', 'educație', 'persoană', 'monta', 'panou', 'majorare', 'brut', 'proiect', 'informare']

Topic: 3
Words: ['preşedinte', 'rinkevics', 'salariu', 'declara', 'edgars', 'miercuri', 'fapt', 'lege', 'bode', 'trebui']

Topic: 4
Words: ['leu', 'guvern', 'staţie', 'ofertă', 'didactic', 'metrou', 'an', 'sindicat', 'panou', 'proiect']

