In [1]:
## save to different categories

import pandas as pd
import os

# Read the main CSV file
df = pd.read_csv("cool.csv")

# Define the columns to keep in the output
columns_to_keep = [
    '_id', 'category', 'content',
    'cleaned_tokens_ner', 'cleaned_tokens_stopwords', 'tokens'
]

# Get unique categories
categories = df['category'].unique()

# Create a directory for outputs
output_dir = "category_outputs"
os.makedirs(output_dir, exist_ok=True)

# Iterate over categories
for category in categories:
    # Filter data for the current category
    category_df = df[df['category'] == category]
    
    # Check if there are any rows for this category
    if not category_df.empty:
        # Prepare the output filename
        output_filename = os.path.join(output_dir, f"{category}.csv")
        
        # Save the category data to a new CSV file
        category_df[columns_to_keep].to_csv(output_filename, index=False)
        print(f"Saved {category} data to {output_filename}")
    else:
        print(f"No data found for {category}, skipping...")


Saved Politică data to category_outputs/Politică.csv
No data found for nan, skipping...
Saved Actualitate data to category_outputs/Actualitate.csv
Saved SUA data to category_outputs/SUA.csv
Saved Economie data to category_outputs/Economie.csv
Saved Educație data to category_outputs/Educație.csv
Saved Justiție data to category_outputs/Justiție.csv
Saved Știri data to category_outputs/Știri.csv
Saved Sănătate data to category_outputs/Sănătate.csv
Saved Externe data to category_outputs/Externe.csv
Saved Evenimente data to category_outputs/Evenimente.csv
Saved Social data to category_outputs/Social.csv
Saved Rusia data to category_outputs/Rusia.csv
Saved Bacalaureat 2023 data to category_outputs/Bacalaureat 2023.csv
Saved Meteo data to category_outputs/Meteo.csv
Saved Sci-tech data to category_outputs/Sci-tech.csv
Saved Pacient în România data to category_outputs/Pacient în România.csv
Saved Animale data to category_outputs/Animale.csv
Saved Evaluare Națională 2023 data to category_outputs

In [10]:
import spacy
import gensim
from gensim import corpora, models
from spacy.lang.ro.stop_words import STOP_WORDS
import pandas as pd

def text_pipeline(data_file, num_topics=10, passes=15):
    # Load the spaCy model for Romanian
    nlp = spacy.load("ro_core_news_sm")

    # Load your text data from CSV
    texts = pd.read_csv(data_file)

    # Tokenize, remove stopwords, and get document vectors
    def process_text(text):
        doc = nlp(text)
        processed_text = [token.text.lower() for token in doc if token.text.lower() not in STOP_WORDS and token.is_alpha]
        return processed_text

    processed_texts = [process_text(text) for text in texts['tokens']]

    # Create a dictionary representation of the documents
    dictionary = corpora.Dictionary(processed_texts)

    # Create corpus of document vectors
    corpus = [dictionary.doc2bow(text) for text in processed_texts]

    # Train the LDA model
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=passes)

    # Print the topics and their corresponding words without weights
    for idx, topic in lda_model.print_topics(10):
        words = topic.split('+')
        topic_words = [word.split('*')[1].replace('"', '').strip() for word in words]
        print(f'Topic: {idx}')
        print(f'Words: {topic_words}')
        print()

    # Save the LDA model
    lda_model.save('lda_model_spacy')

# Example Usage:
text_pipeline('Actualitate.csv')


Topic: 0
Words: ['clădire', 'cutremur', 'seismic', 'risc', 'concurenă', 'judeu', 'oră', 'cca', 'cod', 'magnitudine']

Topic: 1
Words: ['spune', 'an', 'spital', 'caz', 'leu', 'ban', 'românia', 'trebui', 'lua', 'număr']

Topic: 2
Words: ['aeroport', 'companie', 'astronaut', 'damen', 'esa', 'zbor', 'aeronavă', 'mangalia', 'air', 'chatgpt']

Topic: 3
Words: ['oră', 'loc', 'număr', 'afla', 'incendiu', 'pompier', 'român', 'isu', 'mesaj', 'apă']

Topic: 4
Words: ['trebui', 'spune', 'putea', 'leu', 'sistem', 'proiect', 'program', 'public', 'energie', 'guvern']

Topic: 5
Words: ['an', 'cookie', 'loc', 'social', 'permite', 'editor', 'afla', 'zonă', 'direct', 'accident']

Topic: 6
Words: ['biserică', 'sfânt', 'ortodox', 'lumină', 'bisericii', 'înviere', 'slujbă', 'calendar', 'domn', 'mormânt']

Topic: 7
Words: ['grad', 'lege', 'zonă', 'parte', 'român', 'perioadă', 'minister', 'spune', 'lucrare', 'exista']

Topic: 8
Words: ['românia', 'ucraina', 'european', 'stat', 'spune', 'rusia', 'rus', 'război

In [None]:
pd_