In [None]:
import re
import torch
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
from bertopic import BERTopic
import demoji
import spacy
import unidecode
from umap import UMAP
from bertopic import BERTopic
import gensim
import gensim.corpora as corpora
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from gensim.models.coherencemodel import CoherenceModel



In [None]:
nlp = spacy.load("es_core_news_sm")

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = stopwords.words('spanish')

In [None]:
spanish_stop_words = nltk.corpus.stopwords.words('spanish')

In [None]:
df = pd.read_csv(r"C:\Users\CARG\Desktop\Codigos_tesis_bertopic\medio_año_2022_politica.csv") 

In [None]:
df.head(1)

**Filtro por mes**

In [None]:
df['date_source'] = pd.to_datetime(df['date_source'])

In [None]:
df_prueba = df[df['date_source'].dt.month.isin([6])].copy()


In [None]:
num_filas = df_prueba.shape[0]

# Imprimir el número de filas
print("El dataframe df_prueba tiene", num_filas, "filas.")

In [None]:
df_prueba.head()

In [None]:
#eliminacion de emojis
def clean_text(x):
  x = str(x)
  x = x.lower()
  x = re.sub(r'#[A-Za-z0-9]*', ' ', x)
  x = re.sub(r'https*://.*', ' ', x)
  

  x = re.sub(r'@[A-Za-z0-9]+', ' ', x)
  
  # remove emojis
  demoji.download_codes()
  x = demoji.replace(x, '')

  # remove accents
  x = unidecode.unidecode(x)
  
  tokens = word_tokenize(x)
  x = ' '.join([w for w in tokens if not w.lower() in stop_words])
  x = re.sub(r'[%s]' % re.escape('!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~“…”’'), ' ', x)
  x = re.sub(r'\d+', ' ', x)
  x = re.sub(r'\n+', ' ', x)
  x = re.sub(r'\s{2,}', ' ', x)
  return x

In [None]:
def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])


In [None]:
df_prueba['clean_text'] = df_prueba.text.apply(clean_text)
df_prueba.head()

In [None]:
# Lematización de la columna 'clean_text' del dataframe df_prueba
df_prueba['lemmatized_text'] = df_prueba['clean_text'].apply(lemmatize_text)

In [None]:
timestamps = df_prueba.date_source.to_list()
tweets = df_prueba.text.to_list()

In [None]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")


umap_model = UMAP(n_neighbors=20, n_components=5, min_dist=0.0, metric='cosine',random_state = 42)

hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=spanish_stop_words)
ctfidf_model = ClassTfidfTransformer()

topic_model = BERTopic(
        verbose=False,        
        embedding_model="paraphrase-multilingual-MiniLM-L12-v2",
        language="Spanish",
        umap_model=umap_model,              # Step 2 - Reduce dimensionality
        hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
        vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
        ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words
        nr_topics="auto",                        # Step 6 - Diversify topic words
        n_gram_range=(1, 3),
        calculate_probabilities=False        
    )
    
topics, probs = topic_model.fit_transform(tweets)
filtered_text = df_prueba['lemmatized_text']

documents = pd.DataFrame({"Document": filtered_text,
                          "ID": range(len(filtered_text)),
                          "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()


In [None]:
topic_model.get_topic_info()

In [None]:
topic_info = topic_model.get_topic_info()
df = pd.DataFrame(topic_info, columns=['Tema', 'Palabras clave', 'Frecuencia'])


In [None]:
topic_info.to_csv("resultados.csv")

In [None]:
topic_model.visualize_topics()

In [None]:
sentence_model = embedding_model
docs = tweets
embeddings = sentence_model.encode(docs, show_progress_bar=False)

# Train BERTopic
#topic_model = BERTopic().fit(docs, embeddings)

# Run the visualization with the original embeddings
topic_model.visualize_documents(docs, embeddings=embeddings)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=5, min_dist=0.1, metric='cosine').fit_transform(embeddings)
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

reduccion

In [None]:
new_topics = topic_model.reduce_outliers(docs, topics, strategy="c-tf-idf")
topic_model.update_topics(docs, topics=new_topics)

In [None]:
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings, 
                                hide_document_hover=True, hide_annotations=True)

In [None]:
topic_model.get_topic_info()

In [None]:
topic_info.to_csv("resultados_reducidos.csv")

tiempo

In [None]:
topics_over_time = topic_model.topics_over_time(tweets, timestamps, 
                                                global_tuning=True, evolution_tuning=True, nr_bins=40)

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=40)