In [1]:
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
import pandas as pd
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from datasets import load_dataset_builder 
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from datetime import datetime
from dateutil.parser import parse
import numpy as np
from tqdm import tqdm
import tensorflow as tf
from  tp_function import  clean_all, Cleaning_text, load_dataset_function, limpieza_entidades, limpieza_keywords, limpieza_texto
from  tp_function import  train_function, threshold_function, tabla_frecuencia  #(*)



In [22]:
from opensearch_data_model import Topic, TopicKeyword, os_client

In [51]:
Topic.init()



In [4]:
#Levanto el   modelo mergeado de los dias: 
merged_model_1y2 = BERTopic.load(r"C:\Users\amunoz\Desktop\NLP_2024\TP"+ "\merge_modelo_1y2")


### Juntamos un dia más.

In [2]:
date_choice = '2024-07-21'
df_3 = load_dataset_function(date_choice)
df_3.to_parquet(r"C:\Users\amunoz\Desktop\NLP_2024\TP"+"\df_3.parquet")
df_3.head(1)

Registros para la fecha 2024-07-21 -> 8488 de un total de 11247


Unnamed: 0,asset_id,title_ch,Asset Destination,media,impact,start_time_utc,start_time_local,entities_curated,entities,predicted_at_entities,entities_raw_transformers,entities_transformers,title,text,keywords,predicted_at_keywords,truncated_text,title_and_text,prediction_delay_predictions,prediction_delay
3421,115319417,El lado B - Por Martina Funes,http://losandes.com.ar/espectaculo/el-lado-b-p...,Diario Los Andes,8271,2024-07-21 03:00:00,2024-07-21,[],"[Rosa Montero, Voy, Elena Ferrante, Martina Fu...",2024-07-21 04:30:05.235974,"[{'entities': [{'end': 29, 'entity_group': 'PE...","[Martina Funes, Elena Ferrante, The Guardian, ...",El lado B - Por Martina Funes,"Algo imperceptible se quebró, no tenía sentido...","[amistades extintas, el cariño, mis amigas, af...",2024-07-21 04:31:39.569320,"Algo imperceptible se quebró, no tenía sentido...",El lado B - Por Martina Funes\nAlgo impercepti...,0.026204,1.527658


In [3]:
df_3[["entities" ,  "keywords", "text"]].head(2)

Unnamed: 0,entities,keywords,text
3421,"[Rosa Montero, Voy, Elena Ferrante, Martina Fu...","[amistades extintas, el cariño, mis amigas, af...","Algo imperceptible se quebró, no tenía sentido..."
9010,"[Mendoza, Va, Cámara Insurtech Argentina, La C...","[distintos clientes, la caja, conciencia asegu...","La Cámara Insurtech Argentina, que promueve la..."


In [5]:
batch_news = 500

df_train_3 = df_3.sample(n=int(batch_news)).copy()
print(f" Cantidad de noticias de df_3 para entrenar el modelo es {len(df_train_3)}")

 Cantidad de noticias de df_3 para entrenar el modelo es 500


### StopWords

In [6]:
SPANISH_STOPWORDS = list(pd.read_csv('stop_words_spanish.csv' )['stopwords'].values)
SPANISH_STOPWORDS_PARTICULAR = list(pd.read_csv('stop_words_particular.csv' )['stopwords'].values)
#SPANISH_STOPWORDS_PARTICULAR. Falta  Completar a mano

### Entidades y KeyWords con una pequeña limpieza.

In [7]:
enti_train_3 = limpieza_entidades(df_train_3, SPANISH_STOPWORDS, SPANISH_STOPWORDS_PARTICULAR)
print(len(enti_train_3))
enti_train_3[:5]

3528


['presidente', 'ignacio torres', 'leo', 'utdt', 'slokar']

In [8]:
key_train_3 = limpieza_keywords(df_train_3, SPANISH_STOPWORDS, SPANISH_STOPWORDS_PARTICULAR)
print(len(key_train_3))
key_train_3[:2]


10036


['la federación', 'presidente']

### Armo el vocabulario 

In [9]:
vocab_3 =  list(set(key_train_3 + enti_train_3))

### TF-IDF

In [11]:
from typing import Callable, Iterable, Literal, Mapping   

In [12]:
data_train_3 = list(df_train_3['text']) 
tfidf_vectorizer_3 = TfidfVectorizer(
        tokenizer=None,
        max_df=0.9,
        min_df=0.1,
        ngram_range=(1, 3),
        vocabulary=vocab_3,
)
tfidf_vectorizer_3.fit(data_train_3)

### Limpio el texto de las noticias.

In [13]:
proc_data_text_3 =  limpieza_texto(df_train_3, SPANISH_STOPWORDS, SPANISH_STOPWORDS_PARTICULAR)

100%|██████████| 500/500 [00:02<00:00, 209.33it/s]


### Inicializamos el modelo para los corpus 3.  

In [14]:
#Pasos Comunes. 

#Step 1 - Extract embeddings
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

In [15]:
# Step 4 - Tokenize topics
vectorizer_model_3= tfidf_vectorizer_3



topic_model_3 = BERTopic(
  embedding_model=embedding_model,              
  umap_model=umap_model,                        
  hdbscan_model=hdbscan_model,                  
  vectorizer_model=vectorizer_model_3,         
  ctfidf_model=ctfidf_model,                    
  verbose=True,
  #calculate_probabilities=True
)

### Entrenamiento y guardado.

In [16]:
topics_3, probs_3 = train_function(topic_model_3, proc_data_text_3)

2024-09-30 11:51:07,345 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

2024-09-30 11:52:03,788 - BERTopic - Embedding - Completed ✓
2024-09-30 11:52:03,788 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-30 11:52:19,781 - BERTopic - Dimensionality - Completed ✓
2024-09-30 11:52:19,781 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-30 11:52:19,900 - BERTopic - Cluster - Completed ✓
2024-09-30 11:52:20,221 - BERTopic - Representation - Extracting topics from clusters using representation models.
  idf = np.log((avg_nr_samples / df)+1)
2024-09-30 11:52:24,456 - BERTopic - Representation - Completed ✓


In [17]:
topic_model_3.save(r"C:\Users\amunoz\Desktop\NLP_2024\TP"+ "\modelo_3")



In [18]:
#Levanto el modelo guardado.
topic_model_3 = BERTopic.load(r"C:\Users\amunoz\Desktop\NLP_2024\TP"+ "\modelo_3")

### Mergeo ambos modelos

In [19]:
merged_model_1y2 = BERTopic.load(r"C:\Users\amunoz\Desktop\NLP_2024\TP"+ "\merge_modelo_1y2")

In [20]:
merged_model_1y2y3 = BERTopic.merge_models([merged_model_1y2, topic_model_3])


In [29]:
merged_model_1y2y3.save(r"C:\Users\amunoz\Desktop\NLP_2024\TP"+ "\merge_modelo_1y2y3")



In [30]:
#Levanto el modelo guardado.
topic_model_1y2y3 = BERTopic.load(r"C:\Users\amunoz\Desktop\NLP_2024\TP"+ "\merge_modelo_1y2y3")

### Embedding para el tercer corpus de noticias y  matriz de similitud entre noticias y tópicos.

In [25]:
embeds_3 =  embedding_model.encode(proc_data_text_3)

In [40]:
embeds_1y2 = np.load("embedding1y2.npy")
data_train_1y2 = np.load("data_train_1y2.npy")
data_train_1y2 =list(data_train_1y2 )
data_train_1y2

['Por: Por Pablo Ibánez\n\nVictoria Villarruel entra y sale del planeta Milei: cuando le conviene, está; cuando algo le hace ruido, se desmarca. El lunes invocó una gripe fatal para no participar del Pacto de Mayo en Tucumán, pero unas horas más tarde estuvo apta para el Tedeum y el desfile militar, donde terminó trepada, junto al presidente, de un tanque de guerra. “Victoria se curó rápido” lanzó, mordaz, un entornista de Javier Milei, muy temprano el martes. La intermitencia es el método V-V: habita el fenómeno libertario y es, al mismo tiempo, otra cosa.\n\nKarina Milei es su némesis. Más por visceral que por táctica, la hermanísima facilita la dualidad de la vice. En vez de abrazarla, la expulsa. Unas semanas atrás, en una charla mano a mano, Villarruel se ofreció a colaborar y participar del armado de La Libertad Avanza (LLA) en la provincia de Buenos Aires. Karina desechó el ofrecimiento con una excusa pueril: “No es un armado político, es la herramienta electoral, nada más que e

In [44]:
data_train_3 = list(df_train_3['text']) 
len(data_train_3 + data_train_1y2)

1500

In [46]:
# Junto los embeddings
embeds_12y3 = embeds_1y2 + embeds_3

#Junto los documentos.
data_train_m_12y3 = data_train_1y2 + data_train_3



print(f"En el modelo mergeado hay un total de {len(data_train_m_12y3)} documentos. Es la union de los documentos de los dias anteriores.")

topics_m, probs_m = merged_model_1y2y3.transform(data_train_m_12y3) 

# Junto los documentos.
#df_train_m =pd.concat([df_train_1, df_train_2])

#data_train_1y2= list(df_train_m['text'])

#print(f"En el modelo mergeado hay un total de {len(data_train_1y2)} documentos. Es la union de los documentos de los dias anteriores.")

#topics_m, probs_m = merged_model_1y2.transform(data_train_1y2)

En el modelo mergeado hay un total de 1500 documentos. Es la union de los documentos de los dias anteriores.


Batches:   0%|          | 0/47 [00:00<?, ?it/s]

2024-09-30 13:42:38,061 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


In [47]:
sim_matrix_12y3= cosine_similarity(
    merged_model_1y2y3.topic_embeddings_,
    embeds_12y3
)

In [53]:
def get_topic_name(keywords):
    return ', '.join([k for k, s in keywords[:4]])

In [48]:
df_m = pd.DataFrame((zip(topics_m,probs_m)), columns = ['topicos mergeados','probabilidades mergeadas'])
num_topics_m = len(merged_model_1y2y3.get_topics().keys())
threshold_m = []

for topico in range(0,num_topics_m):
    if topico >-1:
        threshold_m.append(np.mean(df_m[df_m["topicos mergeados"] == topico]["probabilidades mergeadas"]))

    


print(len(threshold_m))  
print(threshold_m)
 

5
[0.44148326, 0.38433474, 0.48194903, 0.36367488, nan]


### Borro lo guardado para el ultimo modelo mergeado. 

In [49]:
def delete_index_opensearch(index_name: str) -> bool:
    try:
        delete_query = {
                         "query": {"match_all" : {}}

        }
        response = os_client.delete_by_query(index = index_name , body=delete_query)
        return True

    except Exception as e:
        print(f"Ha ocurrido un error: {e}")
        return False
 

delete_index_opensearch("topic")



True

### Guardo en opensearch el nuevo modelo mergeado

In [54]:
for topic in merged_model_1y2y3.get_topics().keys():
    if topic > -1:
        print(topic)
        keywords = merged_model_1y2y3.topic_representations_[topic]
        topic_keywords = [TopicKeyword(name=k, score=s) for k, s in keywords]


        best_doc_index = sim_matrix_12y3[topic].argmax()

        best_doc =data_train_m_12y3[best_doc_index]

        topic_doc = Topic(
            vector = list(merged_model_1y2y3.topic_embeddings_[topic]),
            similarity_threshold = threshold_m[topic],       
            created_at = datetime.now(),
            to_date = parse('2024-07-15'),    # Incompleto:  Fechas? 
            from_date = parse('2024-07-16'),
            index = topic,
            keywords = topic_keywords,
            name = get_topic_name(keywords),
            best_doc = best_doc
        )

        print(topic_doc.save())

0
created
1
created
2
created
3
created


In [55]:
topic_model_1y2y3.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,127,-1_máxima_temperaturas_pronóstico_casos,"[máxima, temperaturas, pronóstico, casos, espe...",
1,0,1309,0_argentina_américa_selección_copa américa,"[argentina, américa, selección, copa américa, ...",
2,1,39,1_trump_presidente_atentado_donald trump,"[trump, presidente, atentado, donald trump, do...",
3,2,14,4_brecha_ccl_mep_suba,"[brecha, ccl, mep, suba, cierre, tasa, precio,...",
4,3,11,5_misión_nasa_astronautas_armstrong,"[misión, nasa, astronautas, armstrong, marte, ...",
