# Modelamiento no supervisado en base a tópicos

In [1]:
import sys
sys.path.insert(0, '..')
from utils.preprocesamiento import StemmerTokenizer

tokenizador = StemmerTokenizer(stem=False,rmv_punctuation=True)

In [2]:
from utils.cargar import df_caso
from utils.preprocesamiento import process_df

caso = 'adela'
df = df_caso(caso)

df = process_df(df,'comment','sel',verbose=True)

df = df.drop(columns=['user_id','team_id','gender','df','title','opt_left','opt_right','max_num','phase','time','curso'])

79 rows found with non string elements for column comment (1.02%)
Deleting 1059 columns for which max target value is over 7 (13.72%)
6579 available rows after processing


In [3]:
from sklearn.model_selection import train_test_split

df_train, df_test, _, _ = train_test_split(df, df['sel'], test_size=.05, stratify=df['sel'], random_state=0)

In [4]:
tokenized_corpus = [tokenizador(document) for document in df_train['comment']]
tokenized_test = [tokenizador(document) for document in df_test['comment']]

In [5]:
import gensim
from gensim import corpora

# Create a dictionary from the tokenized corpus
dictionary = corpora.Dictionary(tokenized_corpus)

# Convert the tokenized corpus into a document-term matrix
doc_term_matrix = [dictionary.doc2bow(doc) for doc in tokenized_corpus]

## Número óptimo de tópicos

In [6]:
from gensim.models import CoherenceModel

# Set the range of topic numbers to try
min_topics = 2
max_topics = 10
step_size = 1

# Initialize variables for best coherence score and best number of topics
best_coherence_score = -1
best_num_topics = -1

# Iterate over the range of topic numbers
for num_topics in range(min_topics, max_topics+1, step_size):
    # Train the LDA model
    lda_model = gensim.models.LdaModel(doc_term_matrix, num_topics=num_topics, id2word=dictionary, passes=10)
    
    # Calculate coherence score
    coherence_model = CoherenceModel(model=lda_model, texts=tokenized_corpus, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    
    # Check if coherence score is the best so far
    if coherence_score > best_coherence_score:
        best_coherence_score = coherence_score
        best_num_topics = num_topics

# Print the best number of topics
print(f"Best number of topics: {best_num_topics}")

Best number of topics: 6


In [7]:
%%time
lda_model = gensim.models.LdaModel(doc_term_matrix, num_topics=best_num_topics, id2word=dictionary, passes=10)

CPU times: user 11.6 s, sys: 3.99 ms, total: 11.6 s
Wall time: 11.6 s


In [8]:
# Print the generated topics
topics = lda_model.print_topics(num_topics=num_topics)
for topic in topics:
    print(topic)

(0, '0.038*"tradiciones" + 0.030*"pueblo" + 0.022*"alimento" + 0.015*"importante" + 0.015*"llegar" + 0.014*"si" + 0.014*"acuerdo" + 0.013*"originario" + 0.012*"producir" + 0.011*"personas"')
(1, '0.047*"niños" + 0.035*"salud" + 0.026*"ancianos" + 0.014*"adultos" + 0.013*"beneficiar" + 0.013*"mayores" + 0.008*"Adela" + 0.007*"agua" + 0.007*"ende" + 0.006*")"')
(2, '0.035*"recurso" + 0.035*"agua" + 0.017*"escasez" + 0.015*"preservar" + 0.015*"importante" + 0.014*"natural" + 0.013*"priorizar" + 0.012*"puede" + 0.012*"vitamina" + 0.012*"salud"')
(3, '0.017*"(" + 0.017*")" + 0.011*"vida" + 0.010*"calidad" + 0.009*"pena" + 0.008*"vale" + 0.007*"tradición" + 0.006*"resto" + 0.006*"justificación" + 0.006*"mejora"')
(4, '0.031*"agua" + 0.020*"alimento" + 0.013*"personas" + 0.012*"ser" + 0.011*"vitamina" + 0.010*"fruta" + 0.010*"puede" + 0.009*"D" + 0.009*"producción" + 0.009*"podría"')
(5, '0.031*"vitamina" + 0.022*"D" + 0.020*"postura" + 0.014*"grupo" + 0.013*"opinión" + 0.011*"compañeros" + 0

In [9]:
print(' '.join(tokenized_test[0]) + '\n')

# Convert the tokenized document into a document-term matrix
doc_term_matrix = [dictionary.doc2bow(tokenized_test[0])]

# Get the topic probabilities for the new document
topic_probs = lda_model.get_document_topics(doc_term_matrix)[0]

# Print the topic probabilities
for topic, prob in topic_probs:
    print(f"Topic {topic}: {prob}")

Las creencias importantes personas debería priorizarse mantener tradiciones además podría producirse después alimento si administra mal

Topic 0: 0.7103570103645325
Topic 1: 0.010430044494569302
Topic 2: 0.010511009953916073
Topic 3: 0.01042014081031084
Topic 4: 0.24783048033714294
Topic 5: 0.010451314970850945


## Visualización

In [10]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

vis_data = gensimvis.prepare(lda_model, doc_term_matrix, dictionary)
pyLDAvis.display(vis_data)