# Modelamiento no supervisado en base a tópicos

In [1]:
import sys
sys.path.insert(0, '..')
from utils.preprocesamiento import StemmerTokenizer

tokenizador = StemmerTokenizer(stem=False,rmv_punctuation=True)

In [2]:
from utils.cargar import df_caso
from utils.preprocesamiento import process_df, procesar_adela

caso = 'adela'
df = df_caso(caso)

df = procesar_adela(df)
df = df[df['opt_left'] == 'Producir el alimento contra déficit vitamínico']

df = process_df(df,'comment','sel',verbose=True)

df = df.drop(columns=['user_id','team_id','gender','df','title','opt_left','opt_right','max_num','phase','time','curso'])

26 rows found with non string elements for column comment (0.40%)
Deleting 685 columns for which max target value is over 7 (10.58%)
5761 available rows after processing


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_indices, test_indices = train_test_split(range(len(df)), test_size=.05, stratify=df['sel'], random_state=0)

try:
    train_indices = pd.read_csv('../utils/splits/train_indices_{}.csv'.format(caso), header=None)[0].tolist()
    test_indices = pd.read_csv('../utils/splits/test_indices_{}.csv'.format(caso), header=None)[0].tolist()
    print("Partición train-test cargada")
except FileNotFoundError:
    pd.Series(train_indices).to_csv('../utils/splits/train_indices_{}.csv'.format(caso), index=False)
    pd.Series(test_indices).to_csv('../utils/splits/test_indices_{}.csv'.format(caso), index=False)
    print("Partición train-test guardada")

df_train = df.iloc[train_indices]
df_test = df.iloc[test_indices]

Partición train-test guardada


In [4]:
tokenized_corpus = [tokenizador(document) for document in df_train['comment']]
tokenized_test = [tokenizador(document) for document in df_test['comment']]

In [5]:
import gensim
from gensim import corpora

# Create a dictionary from the tokenized corpus
dictionary = corpora.Dictionary(tokenized_corpus)

# Convert the tokenized corpus into a document-term matrix
doc_term_matrix = [dictionary.doc2bow(doc) for doc in tokenized_corpus]

Usamos el número de tópicos con mejor métrica de coherencia

In [6]:
%%time
try:
    lda_model = gensim.models.LdaModel.load('modelos/LDA-{}.gensim'.format(caso))
    print("Cargando modelo pre-ajustado: 'modelos/LDA-{}.gensim'".format(caso))
except FileNotFoundError:
    lda_model = gensim.models.LdaModel(doc_term_matrix, num_topics=4, id2word=dictionary, passes=10)
    print("Ajustando modelo")

CPU times: user 18.8 s, sys: 8.24 ms, total: 18.8 s
Wall time: 18.8 s


In [7]:
# Print the generated topics
topics = lda_model.print_topics(num_topics=5)
for topic in topics:
    print(topic)

(0, '0.023*"acuerdo" + 0.021*"pueblo" + 0.021*"llegar" + 0.021*"alimento" + 0.020*"tradiciones" + 0.014*"originario" + 0.012*"ambas" + 0.012*"producir" + 0.011*"partes" + 0.011*"puede"')
(1, '0.022*"pueblo" + 0.021*"tradiciones" + 0.018*"alimento" + 0.017*"pueblos" + 0.014*"llevar" + 0.013*"si" + 0.013*"fruto" + 0.013*"acuerdo" + 0.013*"llegar" + 0.012*"originarios"')
(2, '0.037*"tradiciones" + 0.029*"salud" + 0.028*"importante" + 0.024*"personas" + 0.020*"alimento" + 0.013*"niños" + 0.011*"producir" + 0.010*"población" + 0.010*"resguardar" + 0.009*"bien"')
(3, '0.024*"pueblo" + 0.023*"tradiciones" + 0.017*"alimento" + 0.016*"fruta" + 0.014*"producción" + 0.011*"vitamina" + 0.010*"puede" + 0.010*"originario" + 0.010*"si" + 0.009*"ser"')


In [8]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

vis_data = gensimvis.prepare(lda_model, doc_term_matrix, dictionary)
pyLDAvis.display(vis_data)

In [9]:
print(' '.join(tokenized_test[0]) + '\n')

# Convert the tokenized document into a document-term matrix
doc_term_matrix = [dictionary.doc2bow(tokenized_test[0])]

# Get the topic probabilities for the new document
topic_probs = lda_model.get_document_topics(doc_term_matrix)[0]

# Print the topic probabilities
for topic, prob in topic_probs:
    print(f"Topic {topic}: {prob}")

Se debe priorizar respetar gente pueblo insistir cantidad gente beneficiaria fruta llegar acuerdo beneficie bandos problema

Topic 0: 0.4372919499874115
Topic 1: 0.5311282873153687
Topic 2: 0.0159648098051548
Topic 3: 0.015614909119904041


## Correlaciones

In [10]:
from TM_utils import get_lda_embeddings
import pandas as pd

arr_test = get_lda_embeddings(lda_model,tokenizador,dictionary,df_test,'comment')
df_topics = pd.DataFrame(arr_test, columns=['Topic{}'.format(i+1) for i in range(len(topics))])

df_topics['sel'] = df_test['sel'].values

In [11]:
from scipy.stats import pearsonr


target_col_name = 'sel'
feature_target_corr = {}
for col in df_topics:
    if target_col_name != col:
        feature_target_corr[col + '_' + target_col_name] = \
            pearsonr(df_topics[col], df_topics[target_col_name])[0]

print("Feature-Target Correlations")
print(feature_target_corr)

Feature-Target Correlations
{'Topic1_sel': 0.05321337858326182, 'Topic2_sel': 0.13664789339665095, 'Topic3_sel': -0.390518196540105, 'Topic4_sel': 0.23675825936757508}


### Guardar Modelo

In [12]:
try:
    lda_model = gensim.models.LdaModel.load('modelos/LDA-{}.gensim'.format(caso))
    print("Modelo guardado anteriormente")
except FileNotFoundError:
    lda_model.save('modelos/LDA-{}.gensim'.format(caso))
    print("Modelo guardado")

Modelo guardado
