# Modelamiento no supervisado en base a tópicos

In [1]:
import sys
sys.path.insert(0, '..')
from utils.preprocesamiento import StemmerTokenizer

tokenizador = StemmerTokenizer(stem=False,rmv_punctuation=True)

In [2]:
from utils.cargar import df_caso
from utils.preprocesamiento import process_df

caso = 'alicia'
df = df_caso(caso)

df = process_df(df,'comment','sel',verbose=True)

df = df.drop(columns=['user_id','team_id','gender','df','title','opt_left','opt_right','max_num','phase','time','curso'])

80 rows found with non string elements for column comment (0.65%)
Deleting 2326 columns for which max target value is over 7 (18.76%)
9991 available rows after processing


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

try:
    train_indices = pd.read_csv('../utils/splits/train_indices_{}'.format(caso), header=None)[0].tolist()
    test_indices = pd.read_csv('../utils/splits/test_indices_{}'.format(caso), header=None)[0].tolist()
    print("Partición train-test cargada")
except FileNotFoundError:
    train_indices, test_indices = train_test_split(range(len(df)), test_size=.2, stratify=df['sel'], random_state=0)
    pd.Series(train_indices).to_csv('../utils/splits/train_indices_{}'.format(caso), index=False)
    pd.Series(test_indices).to_csv('../utils/splits/test_indices_{}'.format(caso), index=False)
    print("Partición train-test guardada")

df_train = df.iloc[train_indices]
df_test = df.iloc[test_indices]

Partición train-test cargada


In [4]:
tokenized_corpus = [tokenizador(document) for document in df_train['comment']]
tokenized_test = [tokenizador(document) for document in df_test['comment']]

In [5]:
import gensim
from gensim import corpora

# Create a dictionary from the tokenized corpus
dictionary = corpora.Dictionary(tokenized_corpus)

# Convert the tokenized corpus into a document-term matrix
doc_term_matrix = [dictionary.doc2bow(doc) for doc in tokenized_corpus]

Usamos el número de tópicos con mejor métrica de coherencia

In [6]:
num_topics = 6

In [7]:
%%time
lda_model = gensim.models.LdaModel(doc_term_matrix, num_topics=num_topics, id2word=dictionary, passes=10)
print("Ajustando modelo")

Ajustando modelo
CPU times: user 19.2 s, sys: 1.74 ms, total: 19.2 s
Wall time: 19.2 s


In [8]:
# Print the generated topics
topics = lda_model.print_topics(num_topics=num_topics)
for topic in topics:
    print(topic)

(0, '0.013*"cliente" + 0.010*"ser" + 0.010*"q" + 0.009*"ambas" + 0.009*"empresa" + 0.007*"partes" + 0.006*"equilibrio" + 0.006*"siempre" + 0.005*"valores" + 0.005*"grupal"')
(1, '0.029*"contexto" + 0.024*"acuerdos" + 0.019*"condiciones" + 0.019*"Alicia" + 0.018*"cumplir" + 0.017*"proyecto" + 0.015*"debe" + 0.015*"pandemia" + 0.012*"mundial" + 0.011*"situación"')
(2, '0.020*"usuario" + 0.020*"proyecto" + 0.012*"ser" + 0.011*"puede" + 0.011*"producto" + 0.009*"Alicia" + 0.009*"debe" + 0.006*"empresa" + 0.006*"necesidades" + 0.005*"podría"')
(3, '0.051*"proyecto" + 0.021*"entregar" + 0.018*"criterios" + 0.016*"si" + 0.016*"plazos" + 0.014*"importante" + 0.014*"priorizar" + 0.014*"plazo" + 0.014*"técnicos" + 0.014*"cumplir"')
(4, '0.030*"reputación" + 0.025*"ser" + 0.021*"transparencia" + 0.021*"proyecto" + 0.021*"si" + 0.020*"usuarios" + 0.013*"transparente" + 0.012*"Alicia" + 0.012*"mantener" + 0.011*"puede"')
(5, '0.028*"grupo" + 0.018*"postura" + 0.012*"opinión" + 0.009*"mas" + 0.009*"

In [9]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

vis_data = gensimvis.prepare(lda_model, doc_term_matrix, dictionary)
pyLDAvis.display(vis_data)

In [10]:
print(' '.join(tokenized_test[0]) + '\n')

# Convert the tokenized document into a document-term matrix
doc_term_matrix = [dictionary.doc2bow(tokenized_test[0])]

# Get the topic probabilities for the new document
topic_probs = lda_model.get_document_topics(doc_term_matrix)[0]

# Print the topic probabilities
for topic, prob in topic_probs:
    print(f"Topic {topic}: {prob}")

entrega producto defectuoso cumple ética ingeniero

Topic 0: 0.023854821920394897
Topic 1: 0.023861605674028397
Topic 2: 0.8804067969322205
Topic 3: 0.024066323414444923
Topic 4: 0.023963524028658867
Topic 5: 0.023846914991736412


## Correlaciones

In [11]:
from TM_utils import get_lda_embeddings
import pandas as pd

arr_test = get_lda_embeddings(lda_model,tokenizador,dictionary,df_test,'comment')
df_topics = pd.DataFrame(arr_test, columns=['Topic{}'.format(i+1) for i in range(len(topics))])

df_topics['sel'] = df_test['sel'].values

In [12]:
from scipy.stats import pearsonr


target_col_name = 'sel'
feature_target_corr = {}
for col in df_topics:
    if target_col_name != col:
        feature_target_corr[col + '_' + target_col_name] = \
            pearsonr(df_topics[col], df_topics[target_col_name])[0]

print("Feature-Target Correlations")
print(feature_target_corr)

Feature-Target Correlations
{'Topic1_sel': -0.010770520969338601, 'Topic2_sel': 0.5055982724100002, 'Topic3_sel': 0.01608580950641765, 'Topic4_sel': -0.08014511939762727, 'Topic5_sel': -0.3471921474264582, 'Topic6_sel': 0.00966320586724342}


In [13]:
def get_correlations(lda_model,tokenizer,dictionary,df_test,num_topics,target_col_name='sel',verbose=True):
    arr_test = get_lda_embeddings(lda_model,tokenizer,dictionary,df_test,'comment')
    topics = lda_model.print_topics(num_topics=num_topics)
    df_topics = pd.DataFrame(arr_test, columns=['Topic{}'.format(i+1) for i in range(len(topics))])

    df_topics[target_col_name] = df_test[target_col_name].values

    feature_target_corr = {}
    for col in df_topics:
        if target_col_name != col:
            feature_target_corr[col + '_' + target_col_name] = \
                pearsonr(df_topics[col], df_topics[target_col_name])[0]

    if verbose:
        print("Feature-Target Correlations")
        print(feature_target_corr)
    
    return feature_target_corr

### Guardar Modelo

In [14]:
lda_model.save('modelos/LDA-{}_{}.gensim'.format(caso,num_topics))
print("Modelo guardado")

Modelo guardado


## Variando el número de tópicos

In [15]:
models = {}

for K in [10,25]:
    models[K] = gensim.models.LdaModel(doc_term_matrix, num_topics=K, id2word=dictionary, passes=10)
    get_correlations(models[K],tokenizador,dictionary,df_test,K)
    models[K].save('modelos/LDA-{}_{}.gensim'.format(caso,K))
    print("Modelo guardado con {} tópicos".format(K))

  perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words


Feature-Target Correlations
{'Topic1_sel': 0.06072849140149168, 'Topic2_sel': 0.05273881719261213, 'Topic3_sel': 0.02453652720420097, 'Topic4_sel': 0.046700591050576376, 'Topic5_sel': 0.0001709713279483653, 'Topic6_sel': 0.025989959777958482, 'Topic7_sel': -0.08859714699575916, 'Topic8_sel': 0.03518552365527622, 'Topic9_sel': 0.021799813847575168, 'Topic10_sel': 0.04131723888662164}
Modelo guardado con 10 tópicos


  perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words


Feature-Target Correlations
{'Topic1_sel': 0.09588356724210816, 'Topic2_sel': 0.09588356724210816, 'Topic3_sel': 0.09588356724210816, 'Topic4_sel': 0.09588356724210816, 'Topic5_sel': 0.09588356724210816, 'Topic6_sel': 0.09588356724210816, 'Topic7_sel': -0.09588356712258123, 'Topic8_sel': 0.09588356724210816, 'Topic9_sel': 0.09588356724210816, 'Topic10_sel': 0.09588356724210816, 'Topic11_sel': 0.09588356724210816, 'Topic12_sel': 0.09588356724210816, 'Topic13_sel': 0.09588356724210816, 'Topic14_sel': 0.09588356724210816, 'Topic15_sel': 0.09588356724210816, 'Topic16_sel': 0.09588356724210816, 'Topic17_sel': 0.09588356724210816, 'Topic18_sel': 0.09588356724210816, 'Topic19_sel': 0.09588356724210816, 'Topic20_sel': 0.09588356724210816, 'Topic21_sel': 0.09588356724210816, 'Topic22_sel': 0.09588356724210816, 'Topic23_sel': 0.09588356724210816, 'Topic24_sel': 0.09588356724210816, 'Topic25_sel': 0.09588356724210816}
Modelo guardado con 25 tópicos
