# TERCER NOTEBOOK - COSINE SIMILARITY MATRIX PARA MEJORES MODELOS ENCONTRADOS EN TOPIC COHERENCE

**Contenido**:

1. Diseño del Algoritmo (Importar Paquetes y Funciones)
2. Cosine Similarity Matrix (Data Frame con Lemmatization)
3. Cosine Similarity Matrix (Data Frame con Stemming)
2. Cosine Similarity Matrix (Top words de los Tópicos)

## I. DISEÑO DEL ALGORITMO

#### Importar paquetes

In [1]:
import re, pickle, datetime, spacy, numpy as np, pandas as pd
from pprint import pprint
from wordcloud import WordCloud
import matplotlib.colors as mcolors
from collections import Counter

# Gensim
import gensim, gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.matutils import softcossim 
import gensim.downloader as api

# Plotting tools
import pyLDAvis, pyLDAvis.sklearn, pyLDAvis.gensim  
import matplotlib.pyplot as plt

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from tmtoolkit.topicmod.evaluate import metric_coherence_gensim

# Download the FastText model
fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
warnings.simplefilter('ignore')
pd.options.mode.chained_assignment = None  # default='warn' ---> Not showing warnings about iloc

# Stop Words
with open('../spanish4.0.txt','r') as f:
    stop_words = f.read().splitlines()
    
# NLP from Spacy
nlp = spacy.load('es_core_news_md', disable=['parser', 'ner'])

In [2]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {return false;}

<IPython.core.display.Javascript object>

#### Estructura del Algoritmo 

In [3]:
# COSINE SIMILARITY MATRIX
def create_soft_cossim_matrix(sentences):
    len_array = np.arange(len(sentences))
    xx, yy = np.meshgrid(len_array, len_array)
    cossim_mat = pd.DataFrame([[round(softcossim(sentences[i],sentences[j], similarity_matrix) ,2) for i, j in zip(x,y)] for y, x in zip(xx, yy)])
    return cossim_mat

In [4]:
# TOPIC MODELING (LDA)
def texts_id2word_corpus(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV','PROPN']):  
    # Tokenization
    texts = [gensim.utils.simple_preprocess(str(sentence), deacc=True) for sentence in texts]
    # Remove stopwords
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    # lemmatization
    texts_out = []
    for sent in texts:
        doc = nlp(u" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    # Remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out] 
    # Create Dictionary
    id2word = corpora.Dictionary(texts_out)
    # Create Corpus: Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts_out]
    return texts_out,id2word,corpus

def coherence_values_mallet(dictionary, corpus, texts, start=2, limit=31, step=2):
    coherence_values = []
    model_list = []
    mallet_path = 'C:/Users/alexa/mallet-2.0.8/bin/mallet.bat' 
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, 
                                                 corpus=corpus, 
                                                 num_topics=num_topics, 
                                                 id2word=dictionary, 
                                                 random_seed = 1996)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values
def coherence_values_gensim(dictionary, corpus, texts, start=2, limit=31, step=2):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=dictionary, 
                                                num_topics=num_topics, 
                                                random_state=1996, 
                                                update_every=1, 
                                                chunksize=100, 
                                                passes=10, 
                                                alpha='auto', 
                                                per_word_topics=True)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

def coherence_values_sklearn(data_lemmatized, start=2, limit=31, step=2):
    data_lemmatized2 = [' '.join(sent) for sent in data_lemmatized]
    cv = CountVectorizer(stop_words = stop_words,max_df=0.95, min_df=2)
    data_cv = cv.fit_transform(data_lemmatized2) # document term-matrix (dtm)
    coherence_values = []
    model_list = [] 
    for num_topics in range(start, limit, step):
        model = LatentDirichletAllocation(n_components=num_topics,          
                                          max_iter=10,               
                                          learning_method='online',   
                                          random_state=1996,           
                                          batch_size=128,            
                                          n_jobs = -1)
        model.fit(data_cv)
        model_list.append(model)
        coherencemodel = metric_coherence_gensim(measure='c_v', 
                                                 top_n=20, 
                                                 topic_word_distrib=model.components_, 
                                                 dtm=data_cv, 
                                                 vocab= np.array([x for x in cv.vocabulary_.keys()]),
                                                 texts = data_lemmatized)
        coherencemodel = np.array(coherencemodel)
        coherence_values.append(np.mean(coherencemodel))
    return model_list, coherence_values , cv

def plot_coherence(cv1,cv2,cv3, start=2,limit=31,step=2, save=False, figsize = (12,6)):
    x = range(start, limit, step) 
    plt.rcParams.update({'figure.figsize':figsize})
    plt.plot(x, cv1 , label = 'LDA Mallet')
    plt.plot(x, cv2 , label = 'LDA Gensim')
    plt.plot(x, cv3 , label = 'LDA Sklearn')
    plt.title('LDA Coherence')
    plt.xlabel("Topics")
    plt.ylabel("Coherence score")
    plt.legend()
    plt.grid(color='gray', linestyle='-', linewidth=1, alpha = 0.1)
    if save:
        plt.savefig('plot coherence.jpg', dpi= 200, quality=95)
    plt.show()

def coherence_summary(cv_mallet, cv_gensim, cv_sklearn, start=2,limit=31,step=2):
    num_topics = list(range(start,limit,step))
    d = {'Nro Tópicos': num_topics , 'LDA Mallet': cv_mallet, 'LDA Gensim': cv_gensim, 'LDA Sklearn': cv_sklearn}
    return pd.DataFrame(d)
    
def top15words_sklearn(lda_model, cv):
    for index,topic in enumerate(lda_model.components_):
        print(f'TOPIC {index}')
        print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]])
        print('\n')

In [5]:
# Lemmatization
def texts_only(sentence, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV','PROPN']):
    # Tokenization
    texts = [gensim.utils.simple_preprocess(str(sentence), deacc=True)]
    # Remove stopwords
    texts = [word for word in texts if word not in stop_words]
    # lemmatization
    texts_out = []
    for word in texts:
        doc = nlp(u" ".join(word))
        for token in doc:
            if token.pos_ in allowed_postags:
                 texts_out.append(token.lemma_)
    # Remove stopwords once more after lemmatization
    texts_out = [word for word in texts_out if word not in stop_words]
    texts_out = ' '.join(texts_out)
    return texts_out

# Stemming

# Limpieza de palabras
def word_cleaner(sent):
    import string,re
    from unidecode import unidecode
    text = sent.lower() 
    text = re.sub('\d+-\d+-\d+','',text)    # Elimina fechas en formato DD-MM-YYYY y D-M-YY
    text = re.sub('\d+/\d+/\d+','',text)    # Elimina fechas en formato DD/MM/YYYY y D/M/YY
    text = re.sub('\d+:\d+', '', text)      # Elimina horas en formato HH:MM 
    text = re.sub('\S*@\S*\s?','',text)     # Elimina emails
    text = re.sub(r'\d{7,9}','', text)      # Elimina teléfonos 
    text = re.sub('\[.*?¿\]\%', '', text)   # Elimina .*?¿\
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)  # Elimina puntuación restante, como ¡! /
    text = re.sub('[%s]' % re.escape('°'), ' ', text)
    text = re.sub('[‘’“”…«»/]', '', text)
    text = unidecode(text)                  # Elimina tildes y acentos (ñ -> n)
    text = re.sub('\n', ' ', text)          # Elimina saltos de linea
    text = re.sub(r'\s{2,}', ' ', text)     # Elimina dos o más espacios en blancos seguidos
    text = re.sub(r"^\s+", "", text, flags=re.UNICODE)      # Elimina espacios blancos al principio
    text = re.sub("^\s+|\s+$", "", text, flags=re.UNICODE)  # Elimina espacios blancos al final
    text = re.sub('\w*\d\w*', '', text)    # Elimina digitos
    #text = re.sub(r'(.)\1+', r'\1', text)  # Elimina caracteres duplicados
    return text 

def texts_only_stem(sentence, stop_words=stop_words):
    import nltk
    from nltk import SnowballStemmer
    spanishstemmer=SnowballStemmer("spanish")
    sentence = word_cleaner(sentence)
    # Tokenization
    texts = [gensim.utils.simple_preprocess(str(sentence), deacc=True)]
    # Remove stopwords
    texts = [x for i in texts for x in i if x not in stop_words]
    # Stemming
    texts_out = [spanishstemmer.stem(token) for token in texts]
    texts_out = ' '.join(texts_out)
    return texts_out

#### Importar Data Frame

In [6]:
pd.options.display.max_colwidth = 12
df_total = pd.read_pickle('../1. OGC - Análisis y limpieza BD/BD_original_reducida_cleaned.pkl')
df_total.head()

Unnamed: 0,Region,Estacion,Tipo de Falla,Prioridad Cliente,Estado OT,Tecnico Asignado,Dia,Nro Dia,Mes,Hora,Rango Recep/Arribo,Rango Arribo/Cierre,Observación,Requerimiento
0,13°,Tingaro ...,Carrete ...,Normal 4...,CERRADA,Juan Aliaga,Viernes,30,AGO 19,16,Error en...,Más de 1...,se cambi...,area de ...
1,8°,Combusti...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,cambio d...,area de ...
2,10°,LLANOS Y...,Carrete ...,Normal 2...,PRECIERRE,Luis Alm...,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,se visit...,area de ...
3,1°,Soc. Imp...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,remplazo...,area de ...
4,3°,Inmobili...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,remplazo...,area de ...


In [7]:
# OBSERVACIONES
data = [row for row in df_total['Observación']]
data[:2]

['se cambia placa de tk 2 gas95 por placa que indica que es producto 93 se pintan tapas con idenficacion de 93',
 'cambio de graficas de equipos no esta contemplado en estaciones delegadas']

In [8]:
len(data)

21840

In [9]:
# ASIGNAR DATA
texts,id2word,corpus = texts_id2word_corpus(texts = data)

In [9]:
# READ PICKLE ELEMENTS FROM ABOVE
with open('model_list1.pkl', 'rb') as f:
    model_list1 = pickle.load(f)
with open('model_list2.pkl', 'rb') as f:
    model_list2 = pickle.load(f)
with open('model_list3.pkl', 'rb') as f:
    model_list3 = pickle.load(f)
    
with open('coherence_values1.pkl', 'rb') as f:
    coherence_values1 = pickle.load(f)
with open('coherence_values2.pkl', 'rb') as f:
    coherence_values2 = pickle.load(f)
with open('coherence_values3.pkl', 'rb') as f:
    coherence_values3 = pickle.load(f)

with open('countvectorizer-sklearn.pkl', 'rb') as f:
    cv = pickle.load(f)

## II. COSINE SIMILARITY MATRIX (LEMMATIZATION)

### 1. LDA MALLET K = 10

In [17]:
# TÓPICO DOMINANTE Y LEMMATIZATION EN OBS CON K = 10
LDAmallet_gensim = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model_list1[4])

df_topic_sents_keywords = format_topics_sentences(ldamodel=LDAmallet_gensim, corpus=corpus, texts=texts)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

df_topic = df_total.copy()
df_topic['Topico Dominante'] = df_dominant_topic['Dominant_Topic']
df_topic['Topico Dominante'] = df_topic['Topico Dominante'].astype(int)

pd.options.display.max_colwidth = 12
df_topics_lemmatization = df_topic.copy()
df_topics_lemmatization['Observación'] = df_topics_lemmatization['Observación'].apply(texts_only)
df_topics_lemmatization.head()

Unnamed: 0,Region,Estacion,Tipo de Falla,Prioridad Cliente,Estado OT,Tecnico Asignado,Dia,Nro Dia,Mes,Hora,Rango Recep/Arribo,Rango Arribo/Cierre,Observación,Requerimiento,Topico Dominante
0,13°,Tingaro ...,Carrete ...,Normal 4...,CERRADA,Juan Aliaga,Viernes,30,AGO 19,16,Error en...,Más de 1...,cambiar ...,area de ...,4
1,8°,Combusti...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,cambiar ...,area de ...,3
2,10°,LLANOS Y...,Carrete ...,Normal 2...,PRECIERRE,Luis Alm...,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,visitar ...,area de ...,0
3,1°,Soc. Imp...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,remplaza...,area de ...,3
4,3°,Inmobili...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,remplaza...,area de ...,3


In [19]:
rango = range(0,10,1)
d_topics = {}

for k in rango:
    dataframe_topics = df_topics_lemmatization[df_topics_lemmatization['Topico Dominante']==k]
    d_topics[f'Topico {k}'] = [i for i in  dataframe_topics['Observación']]
    
# Prepare a dictionary and a corpus.
dictionary = corpora.Dictionary([simple_preprocess(doc) for i in rango for doc in d_topics[f'Topico {i}']])
# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)

sentences_x_topic = [' '.join(d_topics[f'Topico {i}']) for i in rango]
sentences = [dictionary.doc2bow(simple_preprocess(i)) for i in sentences_x_topic]

In [20]:
from IPython.display import display
pd.options.display.max_columns = None
display(create_soft_cossim_matrix(sentences))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.74,0.8,0.78,0.65,0.67,0.84,0.62,0.71,0.76
1,0.74,1.0,0.75,0.77,0.66,0.68,0.83,0.64,0.74,0.78
2,0.8,0.75,1.0,0.83,0.7,0.72,0.86,0.64,0.78,0.76
3,0.78,0.77,0.83,1.0,0.7,0.69,0.87,0.67,0.79,0.79
4,0.65,0.66,0.7,0.7,1.0,0.65,0.76,0.6,0.68,0.73
5,0.67,0.68,0.72,0.69,0.65,1.0,0.77,0.6,0.73,0.76
6,0.84,0.83,0.86,0.87,0.76,0.77,1.0,0.71,0.84,0.86
7,0.62,0.64,0.64,0.67,0.6,0.6,0.71,1.0,0.72,0.75
8,0.71,0.74,0.78,0.79,0.68,0.73,0.84,0.72,1.0,0.81
9,0.76,0.78,0.76,0.79,0.73,0.76,0.86,0.75,0.81,1.0


### 2. LDA MALLET K = 12

In [21]:
# TÓPICO DOMINANTE Y LEMMATIZATION EN OBS CON K = 12
LDAmallet_gensim = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model_list1[5])

df_topic_sents_keywords = format_topics_sentences(ldamodel=LDAmallet_gensim, corpus=corpus, texts=texts)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

df_topic = df_total.copy()
df_topic['Topico Dominante'] = df_dominant_topic['Dominant_Topic']
df_topic['Topico Dominante'] = df_topic['Topico Dominante'].astype(int)

pd.options.display.max_colwidth = 12
df_topics_lemmatization = df_topic.copy()
df_topics_lemmatization['Observación'] = df_topics_lemmatization['Observación'].apply(texts_only)
df_topics_lemmatization.head()

Unnamed: 0,Region,Estacion,Tipo de Falla,Prioridad Cliente,Estado OT,Tecnico Asignado,Dia,Nro Dia,Mes,Hora,Rango Recep/Arribo,Rango Arribo/Cierre,Observación,Requerimiento,Topico Dominante
0,13°,Tingaro ...,Carrete ...,Normal 4...,CERRADA,Juan Aliaga,Viernes,30,AGO 19,16,Error en...,Más de 1...,cambiar ...,area de ...,11
1,8°,Combusti...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,cambiar ...,area de ...,4
2,10°,LLANOS Y...,Carrete ...,Normal 2...,PRECIERRE,Luis Alm...,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,visitar ...,area de ...,0
3,1°,Soc. Imp...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,remplaza...,area de ...,4
4,3°,Inmobili...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,remplaza...,area de ...,4


In [22]:
rango = range(0,12,1)
d_topics = {}

for k in rango:
    dataframe_topics = df_topics_lemmatization[df_topics_lemmatization['Topico Dominante']==k]
    d_topics[f'Topico {k}'] = [i for i in  dataframe_topics['Observación']]
    
# Prepare a dictionary and a corpus.
dictionary = corpora.Dictionary([simple_preprocess(doc) for i in rango for doc in d_topics[f'Topico {i}']])
# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)

sentences_x_topic = [' '.join(d_topics[f'Topico {i}']) for i in rango]
sentences = [dictionary.doc2bow(simple_preprocess(i)) for i in sentences_x_topic]

In [23]:
from IPython.display import display
pd.options.display.max_columns = None
display(create_soft_cossim_matrix(sentences))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.0,0.75,0.76,0.73,0.68,0.78,0.75,0.82,0.71,0.49,0.64,0.71
1,0.75,1.0,0.77,0.73,0.74,0.76,0.81,0.84,0.71,0.55,0.63,0.81
2,0.76,0.77,1.0,0.78,0.72,0.77,0.77,0.82,0.75,0.5,0.61,0.77
3,0.73,0.73,0.78,1.0,0.65,0.8,0.74,0.82,0.8,0.41,0.66,0.72
4,0.68,0.74,0.72,0.65,1.0,0.72,0.74,0.76,0.68,0.63,0.6,0.76
5,0.78,0.76,0.77,0.8,0.72,1.0,0.8,0.84,0.77,0.46,0.69,0.77
6,0.75,0.81,0.77,0.74,0.74,0.8,1.0,0.83,0.75,0.54,0.63,0.8
7,0.82,0.84,0.82,0.82,0.76,0.84,0.83,1.0,0.8,0.54,0.71,0.81
8,0.71,0.71,0.75,0.8,0.68,0.77,0.75,0.8,1.0,0.45,0.61,0.74
9,0.49,0.55,0.5,0.41,0.63,0.46,0.54,0.54,0.45,1.0,0.39,0.53


### 3. LDA MALLET K = 16

In [24]:
# TÓPICO DOMINANTE Y LEMMATIZATION EN OBS CON K = 16
LDAmallet_gensim = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model_list1[7])

df_topic_sents_keywords = format_topics_sentences(ldamodel=LDAmallet_gensim, corpus=corpus, texts=texts)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

df_topic = df_total.copy()
df_topic['Topico Dominante'] = df_dominant_topic['Dominant_Topic']
df_topic['Topico Dominante'] = df_topic['Topico Dominante'].astype(int)

pd.options.display.max_colwidth = 12
df_topics_lemmatization = df_topic.copy()
df_topics_lemmatization['Observación'] = df_topics_lemmatization['Observación'].apply(texts_only)
df_topics_lemmatization.head()

Unnamed: 0,Region,Estacion,Tipo de Falla,Prioridad Cliente,Estado OT,Tecnico Asignado,Dia,Nro Dia,Mes,Hora,Rango Recep/Arribo,Rango Arribo/Cierre,Observación,Requerimiento,Topico Dominante
0,13°,Tingaro ...,Carrete ...,Normal 4...,CERRADA,Juan Aliaga,Viernes,30,AGO 19,16,Error en...,Más de 1...,cambiar ...,area de ...,15
1,8°,Combusti...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,cambiar ...,area de ...,5
2,10°,LLANOS Y...,Carrete ...,Normal 2...,PRECIERRE,Luis Alm...,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,visitar ...,area de ...,10
3,1°,Soc. Imp...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,remplaza...,area de ...,8
4,3°,Inmobili...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,remplaza...,area de ...,8


In [25]:
rango = range(0,16,1)
d_topics = {}

for k in rango:
    dataframe_topics = df_topics_lemmatization[df_topics_lemmatization['Topico Dominante']==k]
    d_topics[f'Topico {k}'] = [i for i in  dataframe_topics['Observación']]
    
# Prepare a dictionary and a corpus.
dictionary = corpora.Dictionary([simple_preprocess(doc) for i in rango for doc in d_topics[f'Topico {i}']])
# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)

sentences_x_topic = [' '.join(d_topics[f'Topico {i}']) for i in rango]
sentences = [dictionary.doc2bow(simple_preprocess(i)) for i in sentences_x_topic]

In [26]:
from IPython.display import display
pd.options.display.max_columns = None
display(create_soft_cossim_matrix(sentences))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1.0,0.56,0.74,0.67,0.42,0.68,0.64,0.68,0.73,0.63,0.73,0.73,0.62,0.58,0.74,0.68
1,0.56,1.0,0.72,0.68,0.42,0.61,0.65,0.77,0.68,0.66,0.71,0.72,0.65,0.78,0.68,0.68
2,0.74,0.72,1.0,0.81,0.5,0.79,0.78,0.82,0.77,0.77,0.86,0.83,0.79,0.71,0.86,0.83
3,0.67,0.68,0.81,1.0,0.45,0.72,0.68,0.8,0.68,0.71,0.77,0.75,0.77,0.66,0.79,0.77
4,0.42,0.42,0.5,0.45,1.0,0.43,0.41,0.46,0.42,0.44,0.49,0.46,0.42,0.46,0.51,0.47
5,0.68,0.61,0.79,0.72,0.43,1.0,0.65,0.73,0.7,0.67,0.79,0.74,0.71,0.6,0.81,0.75
6,0.64,0.65,0.78,0.68,0.41,0.65,1.0,0.69,0.69,0.63,0.73,0.73,0.61,0.65,0.73,0.71
7,0.68,0.77,0.82,0.8,0.46,0.73,0.69,1.0,0.73,0.77,0.8,0.78,0.81,0.71,0.82,0.78
8,0.73,0.68,0.77,0.68,0.42,0.7,0.69,0.73,1.0,0.62,0.77,0.75,0.66,0.64,0.76,0.71
9,0.63,0.66,0.77,0.71,0.44,0.67,0.63,0.77,0.62,1.0,0.71,0.72,0.72,0.63,0.76,0.73


### 4. LDA MALLET K = 20

In [27]:
# TÓPICO DOMINANTE Y LEMMATIZATION EN OBS CON K = 20
LDAmallet_gensim = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model_list1[9])

df_topic_sents_keywords = format_topics_sentences(ldamodel=LDAmallet_gensim, corpus=corpus, texts=texts)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

df_topic = df_total.copy()
df_topic['Topico Dominante'] = df_dominant_topic['Dominant_Topic']
df_topic['Topico Dominante'] = df_topic['Topico Dominante'].astype(int)

pd.options.display.max_colwidth = 12
df_topics_lemmatization = df_topic.copy()
df_topics_lemmatization['Observación'] = df_topics_lemmatization['Observación'].apply(texts_only)
df_topics_lemmatization.head()

Unnamed: 0,Region,Estacion,Tipo de Falla,Prioridad Cliente,Estado OT,Tecnico Asignado,Dia,Nro Dia,Mes,Hora,Rango Recep/Arribo,Rango Arribo/Cierre,Observación,Requerimiento,Topico Dominante
0,13°,Tingaro ...,Carrete ...,Normal 4...,CERRADA,Juan Aliaga,Viernes,30,AGO 19,16,Error en...,Más de 1...,cambiar ...,area de ...,7
1,8°,Combusti...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,cambiar ...,area de ...,2
2,10°,LLANOS Y...,Carrete ...,Normal 2...,PRECIERRE,Luis Alm...,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,visitar ...,area de ...,9
3,1°,Soc. Imp...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,remplaza...,area de ...,2
4,3°,Inmobili...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,remplaza...,area de ...,2


In [28]:
rango = range(0,20,1)
d_topics = {}

for k in rango:
    dataframe_topics = df_topics_lemmatization[df_topics_lemmatization['Topico Dominante']==k]
    d_topics[f'Topico {k}'] = [i for i in  dataframe_topics['Observación']]
    
# Prepare a dictionary and a corpus.
dictionary = corpora.Dictionary([simple_preprocess(doc) for i in rango for doc in d_topics[f'Topico {i}']])
# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)

sentences_x_topic = [' '.join(d_topics[f'Topico {i}']) for i in rango]
sentences = [dictionary.doc2bow(simple_preprocess(i)) for i in sentences_x_topic]

In [29]:
from IPython.display import display
pd.options.display.max_columns = None
display(create_soft_cossim_matrix(sentences))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1.0,0.48,0.64,0.76,0.58,0.52,0.71,0.58,0.56,0.66,0.51,0.66,0.61,0.65,0.76,0.65,0.56,0.58,0.55,0.72
1,0.48,1.0,0.52,0.61,0.48,0.44,0.53,0.46,0.47,0.54,0.41,0.55,0.51,0.5,0.58,0.57,0.44,0.5,0.41,0.52
2,0.64,0.52,1.0,0.77,0.71,0.56,0.7,0.6,0.66,0.74,0.52,0.7,0.64,0.7,0.76,0.69,0.6,0.62,0.56,0.77
3,0.76,0.61,0.77,1.0,0.67,0.61,0.75,0.66,0.68,0.79,0.57,0.8,0.73,0.72,0.86,0.75,0.68,0.68,0.61,0.78
4,0.58,0.48,0.71,0.67,1.0,0.54,0.62,0.61,0.65,0.66,0.58,0.66,0.55,0.68,0.71,0.68,0.57,0.57,0.46,0.73
5,0.52,0.44,0.56,0.61,0.54,1.0,0.55,0.49,0.51,0.61,0.43,0.59,0.53,0.56,0.66,0.56,0.47,0.5,0.47,0.59
6,0.71,0.53,0.7,0.75,0.62,0.55,1.0,0.59,0.58,0.71,0.5,0.72,0.64,0.63,0.77,0.66,0.54,0.62,0.53,0.72
7,0.58,0.46,0.6,0.66,0.61,0.49,0.59,1.0,0.61,0.66,0.72,0.71,0.59,0.71,0.77,0.81,0.64,0.68,0.56,0.7
8,0.56,0.47,0.66,0.68,0.65,0.51,0.58,0.61,1.0,0.69,0.53,0.65,0.61,0.68,0.71,0.69,0.53,0.6,0.46,0.7
9,0.66,0.54,0.74,0.79,0.66,0.61,0.71,0.66,0.69,1.0,0.61,0.77,0.65,0.74,0.8,0.74,0.65,0.68,0.56,0.75


### 5. LDA MALLET K = 24

In [30]:
# TÓPICO DOMINANTE Y LEMMATIZATION EN OBS CON K = 24
LDAmallet_gensim = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model_list1[11])

df_topic_sents_keywords = format_topics_sentences(ldamodel=LDAmallet_gensim, corpus=corpus, texts=texts)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

df_topic = df_total.copy()
df_topic['Topico Dominante'] = df_dominant_topic['Dominant_Topic']
df_topic['Topico Dominante'] = df_topic['Topico Dominante'].astype(int)

pd.options.display.max_colwidth = 12
df_topics_lemmatization = df_topic.copy()
df_topics_lemmatization['Observación'] = df_topics_lemmatization['Observación'].apply(texts_only)
df_topics_lemmatization.head()

Unnamed: 0,Region,Estacion,Tipo de Falla,Prioridad Cliente,Estado OT,Tecnico Asignado,Dia,Nro Dia,Mes,Hora,Rango Recep/Arribo,Rango Arribo/Cierre,Observación,Requerimiento,Topico Dominante
0,13°,Tingaro ...,Carrete ...,Normal 4...,CERRADA,Juan Aliaga,Viernes,30,AGO 19,16,Error en...,Más de 1...,cambiar ...,area de ...,19
1,8°,Combusti...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,cambiar ...,area de ...,2
2,10°,LLANOS Y...,Carrete ...,Normal 2...,PRECIERRE,Luis Alm...,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,visitar ...,area de ...,3
3,1°,Soc. Imp...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,remplaza...,area de ...,10
4,3°,Inmobili...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,remplaza...,area de ...,10


In [31]:
rango = range(0,24,1)
d_topics = {}

for k in rango:
    dataframe_topics = df_topics_lemmatization[df_topics_lemmatization['Topico Dominante']==k]
    d_topics[f'Topico {k}'] = [i for i in  dataframe_topics['Observación']]
    
# Prepare a dictionary and a corpus.
dictionary = corpora.Dictionary([simple_preprocess(doc) for i in rango for doc in d_topics[f'Topico {i}']])
# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)

sentences_x_topic = [' '.join(d_topics[f'Topico {i}']) for i in rango]
sentences = [dictionary.doc2bow(simple_preprocess(i)) for i in sentences_x_topic]

In [32]:
from IPython.display import display
pd.options.display.max_columns = None
display(create_soft_cossim_matrix(sentences))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
0,1.0,0.79,0.79,0.85,0.82,0.76,0.65,0.68,0.85,0.76,0.81,0.64,0.84,0.52,0.81,0.55,0.72,0.85,0.82,0.8,0.7,0.68,0.73,0.66
1,0.79,1.0,0.67,0.75,0.72,0.7,0.6,0.59,0.72,0.63,0.71,0.6,0.8,0.65,0.77,0.44,0.72,0.76,0.68,0.76,0.55,0.55,0.68,0.59
2,0.79,0.67,1.0,0.72,0.71,0.68,0.49,0.6,0.73,0.68,0.76,0.61,0.71,0.49,0.67,0.46,0.62,0.73,0.73,0.73,0.69,0.59,0.67,0.56
3,0.85,0.75,0.72,1.0,0.74,0.73,0.65,0.63,0.78,0.77,0.75,0.54,0.77,0.49,0.78,0.46,0.69,0.82,0.74,0.76,0.65,0.59,0.67,0.61
4,0.82,0.72,0.71,0.74,1.0,0.71,0.54,0.6,0.74,0.74,0.76,0.61,0.74,0.57,0.73,0.46,0.7,0.76,0.75,0.71,0.6,0.58,0.66,0.54
5,0.76,0.7,0.68,0.73,0.71,1.0,0.52,0.55,0.68,0.6,0.71,0.57,0.74,0.45,0.72,0.42,0.67,0.7,0.71,0.68,0.54,0.52,0.69,0.67
6,0.65,0.6,0.49,0.65,0.54,0.52,1.0,0.46,0.55,0.49,0.55,0.44,0.64,0.4,0.64,0.32,0.56,0.7,0.57,0.58,0.43,0.47,0.53,0.44
7,0.68,0.59,0.6,0.63,0.6,0.55,0.46,1.0,0.65,0.59,0.6,0.51,0.62,0.41,0.6,0.42,0.54,0.63,0.59,0.61,0.53,0.55,0.57,0.49
8,0.85,0.72,0.73,0.78,0.74,0.68,0.55,0.65,1.0,0.67,0.8,0.56,0.76,0.49,0.79,0.49,0.67,0.8,0.74,0.75,0.63,0.65,0.69,0.64
9,0.76,0.63,0.68,0.77,0.74,0.6,0.49,0.59,0.67,1.0,0.67,0.56,0.67,0.44,0.64,0.45,0.59,0.69,0.67,0.66,0.59,0.59,0.59,0.47


### 6. LDA SKLEARN K = 30

In [33]:
# TÓPICO DOMINANTE Y LEMMATIZATION EN OBS CON K = 30
LDAmallet_gensim = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model_list1[14])

df_topic_sents_keywords = format_topics_sentences(ldamodel=LDAmallet_gensim, corpus=corpus, texts=texts)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

df_topic = df_total.copy()
df_topic['Topico Dominante'] = df_dominant_topic['Dominant_Topic']
df_topic['Topico Dominante'] = df_topic['Topico Dominante'].astype(int)

pd.options.display.max_colwidth = 12
df_topics_lemmatization = df_topic.copy()
df_topics_lemmatization['Observación'] = df_topics_lemmatization['Observación'].apply(texts_only)
df_topics_lemmatization.head()

Unnamed: 0,Region,Estacion,Tipo de Falla,Prioridad Cliente,Estado OT,Tecnico Asignado,Dia,Nro Dia,Mes,Hora,Rango Recep/Arribo,Rango Arribo/Cierre,Observación,Requerimiento,Topico Dominante
0,13°,Tingaro ...,Carrete ...,Normal 4...,CERRADA,Juan Aliaga,Viernes,30,AGO 19,16,Error en...,Más de 1...,cambiar ...,area de ...,26
1,8°,Combusti...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,cambiar ...,area de ...,13
2,10°,LLANOS Y...,Carrete ...,Normal 2...,PRECIERRE,Luis Alm...,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,visitar ...,area de ...,12
3,1°,Soc. Imp...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,remplaza...,area de ...,13
4,3°,Inmobili...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,remplaza...,area de ...,13


In [34]:
rango = range(0,30,1)
d_topics = {}

for k in rango:
    dataframe_topics = df_topics_lemmatization[df_topics_lemmatization['Topico Dominante']==k]
    d_topics[f'Topico {k}'] = [i for i in  dataframe_topics['Observación']]
    
# Prepare a dictionary and a corpus.
dictionary = corpora.Dictionary([simple_preprocess(doc) for i in rango for doc in d_topics[f'Topico {i}']])
# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)

sentences_x_topic = [' '.join(d_topics[f'Topico {i}']) for i in rango]
sentences = [dictionary.doc2bow(simple_preprocess(i)) for i in sentences_x_topic]

In [35]:
from IPython.display import display
pd.options.display.max_columns = None
display(create_soft_cossim_matrix(sentences))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
0,1.0,0.6,0.53,0.58,0.77,0.34,0.7,0.67,0.61,0.8,0.65,0.78,0.69,0.68,0.72,0.62,0.58,0.52,0.58,0.54,0.61,0.76,0.61,0.73,0.7,0.6,0.68,0.66,0.49,0.75
1,0.6,1.0,0.56,0.45,0.71,0.61,0.58,0.59,0.64,0.77,0.67,0.67,0.64,0.63,0.52,0.6,0.53,0.57,0.64,0.55,0.59,0.6,0.52,0.61,0.62,0.4,0.58,0.6,0.62,0.61
2,0.53,0.56,1.0,0.35,0.61,0.28,0.46,0.48,0.62,0.66,0.63,0.61,0.52,0.52,0.52,0.45,0.43,0.48,0.6,0.58,0.49,0.54,0.37,0.57,0.5,0.43,0.55,0.51,0.38,0.53
3,0.58,0.45,0.35,1.0,0.56,0.24,0.52,0.51,0.44,0.61,0.53,0.58,0.65,0.49,0.53,0.52,0.52,0.33,0.4,0.41,0.48,0.55,0.42,0.54,0.51,0.37,0.66,0.5,0.4,0.6
4,0.77,0.71,0.61,0.56,1.0,0.45,0.7,0.72,0.71,0.86,0.77,0.78,0.72,0.71,0.67,0.7,0.62,0.62,0.7,0.62,0.67,0.74,0.6,0.71,0.75,0.53,0.69,0.72,0.61,0.8
5,0.34,0.61,0.28,0.24,0.45,1.0,0.32,0.3,0.42,0.47,0.38,0.39,0.34,0.41,0.3,0.36,0.29,0.4,0.39,0.31,0.32,0.3,0.28,0.36,0.35,0.21,0.33,0.33,0.83,0.35
6,0.7,0.58,0.46,0.52,0.7,0.32,1.0,0.72,0.57,0.75,0.68,0.71,0.74,0.61,0.61,0.58,0.54,0.48,0.6,0.49,0.77,0.67,0.73,0.68,0.65,0.46,0.6,0.63,0.45,0.7
7,0.67,0.59,0.48,0.51,0.72,0.3,0.72,1.0,0.6,0.76,0.7,0.73,0.72,0.6,0.6,0.57,0.57,0.5,0.56,0.53,0.71,0.71,0.62,0.64,0.63,0.48,0.59,0.65,0.44,0.67
8,0.61,0.64,0.62,0.44,0.71,0.42,0.57,0.6,1.0,0.74,0.75,0.68,0.59,0.57,0.6,0.52,0.51,0.52,0.64,0.62,0.58,0.64,0.56,0.61,0.6,0.47,0.59,0.65,0.51,0.62
9,0.8,0.77,0.66,0.61,0.86,0.47,0.75,0.76,0.74,1.0,0.8,0.85,0.79,0.74,0.69,0.72,0.65,0.63,0.72,0.64,0.71,0.77,0.66,0.77,0.78,0.54,0.75,0.74,0.63,0.78


## III. COSINE SIMILARITY MATRIX (STEMMING)

### 1. LDA MALLET K = 10

In [42]:
# TÓPICO DOMINANTE Y STEMMING EN OBS CON K = 10
LDAmallet_gensim = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model_list1[4])

df_topic_sents_keywords = format_topics_sentences(ldamodel=LDAmallet_gensim, corpus=corpus, texts=texts)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

df_topic = df_total.copy()
df_topic['Topico Dominante'] = df_dominant_topic['Dominant_Topic']
df_topic['Topico Dominante'] = df_topic['Topico Dominante'].astype(int)

# Apply Stemming function (texts_only_stem) to the DF
df_topics_stemming = df_topic.copy()
df_topics_stemming['Observación'] = df_topics_stemming['Observación'].apply(texts_only_stem)
df_topics_stemming.head()

Unnamed: 0,Region,Estacion,Tipo de Falla,Prioridad Cliente,Estado OT,Tecnico Asignado,Dia,Nro Dia,Mes,Hora,Rango Recep/Arribo,Rango Arribo/Cierre,Observación,Requerimiento,Topico Dominante
0,13°,Tingaro ...,Carrete ...,Normal 4...,CERRADA,Juan Aliaga,Viernes,30,AGO 19,16,Error en...,Más de 1...,cambi pl...,area de ...,5
1,8°,Combusti...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,cambi gr...,area de ...,6
2,10°,LLANOS Y...,Carrete ...,Normal 2...,PRECIERRE,Luis Alm...,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,eds cheq...,area de ...,2
3,1°,Soc. Imp...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,remplaz ...,area de ...,0
4,3°,Inmobili...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,remplaz ...,area de ...,0


In [43]:
rango = range(0,10,1)
d_topics = {}
for k in rango:
    dataframe_topics = df_topics_stemming[df_topics_stemming['Topico Dominante']==k]
    d_topics[f'Topico {k}'] = [i for i in  dataframe_topics['Observación']]
    
# Texts 
texts_stem = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in df_topics_stemming['Observación']]
# Dictionary
dictionary = corpora.Dictionary([simple_preprocess(doc) for i in rango for doc in d_topics[f'Topico {i}']])
# Corpus
corpus = [dictionary.doc2bow(text) for text in texts_stem]
# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)

sentences_x_topic = [' '.join(d_topics[f'Topico {i}']) for i in rango]
sentences = [dictionary.doc2bow(simple_preprocess(i)) for i in sentences_x_topic]

from IPython.display import display
pd.options.display.max_columns = None
display(create_soft_cossim_matrix(sentences))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.48,0.56,0.36,0.53,0.19,0.61,0.52,0.51,0.35
1,0.48,1.0,0.77,0.54,0.6,0.21,0.73,0.61,0.68,0.51
2,0.56,0.77,1.0,0.53,0.61,0.26,0.82,0.73,0.73,0.62
3,0.36,0.54,0.53,1.0,0.56,0.19,0.61,0.43,0.62,0.4
4,0.53,0.6,0.61,0.56,1.0,0.37,0.71,0.56,0.69,0.44
5,0.19,0.21,0.26,0.19,0.37,1.0,0.3,0.33,0.31,0.22
6,0.61,0.73,0.82,0.61,0.71,0.3,1.0,0.76,0.81,0.61
7,0.52,0.61,0.73,0.43,0.56,0.33,0.76,1.0,0.59,0.58
8,0.51,0.68,0.73,0.62,0.69,0.31,0.81,0.59,1.0,0.64
9,0.35,0.51,0.62,0.4,0.44,0.22,0.61,0.58,0.64,1.0


### 2. LDA MALLET K = 12

In [44]:
# TÓPICO DOMINANTE Y STEMMING EN OBS CON K = 12
LDAmallet_gensim = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model_list1[5])

df_topic_sents_keywords = format_topics_sentences(ldamodel=LDAmallet_gensim, corpus=corpus, texts=texts)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

df_topic = df_total.copy()
df_topic['Topico Dominante'] = df_dominant_topic['Dominant_Topic']
df_topic['Topico Dominante'] = df_topic['Topico Dominante'].astype(int)

# Apply Stemming function (texts_only_stem) to the DF
df_topics_stemming = df_topic.copy()
df_topics_stemming['Observación'] = df_topics_stemming['Observación'].apply(texts_only_stem)
df_topics_stemming.head()

Unnamed: 0,Region,Estacion,Tipo de Falla,Prioridad Cliente,Estado OT,Tecnico Asignado,Dia,Nro Dia,Mes,Hora,Rango Recep/Arribo,Rango Arribo/Cierre,Observación,Requerimiento,Topico Dominante
0,13°,Tingaro ...,Carrete ...,Normal 4...,CERRADA,Juan Aliaga,Viernes,30,AGO 19,16,Error en...,Más de 1...,cambi pl...,area de ...,3
1,8°,Combusti...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,cambi gr...,area de ...,9
2,10°,LLANOS Y...,Carrete ...,Normal 2...,PRECIERRE,Luis Alm...,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,eds cheq...,area de ...,4
3,1°,Soc. Imp...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,remplaz ...,area de ...,11
4,3°,Inmobili...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,remplaz ...,area de ...,11


In [45]:
rango = range(0,12,1)
d_topics = {}
for k in rango:
    dataframe_topics = df_topics_stemming[df_topics_stemming['Topico Dominante']==k]
    d_topics[f'Topico {k}'] = [i for i in  dataframe_topics['Observación']]
    
# Texts 
texts_stem = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in df_topics_stemming['Observación']]
# Dictionary
dictionary = corpora.Dictionary([simple_preprocess(doc) for i in rango for doc in d_topics[f'Topico {i}']])
# Corpus
corpus = [dictionary.doc2bow(text) for text in texts_stem]
# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)

sentences_x_topic = [' '.join(d_topics[f'Topico {i}']) for i in rango]
sentences = [dictionary.doc2bow(simple_preprocess(i)) for i in sentences_x_topic]

from IPython.display import display
pd.options.display.max_columns = None
display(create_soft_cossim_matrix(sentences))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.0,0.71,0.71,0.55,0.55,0.65,0.66,0.71,0.61,0.65,0.52,0.56
1,0.71,1.0,0.86,0.6,0.79,0.76,0.72,0.8,0.68,0.74,0.21,0.67
2,0.71,0.86,1.0,0.6,0.73,0.76,0.76,0.83,0.72,0.76,0.29,0.73
3,0.55,0.6,0.6,1.0,0.48,0.54,0.56,0.6,0.53,0.59,0.24,0.49
4,0.55,0.79,0.73,0.48,1.0,0.62,0.57,0.65,0.61,0.62,0.19,0.61
5,0.65,0.76,0.76,0.54,0.62,1.0,0.65,0.72,0.61,0.72,0.21,0.6
6,0.66,0.72,0.76,0.56,0.57,0.65,1.0,0.83,0.64,0.64,0.29,0.74
7,0.71,0.8,0.83,0.6,0.65,0.72,0.83,1.0,0.68,0.68,0.32,0.78
8,0.61,0.68,0.72,0.53,0.61,0.61,0.64,0.68,1.0,0.62,0.35,0.54
9,0.65,0.74,0.76,0.59,0.62,0.72,0.64,0.68,0.62,1.0,0.11,0.67


### 3. LDA MALLET K = 16

In [46]:
# TÓPICO DOMINANTE Y STEMMING EN OBS CON K = 16
LDAmallet_gensim = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model_list1[7])

df_topic_sents_keywords = format_topics_sentences(ldamodel=LDAmallet_gensim, corpus=corpus, texts=texts)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

df_topic = df_total.copy()
df_topic['Topico Dominante'] = df_dominant_topic['Dominant_Topic']
df_topic['Topico Dominante'] = df_topic['Topico Dominante'].astype(int)

# Apply Stemming function (texts_only_stem) to the DF
df_topics_stemming = df_topic.copy()
df_topics_stemming['Observación'] = df_topics_stemming['Observación'].apply(texts_only_stem)
df_topics_stemming.head()

Unnamed: 0,Region,Estacion,Tipo de Falla,Prioridad Cliente,Estado OT,Tecnico Asignado,Dia,Nro Dia,Mes,Hora,Rango Recep/Arribo,Rango Arribo/Cierre,Observación,Requerimiento,Topico Dominante
0,13°,Tingaro ...,Carrete ...,Normal 4...,CERRADA,Juan Aliaga,Viernes,30,AGO 19,16,Error en...,Más de 1...,cambi pl...,area de ...,14
1,8°,Combusti...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,cambi gr...,area de ...,6
2,10°,LLANOS Y...,Carrete ...,Normal 2...,PRECIERRE,Luis Alm...,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,eds cheq...,area de ...,2
3,1°,Soc. Imp...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,remplaz ...,area de ...,1
4,3°,Inmobili...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,remplaz ...,area de ...,1


In [47]:
rango = range(0,16,1)
d_topics = {}
for k in rango:
    dataframe_topics = df_topics_stemming[df_topics_stemming['Topico Dominante']==k]
    d_topics[f'Topico {k}'] = [i for i in  dataframe_topics['Observación']]
    
# Texts 
texts_stem = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in df_topics_stemming['Observación']]
# Dictionary
dictionary = corpora.Dictionary([simple_preprocess(doc) for i in rango for doc in d_topics[f'Topico {i}']])
# Corpus
corpus = [dictionary.doc2bow(text) for text in texts_stem]
# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)

sentences_x_topic = [' '.join(d_topics[f'Topico {i}']) for i in rango]
sentences = [dictionary.doc2bow(simple_preprocess(i)) for i in sentences_x_topic]

from IPython.display import display
pd.options.display.max_columns = None
display(create_soft_cossim_matrix(sentences))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1.0,0.53,0.71,0.56,0.5,0.67,0.69,0.54,0.58,0.38,0.7,0.62,0.34,0.54,0.73,0.55
1,0.53,1.0,0.67,0.51,0.44,0.58,0.59,0.48,0.5,0.32,0.63,0.67,0.34,0.51,0.69,0.62
2,0.71,0.67,1.0,0.65,0.59,0.77,0.73,0.64,0.68,0.48,0.81,0.77,0.48,0.67,0.87,0.71
3,0.56,0.51,0.65,1.0,0.57,0.63,0.55,0.57,0.57,0.37,0.61,0.6,0.35,0.52,0.66,0.54
4,0.5,0.44,0.59,0.57,1.0,0.51,0.6,0.45,0.59,0.35,0.55,0.53,0.26,0.59,0.66,0.48
5,0.67,0.58,0.77,0.63,0.51,1.0,0.66,0.63,0.68,0.58,0.8,0.64,0.53,0.64,0.8,0.6
6,0.69,0.59,0.73,0.55,0.6,0.66,1.0,0.61,0.59,0.37,0.71,0.68,0.29,0.61,0.79,0.67
7,0.54,0.48,0.64,0.57,0.45,0.63,0.61,1.0,0.54,0.44,0.64,0.59,0.33,0.48,0.67,0.57
8,0.58,0.5,0.68,0.57,0.59,0.68,0.59,0.54,1.0,0.55,0.72,0.58,0.57,0.64,0.72,0.57
9,0.38,0.32,0.48,0.37,0.35,0.58,0.37,0.44,0.55,1.0,0.56,0.39,0.58,0.34,0.51,0.35


### 4. LDA MALLET K = 20

In [48]:
# TÓPICO DOMINANTE Y STEMMING EN OBS CON K = 20
LDAmallet_gensim = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model_list1[9])

df_topic_sents_keywords = format_topics_sentences(ldamodel=LDAmallet_gensim, corpus=corpus, texts=texts)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

df_topic = df_total.copy()
df_topic['Topico Dominante'] = df_dominant_topic['Dominant_Topic']
df_topic['Topico Dominante'] = df_topic['Topico Dominante'].astype(int)

# Apply Stemming function (texts_only_stem) to the DF
df_topics_stemming = df_topic.copy()
df_topics_stemming['Observación'] = df_topics_stemming['Observación'].apply(texts_only_stem)
df_topics_stemming.head()

Unnamed: 0,Region,Estacion,Tipo de Falla,Prioridad Cliente,Estado OT,Tecnico Asignado,Dia,Nro Dia,Mes,Hora,Rango Recep/Arribo,Rango Arribo/Cierre,Observación,Requerimiento,Topico Dominante
0,13°,Tingaro ...,Carrete ...,Normal 4...,CERRADA,Juan Aliaga,Viernes,30,AGO 19,16,Error en...,Más de 1...,cambi pl...,area de ...,7
1,8°,Combusti...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,cambi gr...,area de ...,2
2,10°,LLANOS Y...,Carrete ...,Normal 2...,PRECIERRE,Luis Alm...,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,eds cheq...,area de ...,9
3,1°,Soc. Imp...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,remplaz ...,area de ...,2
4,3°,Inmobili...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,remplaz ...,area de ...,2


In [49]:
rango = range(0,20,1)
d_topics = {}
for k in rango:
    dataframe_topics = df_topics_stemming[df_topics_stemming['Topico Dominante']==k]
    d_topics[f'Topico {k}'] = [i for i in  dataframe_topics['Observación']]
    
# Texts 
texts_stem = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in df_topics_stemming['Observación']]
# Dictionary
dictionary = corpora.Dictionary([simple_preprocess(doc) for i in rango for doc in d_topics[f'Topico {i}']])
# Corpus
corpus = [dictionary.doc2bow(text) for text in texts_stem]
# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)

sentences_x_topic = [' '.join(d_topics[f'Topico {i}']) for i in rango]
sentences = [dictionary.doc2bow(simple_preprocess(i)) for i in sentences_x_topic]

from IPython.display import display
pd.options.display.max_columns = None
display(create_soft_cossim_matrix(sentences))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1.0,0.74,0.56,0.73,0.4,0.37,0.61,0.69,0.64,0.74,0.27,0.33,0.61,0.17,0.74,0.49,0.57,0.37,0.63,0.65
1,0.74,1.0,0.56,0.69,0.43,0.41,0.54,0.72,0.68,0.67,0.3,0.34,0.62,0.19,0.72,0.41,0.51,0.36,0.5,0.6
2,0.56,0.56,1.0,0.63,0.37,0.4,0.62,0.52,0.58,0.62,0.23,0.38,0.46,0.23,0.71,0.47,0.45,0.42,0.44,0.62
3,0.73,0.69,0.63,1.0,0.42,0.4,0.59,0.68,0.69,0.74,0.27,0.34,0.59,0.17,0.76,0.45,0.55,0.45,0.54,0.65
4,0.4,0.43,0.37,0.42,1.0,0.4,0.32,0.44,0.5,0.48,0.24,0.25,0.53,0.14,0.5,0.31,0.33,0.29,0.32,0.44
5,0.37,0.41,0.4,0.4,0.4,1.0,0.39,0.33,0.39,0.44,0.47,0.29,0.33,0.2,0.5,0.34,0.29,0.3,0.25,0.41
6,0.61,0.54,0.62,0.59,0.32,0.39,1.0,0.6,0.58,0.58,0.23,0.41,0.4,0.27,0.72,0.71,0.47,0.51,0.4,0.61
7,0.69,0.72,0.52,0.68,0.44,0.33,0.6,1.0,0.74,0.64,0.3,0.33,0.57,0.18,0.72,0.46,0.62,0.42,0.62,0.61
8,0.64,0.68,0.58,0.69,0.5,0.39,0.58,0.74,1.0,0.71,0.28,0.39,0.62,0.19,0.74,0.43,0.58,0.48,0.49,0.63
9,0.74,0.67,0.62,0.74,0.48,0.44,0.58,0.64,0.71,1.0,0.28,0.43,0.64,0.19,0.78,0.45,0.57,0.45,0.62,0.64


### 5. LDA MALLET K = 24

In [50]:
# TÓPICO DOMINANTE Y STEMMING EN OBS CON K = 24
LDAmallet_gensim = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model_list1[11])

df_topic_sents_keywords = format_topics_sentences(ldamodel=LDAmallet_gensim, corpus=corpus, texts=texts)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

df_topic = df_total.copy()
df_topic['Topico Dominante'] = df_dominant_topic['Dominant_Topic']
df_topic['Topico Dominante'] = df_topic['Topico Dominante'].astype(int)

# Apply Stemming function (texts_only_stem) to the DF
df_topics_stemming = df_topic.copy()
df_topics_stemming['Observación'] = df_topics_stemming['Observación'].apply(texts_only_stem)
df_topics_stemming.head()

Unnamed: 0,Region,Estacion,Tipo de Falla,Prioridad Cliente,Estado OT,Tecnico Asignado,Dia,Nro Dia,Mes,Hora,Rango Recep/Arribo,Rango Arribo/Cierre,Observación,Requerimiento,Topico Dominante
0,13°,Tingaro ...,Carrete ...,Normal 4...,CERRADA,Juan Aliaga,Viernes,30,AGO 19,16,Error en...,Más de 1...,cambi pl...,area de ...,21
1,8°,Combusti...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,cambi gr...,area de ...,4
2,10°,LLANOS Y...,Carrete ...,Normal 2...,PRECIERRE,Luis Alm...,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,eds cheq...,area de ...,8
3,1°,Soc. Imp...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,remplaz ...,area de ...,17
4,3°,Inmobili...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,remplaz ...,area de ...,17


In [51]:
rango = range(0,24,1)
d_topics = {}
for k in rango:
    dataframe_topics = df_topics_stemming[df_topics_stemming['Topico Dominante']==k]
    d_topics[f'Topico {k}'] = [i for i in  dataframe_topics['Observación']]
    
# Texts 
texts_stem = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in df_topics_stemming['Observación']]
# Dictionary
dictionary = corpora.Dictionary([simple_preprocess(doc) for i in rango for doc in d_topics[f'Topico {i}']])
# Corpus
corpus = [dictionary.doc2bow(text) for text in texts_stem]
# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)

sentences_x_topic = [' '.join(d_topics[f'Topico {i}']) for i in rango]
sentences = [dictionary.doc2bow(simple_preprocess(i)) for i in sentences_x_topic]

from IPython.display import display
pd.options.display.max_columns = None
display(create_soft_cossim_matrix(sentences))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
0,1.0,0.47,0.69,0.72,0.77,0.52,0.64,0.57,0.58,0.55,0.64,0.58,0.62,0.7,0.72,0.26,0.35,0.78,0.46,0.74,0.51,0.54,0.27,0.54
1,0.47,1.0,0.42,0.46,0.44,0.35,0.4,0.39,0.33,0.55,0.41,0.35,0.45,0.48,0.39,0.12,0.19,0.49,0.25,0.49,0.34,0.38,0.21,0.42
2,0.69,0.42,1.0,0.71,0.65,0.52,0.59,0.58,0.57,0.58,0.53,0.54,0.58,0.61,0.67,0.22,0.33,0.7,0.37,0.7,0.47,0.49,0.22,0.58
3,0.72,0.46,0.71,1.0,0.73,0.53,0.64,0.55,0.53,0.56,0.55,0.62,0.62,0.63,0.7,0.18,0.3,0.7,0.38,0.69,0.51,0.52,0.31,0.59
4,0.77,0.44,0.65,0.73,1.0,0.48,0.64,0.58,0.58,0.61,0.6,0.61,0.63,0.68,0.73,0.27,0.42,0.75,0.43,0.77,0.49,0.56,0.31,0.56
5,0.52,0.35,0.52,0.53,0.48,1.0,0.5,0.38,0.37,0.38,0.43,0.39,0.45,0.5,0.47,0.12,0.19,0.45,0.3,0.46,0.44,0.37,0.16,0.35
6,0.64,0.4,0.59,0.64,0.64,0.5,1.0,0.52,0.54,0.48,0.49,0.48,0.55,0.54,0.59,0.15,0.23,0.56,0.33,0.57,0.4,0.44,0.22,0.46
7,0.57,0.39,0.58,0.55,0.58,0.38,0.52,1.0,0.47,0.51,0.42,0.42,0.49,0.52,0.57,0.15,0.27,0.6,0.29,0.55,0.39,0.46,0.23,0.52
8,0.58,0.33,0.57,0.53,0.58,0.37,0.54,0.47,1.0,0.57,0.53,0.5,0.43,0.53,0.54,0.31,0.55,0.66,0.31,0.6,0.36,0.45,0.2,0.39
9,0.55,0.55,0.58,0.56,0.61,0.38,0.48,0.51,0.57,1.0,0.5,0.46,0.46,0.52,0.52,0.26,0.45,0.67,0.29,0.64,0.33,0.49,0.26,0.51


### 6. LDA MALLET K = 30

In [52]:
# TÓPICO DOMINANTE Y STEMMING EN OBS CON K = 30
LDAmallet_gensim = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model_list1[14])

df_topic_sents_keywords = format_topics_sentences(ldamodel=LDAmallet_gensim, corpus=corpus, texts=texts)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

df_topic = df_total.copy()
df_topic['Topico Dominante'] = df_dominant_topic['Dominant_Topic']
df_topic['Topico Dominante'] = df_topic['Topico Dominante'].astype(int)

# Apply Stemming function (texts_only_stem) to the DF
df_topics_stemming = df_topic.copy()
df_topics_stemming['Observación'] = df_topics_stemming['Observación'].apply(texts_only_stem)
df_topics_stemming.head()

Unnamed: 0,Region,Estacion,Tipo de Falla,Prioridad Cliente,Estado OT,Tecnico Asignado,Dia,Nro Dia,Mes,Hora,Rango Recep/Arribo,Rango Arribo/Cierre,Observación,Requerimiento,Topico Dominante
0,13°,Tingaro ...,Carrete ...,Normal 4...,CERRADA,Juan Aliaga,Viernes,30,AGO 19,16,Error en...,Más de 1...,cambi pl...,area de ...,4
1,8°,Combusti...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,cambi gr...,area de ...,24
2,10°,LLANOS Y...,Carrete ...,Normal 2...,PRECIERRE,Luis Alm...,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,eds cheq...,area de ...,22
3,1°,Soc. Imp...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,0 a 24 h...,Menos de 1h,remplaz ...,area de ...,24
4,3°,Inmobili...,Carrete ...,Normal (...,CERRADA,On Line,Viernes,30,AGO 19,12,Error en...,Menos de 1h,remplaz ...,area de ...,24


In [53]:
rango = range(0,30,1)
d_topics = {}
for k in rango:
    dataframe_topics = df_topics_stemming[df_topics_stemming['Topico Dominante']==k]
    d_topics[f'Topico {k}'] = [i for i in  dataframe_topics['Observación']]
    
# Texts 
texts_stem = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in df_topics_stemming['Observación']]
# Dictionary
dictionary = corpora.Dictionary([simple_preprocess(doc) for i in rango for doc in d_topics[f'Topico {i}']])
# Corpus
corpus = [dictionary.doc2bow(text) for text in texts_stem]
# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)

sentences_x_topic = [' '.join(d_topics[f'Topico {i}']) for i in rango]
sentences = [dictionary.doc2bow(simple_preprocess(i)) for i in sentences_x_topic]

from IPython.display import display
pd.options.display.max_columns = None
display(create_soft_cossim_matrix(sentences))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
0,1.0,0.54,0.65,0.65,0.44,0.39,0.32,0.48,0.34,0.55,0.33,0.59,0.29,0.64,0.4,0.4,0.51,0.49,0.5,0.45,0.55,0.58,0.42,0.52,0.5,0.38,0.53,0.52,0.45,0.48
1,0.54,1.0,0.6,0.44,0.6,0.66,0.42,0.54,0.41,0.74,0.45,0.76,0.25,0.62,0.63,0.47,0.54,0.66,0.71,0.55,0.49,0.77,0.59,0.65,0.7,0.56,0.57,0.65,0.67,0.75
2,0.65,0.6,1.0,0.55,0.54,0.42,0.38,0.51,0.46,0.6,0.4,0.72,0.28,0.66,0.52,0.52,0.54,0.56,0.59,0.54,0.52,0.64,0.45,0.51,0.57,0.42,0.62,0.64,0.54,0.57
3,0.65,0.44,0.55,1.0,0.4,0.3,0.32,0.46,0.23,0.45,0.32,0.49,0.26,0.64,0.44,0.41,0.56,0.41,0.53,0.41,0.48,0.51,0.38,0.46,0.45,0.31,0.53,0.5,0.41,0.43
4,0.44,0.6,0.54,0.4,1.0,0.48,0.35,0.42,0.55,0.62,0.31,0.64,0.23,0.52,0.44,0.44,0.4,0.49,0.61,0.43,0.38,0.6,0.4,0.46,0.52,0.47,0.45,0.55,0.48,0.6
5,0.39,0.66,0.42,0.3,0.48,1.0,0.3,0.38,0.3,0.61,0.47,0.6,0.16,0.46,0.36,0.33,0.42,0.57,0.48,0.4,0.37,0.61,0.39,0.48,0.52,0.5,0.39,0.45,0.43,0.59
6,0.32,0.42,0.38,0.32,0.35,0.3,1.0,0.35,0.28,0.42,0.26,0.53,0.16,0.43,0.31,0.32,0.32,0.39,0.43,0.37,0.34,0.4,0.35,0.41,0.43,0.31,0.37,0.38,0.36,0.45
7,0.48,0.54,0.51,0.46,0.42,0.38,0.35,1.0,0.3,0.57,0.31,0.55,0.35,0.65,0.46,0.47,0.46,0.46,0.54,0.31,0.44,0.55,0.44,0.53,0.55,0.43,0.51,0.6,0.5,0.54
8,0.34,0.41,0.46,0.23,0.55,0.3,0.28,0.3,1.0,0.42,0.23,0.46,0.15,0.35,0.33,0.33,0.29,0.34,0.41,0.3,0.28,0.39,0.3,0.32,0.36,0.37,0.33,0.37,0.33,0.42
9,0.55,0.74,0.6,0.45,0.62,0.61,0.42,0.57,0.42,1.0,0.47,0.73,0.26,0.64,0.61,0.49,0.51,0.69,0.7,0.53,0.49,0.7,0.52,0.62,0.64,0.59,0.55,0.68,0.66,0.7


## IV. COSINE SIMILARITY MATRIX (ONLY TOP WORDS)

In [8]:
# Limpieza de palabras 2.0
def word_cleaner(text):
    import string,re
    from unidecode import unidecode
    text = re.sub('\w*\d\w*', '', text)    # Elimina digitos
    text = re.sub('\[.*?¿\]\%', '', text)   # Elimina .*?¿\
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)  # Elimina puntuación restante, como ¡! /
    text = re.sub('[%s]' % re.escape('°'), ' ', text)
    text = re.sub('[‘’“”"+…«»/]', '', text)
    text = re.sub(r'\s{2,}', ' ', text)     # Elimina dos o más espacios en blancos seguidos
    text = re.sub(r"^\s+", "", text, flags=re.UNICODE)      # Elimina espacios blancos al principio
    text = re.sub("^\s+|\s+$", "", text, flags=re.UNICODE)  # Elimina espacios blancos al final
    return text 

### 1. LDA MALLET K = 10

In [100]:
optimal_model1 = model_list1[4]
rango = range(0,10,1)

In [101]:
# Diccionario con las top 1000 palabras de los topicos 
words_per_topic = {}
for topic in rango:
    topwords = optimal_model1.print_topics(num_words=1000, num_topics= 100)[topic][1]
    words_per_topic[f'Topico {topic}'] = word_cleaner(topwords).split(' ')

# Texts 
texts_topics = [[word for word in simple_preprocess(str(words_per_topic[f'Topico {i}']))] for i in rango]
# Dictionary
dictionary_topics = corpora.Dictionary([simple_preprocess(word) for i in rango for word in words_per_topic[f'Topico {i}']])
# Corpus
corpus = [dictionary_topics.doc2bow(text) for text in texts_topics]
# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary_topics, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)


sentences_x_topic = [' '.join(words_per_topic[f'Topico {i}']) for i in rango]
sentences = [dictionary_topics.doc2bow(simple_preprocess(i)) for i in sentences_x_topic]
len(sentences)

from IPython.display import display
pd.options.display.max_columns = None
display(create_soft_cossim_matrix(sentences))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.11,0.18,0.15,0.16,0.13,0.24,0.15,0.19,0.16
1,0.11,1.0,0.12,0.09,0.1,0.11,0.15,0.1,0.1,0.1
2,0.18,0.12,1.0,0.15,0.14,0.13,0.27,0.14,0.19,0.17
3,0.15,0.09,0.15,1.0,0.12,0.11,0.22,0.13,0.17,0.14
4,0.16,0.1,0.14,0.12,1.0,0.14,0.21,0.15,0.16,0.14
5,0.13,0.11,0.13,0.11,0.14,1.0,0.17,0.11,0.12,0.12
6,0.24,0.15,0.27,0.22,0.21,0.17,1.0,0.23,0.31,0.26
7,0.15,0.1,0.14,0.13,0.15,0.11,0.23,1.0,0.17,0.15
8,0.19,0.1,0.19,0.17,0.16,0.12,0.31,0.17,1.0,0.18
9,0.16,0.1,0.17,0.14,0.14,0.12,0.26,0.15,0.18,1.0


### 2. LDA MALLET K = 12

In [102]:
optimal_model1 = model_list1[5]
rango = range(0,12,1)

In [103]:
# Diccionario con las top 1000 palabras de los topicos 
words_per_topic = {}
for topic in rango:
    topwords = optimal_model1.print_topics(num_words=1000, num_topics= 100)[topic][1]
    words_per_topic[f'Topico {topic}'] = word_cleaner(topwords).split(' ')

# Texts 
texts_topics = [[word for word in simple_preprocess(str(words_per_topic[f'Topico {i}']))] for i in rango]
# Dictionary
dictionary_topics = corpora.Dictionary([simple_preprocess(word) for i in rango for word in words_per_topic[f'Topico {i}']])
# Corpus
corpus = [dictionary_topics.doc2bow(text) for text in texts_topics]
# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary_topics, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)


sentences_x_topic = [' '.join(words_per_topic[f'Topico {i}']) for i in rango]
sentences = [dictionary_topics.doc2bow(simple_preprocess(i)) for i in sentences_x_topic]
len(sentences)

from IPython.display import display
pd.options.display.max_columns = None
display(create_soft_cossim_matrix(sentences))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.0,0.18,0.17,0.2,0.13,0.15,0.11,0.15,0.15,0.14,0.25,0.16
1,0.18,1.0,0.29,0.16,0.18,0.19,0.11,0.25,0.22,0.15,0.14,0.24
2,0.17,0.29,1.0,0.16,0.2,0.17,0.11,0.21,0.22,0.14,0.15,0.22
3,0.2,0.16,0.16,1.0,0.13,0.14,0.11,0.14,0.14,0.14,0.21,0.16
4,0.13,0.18,0.2,0.13,1.0,0.16,0.11,0.17,0.16,0.13,0.12,0.18
5,0.15,0.19,0.17,0.14,0.16,1.0,0.1,0.17,0.15,0.15,0.13,0.16
6,0.11,0.11,0.11,0.11,0.11,0.1,1.0,0.1,0.11,0.09,0.12,0.11
7,0.15,0.25,0.21,0.14,0.17,0.17,0.1,1.0,0.18,0.13,0.13,0.21
8,0.15,0.22,0.22,0.14,0.16,0.15,0.11,0.18,1.0,0.14,0.13,0.2
9,0.14,0.15,0.14,0.14,0.13,0.15,0.09,0.13,0.14,1.0,0.19,0.13


### 3. LDA MALLET K = 16

In [104]:
optimal_model1 = model_list1[7]
rango = range(0,16,1)

In [105]:
# Diccionario con las top 1000 palabras de los topicos 
words_per_topic = {}
for topic in rango:
    topwords = optimal_model1.print_topics(num_words=1000, num_topics= 100)[topic][1]
    words_per_topic[f'Topico {topic}'] = word_cleaner(topwords).split(' ')

# Texts 
texts_topics = [[word for word in simple_preprocess(str(words_per_topic[f'Topico {i}']))] for i in rango]
# Dictionary
dictionary_topics = corpora.Dictionary([simple_preprocess(word) for i in rango for word in words_per_topic[f'Topico {i}']])
# Corpus
corpus = [dictionary_topics.doc2bow(text) for text in texts_topics]
# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary_topics, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)


sentences_x_topic = [' '.join(words_per_topic[f'Topico {i}']) for i in rango]
sentences = [dictionary_topics.doc2bow(simple_preprocess(i)) for i in sentences_x_topic]
len(sentences)

from IPython.display import display
pd.options.display.max_columns = None
display(create_soft_cossim_matrix(sentences))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1.0,0.21,0.13,0.34,0.25,0.18,0.3,0.24,0.39,0.27,0.14,0.23,0.27,0.37,0.16,0.28
1,0.21,1.0,0.12,0.28,0.32,0.15,0.24,0.23,0.3,0.28,0.12,0.17,0.2,0.3,0.13,0.19
2,0.13,0.12,1.0,0.14,0.14,0.17,0.14,0.16,0.17,0.14,0.18,0.19,0.14,0.14,0.19,0.17
3,0.34,0.28,0.14,1.0,0.31,0.19,0.35,0.26,0.46,0.32,0.13,0.21,0.31,0.48,0.14,0.28
4,0.25,0.32,0.14,0.31,1.0,0.16,0.29,0.22,0.32,0.31,0.15,0.18,0.24,0.31,0.15,0.22
5,0.18,0.15,0.17,0.19,0.16,1.0,0.18,0.15,0.22,0.14,0.16,0.2,0.18,0.21,0.2,0.2
6,0.3,0.24,0.14,0.35,0.29,0.18,1.0,0.27,0.36,0.29,0.16,0.19,0.24,0.35,0.17,0.3
7,0.24,0.23,0.16,0.26,0.22,0.15,0.27,1.0,0.28,0.33,0.15,0.19,0.19,0.31,0.18,0.24
8,0.39,0.3,0.17,0.46,0.32,0.22,0.36,0.28,1.0,0.33,0.17,0.26,0.34,0.49,0.2,0.34
9,0.27,0.28,0.14,0.32,0.31,0.14,0.29,0.33,0.33,1.0,0.14,0.18,0.22,0.35,0.16,0.22


### 4. LDA MALLET K = 20

In [106]:
optimal_model1 = model_list1[9]
rango = range(0,20,1)

In [107]:
# Diccionario con las top 1000 palabras de los topicos 
words_per_topic = {}
for topic in rango:
    topwords = optimal_model1.print_topics(num_words=1000, num_topics= 100)[topic][1]
    words_per_topic[f'Topico {topic}'] = word_cleaner(topwords).split(' ')

# Texts 
texts_topics = [[word for word in simple_preprocess(str(words_per_topic[f'Topico {i}']))] for i in rango]
# Dictionary
dictionary_topics = corpora.Dictionary([simple_preprocess(word) for i in rango for word in words_per_topic[f'Topico {i}']])
# Corpus
corpus = [dictionary_topics.doc2bow(text) for text in texts_topics]
# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary_topics, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)


sentences_x_topic = [' '.join(words_per_topic[f'Topico {i}']) for i in rango]
sentences = [dictionary_topics.doc2bow(simple_preprocess(i)) for i in sentences_x_topic]
len(sentences)

from IPython.display import display
pd.options.display.max_columns = None
display(create_soft_cossim_matrix(sentences))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1.0,0.51,0.41,0.25,0.37,0.29,0.48,0.35,0.34,0.29,0.41,0.3,0.27,0.5,0.17,0.41,0.44,0.38,0.5,0.32
1,0.51,1.0,0.39,0.21,0.38,0.28,0.47,0.35,0.34,0.3,0.41,0.3,0.26,0.51,0.15,0.4,0.42,0.36,0.53,0.29
2,0.41,0.39,1.0,0.27,0.35,0.28,0.37,0.31,0.32,0.3,0.36,0.27,0.23,0.42,0.17,0.36,0.38,0.3,0.41,0.31
3,0.25,0.21,0.27,1.0,0.24,0.24,0.16,0.23,0.25,0.29,0.17,0.2,0.17,0.18,0.25,0.21,0.2,0.21,0.19,0.28
4,0.37,0.38,0.35,0.24,1.0,0.3,0.33,0.45,0.26,0.26,0.31,0.37,0.34,0.34,0.17,0.29,0.29,0.39,0.44,0.27
5,0.29,0.28,0.28,0.24,0.3,1.0,0.25,0.29,0.23,0.22,0.24,0.27,0.24,0.26,0.17,0.24,0.23,0.25,0.31,0.24
6,0.48,0.47,0.37,0.16,0.33,0.25,1.0,0.31,0.31,0.26,0.4,0.26,0.23,0.49,0.12,0.38,0.42,0.33,0.47,0.27
7,0.35,0.35,0.31,0.23,0.45,0.29,0.31,1.0,0.24,0.24,0.29,0.36,0.33,0.32,0.16,0.26,0.27,0.37,0.41,0.27
8,0.34,0.34,0.32,0.25,0.26,0.23,0.31,0.24,1.0,0.27,0.29,0.2,0.18,0.33,0.15,0.33,0.32,0.26,0.31,0.24
9,0.29,0.3,0.3,0.29,0.26,0.22,0.26,0.24,0.27,1.0,0.23,0.17,0.17,0.28,0.16,0.25,0.34,0.23,0.31,0.24


#### 4.1. LDA MALLET K = 22

In [28]:
optimal_model1 = model_list1[10]
rango = range(0,22,1)

In [29]:
# Diccionario con las top 1000 palabras de los topicos 
words_per_topic = {}
for topic in rango:
    topwords = optimal_model1.print_topics(num_words=1000, num_topics= 100)[topic][1]
    words_per_topic[f'Topico {topic}'] = word_cleaner(topwords).split(' ')

# Texts 
texts_topics = [[word for word in simple_preprocess(str(words_per_topic[f'Topico {i}']))] for i in rango]
# Dictionary
dictionary_topics = corpora.Dictionary([simple_preprocess(word) for i in rango for word in words_per_topic[f'Topico {i}']])
# Corpus
corpus = [dictionary_topics.doc2bow(text) for text in texts_topics]
# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary_topics, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)


sentences_x_topic = [' '.join(words_per_topic[f'Topico {i}']) for i in rango]
sentences = [dictionary_topics.doc2bow(simple_preprocess(i)) for i in sentences_x_topic]
len(sentences)

from IPython.display import display
pd.options.display.max_columns = None
display(create_soft_cossim_matrix(sentences))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
0,1.0,0.32,0.35,0.34,0.25,0.35,0.34,0.33,0.37,0.35,0.2,0.34,0.37,0.35,0.33,0.32,0.27,0.25,0.3,0.38,0.35,0.34
1,0.32,1.0,0.46,0.44,0.22,0.46,0.44,0.33,0.44,0.56,0.15,0.42,0.41,0.48,0.43,0.24,0.25,0.19,0.24,0.4,0.49,0.42
2,0.35,0.46,1.0,0.57,0.26,0.59,0.54,0.38,0.55,0.51,0.21,0.53,0.56,0.63,0.55,0.26,0.3,0.23,0.28,0.5,0.62,0.49
3,0.34,0.44,0.57,1.0,0.25,0.55,0.52,0.36,0.56,0.47,0.2,0.53,0.54,0.61,0.53,0.26,0.31,0.24,0.26,0.5,0.6,0.47
4,0.25,0.22,0.26,0.25,1.0,0.23,0.23,0.23,0.27,0.25,0.23,0.23,0.26,0.25,0.24,0.26,0.28,0.27,0.24,0.26,0.28,0.22
5,0.35,0.46,0.59,0.55,0.23,1.0,0.53,0.38,0.52,0.49,0.17,0.52,0.53,0.62,0.54,0.27,0.3,0.23,0.27,0.49,0.6,0.48
6,0.34,0.44,0.54,0.52,0.23,0.53,1.0,0.37,0.52,0.48,0.19,0.51,0.51,0.56,0.53,0.25,0.28,0.21,0.26,0.49,0.56,0.49
7,0.33,0.33,0.38,0.36,0.23,0.38,0.37,1.0,0.39,0.36,0.18,0.36,0.39,0.38,0.36,0.28,0.27,0.23,0.29,0.38,0.38,0.37
8,0.37,0.44,0.55,0.56,0.27,0.52,0.52,0.39,1.0,0.47,0.22,0.53,0.55,0.57,0.53,0.29,0.32,0.27,0.29,0.51,0.59,0.47
9,0.35,0.56,0.51,0.47,0.25,0.49,0.48,0.36,0.47,1.0,0.17,0.45,0.47,0.51,0.47,0.27,0.27,0.22,0.27,0.45,0.51,0.44


### 5. LDA MALLET K = 24

In [10]:
optimal_model1 = model_list1[11]
rango = range(0,24,1)

In [13]:
# Diccionario con las top 1000 palabras de los topicos 
words_per_topic = {}
for topic in rango:
    topwords = optimal_model1.print_topics(num_words=920, num_topics= 100)[topic][1]
    words_per_topic[f'Topico {topic}'] = word_cleaner(topwords).split(' ')

# Texts 
texts_topics = [[word for word in simple_preprocess(str(words_per_topic[f'Topico {i}']))] for i in rango]
# Dictionary
dictionary_topics = corpora.Dictionary([simple_preprocess(word) for i in rango for word in words_per_topic[f'Topico {i}']])
# Corpus
corpus = [dictionary_topics.doc2bow(text) for text in texts_topics]
# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary_topics, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)


sentences_x_topic = [' '.join(words_per_topic[f'Topico {i}']) for i in rango]
sentences = [dictionary_topics.doc2bow(simple_preprocess(i)) for i in sentences_x_topic]
len(sentences)

from IPython.display import display
pd.options.display.max_columns = None
display(create_soft_cossim_matrix(sentences))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
0,1.0,0.19,0.23,0.17,0.19,0.16,0.16,0.16,0.16,0.16,0.22,0.18,0.17,0.14,0.21,0.18,0.16,0.21,0.21,0.22,0.17,0.13,0.18,0.17
1,0.19,1.0,0.25,0.33,0.3,0.32,0.32,0.21,0.23,0.23,0.27,0.36,0.32,0.27,0.33,0.34,0.32,0.24,0.35,0.32,0.29,0.25,0.33,0.33
2,0.23,0.25,1.0,0.27,0.27,0.28,0.27,0.21,0.22,0.2,0.24,0.28,0.29,0.25,0.26,0.31,0.27,0.27,0.32,0.33,0.3,0.19,0.28,0.29
3,0.17,0.33,0.27,1.0,0.35,0.61,0.63,0.39,0.23,0.41,0.31,0.55,0.56,0.43,0.43,0.54,0.58,0.28,0.56,0.39,0.39,0.44,0.62,0.61
4,0.19,0.3,0.27,0.35,1.0,0.34,0.35,0.29,0.18,0.3,0.32,0.37,0.36,0.29,0.32,0.35,0.33,0.33,0.38,0.31,0.27,0.29,0.34,0.34
5,0.16,0.32,0.28,0.61,0.34,1.0,0.62,0.38,0.22,0.41,0.32,0.54,0.55,0.44,0.43,0.56,0.58,0.25,0.56,0.39,0.39,0.44,0.62,0.6
6,0.16,0.32,0.27,0.63,0.35,0.62,1.0,0.38,0.23,0.41,0.33,0.56,0.55,0.43,0.43,0.57,0.58,0.25,0.56,0.4,0.39,0.45,0.64,0.63
7,0.16,0.21,0.21,0.39,0.29,0.38,0.38,1.0,0.12,0.44,0.34,0.38,0.39,0.3,0.32,0.39,0.38,0.3,0.39,0.26,0.24,0.34,0.38,0.39
8,0.16,0.23,0.22,0.23,0.18,0.22,0.23,0.12,1.0,0.12,0.14,0.24,0.23,0.22,0.21,0.24,0.22,0.18,0.25,0.26,0.23,0.15,0.23,0.24
9,0.16,0.23,0.2,0.41,0.3,0.41,0.41,0.44,0.12,1.0,0.33,0.41,0.41,0.31,0.34,0.42,0.4,0.3,0.42,0.28,0.28,0.36,0.41,0.42


#### 5.1. LDA MALLET K = 26

In [31]:
optimal_model1 = model_list1[12]
rango = range(0,26,1)

In [32]:
# Diccionario con las top 1000 palabras de los topicos 
words_per_topic = {}
for topic in rango:
    topwords = optimal_model1.print_topics(num_words=1000, num_topics= 100)[topic][1]
    words_per_topic[f'Topico {topic}'] = word_cleaner(topwords).split(' ')

# Texts 
texts_topics = [[word for word in simple_preprocess(str(words_per_topic[f'Topico {i}']))] for i in rango]
# Dictionary
dictionary_topics = corpora.Dictionary([simple_preprocess(word) for i in rango for word in words_per_topic[f'Topico {i}']])
# Corpus
corpus = [dictionary_topics.doc2bow(text) for text in texts_topics]
# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary_topics, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)


sentences_x_topic = [' '.join(words_per_topic[f'Topico {i}']) for i in rango]
sentences = [dictionary_topics.doc2bow(simple_preprocess(i)) for i in sentences_x_topic]
len(sentences)

from IPython.display import display
pd.options.display.max_columns = None
display(create_soft_cossim_matrix(sentences))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25
0,1.0,0.61,0.38,0.64,0.43,0.5,0.51,0.29,0.64,0.57,0.52,0.54,0.62,0.46,0.55,0.44,0.49,0.63,0.28,0.57,0.53,0.5,0.48,0.56,0.51,0.59
1,0.61,1.0,0.37,0.62,0.44,0.54,0.52,0.28,0.67,0.61,0.5,0.54,0.65,0.46,0.55,0.42,0.5,0.65,0.27,0.61,0.53,0.51,0.47,0.58,0.49,0.61
2,0.38,0.37,1.0,0.39,0.37,0.39,0.38,0.3,0.41,0.37,0.36,0.36,0.37,0.38,0.39,0.35,0.44,0.38,0.29,0.38,0.38,0.37,0.4,0.38,0.46,0.39
3,0.64,0.62,0.39,1.0,0.45,0.52,0.5,0.31,0.64,0.58,0.51,0.54,0.61,0.48,0.54,0.44,0.5,0.62,0.29,0.58,0.52,0.49,0.49,0.56,0.51,0.59
4,0.43,0.44,0.37,0.45,1.0,0.39,0.42,0.34,0.44,0.41,0.42,0.43,0.41,0.47,0.42,0.44,0.39,0.42,0.3,0.43,0.4,0.38,0.53,0.4,0.45,0.44
5,0.5,0.54,0.39,0.52,0.39,1.0,0.47,0.25,0.59,0.56,0.46,0.46,0.56,0.41,0.47,0.39,0.48,0.55,0.25,0.52,0.48,0.48,0.43,0.49,0.49,0.5
6,0.51,0.52,0.38,0.5,0.42,0.47,1.0,0.26,0.52,0.51,0.45,0.5,0.52,0.43,0.51,0.42,0.46,0.5,0.23,0.52,0.51,0.49,0.46,0.51,0.47,0.52
7,0.29,0.28,0.3,0.31,0.34,0.25,0.26,1.0,0.28,0.25,0.28,0.26,0.26,0.31,0.29,0.26,0.28,0.26,0.32,0.26,0.24,0.22,0.32,0.27,0.29,0.28
8,0.64,0.67,0.41,0.64,0.44,0.59,0.52,0.28,1.0,0.67,0.5,0.55,0.69,0.46,0.55,0.43,0.53,0.7,0.3,0.63,0.53,0.53,0.48,0.58,0.54,0.61
9,0.57,0.61,0.37,0.58,0.41,0.56,0.51,0.25,0.67,1.0,0.47,0.52,0.66,0.43,0.54,0.4,0.48,0.63,0.24,0.62,0.53,0.52,0.44,0.57,0.48,0.58


### 6. LDA MALLET K = 30

In [110]:
optimal_model1 = model_list1[14]
rango = range(0,30,1)

In [111]:
# Diccionario con las top 1000 palabras de los topicos 
words_per_topic = {}
for topic in rango:
    topwords = optimal_model1.print_topics(num_words=1000, num_topics= 100)[topic][1]
    words_per_topic[f'Topico {topic}'] = word_cleaner(topwords).split(' ')

# Texts 
texts_topics = [[word for word in simple_preprocess(str(words_per_topic[f'Topico {i}']))] for i in rango]
# Dictionary
dictionary_topics = corpora.Dictionary([simple_preprocess(word) for i in rango for word in words_per_topic[f'Topico {i}']])
# Corpus
corpus = [dictionary_topics.doc2bow(text) for text in texts_topics]
# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary_topics, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)


sentences_x_topic = [' '.join(words_per_topic[f'Topico {i}']) for i in rango]
sentences = [dictionary_topics.doc2bow(simple_preprocess(i)) for i in sentences_x_topic]
len(sentences)

from IPython.display import display
pd.options.display.max_columns = None
display(create_soft_cossim_matrix(sentences))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
0,1.0,0.46,0.66,0.65,0.53,0.66,0.59,0.59,0.37,0.32,0.66,0.33,0.5,0.66,0.65,0.56,0.65,0.63,0.46,0.66,0.65,0.51,0.67,0.43,0.62,0.61,0.59,0.55,0.59,0.57
1,0.46,1.0,0.49,0.47,0.47,0.48,0.44,0.43,0.39,0.32,0.48,0.37,0.45,0.46,0.46,0.42,0.45,0.47,0.44,0.47,0.47,0.49,0.48,0.48,0.44,0.46,0.49,0.49,0.47,0.45
2,0.66,0.49,1.0,0.67,0.53,0.68,0.6,0.61,0.38,0.34,0.67,0.35,0.51,0.67,0.66,0.56,0.67,0.64,0.49,0.67,0.67,0.52,0.69,0.44,0.63,0.63,0.6,0.55,0.6,0.59
3,0.65,0.47,0.67,1.0,0.52,0.69,0.61,0.64,0.41,0.32,0.66,0.34,0.53,0.67,0.69,0.59,0.72,0.63,0.49,0.67,0.74,0.52,0.74,0.43,0.65,0.63,0.6,0.55,0.59,0.58
4,0.53,0.47,0.53,0.52,1.0,0.54,0.5,0.49,0.37,0.32,0.55,0.36,0.46,0.54,0.52,0.48,0.52,0.54,0.43,0.54,0.52,0.52,0.54,0.45,0.5,0.51,0.52,0.51,0.52,0.51
5,0.66,0.48,0.68,0.69,0.54,1.0,0.59,0.62,0.39,0.33,0.68,0.34,0.51,0.67,0.67,0.58,0.68,0.64,0.48,0.68,0.68,0.53,0.7,0.44,0.64,0.62,0.6,0.55,0.59,0.59
6,0.59,0.44,0.6,0.61,0.5,0.59,1.0,0.6,0.36,0.29,0.57,0.33,0.49,0.58,0.6,0.57,0.6,0.56,0.45,0.59,0.62,0.5,0.6,0.41,0.6,0.58,0.56,0.51,0.56,0.55
7,0.59,0.43,0.61,0.64,0.49,0.62,0.6,1.0,0.39,0.28,0.61,0.33,0.49,0.6,0.62,0.58,0.63,0.56,0.44,0.61,0.64,0.49,0.63,0.4,0.62,0.57,0.55,0.51,0.55,0.54
8,0.37,0.39,0.38,0.41,0.37,0.39,0.36,0.39,1.0,0.3,0.4,0.31,0.38,0.37,0.38,0.36,0.39,0.38,0.37,0.37,0.41,0.38,0.43,0.36,0.37,0.37,0.38,0.39,0.37,0.37
9,0.32,0.32,0.34,0.32,0.32,0.33,0.29,0.28,0.3,1.0,0.35,0.27,0.35,0.32,0.32,0.27,0.31,0.34,0.36,0.32,0.32,0.31,0.34,0.31,0.3,0.31,0.33,0.33,0.32,0.3
