# APLICACIÓN KMEANS EN LOS 20 TÓPICOS ENCONTRADOS POR LDA

**Contenido**

Análisis de tópicos del 0 al 19 en orden ascendente.

El objetivo de este notebook es la implementación del algoritmo Kmeans a los tópicos encontrados en el tercer notebook, con el fin de extraer patrones de comportamiento. 


<div class="alert alert-info" style="margin: 20px">
Se decide crear este algoritmo con el fin de obtener mayor grado de detales en las OT de las base de datos. En otras palabras, se busca identificar clusters específicos, conservando el sentido semantico de la oración, a diferencia del anterior estructura LDA que nos permitió tener una visión general. </div>

# I. DISEÑO DEL ALGORITMO

#### Importar paquetes

In [None]:
import collections
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from pprint import pprint
import numpy as np
from string import punctuation
from time import time
import os
import pandas as pd
from IPython.display import Image
import matplotlib.pyplot as plt

# Stop Words
with open('spanish4.0.txt','r') as f:
    stop_words = f.read().splitlines()

#### Estructura del algoritmo

In [None]:
# Tokenizar palabras
def word_tokenizer(text):    
    tokens = word_tokenize(text)   # tokenizes and stems the text
    # non_words = list(punctuation)  # removes punctuation 
    # text = ''.join([c for c in text if c not in non_words]) ---> Data ingresada ya está libre de puntuación
    stemmer = SnowballStemmer('spanish')
    tokens = [stemmer.stem(t) for t in tokens if t not in stop_words] # Posibilidad de extender stopwords
    return tokens

# Número óptimo de clústers
def elbow_method(sentences,figsize = (8,5),  namecol=None, nro_top=None, save = False,width_text = 0.83, height_text = 0.92):
    tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenizer,stop_words=stop_words,lowercase=True)
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)   #builds a tf-idf matrix for the sentences
    import warnings
    
    try:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            if len(sentences)<= 30:
                list_k = list(range(1,len(sentences)+1))
                elbow = []
                for i in list_k:
                    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 101)
                    kmeans.fit(tfidf_matrix)
                    elbow.append(kmeans.inertia_)

            elif len(sentences)> 30 and len(sentences)<= 70:
                list_k = list(range(2,len(sentences)+1,2))
                elbow = []
                for i in list_k:
                    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 101)
                    kmeans.fit(tfidf_matrix)
                    elbow.append(kmeans.inertia_)     

            else:     
                list_k = list(range(2,71,4))
                elbow = []
                for i in list_k:
                    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 101)
                    kmeans.fit(tfidf_matrix)
                    elbow.append(kmeans.inertia_)

            from kneed import KneeLocator
            kn = KneeLocator(list_k, elbow, curve='convex', direction='decreasing')
            print(f'Número óptimo de clústers: {kn.knee}')

            plt.rcParams.update({'figure.figsize':figsize})
            plt.xlabel('Número de clusters k')
            plt.ylabel('Suma de distancias cuadradas')
            plt.plot(list_k, elbow, 'bo-')
            plt.title('Método del Elbow')
            plt.text(width_text, height_text, f'k óptimo: {kn.knee}', transform=plt.gca().transAxes)
            plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')
            if save :
                plt.savefig(f'{namecol} {nro_top} - Método del elbow.jpg', dpi= 200, quality=95)
            plt.show()

    except ValueError:
        print('You must enter at least two sentences')
        
# Algoritmo Kmeans
def cluster_sentences(sentences, nb_of_clusters):

    tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenizer,stop_words=stop_words,lowercase=True)
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)   #builds a tf-idf matrix for the sentences
    
    kmeans = KMeans(n_clusters=nb_of_clusters,max_iter=300)
    #kmeans.fit(tfidf_matrix)
    X_dist=kmeans.fit_transform(tfidf_matrix)**2  #Se calcula la distancia del cluster y se multiplica al cuadrado
    clusters = collections.defaultdict(list)
    for i, label in enumerate(kmeans.labels_):
        clusters[label].append(i)

    Distancia=X_dist.sum(axis=1)  #Se calcula la distancia por columna de la matriz

    distDict={}  #Se traspasa a un diccionario con las distancias por cluster
    for key,val in clusters.items():
        distDict[key]=Distancia[val]

    representativos=[]   # Se guarda en una lista los valores mínimos de cada cluster (Menor distancia al centro mejor es el representativo)
    
    #Verificacion de los numeros de clusters
    #Si se intenta obtener los representativos de un número mas grande de cluster que el permitido, se procesa el número de clusters
    if len(distDict)<nb_of_clusters:
        nb_of_clusters=len(distDict)
        
    #Verificacion de los números de clústers
    for i in range(nb_of_clusters):
        representativos.append(np.where(distDict[i]==min(distDict[i]))[0][0])

    return dict(clusters),representativos

# Plot cantidad de sentences por cluster
def len_clusters(dic, n, namecol=None, nro_top=None, save = False, width = 0.6, color = "royalblue", figsize = (8,5)):
    keys = list(range(0,n))
    len_values = [len(dic[i]) for i in keys]
    plt.rcParams.update({'figure.figsize': figsize})
    plt.bar(keys,len_values , align='center', color = color, width = width)
    plt.title('Cantidad de OT por clúster')
    plt.xlabel('Clusters')
    plt.ylabel('Orden de Trabajo (OT)')
    plt.xticks(np.arange(min(keys), max(keys)+1, 1.0))
    if save:
        plt.savefig(f'{namecol} {nro_top} - Distribucion clusters .jpg', dpi= 200, quality=95)
    plt.show()

<div class="alert alert-success" style="margin: 20px">En todos los data frames ya está implementada la función word_cleaner() que permitió limpiar la base de datos. Para más información de esta función ir al segundo notebook.</div>


# II. RESULTADOS DE KMEANS - OBSERVACIONES

In [None]:
from IPython.display import Image
Image(filename='Gráficos/Notebook 1 - Análsis BD original/2) Estacion.jpg', width=600, height=400)

In [None]:
# Importar Data Frame

# Se trabaja con el data frame que contiene registros en formato de sentences ([sent1,sent2,...,sentn])
df_estacion_sentences = pd.read_pickle('df_estacion_sentences.pkl')
df_estacion_sentences.head()

### 1. Tópico 0

In [None]:
# Cantidad de OT
namecol = ''
save = True
sentences = df_estacion_sentences['Observación']['OPERACIONES Y SERVICIOS TERPEL LTDA.']
len(sentences)

In [None]:
%%time
# Identificando el número óptimo de clústers por medio del método del Elbow
elbow_method(sentences,namecol=namecol, nro_top=1, save=save)

In [None]:
# Ejecutando el algoritmo Kmeans para el número óptimo de clústers 
print('\033[1mCLUSTERS \n\033[0m')
start_time = time()

nclusters= 14
clusters,representativos = cluster_sentences(sentences, nclusters)

for cluster in range(nclusters):
    print (f"\033[1mCluster {cluster} : {sentences[clusters[cluster][representativos[cluster]]]}\033[0m")
    for i,sentence in enumerate(clusters[cluster]):
        print ("\tsentence ",i,": ",sentences[sentence])

elapsed_time = time() - start_time
print("\033[1mElapsed time: %.10f seconds.\033[0m" % elapsed_time)

In [None]:
# Clusters representativos (con n = 14)
for cluster in range(nclusters):
    print ("\033[1mCluster ",cluster,":\033[0m",sentences[clusters[cluster][representativos[cluster]]])

In [None]:
# Plot - Cantidad de sentences por cluster
len_clusters(dic=clusters,n = nclusters, namecol = namecol , nro_top=1, width = 0.6, save = save) 

### 2. Tópico 1

In [None]:
# Cantidad de OT
sentences = df_estacion_sentences['Observación']['OPERACIONES Y SERVICIO TERPEL LTDA.']
len(sentences)

In [None]:
%%time
# Identificando el número óptimo de clústers por medio del método del Elbow
elbow_method(sentences,namecol=namecol, nro_top=2, save = save)

In [None]:
# Ejecutando el algoritmo Kmeans para el número óptimo de clústers 
print('\033[1mCLUSTERS \n\033[0m')
start_time = time()

nclusters= 26
clusters,representativos = cluster_sentences(sentences, nclusters)

for cluster in range(nclusters):
    print (f"\033[1mCluster {cluster} : {sentences[clusters[cluster][representativos[cluster]]]}\033[0m")
    for i,sentence in enumerate(clusters[cluster]):
        print ("\tsentence ",i,": ",sentences[sentence])

elapsed_time = time() - start_time
print("\033[1mElapsed time: %.10f seconds.\033[0m" % elapsed_time)

In [None]:
# Clusters representativos (con n = 26)
for cluster in range(nclusters):
    print ("\033[1mCluster ",cluster,":\033[0m",sentences[clusters[cluster][representativos[cluster]]])

In [None]:
# Plot - Cantidad de sentences por cluster
len_clusters(dic=clusters,n = nclusters, namecol = namecol , nro_top=2, width = 0.6, save=save)

### 3. Tópico 2

In [None]:
# Cantidad de OT
sentences = df_estacion_sentences['Observación']['Inversiones Enex']
len(sentences)

In [None]:
%%time
# Identificando el número óptimo de clústers por medio del método del Elbow
elbow_method(sentences,namecol=namecol, nro_top=3, save=save)

In [None]:
# Ejecutando el algoritmo Kmeans para el número óptimo de clústers 
print('\033[1mCLUSTERS \n\033[0m')
start_time = time()

nclusters= 22
clusters,representativos = cluster_sentences(sentences, nclusters)

for cluster in range(nclusters):
    print (f"\033[1mCluster {cluster} : {sentences[clusters[cluster][representativos[cluster]]]}\033[0m")
    for i,sentence in enumerate(clusters[cluster]):
        print ("\tsentence ",i,": ",sentences[sentence])

elapsed_time = time() - start_time
print("\033[1mElapsed time: %.10f seconds.\033[0m" % elapsed_time)

In [None]:
# Clusters representativos (con n = 22)
for cluster in range(nclusters):
    print ("\033[1mCluster ",cluster,":\033[0m",sentences[clusters[cluster][representativos[cluster]]])

In [None]:
# Plot - Cantidad de sentences por cluster
len_clusters(dic=clusters,n = nclusters, namecol = namecol , nro_top=3, width = 0.6, save=save)

### 4. Tópico 3

In [None]:
# Cantidad de OT
sentences = df_estacion_sentences['Observación']['SOCIEDAD DE COMBUSTIBLES AUSTRAL LTDA.']
len(sentences)

In [None]:
%%time
# Identificando el número óptimo de clústers por medio del método del Elbow
elbow_method(sentences,namecol=namecol, nro_top=4, save=save)

In [None]:
# Ejecutando el algoritmo Kmeans para el número óptimo de clústers 
print('\033[1mCLUSTERS \n\033[0m')
start_time = time()

nclusters= 30
clusters,representativos = cluster_sentences(sentences, nclusters)

for cluster in range(nclusters):
    print (f"\033[1mCluster {cluster} : {sentences[clusters[cluster][representativos[cluster]]]}\033[0m")
    for i,sentence in enumerate(clusters[cluster]):
        print ("\tsentence ",i,": ",sentences[sentence])

elapsed_time = time() - start_time
print("\033[1mElapsed time: %.10f seconds.\033[0m" % elapsed_time)

In [None]:
# Clusters representativos (con n = 30)
for cluster in range(nclusters):
    print ("\033[1mCluster ",cluster,":\033[0m",sentences[clusters[cluster][representativos[cluster]]])

In [None]:
# Plot - Cantidad de sentences por cluster
len_clusters(dic=clusters,n = nclusters, namecol = namecol , nro_top=4, width = 0.6, save=save)

### 5. Tópico 4

In [None]:
# Cantidad de OT
sentences = df_estacion_sentences['Observación']['CL - Buses Vule S.A.']
len(sentences)

In [None]:
%%time
# Identificando el número óptimo de clústers por medio del método del Elbow
elbow_method(sentences,namecol=namecol, nro_top=5, save=save)

In [None]:
# Ejecutando el algoritmo Kmeans para el número óptimo de clústers 
print('\033[1mCLUSTERS \n\033[0m')
start_time = time()

nclusters= 18
clusters,representativos = cluster_sentences(sentences, nclusters)

for cluster in range(nclusters):
    print (f"\033[1mCluster {cluster} : {sentences[clusters[cluster][representativos[cluster]]]}\033[0m")
    for i,sentence in enumerate(clusters[cluster]):
        print ("\tsentence ",i,": ",sentences[sentence])

elapsed_time = time() - start_time
print("\033[1mElapsed time: %.10f seconds.\033[0m" % elapsed_time)

In [None]:
# Clusters representativos (con n = 18)
for cluster in range(nclusters):
    print ("\033[1mCluster ",cluster,":\033[0m",sentences[clusters[cluster][representativos[cluster]]])

In [None]:
# Plot - Cantidad de sentences por cluster
len_clusters(dic=clusters,n = nclusters, namecol = namecol, nro_top=5, width = 0.6, save=save)

### 6. Tópico 5

In [None]:
# Cantidad de OT
sentences = df_estacion_sentences['Observación']['Petrobras Chile Red Ltda. Salida Norte']
len(sentences)

In [None]:
%%time
# Identificando el número óptimo de clústers por medio del método del Elbow
elbow_method(sentences,namecol=namecol, nro_top=6, save=save)

In [None]:
# Ejecutando el algoritmo Kmeans para el número óptimo de clústers 
print('\033[1mCLUSTERS \n\033[0m')
start_time = time()

nclusters= 22
clusters,representativos = cluster_sentences(sentences, nclusters)

for cluster in range(nclusters):
    print (f"\033[1mCluster {cluster} : {sentences[clusters[cluster][representativos[cluster]]]}\033[0m")
    for i,sentence in enumerate(clusters[cluster]):
        print ("\tsentence ",i,": ",sentences[sentence])

elapsed_time = time() - start_time
print("\033[1mElapsed time: %.10f seconds.\033[0m" % elapsed_time)

In [None]:
# Clusters representativos (con n = 22)
for cluster in range(nclusters):
    print ("\033[1mCluster ",cluster,":\033[0m",sentences[clusters[cluster][representativos[cluster]]])

In [None]:
# Plot - Cantidad de sentences por cluster
len_clusters(dic=clusters,n = nclusters, namecol = namecol , nro_top=6, width = 0.6, save =save)

### 7. Tópico 6

In [None]:
# Cantidad de OT
sentences = df_estacion_sentences['Observación']['INVERSIONES ENEX S.A.']
len(sentences)

In [None]:
%%time
# Identificando el número óptimo de clústers por medio del método del Elbow
elbow_method(sentences,namecol=namecol, nro_top=7, save = save)

In [None]:
# Ejecutando el algoritmo Kmeans para el número óptimo de clústers 
print('\033[1mCLUSTERS \n\033[0m')
start_time = time()

nclusters= 22
clusters,representativos = cluster_sentences(sentences, nclusters)

for cluster in range(nclusters):
    print (f"\033[1mCluster {cluster} : {sentences[clusters[cluster][representativos[cluster]]]}\033[0m")
    for i,sentence in enumerate(clusters[cluster]):
        print ("\tsentence ",i,": ",sentences[sentence])

elapsed_time = time() - start_time
print("\033[1mElapsed time: %.10f seconds.\033[0m" % elapsed_time)

In [None]:
# Clusters representativos (con n = 22)
for cluster in range(nclusters):
    print ("\033[1mCluster ",cluster,":\033[0m",sentences[clusters[cluster][representativos[cluster]]])

In [None]:
# Plot - Cantidad de sentences por cluster
len_clusters(dic=clusters,n = nclusters, namecol = namecol , nro_top=7, width = 0.6, save = save)

### 8. Tópico 7

In [None]:
# Cantidad de OT
sentences = df_estacion_sentences['Observación']['Petrobras Chile Red Ltda- Enea']
len(sentences)

In [None]:
%%time
# Identificando el número óptimo de clústers por medio del método del Elbow
elbow_method(sentences,namecol=namecol, nro_top=8, save = save)

In [None]:
# Ejecutando el algoritmo Kmeans para el número óptimo de clústers 
print('\033[1mCLUSTERS \n\033[0m')
start_time = time()

nclusters= 14
clusters,representativos = cluster_sentences(sentences, nclusters)

for cluster in range(nclusters):
    print (f"\033[1mCluster {cluster} : {sentences[clusters[cluster][representativos[cluster]]]}\033[0m")
    for i,sentence in enumerate(clusters[cluster]):
        print ("\tsentence ",i,": ",sentences[sentence])

elapsed_time = time() - start_time
print("\033[1mElapsed time: %.10f seconds.\033[0m" % elapsed_time)

In [None]:
# Clusters representativos (con n = 14)
for cluster in range(nclusters):
    print ("\033[1mCluster ",cluster,":\033[0m",sentences[clusters[cluster][representativos[cluster]]])

In [None]:
# Plot - Cantidad de sentences por cluster
len_clusters(dic=clusters,n = nclusters, namecol = namecol , nro_top=8, width = 0.6, save = save)

### 9. Tópico 8

In [None]:
# Cantidad de OT
sentences = df_estacion_sentences['Observación']['SOCIEDAD COMERCIAL ANTUCO LTDA.']
len(sentences)

In [None]:
%%time
# Identificando el número óptimo de clústers por medio del método del Elbow
elbow_method(sentences,namecol=namecol, nro_top=9, save = save)

In [None]:
# Ejecutando el algoritmo Kmeans para el número óptimo de clústers 
print('\033[1mCLUSTERS \n\033[0m')
start_time = time()

nclusters= 26
clusters,representativos = cluster_sentences(sentences, nclusters)

for cluster in range(nclusters):
    print (f"\033[1mCluster {cluster} : {sentences[clusters[cluster][representativos[cluster]]]}\033[0m")
    for i,sentence in enumerate(clusters[cluster]):
        print ("\tsentence ",i,": ",sentences[sentence])

elapsed_time = time() - start_time
print("\033[1mElapsed time: %.10f seconds.\033[0m" % elapsed_time)

In [None]:
# Clusters representativos (con n = 26)
for cluster in range(nclusters):
    print ("\033[1mCluster ",cluster,":\033[0m",sentences[clusters[cluster][representativos[cluster]]])

In [None]:
# Plot - Cantidad de sentences por cluster
len_clusters(dic=clusters,n = nclusters, namecol = namecol , nro_top=9, width = 0.6, save = save)

### 10. Tópico 9

In [None]:
# Cantidad de OT
sentences = df_estacion_sentences['Observación']['Comercial Albano Ltda.']
len(sentences)

In [None]:
%%time
# Identificando el número óptimo de clústers por medio del método del Elbow
elbow_method(sentences,namecol=namecol, nro_top=10, save = save)

In [None]:
# Ejecutando el algoritmo Kmeans para el número óptimo de clústers 
print('\033[1mCLUSTERS \n\033[0m')
start_time = time()

nclusters= 22
clusters,representativos = cluster_sentences(sentences, nclusters)

for cluster in range(nclusters):
    print (f"\033[1mCluster {cluster} : {sentences[clusters[cluster][representativos[cluster]]]}\033[0m")
    for i,sentence in enumerate(clusters[cluster]):
        print ("\tsentence ",i,": ",sentences[sentence])

elapsed_time = time() - start_time
print("\033[1mElapsed time: %.10f seconds.\033[0m" % elapsed_time)

In [None]:
# Clusters representativos (con n = 22)
for cluster in range(nclusters):
    print ("\033[1mCluster ",cluster,":\033[0m",sentences[clusters[cluster][representativos[cluster]]])

In [None]:
# Plot - Cantidad de sentences por cluster
len_clusters(dic=clusters,n = nclusters, namecol = namecol , nro_top=10, width = 0.6, save = save)

### 11. Tópico 10

### 12. Tópico 11

### 13. Tópico 12

### 14. Tópico 13

### 15. Tópico 14

### 16. Tópico 15

### 17. Tópico 16

### 18. Tópico 17

### 19. Tópico 18

### 20. Tópico 19

# III. RESULTADOS DE KMEANS - REQUERIMIENTOS

### 1. Tópico 0

In [None]:
# Cantidad de OT
namecol = 'REQ Estacion'
save = True
sentences = df_estacion_sentences['Requerimiento']['OPERACIONES Y SERVICIOS TERPEL LTDA.']
len(sentences)

In [None]:
%%time
# Identificando el número óptimo de clústers por medio del método del Elbow
elbow_method(sentences,namecol=namecol, nro_top=1, save = save)

In [None]:
# Ejecutando el algoritmo Kmeans para el número óptimo de clústers 
print('\033[1mCLUSTERS \n\033[0m')
start_time = time()

sentences = sentences
nclusters= 22
clusters,representativos = cluster_sentences(sentences, nclusters)

for cluster in range(nclusters):
    print (f"\033[1mCluster {cluster} : {sentences[clusters[cluster][representativos[cluster]]]}\033[0m")
    for i,sentence in enumerate(clusters[cluster]):
        print ("\tsentence ",i,": ",sentences[sentence])

elapsed_time = time() - start_time
print("\033[1mElapsed time: %.10f seconds.\033[0m" % elapsed_time)

In [None]:
# Clusters representativos (con n = 22)
for cluster in range(nclusters):
    print ("\033[1mCluster ",cluster,":\033[0m",sentences[clusters[cluster][representativos[cluster]]])

In [None]:
# Plot - Cantidad de sentences por cluster
len_clusters(dic=clusters,n = nclusters, namecol = namecol , nro_top=1, width = 0.6, save = save) 

### 2. Tópico 1

In [None]:
# Cantidad de OT
sentences = df_estacion_sentences['Requerimiento']['OPERACIONES Y SERVICIO TERPEL LTDA.']
len(sentences)

In [None]:
%%time
# Identificando el número óptimo de clústers por medio del método del Elbow
elbow_method(sentences,namecol=namecol, nro_top=2, save = save)

In [None]:
# Ejecutando el algoritmo Kmeans para el número óptimo de clústers 
print('\033[1mCLUSTERS \n\033[0m')
start_time = time()

sentences = sentences
nclusters= 14
clusters,representativos = cluster_sentences(sentences, nclusters)

for cluster in range(nclusters):
    print (f"\033[1mCluster {cluster} : {sentences[clusters[cluster][representativos[cluster]]]}\033[0m")
    for i,sentence in enumerate(clusters[cluster]):
        print ("\tsentence ",i,": ",sentences[sentence])

elapsed_time = time() - start_time
print("\033[1mElapsed time: %.10f seconds.\033[0m" % elapsed_time)

In [None]:
# Clusters representativos (con n = 14)
for cluster in range(nclusters):
    print ("\033[1mCluster ",cluster,":\033[0m",sentences[clusters[cluster][representativos[cluster]]])

In [None]:
# Plot - Cantidad de sentences por cluster
len_clusters(dic=clusters,n = nclusters, namecol = namecol , nro_top=2, width = 0.6, save = save)

### 3. Tópico 2

In [None]:
# Cantidad de OT
sentences = df_estacion_sentences['Requerimiento']['Inversiones Enex']
len(sentences)

In [None]:
%%time
# Identificando el número óptimo de clústers por medio del método del Elbow
elbow_method(sentences,namecol=namecol, nro_top=3, save = save)

In [None]:
# Ejecutando el algoritmo Kmeans para el número óptimo de clústers 
print('\033[1mCLUSTERS \n\033[0m')
start_time = time()

sentences = sentences
nclusters= 30
clusters,representativos = cluster_sentences(sentences, nclusters)

for cluster in range(nclusters):
    print (f"\033[1mCluster {cluster} : {sentences[clusters[cluster][representativos[cluster]]]}\033[0m")
    for i,sentence in enumerate(clusters[cluster]):
        print ("\tsentence ",i,": ",sentences[sentence])

elapsed_time = time() - start_time
print("\033[1mElapsed time: %.10f seconds.\033[0m" % elapsed_time)

In [None]:
# Clusters representativos (con n = 30)
for cluster in range(nclusters):
    print ("\033[1mCluster ",cluster,":\033[0m",sentences[clusters[cluster][representativos[cluster]]])

In [None]:
# Plot - Cantidad de sentences por cluster
len_clusters(dic=clusters,n = nclusters, namecol = namecol , nro_top=3, width = 0.6, save = save)

### 4. Tópico 3

In [None]:
# Cantidad de OT
sentences = df_estacion_sentences['Requerimiento']['SOCIEDAD DE COMBUSTIBLES AUSTRAL LTDA.']
len(sentences)

In [None]:
%%time
# Identificando el número óptimo de clústers por medio del método del Elbow
elbow_method(sentences,namecol=namecol, nro_top=4, save = save)

In [None]:
# Ejecutando el algoritmo Kmeans para el número óptimo de clústers 
print('\033[1mCLUSTERS \n\033[0m')
start_time = time()

sentences = sentences
nclusters= 26
clusters,representativos = cluster_sentences(sentences, nclusters)

for cluster in range(nclusters):
    print (f"\033[1mCluster {cluster} : {sentences[clusters[cluster][representativos[cluster]]]}\033[0m")
    for i,sentence in enumerate(clusters[cluster]):
        print ("\tsentence ",i,": ",sentences[sentence])

elapsed_time = time() - start_time
print("\033[1mElapsed time: %.10f seconds.\033[0m" % elapsed_time)

In [None]:
# Clusters representativos (con n = 26)
for cluster in range(nclusters):
    print ("\033[1mCluster ",cluster,":\033[0m",sentences[clusters[cluster][representativos[cluster]]])

In [None]:
# Plot - Cantidad de sentences por cluster
len_clusters(dic=clusters,n = nclusters, namecol = namecol , nro_top=4, width = 0.6, save = save) 

### 5. Tópico 4

In [None]:
# Cantidad de OT
sentences = df_estacion_sentences['Requerimiento']['CL - Buses Vule S.A.']
len(sentences)

In [None]:
%%time
# Identificando el número óptimo de clústers por medio del método del Elbow
elbow_method(sentences,namecol=namecol, nro_top=5, save = save)

In [None]:
# Ejecutando el algoritmo Kmeans para el número óptimo de clústers 
print('\033[1mCLUSTERS \n\033[0m')
start_time = time()

sentences = sentences
nclusters= 22
clusters,representativos = cluster_sentences(sentences, nclusters)

for cluster in range(nclusters):
    print (f"\033[1mCluster {cluster} : {sentences[clusters[cluster][representativos[cluster]]]}\033[0m")
    for i,sentence in enumerate(clusters[cluster]):
        print ("\tsentence ",i,": ",sentences[sentence])

elapsed_time = time() - start_time
print("\033[1mElapsed time: %.10f seconds.\033[0m" % elapsed_time)

In [None]:
# Clusters representativos (con n = 22)
for cluster in range(nclusters):
    print ("\033[1mCluster ",cluster,":\033[0m",sentences[clusters[cluster][representativos[cluster]]])

In [None]:
# Plot - Cantidad de sentences por cluster
len_clusters(dic=clusters,n = nclusters, namecol = namecol , nro_top=5, width = 0.6, save = save) 

### 6. Tópico 5

In [None]:
# Cantidad de OT
sentences = df_estacion_sentences['Requerimiento']['Petrobras Chile Red Ltda. Salida Norte']
len(sentences)

In [None]:
%%time
# Identificando el número óptimo de clústers por medio del método del Elbow
elbow_method(sentences,namecol=namecol, nro_top=6, save = save)

In [None]:
# Ejecutando el algoritmo Kmeans para el número óptimo de clústers 
print('\033[1mCLUSTERS \n\033[0m')
start_time = time()

sentences = sentences
nclusters= 22
clusters,representativos = cluster_sentences(sentences, nclusters)

for cluster in range(nclusters):
    print (f"\033[1mCluster {cluster} : {sentences[clusters[cluster][representativos[cluster]]]}\033[0m")
    for i,sentence in enumerate(clusters[cluster]):
        print ("\tsentence ",i,": ",sentences[sentence])

elapsed_time = time() - start_time
print("\033[1mElapsed time: %.10f seconds.\033[0m" % elapsed_time)

In [None]:
# Clusters representativos (con n = 22)
for cluster in range(nclusters):
    print ("\033[1mCluster ",cluster,":\033[0m",sentences[clusters[cluster][representativos[cluster]]])

In [None]:
# Plot - Cantidad de sentences por cluster
len_clusters(dic=clusters,n = nclusters, namecol = namecol , nro_top=6, width = 0.6,save = save) 

### 7. Tópico 6

In [None]:
# Cantidad de OT
sentences = df_estacion_sentences['Requerimiento']['INVERSIONES ENEX S.A.']
len(sentences)

In [None]:
%%time
# Identificando el número óptimo de clústers por medio del método del Elbow
elbow_method(sentences,namecol=namecol, nro_top=7,save = save)

In [None]:
# Ejecutando el algoritmo Kmeans para el número óptimo de clústers 
print('\033[1mCLUSTERS \n\033[0m')
start_time = time()

sentences = sentences
nclusters= 30
clusters,representativos = cluster_sentences(sentences, nclusters)

for cluster in range(nclusters):
    print (f"\033[1mCluster {cluster} : {sentences[clusters[cluster][representativos[cluster]]]}\033[0m")
    for i,sentence in enumerate(clusters[cluster]):
        print ("\tsentence ",i,": ",sentences[sentence])

elapsed_time = time() - start_time
print("\033[1mElapsed time: %.10f seconds.\033[0m" % elapsed_time)

In [None]:
# Clusters representativos (con n = 30)
for cluster in range(nclusters):
    print ("\033[1mCluster ",cluster,":\033[0m",sentences[clusters[cluster][representativos[cluster]]])

In [None]:
# Plot - Cantidad de sentences por cluster
len_clusters(dic=clusters,n = nclusters, namecol = namecol , nro_top=7, width = 0.6, save = save)

### 8. Tópico 7

In [None]:
# Cantidad de OT
sentences = df_estacion_sentences['Requerimiento']['Petrobras Chile Red Ltda- Enea']
len(sentences)

In [None]:
%%time
# Identificando el número óptimo de clústers por medio del método del Elbow
elbow_method(sentences,namecol=namecol, nro_top=8, save = save)

In [None]:
# Ejecutando el algoritmo Kmeans para el número óptimo de clústers 
print('\033[1mCLUSTERS \n\033[0m')
start_time = time()

sentences = sentences
nclusters= 18
clusters,representativos = cluster_sentences(sentences, nclusters)

for cluster in range(nclusters):
    print (f"\033[1mCluster {cluster} : {sentences[clusters[cluster][representativos[cluster]]]}\033[0m")
    for i,sentence in enumerate(clusters[cluster]):
        print ("\tsentence ",i,": ",sentences[sentence])

elapsed_time = time() - start_time
print("\033[1mElapsed time: %.10f seconds.\033[0m" % elapsed_time)

In [None]:
# Clusters representativos (con n = 18)
for cluster in range(nclusters):
    print ("\033[1mCluster ",cluster,":\033[0m",sentences[clusters[cluster][representativos[cluster]]])

In [None]:
# Plot - Cantidad de sentences por cluster
len_clusters(dic=clusters,n = nclusters, namecol = namecol , nro_top=8, width = 0.6, save = save)

### 9. Tópico 8

In [None]:
# Cantidad de OT
sentences = df_estacion_sentences['Requerimiento']['SOCIEDAD COMERCIAL ANTUCO LTDA.']
len(sentences)

In [None]:
%%time
# Identificando el número óptimo de clústers por medio del método del Elbow
elbow_method(sentences,namecol=namecol, nro_top=9,save = save)

In [None]:
# Ejecutando el algoritmo Kmeans para el número óptimo de clústers 
print('\033[1mCLUSTERS \n\033[0m')
start_time = time()

sentences = sentences
nclusters= 22
clusters,representativos = cluster_sentences(sentences, nclusters)

for cluster in range(nclusters):
    print (f"\033[1mCluster {cluster} : {sentences[clusters[cluster][representativos[cluster]]]}\033[0m")
    for i,sentence in enumerate(clusters[cluster]):
        print ("\tsentence ",i,": ",sentences[sentence])

elapsed_time = time() - start_time
print("\033[1mElapsed time: %.10f seconds.\033[0m" % elapsed_time)

In [None]:
# Clusters representativos (con n = 22)
for cluster in range(nclusters):
    print ("\033[1mCluster ",cluster,":\033[0m",sentences[clusters[cluster][representativos[cluster]]])

In [None]:
# Plot - Cantidad de sentences por cluster
len_clusters(dic=clusters,n = nclusters, namecol = namecol , nro_top=9, width = 0.6, save = save)

### 10. Tópico 9

In [None]:
# Cantidad de OT
sentences = df_estacion_sentences['Requerimiento']['Comercial Albano Ltda.']
len(sentences)

In [None]:
%%time
# Identificando el número óptimo de clústers por medio del método del Elbow
elbow_method(sentences,namecol=namecol, nro_top=10,save = save)

In [None]:
# Ejecutando el algoritmo Kmeans para el número óptimo de clústers 
print('\033[1mCLUSTERS \n\033[0m')
start_time = time()

sentences = sentences
nclusters= 22
clusters,representativos = cluster_sentences(sentences, nclusters)

for cluster in range(nclusters):
    print (f"\033[1mCluster {cluster} : {sentences[clusters[cluster][representativos[cluster]]]}\033[0m")
    for i,sentence in enumerate(clusters[cluster]):
        print ("\tsentence ",i,": ",sentences[sentence])

elapsed_time = time() - start_time
print("\033[1mElapsed time: %.10f seconds.\033[0m" % elapsed_time)

In [None]:
# Clusters representativos (con n = 22)
for cluster in range(nclusters):
    print ("\033[1mCluster ",cluster,":\033[0m",sentences[clusters[cluster][representativos[cluster]]])

In [None]:
# Plot - Cantidad de sentences por cluster
len_clusters(dic=clusters,n = nclusters, namecol = namecol , nro_top=10, width = 0.6, save = save)

### 11. Tópico 10

### 12. Tópico 11

### 13. Tópico 12

### 14. Tópico 13

### 15. Tópico 14

### 16. Tópico 15

### 17. Tópico 16

### 18. Tópico 17

### 19. Tópico 18

### 20. Tópico 19