## Libraries and helpers

In [14]:
import pandas as pd
import numpy as np
import re
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA, TruncatedSVD
from collections import Counter


In [6]:
def string_to_vector(string):
  """
  Turns a string of a list of numbers into a list of numbers
  """
  string = re.sub("[\[\]]", "", string)
  string = string.split(', ')
  return [float(x) for x in string]

def ideal_n_clusters(n_clusters, data, model):
    """
    Iterates over a range of n_clusters to find the optimal number of clusters
        Inputs: n_clusters (lst): list of integers to iterate over
        Outputs: performance_dict (dict): dictionary of silhouette scores for each n_cluster
    """
    performance_dict = {}
    for n in n_clusters:
        model = model(n_components=n, random_state=0)
        model.fit(data)
        cluster_labels = model.predict(data)
        score = silhouette_score(data, cluster_labels)
        performance_dict[n] = score
    return performance_dict

## Preprocessing

In [15]:
path = r"C:\Users\asarr\Documents\MACSS\Thesis\results\interventions_sample.pkl"
interventions_df = pd.read_pickle(path)
interventions_df.head()

Unnamed: 0,session_id,intervention_id,speaker_text,intervention_text,intervention_words,embeddings,embeddings_str
0,gaceta_459 (7),452004,resolucion numero 001 de 2006,(julio 21) por medio de la cual se crea una c...,"['accidental', 'acreditación', 'acrediten', 'a...","tensor([[ 2.7571e-01, -1.5672e-01, 5.2478e-02...","[0.2757052183151245, -0.15671707689762115, 0.0..."
1,gaceta_459 (7),452003,resolucion numero 33 de 2006,(septiembre 11) por medio de la cual se aclar...,"['aclara', 'aclárese', 'acto', 'administrativo...","tensor([[ 4.0235e-01, -1.5103e-01, 2.4900e-01...","[0.40234655141830444, -0.1510276049375534, 0.2..."
2,gaceta_459 (7),452002,resolucion numero 29 de 2006,(septiembre 8) por medio de la cual se design...,"['artículo', 'comisión', 'congreso', 'convenio...","tensor([[ 1.8694e-01, -1.2405e-01, 4.2312e-02...","[0.1869388073682785, -0.12404859066009521, 0.0..."
3,gaceta_459 (7),452001,iv. buenos resultados de la política social:,si bien el crecimiento económico es un factor ...,"['aumentado', 'crecimiento', 'distribución', '...","tensor([[ 5.9341e-02, -1.1320e-01, 1.7544e-01...","[0.05934141203761101, -0.1132017970085144, 0.1..."
5,gaceta_459 (7),451999,gráfico 5. distribución de ingreso – coeficien...,fuente: merpd la disminución en la inequidad ...,"['aquellos', 'aumentado', 'caído', 'coeficient...","tensor([[ 4.1566e-01, -1.8668e-02, 3.1906e-01...","[0.41566094756126404, -0.018667755648493767, 0..."


In [5]:
df = interventions_df[['session_id', 'intervention_id', 'embeddings_str']]
df['embeddings'] = df['embeddings_str'].apply(string_to_vector)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['embeddings'] = df['embeddings_str'].apply(string_to_vector)


Unnamed: 0,session_id,intervention_id,embeddings_str,embeddings
0,gaceta_459 (7),452004,"[0.2757052183151245, -0.15671707689762115, 0.0...","[0.2757052183151245, -0.15671707689762115, 0.0..."
1,gaceta_459 (7),452003,"[0.40234655141830444, -0.1510276049375534, 0.2...","[0.40234655141830444, -0.1510276049375534, 0.2..."
2,gaceta_459 (7),452002,"[0.1869388073682785, -0.12404859066009521, 0.0...","[0.1869388073682785, -0.12404859066009521, 0.0..."
3,gaceta_459 (7),452001,"[0.05934141203761101, -0.1132017970085144, 0.1...","[0.05934141203761101, -0.1132017970085144, 0.1..."
5,gaceta_459 (7),451999,"[0.41566094756126404, -0.018667755648493767, 0...","[0.41566094756126404, -0.018667755648493767, 0..."


In [6]:
df.to_pickle(r"C:\Users\asarr\Documents\MACSS\Thesis\results\sample_embeddings.pkl")

In [2]:
df = pd.read_pickle(r"C:\Users\asarr\Documents\MACSS\Thesis\results\sample_embeddings.pkl")
df.head()

Unnamed: 0,session_id,intervention_id,embeddings_str,embeddings
0,gaceta_459 (7),452004,"[0.2757052183151245, -0.15671707689762115, 0.0...","[0.2757052183151245, -0.15671707689762115, 0.0..."
1,gaceta_459 (7),452003,"[0.40234655141830444, -0.1510276049375534, 0.2...","[0.40234655141830444, -0.1510276049375534, 0.2..."
2,gaceta_459 (7),452002,"[0.1869388073682785, -0.12404859066009521, 0.0...","[0.1869388073682785, -0.12404859066009521, 0.0..."
3,gaceta_459 (7),452001,"[0.05934141203761101, -0.1132017970085144, 0.1...","[0.05934141203761101, -0.1132017970085144, 0.1..."
5,gaceta_459 (7),451999,"[0.41566094756126404, -0.018667755648493767, 0...","[0.41566094756126404, -0.018667755648493767, 0..."


In [3]:
embeddings_array = np.vstack(df['embeddings'].values)
embeddings_array.shape

(96579, 768)

## Benchmarks, no dimension reductions

Initial set of clusters created without dimensionality reduction as a benchmark.

In [None]:
n_clusters = 5  
gmm = GaussianMixture(n_components=n_clusters, random_state=42)
gmm.fit(embeddings_array)
cluster_labels = gmm.predict(embeddings_array)

In [11]:
score = silhouette_score(embeddings_array, cluster_labels)
print(f"Silhouette Score: {score}")

Silhouette Score: 0.01159898702054076


In [45]:
k_means = KMeans(n_clusters=5, random_state=42)
k_means.fit(embeddings_array)
cluster_labels = k_means.predict(embeddings_array)
score = silhouette_score(embeddings_array, cluster_labels)
print(f"Silhouette Score: {score}")

Silhouette Score: 0.12528334636110114


Pretty bad. Lets see if dimentionality reduction can improve this

# Gaussian Mixture Models

## GMM, pca reduction

In [5]:
#determine the number of components
pca = PCA().fit(embeddings_array)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

num_components_90 = np.where(cumulative_variance > 0.9)[0][0] + 1
print(f"Number of components to explain 90% of the variance: {num_components_90}")


Number of components to explain 90% of the variance: 158


158 components explain 90% of variance, lets try that number of components

In [8]:
pca = PCA(n_components=num_components_90)
pca_result = pca.fit_transform(embeddings_array)

gmm_pca = GaussianMixture(n_components=n_clusters, random_state=42)
gmm_pca.fit(pca_result)
cluster_labels_pca = gmm_pca.predict(pca_result)

score_pca = silhouette_score(pca_result, cluster_labels_pca)
print(f"Silhouette Score: {score_pca}")

Silhouette Score: 0.029439254428256664


That's three times better. It would probably improve a lot by increasing number of clusters, but lets try SVD first

## GMM, svd

In [22]:
svd_result = TruncatedSVD(n_components=num_components_90).fit_transform(embeddings_array)

In [23]:
gmm_svd = GaussianMixture(n_components=n_clusters, random_state=42)
gmm_svd.fit(svd_result)
cluster_labels_svd = gmm_svd.predict(svd_result)

score_svd = silhouette_score(svd_result, cluster_labels_svd)
print(f"Silhouette Score: {score_svd}")

Silhouette Score: 0.026443614839753198


PCA works marginally better, let's move on to a higher number of cluster

## Cluster size evaluator

In [None]:
n_clusters = [10, 20, 30, 40, 50]

performance_dict = ideal_n_clusters(n_clusters, pca_result)

performance_dict


In [31]:
for k, v in performance_dict.items():
    print(f"Number of clusters: {k}, Silhouette Score: {v}")

Number of clusters: 10, Silhouette Score: 0.05345132207031303
Number of clusters: 20, Silhouette Score: 0.04889704517078761
Number of clusters: 30, Silhouette Score: 0.03957749485142707
Number of clusters: 40, Silhouette Score: 0.03377890168506085
Number of clusters: 50, Silhouette Score: 0.03903558424000448


In [34]:
n_clusters = [5, 8, 10, 12, 45, 55, 60]
performance_dict = ideal_n_clusters(n_clusters, svd_result)
for k, v in performance_dict.items():
    print(f"Number of clusters: {k}, Silhouette Score: {v}")

working on 5 clusters
working on 8 clusters
working on 10 clusters
working on 12 clusters
working on 45 clusters
working on 55 clusters
working on 60 clusters
Number of clusters: 5, Silhouette Score: 0.07802620494404573
Number of clusters: 8, Silhouette Score: 0.058949071808405584
Number of clusters: 10, Silhouette Score: 0.05168872323919132
Number of clusters: 12, Silhouette Score: 0.05127818677992968
Number of clusters: 45, Silhouette Score: 0.030900441391879016
Number of clusters: 55, Silhouette Score: 0.007043922029220892
Number of clusters: 60, Silhouette Score: 0.01037453586087734


In [35]:
n_clusters = [5, 8, 10, 23, 45, 55]
performance_dict = ideal_n_clusters(n_clusters, pca_result)
for k, v in performance_dict.items():
    print(f"Number of clusters: {k}, Silhouett  e Score: {v}")

working on 5 clusters
working on 8 clusters
working on 10 clusters
working on 12 clusters
working on 45 clusters
working on 55 clusters
working on 60 clusters
Number of clusters: 5, Silhouette Score: 0.07765081568002019
Number of clusters: 8, Silhouette Score: 0.06227079686176664
Number of clusters: 10, Silhouette Score: 0.05345132207031303
Number of clusters: 12, Silhouette Score: 0.05533508307308329
Number of clusters: 45, Silhouette Score: 0.03679734428612236
Number of clusters: 55, Silhouette Score: 0.0032561744593002746
Number of clusters: 60, Silhouette Score: 0.005456627192208797


In [38]:
n_clusters = [3, 4, 5, 6, 7]
print('PCA')
performance_dict = ideal_n_clusters(n_clusters, pca_result)
for k, v in performance_dict.items():
    print(f"Number of clusters: {k}, Silhouette Score: {v}")
print('SVD')
performance_dict = ideal_n_clusters(n_clusters, svd_result)
for k, v in performance_dict.items():
    print(f"Number of clusters: {k}, Silhouette Score: {v}")

PCA
Number of clusters: 3, Silhouette Score: -0.01083800018383823
Number of clusters: 4, Silhouette Score: 0.0733513807178851
Number of clusters: 5, Silhouette Score: 0.07765081568002019
Number of clusters: 6, Silhouette Score: 0.04084593789213388
Number of clusters: 7, Silhouette Score: 0.05456834237039192
SVD
Number of clusters: 3, Silhouette Score: -0.012046309421314523
Number of clusters: 4, Silhouette Score: 0.07293159107090035
Number of clusters: 5, Silhouette Score: 0.07802620494404573
Number of clusters: 6, Silhouette Score: 0.045405236024291334
Number of clusters: 7, Silhouette Score: 0.0601755348145092


Maybe GMM is not great, lets move try K-Means

# K-Means

In [40]:
n_clusters = [3, 4, 5, 6, 7, 10]

print('PCA')
for n in n_clusters:
    k_means = KMeans(n_clusters=n, random_state=0)
    k_means.fit(embeddings_array)
    k_means_labels = k_means.predict(embeddings_array)
    k_means_score = silhouette_score(embeddings_array, k_means_labels)
    print(f"Silhouette Score for {n}: {k_means_score}")

print('SVD')
for n in n_clusters:
    k_means = KMeans(n_clusters=n, random_state=0)
    k_means.fit(svd_result)
    k_means_labels = k_means.predict(svd_result)
    k_means_score = silhouette_score(svd_result, k_means_labels)
    print(f"Silhouette Score for {n}: {k_means_score}")

PCA
Silhouette Score for 3: 0.13288541356384917
Silhouette Score for 4: 0.1350182757882174
Silhouette Score for 5: 0.12616899755944722
Silhouette Score for 6: 0.11874287556099757
Silhouette Score for 7: 0.12288221125630347
Silhouette Score for 10: 0.1006036104587887
SVD
Silhouette Score for 3: 0.14659500070901288
Silhouette Score for 4: 0.14859290123621494
Silhouette Score for 5: 0.14145579012273848
Silhouette Score for 6: 0.09380878350973747
Silhouette Score for 7: 0.08364390205829669
Silhouette Score for 10: 0.07828206054911885


Best model: K-means, 4 clusters, svd vectors

# Assign cluster labels

In [41]:
df.head(1)

Unnamed: 0,session_id,intervention_id,embeddings_str,embeddings
0,gaceta_459 (7),452004,"[0.2757052183151245, -0.15671707689762115, 0.0...","[0.2757052183151245, -0.15671707689762115, 0.0..."


In [42]:
k_means = KMeans(n_clusters=4, random_state=0)
k_means.fit(svd_result)
k_means_labels = k_means.predict(svd_result)
df['cluster'] = k_means_labels
df.head()

Unnamed: 0,session_id,intervention_id,embeddings_str,embeddings,cluster
0,gaceta_459 (7),452004,"[0.2757052183151245, -0.15671707689762115, 0.0...","[0.2757052183151245, -0.15671707689762115, 0.0...",1
1,gaceta_459 (7),452003,"[0.40234655141830444, -0.1510276049375534, 0.2...","[0.40234655141830444, -0.1510276049375534, 0.2...",3
2,gaceta_459 (7),452002,"[0.1869388073682785, -0.12404859066009521, 0.0...","[0.1869388073682785, -0.12404859066009521, 0.0...",1
3,gaceta_459 (7),452001,"[0.05934141203761101, -0.1132017970085144, 0.1...","[0.05934141203761101, -0.1132017970085144, 0.1...",1
5,gaceta_459 (7),451999,"[0.41566094756126404, -0.018667755648493767, 0...","[0.41566094756126404, -0.018667755648493767, 0...",1


In [43]:
df['cluster'].value_counts()

cluster
1    42384
3    39818
0     7562
2     6815
Name: count, dtype: int64

In [63]:
#get 5 random samples from each cluster
df.groupby('cluster').apply(lambda x: x.sample(5))

#create a dictionary of cluster labels and intervention_ids sample
cluster_dict = {}
for cluster in df['cluster'].unique():
    cluster_dict[cluster] = df[df['cluster'] == cluster]['intervention_id'].sample(5).values
cluster_dict

  df.groupby('cluster').apply(lambda x: x.sample(5))


{1: array([417787, 389037, 422144, 302660, 403725], dtype=int64),
 3: array([366845, 401552, 373831, 449235, 392497], dtype=int64),
 0: array([341999, 332980, 338584, 333658, 342088], dtype=int64),
 2: array([370642, 437492, 358833, 358934, 342625], dtype=int64)}

In [66]:
#extract the text of the interventions from the interventions_df dataframe

for cluster in cluster_dict:
    print(f"Cluster {cluster}")
    for intervention_id in cluster_dict[cluster]:
        print(interventions_df[interventions_df['intervention_id'] == intervention_id]['intervention_text'].values[0][:100])
    print("\n")

Cluster 1
gracias señora presidenta, honorables senadores. a ver. yo estoy diciendo lo contrario, que se  estu
muchas gracias, señora presidenta, este es un  derecho a la réplica en respuesta a la alusión direct
sí, gracias, muy buenos días a todos los  compañeros.  ministra, bienvenida con su equipo, estoy un 
le rogaría que una vez termine el senador camilo romero, le den la primera oportunidad a la señora  
gracias, señora presidenta, señor ministro. pues  usted nos ha presentado un balance de lo ejecutado


Cluster 3
por el sí:	  por el no:	 04 por medio de la cual se reconoce y garantiza  la entrega del kit ‘mamá c
habiendo sido leídas las actas que se pondrán a  consideración, abro su discusión, anuncio que se va
señor presidente, el proyecto consta de tres artículos incluida la vigencia, no hay proposición al  
número 785 de 2008.  jueves 12 de marzo de 2009  autores: honorables senadores alexandra moreno  pir
( agosto 14)  por la cual se autoriza la inasistencia justi¿cada a un

Seems that cluster 2 is all interventions that should not be here. What happens if we remove and recluster?

# Reclustering

In [68]:
df_filtered = df[df.cluster != 2]
df_filtered['cluster'].value_counts()

cluster
1    42384
3    39818
0     7562
Name: count, dtype: int64

In [69]:
embeddings_array_filtered = np.vstack(df_filtered['embeddings'].values)
embeddings_array_filtered.shape

(89764, 768)

## Benchmarks

In [72]:
n_clusters = 4

gmm_filtered = GaussianMixture(n_components=n_clusters, random_state=0)
gmm_filtered.fit(embeddings_array_filtered)
cluster_labels_gmm_filtered = gmm_filtered.predict(embeddings_array_filtered)
score_gmm_filtered = silhouette_score(embeddings_array_filtered, cluster_labels_gmm_filtered)
print(f"Silhouette Score for gmm: {score_gmm_filtered}")

k_means_filtered = KMeans(n_clusters=n_clusters, random_state=0)
k_means_filtered.fit(embeddings_array_filtered)
cluster_labels_k_means_filtered = k_means_filtered.predict(embeddings_array_filtered)
score_k_means_filtered = silhouette_score(embeddings_array_filtered, cluster_labels_k_means_filtered)
print(f"Silhouette Score for k-means: {score_k_means_filtered}")

Silhouette Score for gmm: 0.06408101192390328
Silhouette Score for k-means: 0.11338644982480646


## Dimension reductions

In [73]:
#re-evaluate number of components
pca_filtered = PCA().fit(embeddings_array_filtered)
cumulative_variance_filtered = np.cumsum(pca_filtered.explained_variance_ratio_)
num_components_90_filtered = np.where(cumulative_variance_filtered > 0.9)[0][0] + 1
print(f"Number of components to explain 90% of the variance: {num_components_90_filtered}")

Number of components to explain 90% of the variance: 172


In [74]:
#pca vectors
pca_filtered = PCA(n_components=num_components_90_filtered)
pca_result_filtered = pca_filtered.fit_transform(embeddings_array_filtered)

#svd vectors
svd_result_filtered = TruncatedSVD(n_components=num_components_90_filtered).fit_transform(embeddings_array_filtered)

## Gaussian Mixture

In [76]:
n_clusters = [5, 8, 10, 23, 45, 55]

print('PCA')
for n in n_clusters:
    gmm = GaussianMixture(n_components=n, random_state=0)
    gmm.fit(pca_result_filtered)
    cluster_labels = gmm.predict(pca_result_filtered)
    score = silhouette_score(pca_result_filtered, cluster_labels)
    print(f"Silhouette Score for {n}: {score}")

print('SVD')
for n in n_clusters:
    gmm = GaussianMixture(n_components=n, random_state=0)
    gmm.fit(svd_result_filtered)
    cluster_labels = gmm.predict(svd_result_filtered)
    score = silhouette_score(svd_result_filtered, cluster_labels)
    print(f"Silhouette Score for {n}: {score}")

PCA
Silhouette Score for 5: 0.025947023919696884
Silhouette Score for 8: 0.03963488428063687
Silhouette Score for 10: 0.044201789958498286




Silhouette Score for 23: 0.015491667897847979
Silhouette Score for 45: -0.002720337390753053
Silhouette Score for 55: -0.0021640503311802137
SVD
Silhouette Score for 5: 0.02603564849646683
Silhouette Score for 8: 0.030916526079169542
Silhouette Score for 10: 0.03515839564033078
Silhouette Score for 23: 0.03432434263465254
Silhouette Score for 45: -0.003196269681586971
Silhouette Score for 55: 0.0039522406799611725


## K-Means

In [77]:
print('PCA')
for n in n_clusters:
    k_means = KMeans(n_clusters=n, random_state=0)
    k_means.fit(pca_result_filtered)
    k_means_labels = k_means.predict(pca_result_filtered)
    k_means_score = silhouette_score(pca_result_filtered, k_means_labels)
    print(f"Silhouette Score for {n}: {k_means_score}")

print('SVD')
for n in n_clusters:
    k_means = KMeans(n_clusters=n, random_state=0)
    k_means.fit(svd_result_filtered)
    k_means_labels = k_means.predict(svd_result_filtered)
    k_means_score = silhouette_score(svd_result_filtered, k_means_labels)
    print(f"Silhouette Score for {n}: {k_means_score}")

PCA
Silhouette Score for 5: 0.07487538558631139
Silhouette Score for 8: 0.07264063285800462
Silhouette Score for 10: 0.0763731365329157
Silhouette Score for 23: 0.06577836232166258
Silhouette Score for 45: 0.017078943151268056
Silhouette Score for 55: 0.01602990734341895
SVD
Silhouette Score for 5: 0.12951844548964025
Silhouette Score for 8: 0.06221019358582034
Silhouette Score for 10: 0.06304346355298597
Silhouette Score for 23: 0.06871096646280267
Silhouette Score for 45: 0.03419384458832527
Silhouette Score for 55: 0.040588346534950574


In [78]:
k_means = KMeans(n_clusters=4, random_state=0)
k_means.fit(svd_result_filtered)
k_means_labels = k_means.predict(svd_result_filtered)
k_means_score = silhouette_score(svd_result_filtered, k_means_labels)
print(f"Silhouette Score for {n}: {k_means_score}")

Silhouette Score for 55: 0.126126816642135


In [79]:
k_means = KMeans(n_clusters=6, random_state=0)
k_means.fit(svd_result_filtered)
k_means_labels = k_means.predict(svd_result_filtered)
k_means_score = silhouette_score(svd_result_filtered, k_means_labels)
print(f"Silhouette Score for {n}: {k_means_score}")

Silhouette Score for 55: 0.12210832940922821


Getting lower silhoutte scores after filtering out dirty interventions. Makes some sense? lets check the labels

# Assign new labels

In [83]:
import warnings
warnings.filterwarnings("ignore")

k_means = KMeans(n_clusters=5, random_state=0)
k_means.fit(svd_result_filtered)
k_means_labels = k_means.predict(svd_result_filtered)

df_filtered['cluster'] = k_means_labels
df_filtered['cluster'].value_counts()

cluster
0    32206
3    24625
4    15432
1    10400
2     7101
Name: count, dtype: int64

In [82]:
cluster_dict = {}
for cluster in df_filtered['cluster'].unique():
    cluster_dict[cluster] = df_filtered[df_filtered['cluster'] == cluster]['intervention_id'].sample(5).values

for cluster in cluster_dict:
    print(f"Cluster {cluster}")
    for intervention_id in cluster_dict[cluster]:
        print(interventions_df[interventions_df['intervention_id'] == intervention_id]['intervention_text'].values[0][:100])
    print("\n")

Cluster 3
número 1622 de 2022.  autores:  honorables representantes juan carlos wills  ospina, buenaventura le
al respecto se manifiesta que en lo concerniente al recaudo del mes de diciembre los  cuales asciend
sí, señor presidente, se anuncian por instrucciones  suyas los proyectos que se discutirán y votarán
número 361 de 2017.  autoras: ministras de relaciones exteriores,  doctora maría ángela holguín cuél
la facultad del ministerio del interior y justicia en esta materia no puede exceder la ley. es  deci


Cluster 0
vean  compañeros,  señor  ministro,  señor  superintendente. todos entendemos tue el propósito  de e
gracias, señor presidente, con los buenos días  para todos y para todas, a la mesa principal, a los 
muy bien. ministra, le cuento que también nos  faltan unos debates, tenemos programado fechas, nos  
buenos días a todos los asistentes, bienvenida  comisión primera de la honorable cámara de represent
senador delgado, antes de que intervenga el  doctor botero. igualment

These seem more interpretable!

In [90]:
df_filtered.to_csv(r"C:\Users\asarr\Documents\MACSS\Thesis\results\sample_clusters.csv", index=False)

In [None]:
df_filtered = pd.read_csv(r"C:\Users\asarr\Documents\MACSS\Thesis\results\sample_clusters.csv", index_col=0)

In [7]:
df_filtered['embeddings'] = df_filtered['embeddings'].apply(string_to_vector)

In [9]:
df_filtered.head()

Unnamed: 0_level_0,intervention_id,embeddings_str,embeddings,cluster
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
gaceta_459 (7),452004,"[0.2757052183151245, -0.15671707689762115, 0.0...","[0.2757052183151245, -0.15671707689762115, 0.0...",3
gaceta_459 (7),452003,"[0.40234655141830444, -0.1510276049375534, 0.2...","[0.40234655141830444, -0.1510276049375534, 0.2...",3
gaceta_459 (7),452002,"[0.1869388073682785, -0.12404859066009521, 0.0...","[0.1869388073682785, -0.12404859066009521, 0.0...",3
gaceta_459 (7),452001,"[0.05934141203761101, -0.1132017970085144, 0.1...","[0.05934141203761101, -0.1132017970085144, 0.1...",0
gaceta_459 (7),451999,"[0.41566094756126404, -0.018667755648493767, 0...","[0.41566094756126404, -0.018667755648493767, 0...",0


In [11]:
embeddings_array_filtered = np.vstack(df_filtered['embeddings'].values)
svd_result_filtered = TruncatedSVD(n_components=172)
reduced_embeddings = svd_result_filtered.fit_transform(embeddings_array_filtered)

n = 10

k_means = KMeans(n_clusters=n, random_state=0)
k_means.fit(reduced_embeddings)
k_means_labels = k_means.predict(reduced_embeddings)
k_means_score = silhouette_score(reduced_embeddings, k_means_labels)
print(f"Silhouette Score for {n}: {k_means_score}")

Silhouette Score for 10: 0.07661439088026407


In [12]:
df_filtered['cluster'] = k_means_labels
df_filtered['cluster'].value_counts()

cluster
1    15342
7    14786
3    12606
5    12315
6     8023
2     6150
4     5974
8     5546
0     4798
9     4224
Name: count, dtype: int64

In [16]:
cluster_dict = {}
for cluster in df_filtered['cluster'].unique():
    cluster_dict[cluster] = df_filtered[df_filtered['cluster'] == cluster]['intervention_id'].sample(5).values

for cluster in cluster_dict:
    print(f"Cluster {cluster}")
    for intervention_id in cluster_dict[cluster]:
        print(interventions_df[interventions_df['intervention_id'] == intervention_id]['intervention_text'].values[0][:100])
    print("\n")

Cluster 5
doctor mora, nosotros teníamos varios interrogantes que se nos hacía en un cuestionario, muchas de l
nocimiento de los resultados, de la seguridad demorinde fondelibertad. el establecer el día de la li
gracias señor presidente, la verdad es que esta  propuesta fue analizada en la comisión de ponentes 
muchas gracias señor presidente, un saludo muy  especial a los altos funcionarios del gobierno que  
presidente muchas gracias, después de escuchar  a los compañeros pues no queda mucho que decir.  rea


Cluster 1
sí presidenta, buenos días llamado a lista sesión comisión segunda, cámara de representantes, abril 
1.al grado de vicealmirante, del contralmirante  daniel iriarte alvira.  ponente: nancy patricia gut
para que brinde las explicaciones a las glosas de la  contraloría general de la república, a las obs
número 798 de 2019.  11. por medio de la cual se crea el  sistema nacional de prevención y atención 
(negado)  solicito a la plenaria aceptar mi impedimento para  partici

In [17]:
df_filtered.to_csv(r"C:\Users\asarr\Documents\MACSS\Thesis\results\sample_clusters.csv", index=False)