In [132]:
import csv
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
import plotly.graph_objs as pgo
from sklearn_extra.cluster import KMedoids



In [133]:
with open('../data/relation_mapping_sentence_transformer.csv') as f:
    data = csv.reader(f)
    relations = [row[0] for row in data][1:]

In [134]:
relations = list(set(relations))

In [135]:
model = SentenceTransformer('intfloat/e5-large-v2')

embeddings = model.encode(relations)

In [155]:
def cluster(embeddings, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0, init='random')  # or KMenoids()
    kmeans.fit_predict(embeddings)
    labels = kmeans.labels_
    return kmeans, labels
    

In [139]:
def get_list_of_clusters(labels):
    n_clusters = len(set(labels))
    clustered_relations = [[] for i in range(n_clusters)]
    for id, cluster_id in enumerate(labels):
        clustered_relations[cluster_id].append(relations[id])
    return clustered_relations

In [140]:
def get_inertias(embeddings, max_clusters):
    inertias = np.zeros(max_clusters)
    for i in range(1, max_clusters):
        kmeans, _ = cluster(embeddings, n_clusters=i)
        inertias[i] = kmeans.inertia_
    return inertias

In [141]:
max_clusters = 500
inertias = get_inertias(embeddings, max_clusters)


In [142]:
data6 = pgo.Data([
    pgo.Scatter(
            x=[0, max_clusters],
            
            y=inertias[1:]
    )
])
layout6 = pgo.Layout(
    title='Optinal number of clusters for intfloat/e5-large-v2',
    xaxis=pgo.XAxis(title='Number of clusters',
                    range=[0, max_clusters]),
    yaxis=pgo.YAxis(title='Inertia')
)
fig6 = pgo.Figure(data=data6, layout=layout6)
fig6.show()



plotly.graph_objs.Data is deprecated.
Please replace it with a list or tuple of instances of the following types
  - plotly.graph_objs.Scatter
  - plotly.graph_objs.Bar
  - plotly.graph_objs.Area
  - plotly.graph_objs.Histogram
  - etc.



plotly.graph_objs.XAxis is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.XAxis
  - plotly.graph_objs.layout.scene.XAxis



plotly.graph_objs.YAxis is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.YAxis
  - plotly.graph_objs.layout.scene.YAxis




In [143]:
kmeans, labels = cluster(embeddings, 50)
centroid_embeddings = kmeans.cluster_centers_
for centroid in centroid_embeddings:
    if centroid in embeddings:
        index = np.where(centroid == embeddings)[0][0]
        print(relations[index])
    else:
        print("Menoid not found")
            

Menoid not found
Menoid not found
remain in touch with
Menoid not found
Menoid not found
Menoid not found
Menoid not found
sustain
Menoid not found
Menoid not found
Menoid not found
Menoid not found
served as
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
recover
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
Menoid not found
transition
Menoid not found


In [144]:
from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=2, metric='cosine').fit(embeddings)
distances, indices = nbrs.kneighbors(centroid_embeddings)
menoids = []
for index in indices:
   menoids.append(relations[index[0]])

In [93]:
clusters = get_list_of_clusters(labels)
with open('../data/50clusters_cosine.txt', 'w') as f:
    for m, c in zip(menoids, clusters):
        f.write(f"Cluster Center: {m}\n")
        f.write(",".join(c))
        f.write("\n\n\n")

In [147]:
df = pd.DataFrame(embeddings)
pca = PCA()
pca.fit(df)
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)

px.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={"x": "# Components", "y": "Explained Variance"},
    title="Explained Variable per number of PCA components"
)

In [151]:
X = pd.DataFrame(embeddings)
pca = PCA(n_components=2)
components = pca.fit_transform(X)
X['labels'] = labels

fig = px.scatter(components, x=0, y=1, color=X['labels'], title="The clustered data projected into the latent space of 2 PCA components", labels={'0': 'X1', '1': 'X2'})
fig.show()

In [154]:
import plotly.express as px 
X = pd.DataFrame(embeddings)
pca = PCA(n_components=3)
components = pca.fit_transform(X)

total_var = pca.explained_variance_ratio_.sum() * 100
X['labels'] = labels
X['relation'] = relations
fig = px.scatter_3d(
    components, x=0, y=1, z=2, color=X['labels'],
    title='The clustered data projected into the latent space of 3 PCA components',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'},
    width=800, height=800
)
fig.update_traces(marker_size=5)
fig.update_layout(scattermode="group", scattergap=0.5)
fig.show()