In [None]:
# imports
import numpy as np
import pandas as pd
import openai

# load data
datafile_path = "" 

df = pd.read_csv(datafile_path)
df["embedding"] = df.embedding.apply(eval).apply(np.array)  # convert string to numpy array
matrix = np.vstack(df.embedding.values)
matrix.shape

In [29]:
openai.api_key = ''
openai.api_type = ''
openai.api_base = ''
openai.api_version = ''

## Find the clusters using K-means

The following cell creates the number of clusters defined in the variable n_clusters. If you don't know the optimal number, you can run the next cell which first calculates the optimal number of clusters for the dataset from a range of n clusters using the silhouette_score method.

In [None]:
from sklearn.cluster import KMeans

n_clusters = 10

kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42)
kmeans.fit(matrix)
labels = kmeans.labels_
df["Cluster"] = labels

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score 

# define a range of possible numbers of clusters
range_n_clusters = range(8,24)

# variables to store the best silhouette score and corresponding number
best_score = -1
best_n_clusters = -1

# Iterate over possible values of clusters
for n_clusters in range_n_clusters:
    # Fit Kmeans model and compute labels
    kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42)
    kmeans.fit(matrix)
    labels = kmeans.labels_

    # Compute silhouette score
    score = silhouette_score(matrix, labels)

    # Update best score and number of clusters if current score is better
    if score > best_score:
        best_score = score
        best_n_clusters = n_clusters 

# Fit KMeans model with best number of clusters and compute labels
kmeans = KMeans(n_clusters=best_n_clusters, init="k-means++", random_state=42)
kmeans.fit(matrix)
labels = kmeans.labels_

# Add Cluster labels to DataFrame
df["Cluster"] = labels

# Print number of clusters
print(best_n_clusters)

In [None]:
df.head(10)

In [None]:
from sklearn.manifold import TSNE
import matplotlib
import matplotlib.pyplot as plt

tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200)
vis_dims2 = tsne.fit_transform(matrix)

x = [x for x, y in vis_dims2]
y = [y for x, y in vis_dims2]

for category, color in enumerate(["purple", "green", "red", "blue", "yellow", "orange", "grey", "black", "brown","pink"]):
    xs = np.array(x)[df.Cluster == category]
    ys = np.array(y)[df.Cluster == category]
    plt.scatter(xs, ys, color=color, alpha=0.3)

    avg_x = xs.mean()
    avg_y = ys.mean()

    plt.scatter(avg_x, avg_y, marker="x", color=color, s=100)
plt.title("Clusters identified visualized in language 2d using t-SNE")

## Text samples in the clusters & naming the clusters

In [38]:
import openai

# Reading a description which belong to each group.
rev_per_cluster = 5

# Print results and save them in a .txt file

with open('data/categories.txt', 'w') as file:
    for i in range(n_clusters):
        print(f"Cluster {i} Theme:", end=" ", file=file)

        descriptions = "\n".join(
            df[df.Cluster == i]
            .cambio.str.replace("Title: ", "") 
            .str.replace("\n\nContent: ", ":  ")
            .sample(rev_per_cluster, random_state=42)
            .values
        )
        response = openai.Completion.create(
            engine="text-davinci-003",
            prompt=f'What do the following descriptions have in common?\n\nDescriptions:\n"""\n{descriptions}\n"""\n\nTheme:',
            temperature=0,
            max_tokens=64,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
        )
        print(response["choices"][0]["text"].replace("\n", ""), file=file)

        sample_cluster_rows = df[df.Cluster == i].sample(rev_per_cluster, random_state=42)
        for j in range(rev_per_cluster):
            print(sample_cluster_rows.cambio.str[:70].values[j], file=file)

        print("-" * 100, file=file)