In [1]:
import openai
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
import plotly.graph_objects as go
import os
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

df = pd.read_csv("./posts.csv")

openai.api_key = os.environ["OPENAI_API_KEY"]

# convert all Content to string
df["content"] = df["content"].astype(str)

df = df.dropna()

In [None]:
df["embedding"] = df["content"].apply(lambda x: openai.Embedding.create(
    input="".join(x.split("\n")[1:8]),
    model="text-embedding-ada-002",
)["data"][0]["embedding"])

In [34]:
data = df["embedding"].to_list()

kmeans = KMeans(n_clusters=3, random_state=1)

kmeans.fit(data)

df["Cluster"] = kmeans.labels_

tsne = TSNE(n_components=3, verbose=1, perplexity=15, n_iter=300)

tsne_results = tsne.fit_transform(np.array(data))

x = tsne_results[:, 0]
y = tsne_results[:, 1]
z = tsne_results[:, 2]

colors = ["red", "green", "blue"]

# plot using go
fig = go.Figure(data=[go.Scatter3d(
    x=x,
    y=y,
    z=z,
    mode='markers',
    text=df["category"],
    marker=dict(
        size=12,
        color=df["Cluster"],
        colorscale='Viridis',
        opacity=0.8
    )
)])

# open in html
fig.write_html('first_figure.html', auto_open=True)

fig.show()





[t-SNE] Computing 46 nearest neighbors...
[t-SNE] Indexed 377 samples in 0.000s...
[t-SNE] Computed neighbors for 377 samples in 0.024s...
[t-SNE] Computed conditional probabilities for sample 377 / 377
[t-SNE] Mean sigma: 0.164136
[t-SNE] KL divergence after 250 iterations with early exaggeration: 63.579132
[t-SNE] KL divergence after 300 iterations: 0.807748
Sample size:  377


In [None]:
# plot embeddings with colours equal to category

import random

colors = {k: [random.random(), random.random(), random.random()] for k in df["category"].unique()}

for category in df["category"].unique():
    xs = x[df["category"] == category]
    ys = y[df["category"] == category]
    plt.scatter(xs, ys, c=colors[category], alpha=0.5, label=category)

In [None]:
# plot embeddings with colours equal to red, green, and blue, based on cluster

predict = kmeans.predict(data)

colors = ["red", "green", "blue"]

for c, color in enumerate(colors):
    xs = x[predict == c]
    ys = y[predict == c]
    plt.scatter(xs, ys, c=color, alpha=0.5)
    print("Cluster: ", c, "Size: ", len(xs))
    print(df[predict == c]["category"].value_counts())

plt.title("jamesg.blog, visualized with t-SNE")
plt.show()