In [1]:
import pandas as pd
import numpy as np
from openai import OpenAI

In [2]:
df = pd.read_csv("extracted_papers.csv")
client = OpenAI()

def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return np.array(client.embeddings.create(input = [text], model=model).data[0].embedding)

In [3]:
df['ada_embedding'] = df.summary.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))

In [4]:
embedding_array = df['ada_embedding'].values
concatenated_array = np.concatenate([arr[:, np.newaxis] for arr in embedding_array], axis=1).transpose()

In [5]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

projected_array = PCA(n_components=2).fit_transform(concatenated_array)
kmeans_array = KMeans().fit(projected_array).labels_

In [6]:
new_df = pd.DataFrame()
new_df["paper_title"] = df["title"]
new_df["paper_x"] = projected_array[:, 0]
new_df["paper_y"] = projected_array[:, 1]
new_df["cluster_index"] = kmeans_array
new_df.to_csv("clustered_papers.csv")