## Text Clustering and Labeling Utilizing OpenAI API


In [12]:
import os
import openai
from dotenv import load_dotenv, find_dotenv
import hdbscan
import numpy as np
import pandas as pd
import polars as pl
import plotly.express as px
import requests
from openai import OpenAI
from umap import UMAP
import tiktoken
import sys
from pathlib import Path
from tqdm import tqdm
import sklearn
import ast
client = OpenAI()

In [2]:
# Load the environment variables from the .env file
load_dotenv(find_dotenv(), override=True)

# Get the API key
openai.api_key = os.getenv('OPENAI_API_KEY')

# Verify if the API key is set correctly
if openai.api_key:
    print("OpenAI API key is loaded successfully.")
else:
    print("Error: OpenAI API key is not found. Check your .env file.")



OpenAI API key is loaded successfully.


In [3]:
embedding_model = "text-embedding-3-large"
embedding_encoding = "cl100k_base"
max_tokens = 8000


In [4]:
current_directory = Path.cwd()
parent_dir = current_directory.parent

In [5]:
file_path = f'{parent_dir}/outputs/data/categorical'
PCS_df = pd.read_csv(f'{file_path}/Patient Characteristics Survey (Years 2013 - 2022) (categorical).csv')

sample_size = 10000
sample_PCS_df = PCS_df.sample(n=sample_size, random_state=42)


In [7]:
# 1. Stack the DataFrame (removes empty cells automatically)
stacked = sample_PCS_df.stack()

# 2. Get headers and values, enforcing string types for safety
# .astype(str) on the index handles cases where a column name might be NaN or an integer
headers = stacked.index.get_level_values(1).astype(str)
values = stacked.astype(str).str.strip()

# 3. Create the formatted string "Column: Value"
formatted_values = headers + ": " + values

# 4. Group by the original row index (level=0) and join
# We add .dropna() before grouping to ensure no stray NaNs break the join
sample_PCS_df['intake'] = formatted_values.dropna().groupby(level=0).agg('; '.join)

# 5. Fill any rows that had zero data with an empty string
sample_PCS_df['intake'] = sample_PCS_df['intake'].fillna("")

In [8]:
encoding = tiktoken.get_encoding(embedding_encoding)

sample_PCS_df["n_tokens"] = sample_PCS_df.intake.apply(lambda x: len(encoding.encode(x)))

#### Get embeddings and save them for future reuse

In [9]:
os.makedirs('embeddings', exist_ok=True)

In [None]:
from tenacity import retry, stop_after_attempt, wait_random_exponential

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(10))
def get_embeddings_batch(texts, model=embedding_model):
    clean = [str(t).replace("\n", " ") for t in texts]
    response = client.embeddings.create(input=clean, model=model)
    return [d.embedding for d in response.data]

batch_size = 100
all_embeddings = []
for i in tqdm(range(0, len(sample_PCS_df), batch_size), desc="Embedding batches"):
    batch = sample_PCS_df['intake'].iloc[i:i+batch_size].tolist()
    all_embeddings.extend(get_embeddings_batch(batch))

embeddings = np.vstack(all_embeddings)

file_path = f'{parent_dir}/outputs/embeddings'
os.makedirs(file_path, exist_ok=True)
np.save(f'{file_path}/embeddings.npy', embeddings)

Embedding batches: 100%|██████████| 100/100 [03:42<00:00,  2.22s/it]


In [None]:
# embeddings = np.load(f'{parent_dir}/outputs/embeddings/embeddings.npy')

In [None]:
hdb = hdbscan.HDBSCAN(min_cluster_size=100, min_samples=10).fit(embeddings)
sample_PCS_df["cluster"] = hdb.labels_.astype(str)


In [9]:
# Reduce dimensionality with UMAP
umap = UMAP(n_components=2, random_state=42, n_neighbors=80, min_dist=0.1)
df_umap = (pd.DataFrame(umap.fit_transform(np.array(embeddings)), columns=["x", "y"])
           .assign(cluster=hdb.labels_.astype(str))
           .query("cluster != '-1'")
           .sort_values(by="cluster"))

# Visualize clusters
fig = px.scatter(df_umap, x="x", y="y", color="cluster", title="UMAP Clusters")
fig.show()


  warn(


In [10]:
print(f"Noise points: {(hdb.labels_ == -1).sum()}")
print(f"Clustered points: {(hdb.labels_ != -1).sum()}")

Noise points: 1605
Clustered points: 8395


In [13]:
def generate_cluster_text(df, output_col, system_message, user_template, text_col="intake"):
    df[output_col] = "Pending"
    
    for c in tqdm(df["cluster"].unique(), desc=f"Generating {output_col}"):
        if c == '-1':
            continue
            
        cluster_data = df[df["cluster"] == c][text_col].tolist()
        if not cluster_data:
            continue
        
        # Sample if cluster is large to stay within context window
        if len(cluster_data) > 20:
            cluster_data = cluster_data[:20]
        
        data_str = "\n\n".join(f"{i+1}. {entry}" for i, entry in enumerate(cluster_data))
        
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "system", "content": system_message},
                          {"role": "user", "content": user_template.format(data_str)},],
                temperature=0.7, seed=42,)
            df.loc[df["cluster"] == c, output_col] = response.choices[0].message.content.strip()
        except Exception as e:
            print(f"Error for cluster {c}: {e}")
            df.loc[df["cluster"] == c, output_col] = "Error"

# Generate topic titles
generate_cluster_text(sample_PCS_df,
                      output_col="cluster_name",
                      system_message="You are an expert health economist analyzing the New York Patient Characteristics Survey.",
                      user_template="Based on the following patient intake data, write a short descriptive title (5-10 words) that captures the defining characteristics of this patient group.\n\nDATA:\n\n{}\n\nTOPIC TITLE:")

# Generate clinical notes
generate_cluster_text(sample_PCS_df,
                      output_col="cluster_notes",
                      system_message="You are an expert health economist tasked with writing clinical notes summarizing patient characteristics.",
                      user_template="Write a concise clinical summary (one paragraph) of the key characteristics and health statuses of the patients in this cluster.\n\nDATA:\n\n{}\n\nCLINICAL NOTES:")

Generating cluster_name: 100%|██████████| 12/12 [00:40<00:00,  3.38s/it]
Generating cluster_notes: 100%|██████████| 12/12 [01:29<00:00,  7.46s/it]


In [19]:
clusters = sorted(list(sample_PCS_df['cluster'].unique()))

for c in clusters:
    with pd.option_context("display.max_colwidth", None):
        cluster_rows = sample_PCS_df.query(f"cluster == '{c}'")
        if not cluster_rows.empty:
            print(f"Topic Title for Cluster {c}: {cluster_rows.cluster_name.values[0]}")
            print(f"Clinical Notes: {cluster_rows.cluster_notes.values[0]}")
            print(f"Size: {len(cluster_rows)} patients")
        else:
            print(f"No data found for cluster {c}")

Topic Title for Cluster -1: Pending
Clinical Notes: Pending
Size: 1605 patients
Topic Title for Cluster 0: "Unemployed Adults with Mental Illness and Substance Use"
Clinical Notes: The patient cluster primarily consists of adults from the Western Region, with a mix of male and female individuals, predominantly identifying as heterosexual, and primarily of White or Black race. Most patients are unemployed and not looking for work, with various education levels ranging from middle school to college. Mental illness is a common diagnosis among this group, often accompanied by substance-related disorders; serious mental illness is noted in several cases. Many patients live alone or in private residences, and a significant number report issues with smoking. The insurance status varies, with some individuals lacking any form of insurance, while others are covered by Medicaid and Medicare. Notably, many of these patients have criminal justice involvement. Chronic health conditions are inconsis

In [22]:
file_path = f'{parent_dir}/outputs/clusters'
os.makedirs(file_path)
sample_PCS_df.to_csv(f'{file_path}/clustered_patients.csv', index=False)