## Text Clustering and Labeling Utilizing OpenAI API


In [None]:
import os
import openai
from dotenv import load_dotenv, find_dotenv
import hdbscan
import numpy as np
import pandas as pd
import polars as pl
import plotly.express as px
import requests
from openai import OpenAI
from umap import UMAP
import tiktoken
import sys
from pathlib import Path
from tqdm import tqdm
import sklearn
import ast

In [None]:
# Load the environment variables from the .env file
load_dotenv(find_dotenv(), override=True)
client = OpenAI()

# Get the API key
openai.api_key = os.getenv('OPENAI_API_KEY')

# Verify if the API key is set correctly
if openai.api_key:
    print("OpenAI API key is loaded successfully.")
else:
    print("Error: OpenAI API key is not found. Check your .env file.")



OpenAI API key is loaded successfully.


In [None]:
embedding_model = "text-embedding-3-large"
embedding_encoding = "cl100k_base"
max_tokens = 8000


In [5]:
current_directory = Path.cwd()
parent_dir = current_directory.parent

In [5]:
file_path = f'{parent_dir}/outputs/data/categorical'
PCS_df = pd.read_csv(f'{file_path}/Patient Characteristics Survey (Years 2013 - 2022) (categorical).csv')

sample_size = 10000
sample_PCS_df = PCS_df.sample(n=sample_size, random_state=42)


In [7]:
# 1. Stack the DataFrame (removes empty cells automatically)
stacked = sample_PCS_df.stack()

# 2. Get headers and values, enforcing string types for safety
# .astype(str) on the index handles cases where a column name might be NaN or an integer
headers = stacked.index.get_level_values(1).astype(str)
values = stacked.astype(str).str.strip()

# 3. Create the formatted string "Column: Value"
formatted_values = headers + ": " + values

# 4. Group by the original row index (level=0) and join
# We add .dropna() before grouping to ensure no stray NaNs break the join
sample_PCS_df['intake'] = formatted_values.dropna().groupby(level=0).agg('; '.join)

# 5. Fill any rows that had zero data with an empty string
sample_PCS_df['intake'] = sample_PCS_df['intake'].fillna("")

In [8]:
encoding = tiktoken.get_encoding(embedding_encoding)

sample_PCS_df["n_tokens"] = sample_PCS_df.intake.apply(lambda x: len(encoding.encode(x)))

#### Get embeddings and save them for future reuse

In [9]:
os.makedirs('embeddings', exist_ok=True)

In [None]:
from tenacity import retry, stop_after_attempt, wait_random_exponential

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(10))
def get_embeddings_batch(texts, model=embedding_model):
    clean = [str(t).replace("\n", " ") for t in texts]
    response = client.embeddings.create(input=clean, model=model)
    return [d.embedding for d in response.data]

batch_size = 100
all_embeddings = []
for i in tqdm(range(0, len(sample_PCS_df), batch_size), desc="Embedding batches"):
    batch = sample_PCS_df['intake'].iloc[i:i+batch_size].tolist()
    all_embeddings.extend(get_embeddings_batch(batch))

embeddings = np.vstack(all_embeddings)

file_path = f'{parent_dir}/outputs/embeddings'
os.makedirs(file_path, exist_ok=True)
np.save(f'{file_path}/embeddings.npy', embeddings)

Embedding batches: 100%|██████████| 100/100 [03:42<00:00,  2.22s/it]


In [22]:
embeddings = np.load(f'{parent_dir}/outputs/embeddings/embeddings.npy')
sample_PCS_df = pd.read_csv(f'{parent_dir}/outputs/embeddings/embeddings.csv')

In [23]:
# Reduce for clustering
umap_cluster = UMAP(n_components=30, random_state=42, n_neighbors=80, min_dist=0.0, metric='cosine')
reduced = umap_cluster.fit_transform(embeddings)



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [24]:
from itertools import product

results = []
for mcs, ms, method in product([75, 100, 150, 200],
                               [5, 10, 20],
                               ['eom', 'leaf']):
    hdb = hdbscan.HDBSCAN(min_cluster_size=mcs, min_samples=ms,
                          cluster_selection_method=method,
                          gen_min_span_tree=True).fit(reduced)
    
    n_clusters = len(set(hdb.labels_)) - 1
    noise_pct = (hdb.labels_ == -1).mean()
    validity = hdb.relative_validity_
    
    results.append({'min_cluster_size': mcs, 'min_samples': ms,
                    'method': method, 'n_clusters': n_clusters,
                    'noise_pct': round(noise_pct, 3), 'dbcv': round(validity, 4)})

results_df = pd.DataFrame(results).sort_values('dbcv', ascending=False)
print(results_df.to_string(index=False))

 min_cluster_size  min_samples method  n_clusters  noise_pct   dbcv
               75            5    eom          42      0.014 0.7480
              100           20    eom          32      0.009 0.7367
               75           10    eom          42      0.017 0.7268
              150           20    eom          22      0.022 0.7265
               75           20    eom          42      0.014 0.7191
              100           10    eom          33      0.005 0.7117
              200            5    eom          19      0.030 0.6764
              200           10    eom          19      0.030 0.6731
              100            5    eom          37      0.010 0.6681
              200           20    eom          19      0.054 0.6673
              150            5    eom          25      0.022 0.6404
              150           10    eom          25      0.015 0.6395
              150            5   leaf          33      0.040 0.4392
              200           10   leaf          2

In [25]:
# Pick the best row and use those params
# (replace these values with whatever scores highest)
hdb = hdbscan.HDBSCAN(min_cluster_size=250, min_samples=10,
                      cluster_selection_method='eom').fit(reduced)

sample_PCS_df["cluster"] = hdb.labels_.astype(str)

In [26]:
umap_viz = UMAP(n_components=2, random_state=42, n_neighbors=80, min_dist=0.1)
viz_coords = umap_viz.fit_transform(embeddings)


df_umap = (pd.DataFrame(viz_coords, columns=["x", "y"])
           .assign(cluster=hdb.labels_.astype(str))
           .query("cluster != '-1'")
           .sort_values(by="cluster"))

fig = px.scatter(df_umap, x="x", y="y", color="cluster", title="UMAP Clusters")
fig.show()


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [19]:
print(f"Noise points: {(hdb.labels_ == -1).sum()}")
print(f"Clustered points: {(hdb.labels_ != -1).sum()}")

Noise points: 46
Clustered points: 9954


In [27]:
sample_PCS_df[sample_PCS_df["cluster"] != "-1"]["cluster"].value_counts().sort_values()

cluster
8      253
4      262
5      265
15     360
14     382
9      433
7      495
10     496
13     501
3      506
6      511
16     556
11     560
12     560
1      727
0     1148
2     1258
Name: count, dtype: int64

In [47]:
drop_cols = ['Region Served',
             'Three Digit Residence Zip Code',
             'Survey Year',
             'intake',
             'n_tokens',
             'embedding',
             'cluster_name',
             'cluster_notes']

intake_df = sample_PCS_df.drop(columns=drop_cols)

In [48]:
# 1. Stack the DataFrame (removes empty cells automatically)
stacked = intake_df.stack()

# 2. Get headers and values, enforcing string types for safety
# .astype(str) on the index handles cases where a column name might be NaN or an integer
headers = stacked.index.get_level_values(1).astype(str)
values = stacked.astype(str).str.strip()

# 3. Create the formatted string "Column: Value"
formatted_values = headers + ": " + values

# 4. Group by the original row index (level=0) and join
# We add .dropna() before grouping to ensure no stray NaNs break the join
intake_df['intake'] = formatted_values.dropna().groupby(level=0).agg('; '.join)

# 5. Fill any rows that had zero data with an empty string
intake_df['intake'] = intake_df['intake'].fillna("")

In [55]:
import random

def generate_cluster_text(df, output_col, system_message, user_template, text_col="intake"):
    df[output_col] = "Pending"
    
    for c in tqdm(df["cluster"].unique(), desc=f"Generating {output_col}"):
        if c == '-1':
            continue
            
        cluster_data = df[df["cluster"] == c][text_col].tolist()
        if not cluster_data:
            continue
        
        cluster_size = len(cluster_data)
        
        if len(cluster_data) > 20:
            cluster_data = random.sample(cluster_data, 20)
        
        data_str = "\n\n".join(f"{i+1}. {entry}" for i, entry in enumerate(cluster_data))
        
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "system", "content": system_message},
                          {"role": "user", "content": user_template.format(cluster_size, data_str)},],
                temperature=0.7, seed=42,)
            df.loc[df["cluster"] == c, output_col] = response.choices[0].message.content.strip()
        except Exception as e:
            print(f"Error for cluster {c}: {e}")
            df.loc[df["cluster"] == c, output_col] = "Error"

# Generate topic titles
generate_cluster_text(intake_df,
                      output_col="cluster_name",
                      system_message="You are an expert health economist analyzing the New York Patient Characteristics Survey.",
                      user_template="Based on the following sample of patient intake data from a cluster of {} patients, write a short descriptive title (5-10 words) that captures the defining characteristics of this patient group.\n\nDATA:\n\n{}\n\nTOPIC TITLE:")

# Generate clinical notes
generate_cluster_text(intake_df,
                      output_col="cluster_notes",
                      system_message="You are an expert health economist tasked with writing clinical notes summarizing patient characteristics.",
                      user_template="Write a concise clinical summary (one paragraph) of the key characteristics and health statuses of the patients in this cluster of {} patients. Include what distinguishes this group from a general mental health population, and end with 1-2 sentences on the specific needs and recommended interventions for this population.\n\nDATA:\n\n{}\n\nCLINICAL NOTES:")

Generating cluster_name: 100%|██████████| 18/18 [00:42<00:00,  2.37s/it]
Generating cluster_notes: 100%|██████████| 18/18 [01:39<00:00,  5.55s/it]


In [56]:
clusters = sorted(list(intake_df['cluster'].unique()))

for c in clusters:
    with pd.option_context("display.max_colwidth", None):
        cluster_rows = intake_df.query(f"cluster == '{c}'")
        if not cluster_rows.empty:
            print(f"Topic Title for Cluster {c}: {cluster_rows.cluster_name.values[0]}")
            print(f"Clinical Notes: {cluster_rows.cluster_notes.values[0]}")
            print(f"Size: {len(cluster_rows)} patients")
        else:
            print(f"No data found for cluster {c}")

Topic Title for Cluster -1: Pending
Clinical Notes: Pending
Size: 727 patients
Topic Title for Cluster 0: "Unemployed Adults with Serious Mental Illness"
Clinical Notes: This clinical summary encapsulates a diverse cohort of 1,148 patients primarily consisting of adult males (with some females and children) from various living situations, predominantly residing alone or cohabitating. Most patients are White, with a significant representation of individuals identifying as Hispanic/Latino, particularly among adults. A notable feature of this group is the high prevalence of serious mental illnesses, including substance-related disorders, with many patients also facing comorbid health conditions such as hypertension, diabetes, and mobility impairments. Compared to a general mental health population, this cluster is distinguished by a higher unemployment rate, a significant portion being out of the labor force and actively seeking work, alongside substantial engagement with public assistanc