In [35]:
from dotenv import load_dotenv
import os
from openai import OpenAI
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key=api_key)

def embed_text_list(
    openai_client
    , openai_embedding_model
    , text_list
):
    ret = openai_client.embeddings.create(
        input = text_list
        , model = openai_embedding_model
    )
    return np.transpose(np.array([ret.data[n].embedding for n in range(len(ret.data))]))

In [32]:
dfText = pd.read_csv(
    'C:/Users/alire/OneDrive/data/statman_bitbucket/aki/LLM/all_text_columns_na_handled.csv'
)
dfText.head()

Unnamed: 0,project_id,operation_no,2.01 Diagnosis,2.02 Previous Procedure,2.07 Comorbid Conditions,3.09 Operation performed
0,PR-00000001,1,155500. Cardiac conduit complication;010125. P...,123610. Replacement of cardiac conduit;123601....,140206. 22q11 microdeletion with full DiGeorge...,123610. Replacement of cardiac conduit
1,PR-00000002,2,155516. Cardiac conduit failure;103604. Aortic...,No previous procedure,102000. No pre-procedural risk factors,123610. Replacement of cardiac conduit;121614....
2,PR-00000003,3,010101. Tetralogy of Fallot;090591. Pulmonary ...,No previous procedure,102000. No pre-procedural risk factors,122621. Absent pulmonary valve syndrome (Fallo...
3,PR-00000004,4,091591. Aortic regurgitation;091519. Congenita...,123601. RV to pulmonary artery conduit constru...,102000. No pre-procedural risk factors,121791. Aortic root replacement: valve sparing...
4,PR-00000005,5,155516. Cardiac conduit failure;090101. Common...,123601. RV to pulmonary artery conduit constru...,160305. Lung disease;158210. Renal failure;140...,123610. Replacement of cardiac conduit;123452....


In [33]:
dfEmbeddings = pd.read_csv(
    'C:/Users/alire/OneDrive/data/statman_bitbucket/aki/LLM/March2024/openai_3large_operation.csv'
)
dfEmbeddings.head()

Unnamed: 0,project_id,operation_no,X1,X2,X3,X4,X5,X6,X7,X8,...,X3063,X3064,X3065,X3066,X3067,X3068,X3069,X3070,X3071,X3072
0,PR-00000001,1,0.011898,0.036309,-0.005463,0.008832,0.01476,0.027967,-0.035566,0.017884,...,-0.010715,0.0108,-0.012413,0.012261,0.008452,0.008917,0.010977,-0.029892,-0.010766,-0.010437
1,PR-00000002,2,0.000332,0.027386,-0.020735,-0.030093,0.011078,0.034497,-0.051592,-0.001025,...,-0.004109,0.002861,0.003123,-0.000474,0.001826,0.023946,0.022539,-0.020903,-0.003339,0.002045
2,PR-00000003,3,-0.013588,0.006824,-0.002654,0.027271,0.033145,0.019402,-0.031663,0.027783,...,0.004871,0.006225,-2.5e-05,0.002941,-0.011971,0.019294,0.002917,0.003523,0.004716,0.002019
3,PR-00000004,4,0.028444,0.027613,-0.017272,-0.006343,0.022245,0.052718,-0.014736,0.047054,...,0.011954,-0.00659,0.000384,-0.010179,0.002074,0.015004,0.001219,-0.010087,0.011813,0.037503
4,PR-00000005,5,0.015713,0.057943,-0.016102,0.00951,-0.026413,0.027086,-0.050581,0.033581,...,-0.001399,-0.003857,-0.01281,-0.008986,-0.001552,0.009922,0.005754,-0.01391,-0.013042,-0.006614


In [31]:
my_embeddings = embed_text_list(
    openai_client = client
    , openai_embedding_model = 'text-embedding-3-large'
    , text_list = list(dfText.loc[:4, '2.02 Previous Procedure'])
)

array([[ 0.00302609, -0.00116791, -0.00116791, -0.00629944, -0.00614498,
        -0.00116791],
       [ 0.03218896, -0.00585787, -0.00585787,  0.03419048,  0.03267848,
        -0.00585787],
       [-0.01271945, -0.00282817, -0.00282817, -0.00728773, -0.01119278,
        -0.00282817],
       ...,
       [-0.02166959,  0.00860589,  0.00860589, -0.00698596, -0.01503684,
         0.00860589],
       [-0.01968242, -0.00999365, -0.00999365, -0.00292528, -0.01933872,
        -0.00999365],
       [-0.01315316, -0.00654488, -0.00654488, -0.00990558, -0.00945624,
        -0.00654488]])

In [43]:
#dfEmbeddings.iloc[:, 2:]
my_kmeans = KMeans(n_clusters = 10, n_init = 10).fit(dfEmbeddings.iloc[:, 2:])
my_labels = my_kmeans.labels_
my_labels

array([4, 4, 7, 4, 4, 0, 4, 4, 6, 9, 4, 4, 4, 0, 4, 4, 9, 2, 4, 4, 4, 4,
       2, 4, 4, 1, 4, 4, 0, 4, 4, 4, 4, 4, 2, 6, 4, 4, 9, 4, 4, 4, 4, 9,
       9, 6, 4, 7, 9, 4, 4, 4, 5, 4, 4, 5, 4, 4, 4, 4, 4, 4, 6, 9, 4, 6,
       5, 2, 4, 6, 4, 4, 4, 4, 5, 5, 5, 6, 9, 4, 5, 9, 0, 9, 3, 4, 4, 5,
       5, 2, 4, 0, 4, 3, 4, 2, 5, 4, 3, 4, 2, 3, 4, 3, 4, 3, 4, 3, 6, 6,
       9, 0, 4, 5, 3, 3, 4, 6, 4, 4, 4, 5, 4, 2, 3, 4, 9, 4, 6, 4, 6, 4,
       4, 3, 6, 6, 4, 4, 9, 4, 3, 5, 4, 4, 4, 0, 2, 9, 3, 6, 4, 3, 2, 9,
       5, 7, 5, 4, 5, 0, 9, 5, 4, 5, 9, 5, 0, 9, 9, 3, 6, 4, 3, 1, 3, 7,
       4, 9, 0, 4, 6, 5, 4, 6, 5, 9, 3, 1, 6, 5, 9, 0, 5, 4, 1, 6, 6, 2,
       5, 7, 5, 4, 6, 5, 0, 5, 9, 1, 7, 4, 5, 1, 1, 4, 4, 4, 5, 3, 2, 5,
       5, 6, 4, 3, 1, 7, 9, 5, 5, 7, 2, 4, 6, 5, 1, 2, 0, 9, 2, 3, 2, 1,
       7, 7, 2, 7, 1, 2, 4, 0, 8, 9, 4, 7, 1, 3, 0, 3, 2, 4, 2, 2, 7, 0,
       7, 4, 1, 5, 4, 0, 3, 2, 9, 2, 1, 5, 5, 1, 3, 4, 9, 2, 4, 4, 5, 5,
       4, 5, 7, 7, 1, 1, 0, 2, 4, 3, 7, 2, 5, 2, 2,