### Generate ICD-10 code embeddings using GPT-3 to obtain a reduced_validation set for faster validation during training.


In [None]:
!pip install --upgrade openai -q

In [None]:
import openai
import os
openai.api_key = ''
os.environ['OPENAI_API_KEY'] = openai.api_key

import json
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_colwidth', None)

In [None]:
import os
from google.colab import drive

drive.mount('/content/gdrive')
DS_HOME = '/content/gdrive/Shareddrives/PROJECT_ROOT_DIR/injury-icd-dataset'

In [None]:
icd_codes = pd.read_csv(os.path.join(DS_HOME, 'case-labels.csv'), usecols=['label', 'label_name']).drop_duplicates().sort_values('label')
icd_codes_5_char = pd.read_csv(os.path.join(DS_HOME, 'case-labels-5-char.csv'), usecols=['label', 'label_name']).drop_duplicates().sort_values('label')
icd_codes = pd.concat([icd_codes, icd_codes_5_char]) 
code_names = icd_codes.label_name.tolist()
len(code_names)

In [None]:
model_id = 'text-similarity-davinci-001'
res = openai.Embedding.create(input=code_names+code_names_5_char, engine=model_id)

In [None]:
embeddings = []
for d in res['data']:
    embeddings.append({
        'label_name': code_names[d['index']],
        'embedding': d['embedding']
    })
embeddings = pd.DataFrame(embeddings)
embeddings.embedding = embeddings.embedding.apply(json.dumps)
embeddings = icd_codes.merge(embeddings)
embeddings.to_csv(os.path.join(DS_HOME, 'icd-name-davinci-001-embeddings.csv'), index=False)

In [None]:
embeddings = pd.read_csv(os.path.join(DS_HOME, 'icd-name-davinci-001-embeddings.csv'))
embeddings.embedding = embeddings.embedding.apply(json.loads)
embeddings.embedding = embeddings.embedding.apply(np.array)
len(embeddings)

In [None]:
label_sim = []

for _, row1 in embeddings.iterrows():
    for _, row2 in embeddings.iterrows():
        if row1.label != row2.label:
            label_sim.append({
                'label_1': row1.label,
                'label_2': row2.label,
                'label_name_1': row1.label_name,
                'label_name_2': row2.label_name,
                'davinci_cosine_similarity': cosine_similarity([row1.embedding], [row2.embedding])[0][0]
            })

label_sim = pd.DataFrame(label_sim)
min_sim = label_sim.davinci_cosine_similarity.min()
max_sim = label_sim.davinci_cosine_similarity.max()
label_sim['sim'] = label_sim.davinci_cosine_similarity.apply(lambda x: (x-min_sim)/(max_sim-min_sim))
        


In [None]:
label_sim[label_sim.label_1 == icd_codes.sample(1).iloc[0].label].sort_values('davinci_cosine_similarity', ascending=False)

In [None]:
label_sim.to_csv(os.path.join(DS_HOME, 'icd-name-davinci-001-simularity-scores.csv'), index=False)