### Performance of Classification Between Cosine Similarity of Vector Embeddings and LLM

##### Load Data

In [9]:
from datasets import load_dataset, concatenate_datasets

In [10]:
icd_dataset = load_dataset('krishnareddy/icddxdescmap', trust_remote_code = True)

icd_concat = concatenate_datasets([
    icd_dataset['train'],
    icd_dataset['validation'],
    icd_dataset['test']
])

icd_data = icd_concat.to_pandas()
icd_data.head()

Unnamed: 0,docdesc,dxcode,shortdesc,longdesc
0,12 week IUP,Z3A.12,12 weeks gestation of pregnancy,12 weeks gestation of pregnancy
1,14 weeks pregnant,Z3A.14,14 weeks gestation of pregnancy,14 weeks gestation of pregnancy
2,15 weeks pregnant,Z3A.15,15 weeks gestation of pregnancy,15 weeks gestation of pregnancy
3,17 wks pregnant,Z3A.17,17 weeks gestation of pregnancy,17 weeks gestation of pregnancy
4,2 weeks pregnant,Z3A.20,20 weeks gestation of pregnancy,20 weeks gestation of pregnancy


##### Clean

In [11]:
# Remove '.' in the dataset to much the ICD code set that we have
icd_data['dxcode'] = icd_data['dxcode'].str.replace('.', '', regex = False)

icd_data.head()

Unnamed: 0,docdesc,dxcode,shortdesc,longdesc
0,12 week IUP,Z3A12,12 weeks gestation of pregnancy,12 weeks gestation of pregnancy
1,14 weeks pregnant,Z3A14,14 weeks gestation of pregnancy,14 weeks gestation of pregnancy
2,15 weeks pregnant,Z3A15,15 weeks gestation of pregnancy,15 weeks gestation of pregnancy
3,17 wks pregnant,Z3A17,17 weeks gestation of pregnancy,17 weeks gestation of pregnancy
4,2 weeks pregnant,Z3A20,20 weeks gestation of pregnancy,20 weeks gestation of pregnancy


##### Load ICD Embeddings

In [12]:
from helpers.icd import open_icd_embeddings

embeddings = open_icd_embeddings('pritamdeka')

FileNotFoundError: [Errno 2] No such file or directory: './Saved Data/ICD Pickles/pritamdeka.pkl'

##### Embed Text from Dataset

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('pritamdeka/S-PubMedBert-MS-MARCO')

In [None]:
encoding = model.encode(icd_data['docdesc'][0])

In [None]:
from helpers.icd import get_top_k_similar

In [None]:
get_top_k_similar(encoding, embeddings)

Z3A12: similarity = 0.9186
Z3A24: similarity = 0.9098
Z3A18: similarity = 0.9064
Z3A08: similarity = 0.9054
Z3A36: similarity = 0.9042


In [None]:
for i in range(1000, 1010):
    row = icd_data.iloc[i,:]
    print("-----------")
    print(row['dxcode'])
    print("-----")
    encoding = model.encode(row['docdesc'])
    get_top_k_similar(encoding, embeddings)

-----------
D34
-----
D34: similarity = 1.0000
C73: similarity = 0.9728
D351: similarity = 0.9684
D110: similarity = 0.9676
D352: similarity = 0.9637
-----------
H8110
-----
H8113: similarity = 0.9590
H8112: similarity = 0.9569
H8111: similarity = 0.9527
H8110: similarity = 0.9414
H81313: similarity = 0.9150
-----------
H8110
-----
H8113: similarity = 0.9498
H8112: similarity = 0.9470
H8111: similarity = 0.9429
H8110: similarity = 0.9310
H81313: similarity = 0.9188
-----------
N400
-----
D291: similarity = 0.9466
N62: similarity = 0.9322
N8501: similarity = 0.9289
N400: similarity = 0.9262
D352: similarity = 0.9248
-----------
N400
-----
N401: similarity = 0.9309
N400: similarity = 0.9293
D291: similarity = 0.9166
N8501: similarity = 0.9104
M2603: similarity = 0.9091
-----------
N400
-----
D291: similarity = 0.9217
N401: similarity = 0.9149
D2921: similarity = 0.9147
N400: similarity = 0.9144
D3501: similarity = 0.9111
-----------
N401
-----
N401: similarity = 0.9247
N46123: similarity