In [None]:
# DATA PREPROCESSING
# - removed two empty lines
# - dropped duplicates
# - add ICPC codes as search terms
# - add ICD codes as search terms

In [1]:
import pandas as pd
import os
import csv


data_path = os.path.join("med_code_search","data", "icpc_3_pt.csv")
df = pd.read_csv(data_path, names=["icpc_3_code", "category", "value"], escapechar='\\')

In [2]:
df[df["category"]=="indexwords"]

Unnamed: 0,icpc_3_code,category,value
5,-101,indexwords,"exame físico, corpo inteiro"
6,-101,indexwords,avaliação do corpo todo
25,-102,indexwords,anamnese
26,-102,indexwords,avaliação da função cardiovascular
27,-102,indexwords,avaliação do sistema genital masculino
...,...,...,...
14006,ZC99.01,indexwords,exclusão social
14007,ZC99.01,indexwords,retirar-se do social
14011,ZC99.02,indexwords,imigrante ilegal
14012,ZC99.02,indexwords,"problema de vida cívica, comunitária e social"


In [8]:
import openai
from openai.embeddings_utils import get_embedding
import streamlit as st

api_key = st.secrets['openai']['key']
openai.api_key = api_key

def get_input_embedding(input):
    input_vector = get_embedding(input, engine="text-embedding-ada-002")
    return input_vector

2023-09-09 15:42:18.041 No runtime found, using MemoryCacheStorageManager


In [18]:
df.iloc[0]

icpc_3_code                                             -1
category                                         preferred
value          Intervenções de diagnóstico e monitoramento
Name: 0, dtype: object

In [11]:
len(get_input_embedding("tosse"))

1536

In [36]:
from tqdm import tqdm
import nanoid
import openai
from openai.embeddings_utils import get_embedding
import streamlit as st
import pandas as pd
import pinecone
import asyncio

import pandas as pd
import os
import csv


data_path = os.path.join("med_code_search","data", "icpc_3_pt.csv")
df = pd.read_csv(data_path, names=["icpc_3_code", "category", "value"], escapechar='\\')

# Initialize API keys
api_key_openai = st.secrets['openai']['key']
openai.api_key = api_key_openai
api_key_pinecone = st.secrets['pinecone']['key']
pinecone.init(api_key=api_key_pinecone, environment='us-east1-gcp')
index = pinecone.Index("icpc-3")


In [38]:
import xml.etree.ElementTree as ET

def remove_tags_and_text(xml_string, tag_name):
    # Wrap the XML string in a root element to make it well-formed
    root = ET.fromstring(f'<root>{xml_string}</root>')

    # List to hold elements to be removed
    elements_to_remove = []

    # Iterate through the XML tree and find elements to remove
    for parent in root.iter():
        elements_to_remove += [(parent, elem) for elem in parent.findall(tag_name)]

    # Remove the found elements from their respective parents
    for parent, elem in elements_to_remove:
        parent.remove(elem)

    # Convert the modified XML tree back to a string
    return ET.tostring(root, encoding='unicode')[6:-7]  # Remove the added <root> and </root> tags


def extract_text_from_xml(xml_string, tag_name):
    root = ET.fromstring(f'<Root>{xml_string}</Root>')  # Wrap in a root tag for parsing
    return [elem.text for elem in root.findall(f"{tag_name}")]


async def get_input_embedding(input):
    input_vector = get_embedding(input, engine="text-embedding-ada-002")
    return input_vector


def batch_generator(df: pd.DataFrame, batch_size: int = 100):
    num_batches = (df.shape[0] + batch_size - 1) // batch_size
    for i in tqdm(range(num_batches)):
        yield df.iloc[i * batch_size: (i + 1) * batch_size]


async def process_row(row):
    id = "_".join([row["icpc_3_code"], row["category"], nanoid.generate(size=5)])
    vector = await get_input_embedding(row["value"])
    metadata = row
    return (id, vector, metadata)


async def chunk_preprocessing(chunk: pd.DataFrame):
    tasks = [process_row(row) for row in chunk.to_dict('records')]
    return await asyncio.gather(*tasks)


async def upsert_to_pinecone(df, namespace: str):
    generator = batch_generator(df)
    for batch in generator:
        processed_chunk = await chunk_preprocessing(batch)
        index.upsert(vectors=processed_chunk, namespace=namespace)

# To run the entire asynchronous workflow
# await main(df)


In [44]:
input = "dificuldade para andar"


pinecone.init(api_key=api_key_pinecone, environment='us-east1-gcp')
index = pinecone.Index("icpc-3")

input_embedding = await get_input_embedding(input)

async def get_icd_10(icpc3_code: str, query_vector: list):
    result = index.query(
        namespace="icpc-3-v0",
        top_k=3,
        # include_values=True,
        include_metadata=True,
        vector=query_vector,
        filter={
            "category": "icd10",
            "icpc_3_code": icpc3_code
        }
    )
    return result["matches"]


result = index.query(
    namespace="icpc-3-v0",
    top_k=3,
    include_values=False,
    include_metadata=True,
    vector=input_embedding,
)

print(result["matches"])

for match in result["matches"]:
    print("CIAP-3", match["metadata"]["icpc_3_code"], match["metadata"]["value"])
    icd_codes = await get_icd_10(match["metadata"]["icpc_3_code"], input_embedding)
    for icd_code in icd_codes:
        xml_string = icd_code["metadata"]["value"]
        extracted_title = remove_tags_and_text(xml_string, 'Reference')
        extracted_code = extract_text_from_xml(xml_string, 'Reference')[0]
        print("CID-10", extracted_code, extracted_title)
    print("--"*25)

[{'id': 'RS04_indexwords_GcO7t',
 'metadata': {'category': 'indexwords',
              'icpc_3_code': 'RS04',
              'value': 'dificuldade ao respirar'},
 'score': 0.892814457,
 'values': []}, {'id': '2F31_inclusion_5PxVU',
 'metadata': {'category': 'inclusion',
              'icpc_3_code': '2F31',
              'value': 'usar um andador'},
 'score': 0.888801336,
 'values': []}, {'id': 'NS99_inclusion_8D8Jl',
 'metadata': {'category': 'inclusion',
              'icpc_3_code': 'NS99',
              'value': 'anormalidades da marcha'},
 'score': 0.877397478,
 'values': []}]
CIAP-3 RS04 dificuldade ao respirar
CID-10 R06.4 Hiperventilação 
CID-10 R06.3 Respiraçăo periódica 
CID-10 R06.1 Estridor 
--------------------------------------------------
CIAP-3 2F31 usar um andador
--------------------------------------------------
CIAP-3 NS99 anormalidades da marcha
CID-10 R26 Anormalidades da marcha e da mobilidade 
CID-10 R27 Outros distúrbios da coordenação 
CID-10 R29.8 Outros sintoma

In [43]:
df["category"].unique()

array(['preferred', 'description', 'exclusion', 'indexwords', 'inclusion',
       'note', 'shortTitle', 'codinghint', 'hasExtension', 'icd10'],
      dtype=object)

In [16]:
import xml.etree.ElementTree as ET

def extract_text_from_xml(xml_string, tag_name):
    root = ET.fromstring(f'<Root>{xml_string}</Root>')  # Wrap in a root tag for parsing
    return [elem.text for elem in root.findall(f"{tag_name}")]

xml_string = 'Problemas nas relações com cônjuge ou parceiro <Reference scheme="icd-10">Z63.0</Reference>'
extracted_texts = extract_text_from_xml(xml_string, 'Reference')
print(extract_text_from_xml(xml_string, 'Root'))

print(extracted_texts)


[]
['Z63.0']


In [20]:
import xml.etree.ElementTree as ET

def remove_tags_and_text(xml_string, tag_name):
    # Wrap the XML string in a root element to make it well-formed
    root = ET.fromstring(f'<root>{xml_string}</root>')

    # List to hold elements to be removed
    elements_to_remove = []

    # Iterate through the XML tree and find elements to remove
    for parent in root.iter():
        elements_to_remove += [(parent, elem) for elem in parent.findall(tag_name)]

    # Remove the found elements from their respective parents
    for parent, elem in elements_to_remove:
        parent.remove(elem)

    # Convert the modified XML tree back to a string
    return ET.tostring(root, encoding='unicode')[6:-7]  # Remove the added <root> and </root> tags


def extract_text_from_xml(xml_string, tag_name):
    root = ET.fromstring(f'<Root>{xml_string}</Root>')  # Wrap in a root tag for parsing
    return [elem.text for elem in root.findall(f"{tag_name}")]


xml_string = 'Problemas nas relações com cônjuge ou parceiro <Reference scheme="icd-10">Z63.0</Reference>'
modified_string = remove_tags_and_text(xml_string, 'Reference')
extracted_texts = extract_text_from_xml(xml_string, 'Reference')

print(modified_string)
print(extracted_texts)



Problemas nas relações com cônjuge ou parceiro 
['Z63.0']


In [8]:
df[
    (df["icpc_3_code"]=="AD66")
    &
    (df["category"]=="indexwords")
]

Unnamed: 0,icpc_3_code,category,value
2211,AD66,indexwords,asfixia em nascido vivo
2212,AD66,indexwords,sepse neonatal bacteriana
2213,AD66,indexwords,asfixia no parto
2214,AD66,indexwords,lesão no parto
2215,AD66,indexwords,trauma fetal no parto
2216,AD66,indexwords,distúrbios cardiovasculares presentes no perío...
2217,AD66,indexwords,distúrbios da regulação da temperatura do recé...
2218,AD66,indexwords,síndrome de abstinência de drogas em recém-nas...
2219,AD66,indexwords,paralisia de Erb Duchenne
2220,AD66,indexwords,paralisia de Erb devido a lesão no nascimento


In [22]:
len(df['icpc_3_code'].unique())

icpc_code_as_query = []
for code in df['icpc_3_code'].unique():
    row = {
        "icpc_3_code": code,
        "category": "codequery",
        "value": "CIAP-3 " + code
    }
    icpc_code_as_query.append(row)

icpc_code_as_query_df = pd.DataFrame.from_records(icpc_code_as_query)
icpc_code_as_query_df

Unnamed: 0,icpc_3_code,category,value
0,-1,codequery,CIAP-3 -1
1,-101,codequery,CIAP-3 -101
2,-102,codequery,CIAP-3 -102
3,-103,codequery,CIAP-3 -103
4,-104,codequery,CIAP-3 -104
...,...,...,...
1593,ZC99,codequery,CIAP-3 ZC99
1594,ZC99.00,codequery,CIAP-3 ZC99.00
1595,ZC99.01,codequery,CIAP-3 ZC99.01
1596,ZC99.02,codequery,CIAP-3 ZC99.02


In [23]:
df[df["category"]=='icd10']

icd_code_as_query = []
for row in df[df["category"]=='icd10'].to_dict('records'):
    extracted_code = extract_text_from_xml(row["value"], 'Reference')[0]
    new_row = {
        "icpc_3_code": row["icpc_3_code"],
        "category": "codequery",
        "value": "CID-10 " + extracted_code
    }
    icd_code_as_query.append(new_row)

icd_code_as_query_df = pd.DataFrame.from_records(icd_code_as_query)
icd_code_as_query_df

Unnamed: 0,icpc_3_code,category,value
0,AD01,codequery,CID-10 B05
1,AD02,codequery,CID-10 B01
2,AD03,codequery,CID-10 B06
3,AD04,codequery,CID-10 B27
4,AD13,codequery,CID-10 B08.0
...,...,...,...
2947,ZC99,codequery,CID-10 Z73.5
2948,ZC99,codequery,CID-10 Z58.0
2949,ZC99.00,codequery,CID-10 Z60.4
2950,ZC99.00,codequery,CID-10 Z60.5


In [32]:
df_with_codequery = pd.concat([df, icpc_code_as_query_df, icd_code_as_query_df])
df_with_codequery = df_with_codequery.drop_duplicates()
df_with_codequery.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18555 entries, 0 to 2951
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   icpc_3_code  18555 non-null  object
 1   category     18555 non-null  object
 2   value        18555 non-null  object
dtypes: object(3)
memory usage: 579.8+ KB


In [35]:
await upsert_to_pinecone(df_with_codequery, namespace="icpc-3-v0")

100%|██████████| 186/186 [1:38:04<00:00, 31.64s/it]


In [14]:


code_as_query = []
for code in df['icpc_3_code'].unique():
    row = {
        "icpc_3_code": code,
        "category": "codequery",
        "value": "CIAP-3 " + code
    }
    code_as_query.append(row)

code_as_query_df = pd.DataFrame.from_records(code_as_query)
code_as_query_df

AttributeError: 'DataFrame' object has no attribute 'unique'