In [21]:
# DATA PREPROCESSING
# - removed two empty lines
# - dropped duplicates
# - add ICPC codes as search terms
# - add ICD codes as search terms

In [22]:
import pandas as pd
import os
import csv


data_path = os.path.join("med_code_search","data", "icpc_3_pt.csv")
df = pd.read_csv(data_path, names=["icpc_3_code", "category", "value"], escapechar='\\')

In [23]:
df[df["category"]=="indexwords"]

Unnamed: 0,icpc_3_code,category,value
5,-101,indexwords,"exame físico, corpo inteiro"
6,-101,indexwords,avaliação do corpo todo
25,-102,indexwords,anamnese
26,-102,indexwords,avaliação da função cardiovascular
27,-102,indexwords,avaliação do sistema genital masculino
...,...,...,...
14004,ZC99.01,indexwords,exclusão social
14005,ZC99.01,indexwords,retirar-se do social
14009,ZC99.02,indexwords,imigrante ilegal
14010,ZC99.02,indexwords,"problema de vida cívica, comunitária e social"


In [24]:
import openai
from openai.embeddings_utils import get_embedding
import streamlit as st

api_key = st.secrets['openai']['key']
openai.api_key = api_key

def get_input_embedding(input):
    input_vector = get_embedding(input, engine="text-embedding-ada-002")
    return input_vector

KeyError: 'st.secrets has no key "openai". Did you forget to add it to secrets.toml or the app settings on Streamlit Cloud? More info: https://docs.streamlit.io/streamlit-cloud/get-started/deploy-an-app/connect-to-data-sources/secrets-management'

In [None]:
df.iloc[0]

In [None]:
len(get_input_embedding("tosse"))

In [None]:
from tqdm import tqdm
import nanoid
import openai
from openai.embeddings_utils import get_embedding
import streamlit as st
import pandas as pd
import pinecone
import asyncio

import pandas as pd
import os
import csv


data_path = os.path.join("med_code_search","data", "icpc_3_pt.csv")
df = pd.read_csv(data_path, names=["icpc_3_code", "category", "value"], escapechar='\\')

# Initialize API keys
api_key_openai = st.secrets['openai']['key']
openai.api_key = api_key_openai
api_key_pinecone = st.secrets['pinecone']['key']
pinecone.init(api_key=api_key_pinecone, environment='us-east1-gcp')
index = pinecone.Index("icpc-3")


In [None]:
import xml.etree.ElementTree as ET

def remove_tags_and_text(xml_string, tag_name):
    # Wrap the XML string in a root element to make it well-formed
    root = ET.fromstring(f'<root>{xml_string}</root>')

    # List to hold elements to be removed
    elements_to_remove = []

    # Iterate through the XML tree and find elements to remove
    for parent in root.iter():
        elements_to_remove += [(parent, elem) for elem in parent.findall(tag_name)]

    # Remove the found elements from their respective parents
    for parent, elem in elements_to_remove:
        parent.remove(elem)

    # Convert the modified XML tree back to a string
    return ET.tostring(root, encoding='unicode')[6:-7]  # Remove the added <root> and </root> tags


def extract_text_from_xml(xml_string, tag_name):
    root = ET.fromstring(f'<Root>{xml_string}</Root>')  # Wrap in a root tag for parsing
    return [elem.text for elem in root.findall(f"{tag_name}")]


async def get_input_embedding(input):
    input_vector = get_embedding(input, engine="text-embedding-ada-002")
    return input_vector


def batch_generator(df: pd.DataFrame, batch_size: int = 100):
    num_batches = (df.shape[0] + batch_size - 1) // batch_size
    for i in tqdm(range(num_batches)):
        yield df.iloc[i * batch_size: (i + 1) * batch_size]


async def process_row(row):
    id = "_".join([row["icpc_3_code"], row["category"], nanoid.generate(size=5)])
    vector = await get_input_embedding(row["value"])
    metadata = row
    return (id, vector, metadata)


async def chunk_preprocessing(chunk: pd.DataFrame):
    tasks = [process_row(row) for row in chunk.to_dict('records')]
    return await asyncio.gather(*tasks)


async def upsert_to_pinecone(df, namespace: str):
    generator = batch_generator(df)
    for batch in generator:
        processed_chunk = await chunk_preprocessing(batch)
        index.upsert(vectors=processed_chunk, namespace=namespace)

# To run the entire asynchronous workflow
# await main(df)


In [None]:
input = "dificuldade para andar"


pinecone.init(api_key=api_key_pinecone, environment='us-east1-gcp')
index = pinecone.Index("icpc-3")

input_embedding = await get_input_embedding(input)

async def get_icd_10(icpc3_code: str, query_vector: list):
    result = index.query(
        namespace="icpc-3-v0",
        top_k=3,
        # include_values=True,
        include_metadata=True,
        vector=query_vector,
        filter={
            "category": "icd10",
            "icpc_3_code": icpc3_code
        }
    )
    return result["matches"]


result = index.query(
    namespace="icpc-3-v0",
    top_k=3,
    include_values=False,
    include_metadata=True,
    vector=input_embedding,
)

print(result["matches"])

for match in result["matches"]:
    print("CIAP-3", match["metadata"]["icpc_3_code"], match["metadata"]["value"])
    icd_codes = await get_icd_10(match["metadata"]["icpc_3_code"], input_embedding)
    for icd_code in icd_codes:
        xml_string = icd_code["metadata"]["value"]
        extracted_title = remove_tags_and_text(xml_string, 'Reference')
        extracted_code = extract_text_from_xml(xml_string, 'Reference')[0]
        print("CID-10", extracted_code, extracted_title)
    print("--"*25)

In [None]:
df["category"].unique()

In [None]:
import xml.etree.ElementTree as ET

def extract_text_from_xml(xml_string, tag_name):
    root = ET.fromstring(f'<Root>{xml_string}</Root>')  # Wrap in a root tag for parsing
    return [elem.text for elem in root.findall(f"{tag_name}")]

xml_string = 'Problemas nas relações com cônjuge ou parceiro <Reference scheme="icd-10">Z63.0</Reference>'
extracted_texts = extract_text_from_xml(xml_string, 'Reference')
print(extract_text_from_xml(xml_string, 'Root'))

print(extracted_texts)


In [None]:
import xml.etree.ElementTree as ET

def remove_tags_and_text(xml_string, tag_name):
    # Wrap the XML string in a root element to make it well-formed
    root = ET.fromstring(f'<root>{xml_string}</root>')

    # List to hold elements to be removed
    elements_to_remove = []

    # Iterate through the XML tree and find elements to remove
    for parent in root.iter():
        elements_to_remove += [(parent, elem) for elem in parent.findall(tag_name)]

    # Remove the found elements from their respective parents
    for parent, elem in elements_to_remove:
        parent.remove(elem)

    # Convert the modified XML tree back to a string
    return ET.tostring(root, encoding='unicode')[6:-7]  # Remove the added <root> and </root> tags


def extract_text_from_xml(xml_string, tag_name):
    root = ET.fromstring(f'<Root>{xml_string}</Root>')  # Wrap in a root tag for parsing
    return [elem.text for elem in root.findall(f"{tag_name}")]


xml_string = 'Problemas nas relações com cônjuge ou parceiro <Reference scheme="icd-10">Z63.0</Reference>'
modified_string = remove_tags_and_text(xml_string, 'Reference')
extracted_texts = extract_text_from_xml(xml_string, 'Reference')

print(modified_string)
print(extracted_texts)



In [None]:
df[
    (df["icpc_3_code"]=="AD66")
    &
    (df["category"]=="indexwords")
]

In [None]:
len(df['icpc_3_code'].unique())

icpc_code_as_query = []
for code in df['icpc_3_code'].unique():
    row = {
        "icpc_3_code": code,
        "category": "codequery",
        "value": "CIAP-3 " + code
    }
    icpc_code_as_query.append(row)

icpc_code_as_query_df = pd.DataFrame.from_records(icpc_code_as_query)
icpc_code_as_query_df

In [None]:
df[df["category"]=='icd10']

icd_code_as_query = []
for row in df[df["category"]=='icd10'].to_dict('records'):
    extracted_code = extract_text_from_xml(row["value"], 'Reference')[0]
    new_row = {
        "icpc_3_code": row["icpc_3_code"],
        "category": "codequery",
        "value": "CID-10 " + extracted_code
    }
    icd_code_as_query.append(new_row)

icd_code_as_query_df = pd.DataFrame.from_records(icd_code_as_query)
icd_code_as_query_df

In [None]:
df_with_codequery = pd.concat([df, icpc_code_as_query_df, icd_code_as_query_df])
df_with_codequery = df_with_codequery.drop_duplicates()
df_with_codequery.info()

In [None]:
await upsert_to_pinecone(df_with_codequery, namespace="icpc-3-v0")

In [None]:


code_as_query = []
for code in df['icpc_3_code'].unique():
    row = {
        "icpc_3_code": code,
        "category": "codequery",
        "value": "CIAP-3 " + code
    }
    code_as_query.append(row)

code_as_query_df = pd.DataFrame.from_records(code_as_query)
code_as_query_df