In [7]:
import os
import shutil

In [8]:
sub_dir_path = "../THOR/Dataset/Disease A-Z/Annotated_Text"
target_dir_path = "./files"
extracted_dir_path = "./extracted_files"

os.makedirs(target_dir_path, exist_ok=True)
os.makedirs(extracted_dir_path, exist_ok=True)

In [9]:

for subdir, _, files in os.walk(sub_dir_path):
    for file in files:
        source_file_path = os.path.join(subdir, file)
        target_file_path = os.path.join(target_dir_path, file)
        

        if not os.path.exists(target_file_path):
            shutil.copy2(source_file_path, target_file_path)

In [10]:
def count_files(directory):
    # List all items in the directory
    items = os.listdir(directory)
    # Filter out only files (excluding directories)
    files = [item for item in items if os.path.isfile(os.path.join(directory, item))]
    return len(files)

In [11]:
print(count_files(target_dir_path))

314


In [12]:
import json
def read_jsonl_with_json(file_path):
    json_objects = []
    with open(file_path, "r") as file:
        for line in file:
            json_object = json.loads(line)
            json_objects.append(json_object)
    return json_objects

In [25]:
for filename in os.listdir(target_dir_path):
    file_path = os.path.join(target_dir_path,filename)
    if os.path.isfile(file_path):
        json_objects = read_jsonl_with_json(file_path)
        # print(json_objects)
        texts = [obj["text"] for obj in json_objects]
        combine_text = "\n".join(texts)
        # print(combine_text)
        extracted_file_path = os.path.join(extracted_dir_path, filename)
        with open(extracted_file_path, 'w') as file:
            file.write(combine_text)

In [14]:
def custom_relevance_score_fn(similarity_score: float) -> float:
    # Example calculation (customize as needed)
    relevance_score = 1 / (1 + similarity_score)
    return relevance_score

In [15]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50, length_function=len, add_start_index=True)

modelPath = "sentence-transformers/sentence-t5-base"
custom_cache_dir = "/scratch/general/vast/u1471428/dcp/models"
chroma_persist_directory = "/scratch/general/vast/u1471428/dcp/chroma_db2"

embeddings_model = HuggingFaceEmbeddings(model_name=modelPath, cache_folder=custom_cache_dir)
db = Chroma(persist_directory= chroma_persist_directory , embedding_function=embeddings_model, relevance_score_fn=custom_relevance_score_fn,collection_name="try3")

In [11]:
print(db)

<langchain_community.vectorstores.chroma.Chroma object at 0x14a1743a4100>


In [31]:
for filename in os.listdir(extracted_dir_path):
    file_path = os.path.join(extracted_dir_path,filename)
    if os.path.isfile(file_path):
        long_text = ""
        with open(file_path, "r") as file:
            long_text = file.read()
        # print(long_text)
        long_text = long_text.replace("\n"," ")
        # print(long_text)

        chunks = splitter.split_text(long_text)
        chunks = splitter.create_documents(chunks)
        for chunk in chunks:
            # print(chunk)
            db.add_documents([chunk])

In [16]:
query = "Give overview for disease Chronic Dacryoadentitis?"
results = db.similarity_search_with_score(query,k=10)

In [23]:
print(result)

(Document(metadata={'start_index': 0}, page_content='long and joins lacrimal sac with the nasal cavity (nasolacrimal duct opens into the inferior meatus of the nose). #Treatment - Dacryocystitis - 9A11.2 Management should be carried out under medical supervision. Congenital dacryocystitis: Local massage over lacrimal sac: This is carried out by blocking the common canaliculus with finger and then stroking downward to increase hydrostatic pressure within lacrimal sac, which may open the membranous obstruction in the nasolacrimal duct. Topical'), 0.25609061121940613)


In [17]:
for result in results:
    print(result[0].page_content)
    print(result[1])

Dacryoadenitis often lacks the inflammatory signs and may present with enlargement only, then it has to be distinguished from the neoplasm of the gland. #Treatment - Dacryoadenitis - 1D80.Y Treatment of the disease should be under medical supervision. Treatment depends upon the onset of disease process and its aetiology. Acute dacryoadenitis: Viral dacryoadenitis: It is usually self-limiting and does not require any treatment. Bacterial dacryoadenitis: Broad spectrum antibiotics (such as
0.24124547839164734
carries a good prognosis. Chronic dacryoadenitis: Prognosis depends upon the underlying disease and its treatment. #Diagnosis - Dacryoadenitis - 1D80.Y Diagnosis is based on the clinical presentation of the patient and the supportive tests being conducted. Patient may present with: In acute dacryoadenitis: There is usually unilateral, severe pain, associated redness with pressure sensation in superotemporal region of the eye. In chronic dacryoadenitis: Unilateral or bilateral painle

In [26]:
db.delete_collection()

OperationalError: attempt to write a readonly database

In [17]:
## training 
import json
from constants import *
import pandas as pd
with open(qa_disease_precaution, "r") as file:
    json_data_p = json.load(file)

with open(qa_disease_symptom, "r") as file:
    json_data_s = json.load(file)

precaution_qa_df = pd.read_json(qa_disease_precaution)

symptom_qa_df = pd.read_json(qa_disease_symptom)


train_df = pd.concat([precaution_qa_df, symptom_qa_df], ignore_index=True)

In [20]:
def get_context(query):
    results = db.similarity_search_with_score(query, k=10)
    contexts = ""
    flag = False
    for result in results:
        context = (result[0].page_content)
        score = (result[1])
        if score >0.1:
            flag = True
            contexts+= context + " "
    if flag:
        return context
    return None

        
    

In [23]:
json_context_data = []
none_count = 0
for ind, row in train_df.iterrows():
    question = row["Question"]
    answer = row["answer"]
    context = get_context(question)
    if not context:
        none_count+=1
        continue
    
    query = f"question: {question} context: {context}"
    obj = {}
    obj["query"] = query
    obj["answer"] = answer
    json_context_data.append(obj)

print("none count", none_count)
with open("context_query_QA_diesease.json",'w') as file:
    json.dump(json_context_data, file)

none count 0


In [25]:
test_df = pd.read_json("test_disease_QA.json")
test_json_context_data = []
none_count = 0
for ind, row in test_df.iterrows():
    question = row["Question"]
    answer = row["answer"]
    context = get_context(question)
    if not context:
        none_count += 1
        continue

    query = f"question: {question} context: {context}"
    obj = {}
    obj["query"] = query
    obj["answer"] = answer
    test_json_context_data.append(obj)

print("none count", none_count)
with open("test_context_query_QA_diesease.json", "w") as file:
    json.dump(test_json_context_data, file)

none count 0


In [18]:
from constants import *

In [21]:
import pandas as pd
import json

train_df = pd.read_json(train_disease_file)
json_context_data = []

none_count = 0
for ind, row in train_df.iterrows():
    question = row["question"]
    answer = row["answer"]
    context = get_context(question)
    if not context:
        none_count+=1
        continue
    
    query = f"question: {question} context: {context}"
    obj = {}
    obj["query"] = query
    obj["answer"] = answer
    json_context_data.append(obj)

print("none count", none_count)
with open("context_query_QA_diesease_1.json",'w') as file:
    json.dump(json_context_data, file)

none count 0


In [22]:
test_df = pd.read_json(test_disease_file)
test_json_context_data = []
none_count = 0
for ind, row in test_df.iterrows():
    question = row["question"]
    answer = row["answer"]
    context = get_context(question)
    if not context:
        none_count += 1
        continue

    query = f"question: {question} context: {context}"
    obj = {}
    obj["query"] = query
    obj["answer"] = answer
    test_json_context_data.append(obj)

print("none count", none_count)
with open("test_context_query_QA_diesease_1.json", "w") as file:
    json.dump(test_json_context_data, file)

none count 0


## Get disease names from the extracted_files

In [1]:
import os
extracted_dir_path = "./extracted_files"

In [6]:
file_names = os.listdir(extracted_dir_path)
print(len(file_names))
disease_set = ()

for file_name in file_names:
    disease = file_name.split('-')[0]
    disease = disease.replace('_', ' ')
    disease = disease.lower()
    print('"'+disease+'",')

314
"acl injury",
"abdominal pain",
"abdominal aortic aneurysm",
"abducens nerve palsy",
"abnormal uterine bleeding",
"absence seizure",
"acanthosis nigricans",
"achalasia",
"achilles tendinitis",
"achilles tendon rupture",
"acne",
"acoustic neuroma",
"acquired immuno deficiency syndrome (aids)",
"acromegaly",
"actinic keratosis",
"acute coronary syndrome",
"acute encephalitis",
"acute flaccid myelitis (afm)",
"adult inclusion conjunctivitis",
"alcohol abuse and alcoholism",
"alopecia (hair loss)",
"alzheimers disease",
"amaurosis fugax",
"amblyopia",
"amoebiasis",
"anaemia",
"aniseikonia",
"anisometropia",
"ankyloblepharon",
"antepartum hemorrhage",
"anthrax",
"anxiety",
"aphakia",
"appendicitis",
"arthritis",
"asbestos",
"asthma",
"astigmatism",
"atherosclerosis",
"atopic keratoconjunctivitis",
"attention deficit hyperactivity disorder",
"autism",
"avian influenza",
"back pain",
"bad breath",
"band shaped keratopathy",
"bedsores",
"bells palsy",
"benign essential blepharospasm",
"ble