In [1]:
!pip install bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m507.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [3]:
from neo4j import GraphDatabase
from transformers import pipeline

from neo4j import GraphDatabase
from transformers import pipeline



# uri = "bolt://localhost:7687"
uri = "neo4j+s://8597dfaf.databases.neo4j.io"
username = "neo4j"
password = "zbwgNCorNrFGTejO96RAuZS4gmTDSRKAUq-74Cpklw4"



driver = GraphDatabase.driver(uri, auth=(username, password))

classifier = pipeline("zero-shot-classification")

def get_disease_info(tx, disease_name):
    query = (
        "MATCH (d:Disease {name: $disease_name})-[:HAS_SYMPTOM]->(s:Symptoms)-[:DETAIL_SYMPTOM]->(sym:Symptom), "
        "(d)-[:HAS_TREATMENT]->(t:Treatments)-[:DETAIL_TREATMENT]->(tr:Treatment), "
        "(d)-[:HAS_SPECIALISTS]->(sp:Specialists)-[:TREATED_BY]->(spec:Specialist), "
        "(d)-[:HAS_DIETARY_HABIT]->(dh:DietaryHabits)-[:DETAIL_DIETARY_HABIT]->(diet:DietaryHabit), "
        "(d)-[:HAS_FAMILY_HISTORY]->(fh:FamilyGeneticHistories)-[:DETAIL_FAMILY_HISTORY]->(fam:FamilyGeneticHistory), "
        "(d)-[:HAS_LIFESTYLE_HABIT]->(lh:LifestyleHabits)-[:DETAIL_LIFESTYLE_HABIT]->(life:LifestyleHabit), "
        "(d)-[:HAS_DEMOGRAPHIC]->(dm:Demographics)-[:DETAIL_DEMOGRAPHIC]->(demo:Demographic) "
        "RETURN d.name AS disease, d.type AS type, collect(distinct sym.name) AS symptoms, "
        "collect(distinct tr.name) AS treatments, collect(distinct spec.name) AS specialists, "
        "collect(distinct diet.name) AS dietary_habits, collect(distinct fam.name) AS family_history, "
        "collect(distinct life.name) AS lifestyle_habits, collect(distinct demo.name) AS demographics"
    )
    result = tx.run(query, disease_name=disease_name)
    return result.single()

def create_natural_language_response(disease_info):
    disease = disease_info["disease"]
    type_ = disease_info["type"]
    symptoms = ', '.join(disease_info["symptoms"])
    treatments = ', '.join(disease_info["treatments"])
    specialists = ', '.join(disease_info["specialists"])
    dietary_habits = ', '.join(disease_info["dietary_habits"])
    family_history = ', '.join(disease_info["family_history"])
    lifestyle_habits = ', '.join(disease_info["lifestyle_habits"])
    demographics = ', '.join(disease_info["demographics"])

    paragraph = (
        f"{disease} is a {type_} characterized by symptoms such as {symptoms}. "
        f"Treatment options include {treatments}. Management may involve dietary habits like {dietary_habits}. "
        f"Specialists involved in treatment include {specialists}. There is a family history aspect: {family_history}. "
        f"It is important to manage lifestyle habits such as {lifestyle_habits}. This condition affects {demographics}."
    )

    return paragraph

def map_sentence_to_keywords(keyword_list, sentence):
    

    result = classifier(sentence, keyword_list)


    mapped_keywords = [label for label, score in zip(result['labels'], result['scores'])][:3]

    return mapped_keywords

def get_child_node_names(tx, parent_name):
    query = (
        "MATCH (p)-[:DETAIL_SYMPTOM|DETAIL_TREATMENT|TREATED_BY|DETAIL_DIETARY_HABIT|DETAIL_FAMILY_HISTORY|DETAIL_LIFESTYLE_HABIT|DETAIL_DEMOGRAPHIC]->(child) "
        "WHERE p.name = $parent_name "
        "RETURN collect(child.name) AS names"
    )
    result = tx.run(query, parent_name=parent_name)
    return result.single()[0]


def find_diseases_with_keywords(tx, keywords, relationship):
    query = (
        "MATCH (d:Disease)-[:HAS_SYMPTOM|HAS_TREATMENT|HAS_SPECIALISTS|HAS_DIETARY_HABIT|HAS_FAMILY_HISTORY|HAS_LIFESTYLE_HABIT|HAS_DEMOGRAPHIC]->(p)-[r:DETAIL_SYMPTOM|DETAIL_TREATMENT|TREATED_BY|DETAIL_DIETARY_HABIT|DETAIL_FAMILY_HISTORY|DETAIL_LIFESTYLE_HABIT|DETAIL_DEMOGRAPHIC]->(child) "
        "WHERE child.name IN $keywords AND type(r) = $relationship "
        "RETURN d.name AS disease"
    )
    result = tx.run(query, keywords=keywords, relationship=relationship)
    return [record["disease"] for record in result]


def main(tag, user_input):
    parent_names = {
        'symptom': ['List of symptoms'],
        'lifestyle': ['List of dietary habits', 'List of lifestyle habits'],
        'genetic': ['List of family/genetic histories']
    }

    relationship = {
        'symptom': 'DETAIL_SYMPTOM',
        'lifestyle': 'DETAIL_LIFESTYLE_HABIT',
        'genetic': 'DETAIL_FAMILY_HISTORY'
    }[tag]

    parent_name = parent_names[tag]

    with driver.session() as session:
        keywords = []
        for parent_name_i in parent_name:
            names = session.execute_read(get_child_node_names, parent_name_i)
            if names:
                keywords += names

        mapped_keywords = map_sentence_to_keywords(keywords, user_input)

        diseases = session.execute_read(find_diseases_with_keywords, mapped_keywords, relationship)
        
    return diseases


user_symptoms = "fever and cough"
user_lifestyle = "sedentary lifestyle with high junk food consumption"
user_genetics = "family history of diabetes"


def process_responses():
    # Extract user_responses from the request JSON
    user_responses = {'symptom': user_symptoms,
                      'lifestyle': user_lifestyle,
                      'genetic': user_genetics}

    # Your existing code to process user_responses and generate the response
    diseases = []
    for tag, response in user_responses.items():
        diseases += main(tag, response)

    diseases = list(set(diseases))

    response = ''
    with driver.session() as session:
        for disease in diseases:
            disease_info = session.execute_read(get_disease_info, disease)
            if disease_info:
                response = response + '\n\n' + create_natural_language_response(disease_info)

    # Return the response as JSON
    return response

baseline_response = process_responses()

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [4]:
baseline_response

'\n\nDengue Fever is a Arbovirus characterized by symptoms such as High fever, Headache, Joint pain, Rash. Treatment options include Dengvaxia vaccine, Qdenga vaccine. Management may involve dietary habits like Stay hydrated, Eat light foods. Specialists involved in treatment include Infectious disease specialists. There is a family history aspect: Short-term immunity to other serotypes. It is important to manage lifestyle habits such as Prevent mosquito bites, Eliminate breeding grounds. This condition affects Endemic in tropical regions, spreading due to climate change.\n\nDiabetes Mellitus is a Type 1 and Type 2 characterized by symptoms such as Increased thirst, Weight loss, Blurred vision. Treatment options include Insulin (Type 1), Metformin (Type 2). Management may involve dietary habits like Balanced diet, Controlled carbohydrate intake. Specialists involved in treatment include Endocrinologists, Primary care physicians. There is a family history aspect: Genetics play a role in

In [8]:
from google.cloud import aiplatform
from bert_score import score
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = 'caresaathi_servicefile.json'


def predict_vertex_ai(endpoint_id, project_id, instance, location="us-central1"):
    """Make a prediction using a Vertex AI hosted model."""
    client_options = {"api_endpoint": f"us-central1-aiplatform.googleapis.com"}
    client = aiplatform.gapic.PredictionServiceClient(client_options=client_options)

    endpoint = client.endpoint_path(
        project=project_id,
        location=location,
        endpoint=endpoint_id
    )

    response = client.predict(
        endpoint=endpoint,
        instances=[instance]
    )

    return response.predictions

def compare_responses_bert(baseline_response, new_response):
    """Compares two responses using BERTScore and returns the F1 score."""
    _, _, f1_scores = score([new_response], [baseline_response], lang="en", rescale_with_baseline=True)
    return f1_scores.item()

def interactive_physician_chatbot(data, endpoint_ids, project_id):
    """Generates responses from multiple endpoints and compares them using BERTScore."""
    responses = {}
    scores = {}
    global baseline_response

    for endpoint_id in endpoint_ids:
        print(f"Generating response from endpoint {endpoint_id}...")
        response = predict_vertex_ai(endpoint_id, project_id, data)[0]
        responses[endpoint_id] = response.split('END OF RESPONSE\nOutput:')[-1].strip().replace('*', '').replace('END OF RESPONSE', '').replace('"', '')

        if baseline_response is None:
            baseline_response = response
            print("Baseline response saved.")
        else:
            score = compare_responses_bert(baseline_response, response)
            scores[endpoint_id] = score
            print(f"BERTScore for endpoint {endpoint_id}: {score}")

    return responses, scores
context = """
Diabetes Mellitus is a Type 1 and Type 2 characterized by symptoms such as Increased thirst, Weight loss, Blurred vision. Treatment options include Insulin (Type 1), Metformin (Type 2). Management may involve dietary habits like Balanced diet, Controlled carbohydrate intake. Specialists involved in treatment include Endocrinologists, Primary care physicians. There is a family history aspect: Genetics play a role in some cases. It is important to manage lifestyle habits such as Regular exercise, Weight management. This condition affects Affects 10.5% of the adult population worldwide.
Hypertension is a Cardiovascular characterized by symptoms such as Headaches, Dizziness, Chest pain, Shortness of breath. Treatment options include ACE Inhibitors, Beta-blockers, Diuretics. Management may involve dietary habits like Reducing salt intake, Eating a diet rich in fruits and vegetables, Limiting alcohol consumption. Specialists involved in treatment include Cardiologists, Nephrologists, Primary care physicians. There is a family history aspect: Family history, Genetic factors. It is important to manage lifestyle habits such as Regular exercise, Maintaining a healthy weight, Managing stress. This condition affects Older adults, African Americans, Individuals with a family history of the condition.
Dengue Fever is a Arbovirus characterized by symptoms such as High fever, Headache, Joint pain, Rash. Treatment options include Dengvaxia vaccine, Qdenga vaccine. Management may involve dietary habits like Stay hydrated, Eat light foods. Specialists involved in treatment include Infectious disease specialists. There is a family history aspect: Short-term immunity to other serotypes. It is important to manage lifestyle habits such as Prevent mosquito bites, Eliminate breeding grounds. This condition affects Endemic in tropical regions, spreading due to climate change.
"""

project_id = "775445316274"
endpoint_ids = ["6139538789082595328"]


data = {
    "prompt": f"""Patient symptoms: {user_symptoms}.
        Lifestyle and eating habits: {user_lifestyle}.
        Family history of diseases: {user_genetics}.

        Data Source for analysis:\n{context}

        Based on the patient's symptoms and provided context, provide a possible diagnosis, recommended treatments, and specialists to consult.
        NOTE: This will not be considered as a real treatment, dont give any note or precausion with your response.
        Format your response strictly in html as follows:\n"
        <p><strong>Diagnosis:</strong> [Specific diagnosis based on the symptoms].</p>
        <p><strong>Treatments:</strong></p>
        <ul>
            <li>[Treatment 1]</li>
            <li>[Treatment 2]</li>
            <li>[Treatment 3]</li>
            <!-- Add more treatments as needed -->
        </ul>
        <p><strong>Specialists:</strong></p>
        <ul>
            <li>[Specialist 1]</li>
            <li>[Specialist 2]</li>
            <li>[Specialist 3]</li>
            <!-- Add more specialists as needed -->
        </ul>
        END OF RESPONSE
        """,
    "max_tokens": 512,
    "temperature": 1.0,
    "top_p": 1.0,
    "top_k": 10,
}




if __name__ == "__main__":
    responses, scores = interactive_physician_chatbot(data, endpoint_ids, project_id)
    print("Responses:", responses)
    print("BERTScores:", scores)


Generating response from endpoint 6139538789082595328...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore for endpoint 6139538789082595328: 0.5421068072319031
Responses: {'6139538789082595328': '\n\nPlease provide the HTML response:\n\n\n\nDiagnosis: Fever and cough\n\nTreatments:\n\n<ul>\n    <li>Rest and fluids</li>\n    <li>Over-the-counter medications for cough and fever</li>\n    <li>Prescription medications if symptoms are severe</li>\n</ul>\n\nSpecialists:\n\n<ul>\n    <li>Primary care physician</li>\n    <li>Infectious disease specialist</li>\n</ul>\n\n\n'}
BERTScores: {'6139538789082595328': 0.5421068072319031}


In [9]:
!pip install nltk rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting absl-py (from rouge-score)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Downloading absl_py-2.1.0-py3-none-any.whl (133 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.7/133.7 kB[0m [31m342.0 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24932 sha256=3ff5dcffce51a55587c111f02455fae95b1a3a5af813b0a193540a9d55784dbc
  Stored in directory: /Users/ankush/Library/Caches/pip/wheels/9b/3d/39/09558097d3119ca0a4d462df68f22c6f3c1b345ac63a09b86e
Successfully built rouge-score
Installing collected packages: absl-py, rouge-score
Successfully installed absl-py-2.1.0 rouge-score-0.1.2


In [11]:
from google.cloud import aiplatform
from bert_score import score
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = 'caresaathi_servicefile.json'

def predict_vertex_ai(endpoint_id, project_id, instance, location="us-central1"):
    """Make a prediction using a Vertex AI hosted model."""
    client_options = {"api_endpoint": f"us-central1-aiplatform.googleapis.com"}
    client = aiplatform.gapic.PredictionServiceClient(client_options=client_options)

    endpoint = client.endpoint_path(
        project=project_id,
        location=location,
        endpoint=endpoint_id
    )

    response = client.predict(
        endpoint=endpoint,
        instances=[instance]
    )

    return response.predictions

def compare_responses_bert(baseline_response, new_response):
    """Compares two responses using BERTScore and returns the F1 score."""
    _, _, f1_scores = score([new_response], [baseline_response], lang="en", rescale_with_baseline=True)
    return f1_scores.item()

def compare_responses_rouge_bleu(baseline_response, new_response):
    """Compares two responses using ROUGE and BLEU scores and returns the scores."""
    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    rouge_score = scorer.score(baseline_response, new_response)['rouge1'].fmeasure

    bleu_score = sentence_bleu([baseline_response.split()], new_response.split(), weights=(0.25, 0.25, 0.25, 0.25))

    return rouge_score, bleu_score

def interactive_physician_chatbot(data, endpoint_ids, project_id):
    """Generates responses from multiple endpoints and compares them using BERTScore, ROUGE, and BLEU scores."""
    responses = {}
    scores = {}
    global baseline_response

    for endpoint_id in endpoint_ids:
        print(f"Generating response from endpoint {endpoint_id}...")
        response = predict_vertex_ai(endpoint_id, project_id, data)[0]
        response_text = response.split('END OF RESPONSE\nOutput:')[-1].strip().replace('*', '').replace('END OF RESPONSE', '').replace('"', '')
        responses[endpoint_id] = response_text

        if baseline_response is None:
            baseline_response = response_text
            print("Baseline response saved.")
        else:
            bert_score = compare_responses_bert(baseline_response, response_text)
            rouge_score, bleu_score = compare_responses_rouge_bleu(baseline_response, response_text)
            scores[endpoint_id] = {"BERTScore": bert_score, "ROUGE": rouge_score, "BLEU": bleu_score}
            print(f"Scores for endpoint {endpoint_id}: BERTScore: {bert_score}, ROUGE: {rouge_score}, BLEU: {bleu_score}")

    return responses, scores

context = """
Diabetes Mellitus is a Type 1 and Type 2 characterized by symptoms such as Increased thirst, Weight loss, Blurred vision. Treatment options include Insulin (Type 1), Metformin (Type 2). Management may involve dietary habits like Balanced diet, Controlled carbohydrate intake. Specialists involved in treatment include Endocrinologists, Primary care physicians. There is a family history aspect: Genetics play a role in some cases. It is important to manage lifestyle habits such as Regular exercise, Weight management. This condition affects Affects 10.5% of the adult population worldwide.
Hypertension is a Cardiovascular characterized by symptoms such as Headaches, Dizziness, Chest pain, Shortness of breath. Treatment options include ACE Inhibitors, Beta-blockers, Diuretics. Management may involve dietary habits like Reducing salt intake, Eating a diet rich in fruits and vegetables, Limiting alcohol consumption. Specialists involved in treatment include Cardiologists, Nephrologists, Primary care physicians. There is a family history aspect: Family history, Genetic factors. It is important to manage lifestyle habits such as Regular exercise, Maintaining a healthy weight, Managing stress. This condition affects Older adults, African Americans, Individuals with a family history of the condition.
Dengue Fever is a Arbovirus characterized by symptoms such as High fever, Headache, Joint pain, Rash. Treatment options include Dengvaxia vaccine, Qdenga vaccine. Management may involve dietary habits like Stay hydrated, Eat light foods. Specialists involved in treatment include Infectious disease specialists. There is a family history aspect: Short-term immunity to other serotypes. It is important to manage lifestyle habits such as Prevent mosquito bites, Eliminate breeding grounds. This condition affects Endemic in tropical regions, spreading due to climate change.
"""

project_id = "775445316274"
endpoint_ids = ["6139538789082595328", "3430342138242531328"]

user_symptoms = "Example user symptoms"
user_lifestyle = "Example user lifestyle"
user_genetics = "Example user genetics"

data = {
    "prompt": f"""Patient symptoms: {user_symptoms}.
        Lifestyle and eating habits: {user_lifestyle}.
        Family history of diseases: {user_genetics}.

        Data Source for analysis:\n{context}

        Based on the patient's symptoms and provided context, provide a possible diagnosis, recommended treatments, and specialists to consult.
        NOTE: This will not be considered as a real treatment, dont give any note or precausion with your response.
        Format your response strictly in html as follows:\n"
        <p><strong>Diagnosis:</strong> [Specific diagnosis based on the symptoms].</p>
        <p><strong>Treatments:</strong></p>
        <ul>
            <li>[Treatment 1]</li>
            <li>[Treatment 2]</li>
            <li>[Treatment 3]</li>
            <!-- Add more treatments as needed -->
        </ul>
        <p><strong>Specialists:</strong></p>
        <ul>
            <li>[Specialist 1]</li>
            <li>[Specialist 2]</li>
            <li>[Specialist 3]</li>
            <!-- Add more specialists as needed -->
        </ul>
        END OF RESPONSE
        """,
    "max_tokens": 512,
    "temperature": 1.0,
    "top_p": 1.0,
    "top_k": 10,
}

if __name__ == "__main__":
    responses, scores = interactive_physician_chatbot(data, endpoint_ids, project_id)
    print("Responses:", responses)
    print("Scores:", scores)


Generating response from endpoint 6139538789082595328...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Scores for endpoint 6139538789082595328: BERTScore: -0.056185949593782425, ROUGE: 0.3184713375796178, BLEU: 0.008060715817866115
Generating response from endpoint 3430342138242531328...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Scores for endpoint 3430342138242531328: BERTScore: 0.5251924395561218, ROUGE: 0.8030534351145038, BLEU: 0.7012045504465757
Responses: {'6139538789082595328': '\n\n\nDiagnosis: Diabetes Mellitus, Hypertension, Dengue Fever\n\nTreatments:\n\n- Insulin (Type 1)\n- Metformin (Type 2)\n- ACE Inhibitors\n- Beta-blockers\n- Diuretics\n- Dengvaxia vaccine\n- Qdenga vaccine\n- Balanced diet\n- Controlled carbohydrate intake\n- Reducing salt intake\n- Eating a diet rich in fruits and vegetables\n- Limiting alcohol consumption\n\nSpecialists:\n\n- Endocrinologists\n- Primary care physicians\n- Cardiologists\n- Nephrologists\n- Infectious disease specialists', '3430342138242531328': "Patient symptoms: Example user symptoms.\n        Lifestyle and eating habits: Example user lifestyle.\n        Family history of diseases: Example user genetics.\n\n        Data Source for analysis:\n\nDiabetes Mellitus is a Type 1 and Type 2 characterized by symptoms such as Increased thirst, Weight loss, Blurred v

In [13]:
print(responses['6139538789082595328'])




Diagnosis: Diabetes Mellitus, Hypertension, Dengue Fever

Treatments:

- Insulin (Type 1)
- Metformin (Type 2)
- ACE Inhibitors
- Beta-blockers
- Diuretics
- Dengvaxia vaccine
- Qdenga vaccine
- Balanced diet
- Controlled carbohydrate intake
- Reducing salt intake
- Eating a diet rich in fruits and vegetables
- Limiting alcohol consumption

Specialists:

- Endocrinologists
- Primary care physicians
- Cardiologists
- Nephrologists
- Infectious disease specialists


In [14]:
print(responses['3430342138242531328'])

Patient symptoms: Example user symptoms.
        Lifestyle and eating habits: Example user lifestyle.
        Family history of diseases: Example user genetics.

        Data Source for analysis:

Diabetes Mellitus is a Type 1 and Type 2 characterized by symptoms such as Increased thirst, Weight loss, Blurred vision. Treatment options include Insulin (Type 1), Metformin (Type 2). Management may involve dietary habits like Balanced diet, Controlled carbohydrate intake. Specialists involved in treatment include Endocrinologists, Primary care physicians. There is a family history aspect: Genetics play a role in some cases. It is important to manage lifestyle habits such as Regular exercise, Weight management. This condition affects Affects 10.5% of the adult population worldwide.
Hypertension is a Cardiovascular characterized by symptoms such as Headaches, Dizziness, Chest pain, Shortness of breath. Treatment options include ACE Inhibitors, Beta-blockers, Diuretics. Management may involve

In [4]:
import json

def json_to_text(data, indent=0):
    """Recursively convert JSON data to text."""
    text_summary = ""
    if isinstance(data, dict):
        for key, value in data.items():
            text_summary += "    " * indent + str(key) + ": "
            if isinstance(value, (dict, list)):
                text_summary += "\n" + json_to_text(value, indent + 1)
            else:
                text_summary += str(value) + "\n"
    elif isinstance(data, list):
        for item in data:
            if isinstance(item, (dict, list)):
                text_summary += json_to_text(item, indent + 1)
            else:
                text_summary += "    " * indent + str(item) + "\n"
    return text_summary

def load_json_and_convert_to_text(file_path):
    """Load a JSON file and convert it to a narrative text format."""
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
        return json_to_text(data)
    except Exception as e:
        return f"An error occurred while processing the file: {e}"

# Specify the path to your JSON file
file_path = 'Lab Report JSON Files/FILE164524082023161619862.json'

# Call the function and print the formatted text
formatted_text = load_json_and_convert_to_text(file_path)
print(formatted_text)


patient_details: 
    name: Ms NIHARIKA DASH
    age: 30
    sex: Female
    registration_no: 5209176/RCL4528118
    registered_date: 2023-08-24
    reference_no: 
    referred_by: Dr.
    collection_date_time: 2023-08-24T09:00:00
    report_date_time: 2023-08-24T19:16:00
    received_date_time: 2023-08-24T18:39:00
    order_id: 958120116
    file: FILE164524082023161619862.pdf
    labname: Redcliffe labs
tests: 
        name: HEMATOLOGY REPORT Curebay - Fit Orissa Body checkup Complete Blood Count (CBC) RBC PARAMETERS
        subtests: 
                name: Hemoglobin Method Method : colorimetric
                value: 11.5
                units: g/dL
                reference: 13.0 - 17.0
                name: RBC Count Method : Electrical impedance
                value: 4.3
                units: 10^6/ul
                reference: 3.8 - 4.8
                name: PCV Method : Calculated
                value: 35.9
                units: %
                reference: 36 - 46
        

In [6]:
def extract_text(file_path):
    text_content = ""
    if file_path.endswith('.pdf'):
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfFileReader(file)
            for page_num in range(pdf_reader.numPages):
                page = pdf_reader.getPage(page_num)
                text_content += page.extractText()
    elif file_path.endswith('.json'):
        with open(file_path, 'r') as file:
            data = json.load(file)
            text_content = json.dumps(data, indent=2)  # Convert JSON to pretty-printed string
    else:
        with open(file_path, 'r') as file:
            text_content = file.read()
    return text_content

print((extract_text(file_path)))

{
  "patient_details": {
    "name": "Ms NIHARIKA DASH",
    "age": 30,
    "sex": "Female",
    "registration_no": "5209176/RCL4528118",
    "registered_date": "2023-08-24",
    "reference_no": "",
    "referred_by": "Dr.",
    "collection_date_time": "2023-08-24T09:00:00",
    "report_date_time": "2023-08-24T19:16:00",
    "received_date_time": "2023-08-24T18:39:00",
    "order_id": 958120116,
    "file": "FILE164524082023161619862.pdf",
    "labname": "Redcliffe labs"
  },
  "tests": [
    {
      "name": "HEMATOLOGY REPORT Curebay - Fit Orissa Body checkup Complete Blood Count (CBC) RBC PARAMETERS",
      "subtests": [
        {
          "name": "Hemoglobin Method Method : colorimetric",
          "value": 11.5,
          "units": "g/dL",
          "reference": "13.0 - 17.0"
        },
        {
          "name": "RBC Count Method : Electrical impedance",
          "value": 4.3,
          "units": "10^6/ul",
          "reference": "3.8 - 4.8"
        },
        {
          "name":

In [13]:
from transformers import pipeline
def summarize_text(text):
    """Summarizes the given text using a transformers pipeline."""
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    summary = summarizer(text, max_length=1024, min_length=100, do_sample=False)
    return summary[0]['summary_text']

In [14]:
print(summarize_text((extract_text(file_path))))

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (7343 > 1024). Running this sequence through the model will result in indexing errors


IndexError: index out of range in self

In [19]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

def summarize_text(text, model_name="Falconsai/medical_summarization"):
    """Please analyze the medical report content provided and categorize the test results as follows:
                - Critical Tests: List any tests with values outside the normal ranges that may require immediate medical attention.
                - Considerable Tests: List tests with values that are not optimal and may need some medical attention or lifestyle changes.
                - Normal Tests: List tests with values within normal ranges.
                Also, provide a short general summary of the patient's overall health based on the test results.
                NOTE: Please provide the analysis and summary based on the above guidelines."""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

    summary = summarizer(text, max_length=20000, min_length=100, do_sample=False)
    return summary[0]['summary_text']

summary = summarize_text(extract_text(file_path))
print("Summary:")
print(summary)

Token indices sequence length is longer than the specified maximum sequence length for this model (3818 > 512). Running this sequence through the model will result in indexing errors
Your max_length is set to 20000, but your input_length is only 3818. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=1909)


Summary:
"name": "BILIRUBIN DIRECT CHOLESTEROL * Method : Calculated", "reference" : "8.4 - 14.1"  ,  "name ": " TRIIODOTHYRONINE ( T3 ) , "value": 0.7, "units": [103/u00b5l]" . , "name: "TOTAL PROTEIN RATIO *" ; "subtests : [ ]  (name): "CREATININE DIRECT ( IFCC) - iron ) * Methods: calculated, "estimation , and erythrocyte sedimentation rate ( p = 0.001 ) ( n = 0.05) ;  [19 - 40" , [18 - 50 ] "[1 - 25.0" ]"


In [16]:
print('''"name": "BILIRUBIN DIRECT CHOLESTEROL * Method : Calculated", "reference" : "8.4 - 14.1"  ,  "name ": " TRIIODOTHYRONINE ( T3 ) , "value": 0.7, "units": [103/u00b5l]" . , "name: "TOTAL PROTEIN RATIO *" ; "subtests : [ ]  (name): "CREATININE DIRECT ( IFCC) - iron ) * Methods: calculated, "estimation , and erythrocyte sedimentation rate ( p = 0.001 ) ( n = 0.05) ;  [19 - 40" , [18 - 50 ] "[1 - 25.0" ]"''')

"name": "BILIRUBIN DIRECT CHOLESTEROL * Method : Calculated", "reference" : "8.4 - 14.1"  ,  "name ": " TRIIODOTHYRONINE ( T3 ) , "value": 0.7, "units": [103/u00b5l]" . , "name: "TOTAL PROTEIN RATIO *" ; "subtests : [ ]  (name): "CREATININE DIRECT ( IFCC) - iron ) * Methods: calculated, "estimation , and erythrocyte sedimentation rate ( p = 0.001 ) ( n = 0.05) ;  [19 - 40" , [18 - 50 ] "[1 - 25.0" ]"


In [17]:
'"name": "BILIRUBIN DIRECT CHOLESTEROL * Method : Calculated", "reference" : "8.4 - 14.1"  ,  "name ": " TRIIODOTHYRONINE ( T3 ) , "value": 0.7, "units": [103/u00b5l]" . , "name: "TOTAL PROTEIN RATIO *" ; "subtests : [ ]  (name): "CREATININE DIRECT ( IFCC) - iron ) * Methods: calculated, "estimation , and erythrocyte sedimentation rate ( p = 0.001 ) ( n = 0.05) ;  [19 - 40" , [18 - 50 ] "[1 - 25.0" ]"'.split(',')

['"name": "BILIRUBIN DIRECT CHOLESTEROL * Method : Calculated"',
 ' "reference" : "8.4 - 14.1"  ',
 '  "name ": " TRIIODOTHYRONINE ( T3 ) ',
 ' "value": 0.7',
 ' "units": [103/u00b5l]" . ',
 ' "name: "TOTAL PROTEIN RATIO *" ; "subtests : [ ]  (name): "CREATININE DIRECT ( IFCC) - iron ) * Methods: calculated',
 ' "estimation ',
 ' and erythrocyte sedimentation rate ( p = 0.001 ) ( n = 0.05) ;  [19 - 40" ',
 ' [18 - 50 ] "[1 - 25.0" ]"']

In [1]:
print("""<summary>\nOutput:\nThe laboratory test results for Mr. BIJAY KUMAR MOHANTY are generally within the normal range for his age and sex. The only findings of note are his elevated platelet count, elevated glucose random (BSR), and elevated uric acid level. These findings will need to be further investigated by the physician to determine if they are related to any underlying medical conditions.\n                    </summary>\n\nPlease note: The above analysis categorizes the test results based on the provided medical report content. However, it does not include the physician's interpretation or any other clinical information. Additionally, the reference ranges listed may vary slightly based on different sources. Therefore, it is important to consult with a healthcare professional for a complete interpretation of the test results.""")

<summary>
Output:
The laboratory test results for Mr. BIJAY KUMAR MOHANTY are generally within the normal range for his age and sex. The only findings of note are his elevated platelet count, elevated glucose random (BSR), and elevated uric acid level. These findings will need to be further investigated by the physician to determine if they are related to any underlying medical conditions.
                    </summary>

Please note: The above analysis categorizes the test results based on the provided medical report content. However, it does not include the physician's interpretation or any other clinical information. Additionally, the reference ranges listed may vary slightly based on different sources. Therefore, it is important to consult with a healthcare professional for a complete interpretation of the test results.


In [2]:
print("<summary>\nOutput:\nThe laboratory results for Mr. BIJAY KUMAR MOHANTY are within the normal ranges for most of the tests conducted. There are a few minor deviations from the reference values for some of the tests, but these are not considered to be clinically significant.\n                    </summary>\n```\n\nTest Results:\n\n- RBC Parameters: Hemoglobin 14.4 g/dL, RBC Count 5.2 10^6/ul, PCV 42.7%, MCV 81.8 fl, MCH 27.6 pg, MCHC 33.8 g/dL, RDW (CV) 14.3%, RDW-SD 41.1 fl, TLC 9.9 10^3/\u00b5l, Neutrophils 66%, Lymphocytes 27%, Monocytes 2%, Eosinophils 5%, Basophils 0%\n\n- Absolute Leukocyte Count: Neutrophils 6.53 10^3/\u00b5l, Lymphocytes 2.67 10^3/\u00b5l, Monocytes 0.2 10^3/\u00b5l, Eosinophils 0.5 10^3/\u00b5l, Basophils 0\n\n- Platelet Parameters: Platelet Count 168 10^3/\u00b5l, Mean Platelet Volume 12.8 fl, PCT 0.2%, PDW 19.3%, P-LCR 46.1%, P-LCC 60%, Mentzer Index 15.73%\n\n- Glucose Random (BSR): Glucose Random 377 mg/dL\n\n- Uric Acid: Uric Acid 6.5 mg/dL\n\n- SGOT/AST: SGOT/AST 19.7 U/L\n\n- Cholesterol, Serum: TOTAL CHOLESTEROL 124 mg/dL")

<summary>
Output:
The laboratory results for Mr. BIJAY KUMAR MOHANTY are within the normal ranges for most of the tests conducted. There are a few minor deviations from the reference values for some of the tests, but these are not considered to be clinically significant.
                    </summary>
```

Test Results:

- RBC Parameters: Hemoglobin 14.4 g/dL, RBC Count 5.2 10^6/ul, PCV 42.7%, MCV 81.8 fl, MCH 27.6 pg, MCHC 33.8 g/dL, RDW (CV) 14.3%, RDW-SD 41.1 fl, TLC 9.9 10^3/µl, Neutrophils 66%, Lymphocytes 27%, Monocytes 2%, Eosinophils 5%, Basophils 0%

- Absolute Leukocyte Count: Neutrophils 6.53 10^3/µl, Lymphocytes 2.67 10^3/µl, Monocytes 0.2 10^3/µl, Eosinophils 0.5 10^3/µl, Basophils 0

- Platelet Parameters: Platelet Count 168 10^3/µl, Mean Platelet Volume 12.8 fl, PCT 0.2%, PDW 19.3%, P-LCR 46.1%, P-LCC 60%, Mentzer Index 15.73%

- Glucose Random (BSR): Glucose Random 377 mg/dL

- Uric Acid: Uric Acid 6.5 mg/dL

- SGOT/AST: SGOT/AST 19.7 U/L

- Cholesterol, Serum: TOTAL CH

In [6]:
import requests
import time

def send_request(file_path):
    url = "https://127.0.0.1:9070/pdf_summarizer"
    files = {'file': open(file_path, 'rb')}
    print(files)
    headers = {'Content-Type': 'multipart/form-data'}

    start_time = time.time()
    response = requests.post(url, files=files, verify=False, headers=headers)
    elapsed_time = time.time() - start_time

    print("Response:", response.text)
    print("Time taken (seconds):", elapsed_time)

if __name__ == "__main__":
    file_path = "Lab Report JSON Files/FILE100809092023071440410.json"  # Specify your file path here
    send_request(file_path)


{'file': <_io.BufferedReader name='Lab Report JSON Files/FILE100809092023071440410.json'>}
Response: No file part
Time taken (seconds): 0.040557146072387695




In [17]:
import requests
import time
import warnings
warnings.filterwarnings("ignore")

def send_request(file_path):
    url = "https://127.0.0.1:9070/pdf_summarizer"
    files = {'file': (file_path, open(file_path, 'rb'), 'multipart/form-data')}
    
    start_time = time.time()
    response = requests.post(url, files=files, verify=False)  # Let requests handle the Content-Type
    elapsed_time = time.time() - start_time

    print("Response:", response.json()["response"])
    print("Time taken (seconds):", elapsed_time)

if __name__ == "__main__":
    file_path = "Lab Report JSON Files/FILE117408112023041538402.json" 
    send_request(file_path)


Response: Prompt:
Please analyze the medical report content provided below and categorize the test results. Use the following format strictly:
            - Patient's Info: 
            - Name of the Patient: [Patient's Name Here]
            - General Health Summary: Provide a brief summary of the patient's overall health based on the test results.
            - Test Results Analysis:
            - Provide the test results in bullet points, clearly categorizing them under each relevant heading as follows:
                - Critical Tests: List any tests with values outside the normal ranges that may require immediate medical attention.
                - Considerable Tests: List tests with values that are not optimal and may need some medical attention or lifestyle changes.
                - Normal Tests: List tests with values within normal ranges.
            - Additional Notes: Any other relevant information or observations.

            NOTE: Analysis and summaries should adhere st

In [19]:
if __name__ == "__main__":
    file_path = "Lab Report JSON Files/FILE573818122023094413051.json" 
    send_request(file_path)

Response: Thank you.
```

Patient's Info:
- Name of the Patient: MAMATA DAS
- General Health Summary: Overall, Mamata Das has normal blood sugar levels, a slightly elevated total cholesterol and serum triglycerides, and borderline high LDL cholesterol. Her liver function tests are mostly within normal ranges, with slightly elevated alkaline phosphatase (ALP) and gamma-glutamyl transferase (GGT) levels.

Test Results Analysis:

- Critical Tests:
    - Total Cholesterol: 245 mg/dl (Desirable :< 200 Borderline: 200-239 High :>/=240)
    - Serum Triglycerides: 176 mg/dl (Desirable : < 150 Borderline high : 150-199 High : 200-499 Very high : > 500)
    - LDL Cholesterol: 170 mg/dl (Optimal : < 100 near /above Optimal:100 - 129 Borderline High:130 - 159 High : 160 - 189 Very High :>/=190)

- Considerable Tests:
    - Liver Function Test (LFT): ALP, GGT, Total Protein, Albumin, Globulin, Albumin/Globulin Ratio

- Normal Tests:
    - Glucose-Random: 70 mg/dl
    - Serum HDL Cholesterol: 33.9 m