<a href="https://colab.research.google.com/github/aman-17/actualwise/blob/main/simplified_chunking_actualwise_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!sudo apt install tesseract-ocr
!pip install pytesseract pdf2image
!apt-get install poppler-utils
!pip install --upgrade git+https://github.com/huggingface/transformers
!pip install openai==0.28

In [None]:
!mkdir 'images'
!mkdir 'chunk_deid1'
!mkdir 'chunk1'
!mkdir 'chunk2'
!mkdir 'chunk1_outputs'
!mkdir 'chunk2_outputs'
!mkdir 'merged_deid1'

In [None]:
#
from pdf2image import convert_from_path
import re
import os
import time
import subprocess
from multiprocessing import Pool

# Specify the folders
pdf_folder = '/content/pdf'
image_output_folder = '/content/image/'
text_output_folder = '/content/text/'

# Ensure the output folders exist
os.makedirs(image_output_folder, exist_ok=True)
os.makedirs(text_output_folder, exist_ok=True)

oem_val = 3  # Example OEM value
psm_val = 6  # Example PSM value

def clean_embedded_page_numbers(text):
    pattern = r'Page\s+\d+(\s*/\s*\d+)*(\s+of\s+\d+)?'
    cleaned_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    pattern_no_spaces = r'Page\d+of\d+'
    cleaned_text = re.sub(pattern_no_spaces, '', cleaned_text, flags=re.IGNORECASE)
    return cleaned_text

def ocr_image(args):
    image_path, config, image_output_path = args
    # Constructing the Tesseract command
    command = ['tesseract', image_path, image_output_path, config]
    # Running the Tesseract command
    subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    # Reading the output text from file
    with open(f"{image_output_path}.txt", 'r') as file:
        text = file.read()
    return text

# Process each PDF file
for pdf_file in os.listdir(pdf_folder):
    if pdf_file.endswith(".pdf"):
        start_time_pdf = time.time()
        pdf_path = os.path.join(pdf_folder, pdf_file)
        print(f"Processing PDF: {pdf_path}")

        pdf_base_name = os.path.splitext(os.path.basename(pdf_path))[0]
        text_output_file_name = f"{pdf_base_name}_oem{oem_val}_psm{psm_val}_600_dpi.txt"
        text_output_file_path = os.path.join(text_output_folder, text_output_file_name)

        images = convert_from_path(pdf_path, 600)

        # Preparing arguments for parallel processing
        ocr_args = []
        for i, image in enumerate(images):
            image_file_path = os.path.join(image_output_folder, f"{pdf_base_name}_page_{str(i+1).zfill(2)}.png")
            image.save(image_file_path)
            config = f'--oem {oem_val} --psm {psm_val}'
            image_output_path = os.path.join(image_output_folder, f"{pdf_base_name}_page_{str(i+1).zfill(2)}")
            ocr_args.append((image_file_path, config, image_output_path))

        # Using multiprocessing to process images in parallel
        with Pool() as pool:
            texts = pool.map(ocr_image, ocr_args)

        with open(text_output_file_path, 'w', encoding='utf-8') as text_output_file:
            for i, text in enumerate(texts):
                cleaned_text = clean_embedded_page_numbers(text)
                text_output_file.write(f'PDF Page Number {i + 1}\n{cleaned_text}\n\n' + '-'*60 + '\n')

        print(f"OCR text for {pdf_base_name} saved to {text_output_file_path} in {time.time() - start_time_pdf:.2f} seconds.")

#

In [None]:
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
import re
import os


oem_val = 3  # Example OEM value
psm_val = 6  # Example PSM value - 3, 4, 6? 1, 5, 7, 11, 12 (6 is best for Case 1)
custom_config = f'--oem {oem_val} --psm {psm_val}'

pdf_path = '/content/CaseStudy_3_ZB.pdf'

output_folder = '/content/images'
pdf_base_name = os.path.splitext(os.path.basename(pdf_path))[0]

output_file_name = f"{pdf_base_name}_oem{oem_val}_psm{psm_val}_pytesseract_poppler_dpi600.txt"
output_file_path = os.path.join(output_folder, output_file_name)

def clean_embedded_page_numbers(text):
    # Matches 'Page X', 'Page X/Y/Z', and 'Page X of Y' (with variations in spacing and 'of' usage)
    pattern = r'Page\s+\d+(\s*/\s*\d+)*(\s+of\s+\d+)?'
    cleaned_text = re.sub(pattern, '', text, flags=re.IGNORECASE)

    # Matches 'PageXofY' without spaces
    pattern_no_spaces = r'Page\d+of\d+'
    cleaned_text = re.sub(pattern_no_spaces, '', cleaned_text, flags=re.IGNORECASE)

    return cleaned_text

images = convert_from_path(pdf_path, 600)

with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for i, image in enumerate(images):
        text = pytesseract.image_to_string(image, config=custom_config)
        cleaned_text = clean_embedded_page_numbers(text)
        # Inserting PDF page number
        output_file.write(f'PDF Page Number {i + 1}\n{cleaned_text}\n\n' + '-'*60 + '\n')

print(f"OCR text saved to {output_file_path}")

OCR text saved to /content/images/CaseStudy_3_ZB_oem3_psm6_pytesseract_poppler_dpi600.txt


In [None]:
import re

# List of insurance companies
insurance_companies = [
    "Blue Cross and Blue Shield of Illinois", "BCBSIL", "Health Care Service Corporation", "HCSC",
    "Aetna",
    "UnitedHealthcare", "UHC", "UHG",
    "Cigna",
    "Humana",
    "Health Alliance",
    "Molina Health care", "Molina",
    "Meridian Health", "Meridian",
    "Medicaid",
    "Medicare",
    "insurance"
]

# Open and read the file
with open('/content/images/CaseStudy_3_ZB_oem3_psm6_pytesseract_poppler_dpi600.txt', 'r') as file:
    data = file.read()

# Search for the insurance companies in the file
for company in insurance_companies:
    if re.search(company, data, re.IGNORECASE):
        print(f"The patient is insured with {company}")


The patient is insured with Meridian Health
The patient is insured with Meridian
The patient is insured with Medicaid


In [None]:
# Simplified chunking
import os
import re
from transformers import AutoTokenizer

# Setup
tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
print("Tokenizer loaded.")

def tokenize_text_continuously(text, max_length):
    print("Tokenizing text and creating continuous chunks...")
    token_ids = tokenizer.encode(text, add_special_tokens=True)
    chunks = []
    token_start = 0
    while token_start < len(token_ids):
        token_end = min(token_start + max_length, len(token_ids))
        chunk_token_ids = token_ids[token_start:token_end]
        chunk = tokenizer.decode(chunk_token_ids, skip_special_tokens=True)
        chunks.append((chunk, token_start, token_end-1))
        token_start = token_end
    return chunks

def chunk_files(input_directory, output_directory, max_length=512):
    os.makedirs(output_directory, exist_ok=True)
    print(f"Processing files from {input_directory} to save chunks in {output_directory}")

    for filename in os.listdir(input_directory):
        if filename.endswith(".txt"):
            print(f"Processing file: {filename}")
            with open(os.path.join(input_directory, filename), 'r') as file:
                text = file.read()

            first_pass_chunks = tokenize_text_continuously(text, max_length)
            total_chunks = len(first_pass_chunks)
            max_digits_chunk = len(str(total_chunks))

            for i, (chunk, start_token, end_token) in enumerate(first_pass_chunks, start=1):
                chunk_num_padded = str(i).zfill(max_digits_chunk)
                chunk_filename = f"Chunk{chunk_num_padded}_Tokens{start_token}-{end_token}.txt"
                with open(os.path.join(output_directory, chunk_filename), 'w') as chunk_file:
                    chunk_file.write(chunk)
                    print(f"Saved chunk {chunk_filename}.")

# Example usage
input_directory = "/content/images/"
output_directory = "/content/chunks/"
chunk_files(input_directory, output_directory)

In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the tokenizer and model from the pretrained "obi/deid_roberta_i2b2"
tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
model = AutoModelForMaskedLM.from_pretrained("obi/deid_roberta_i2b2").to(device)
nlp = pipeline(task="ner", model="obi/deid_roberta_i2b2",device=0 if torch.cuda.is_available() else -1)

"""
tokenizer = AutoTokenizer.from_pretrained("nlpie/clinical-distilbert-i2b2-2010")
model = AutoModelForMaskedLM.from_pretrained("nlpie/clinical-distilbert-i2b2-2010").to(device)
nlp=pipeline(task="ner", model="obi/deid_bert_i2b2",device=0)
"""

def extract_info(text, ner_results):
    # Sort the entities based on their start position
    sorted_entities = sorted(ner_results, key=lambda x: x['start'])
    merged_entities = []

    # Merge overlapping entities
    for entity in sorted_entities:
        if merged_entities and entity['start'] <= merged_entities[-1]['end']:
            merged_entities[-1]['end'] = max(merged_entities[-1]['end'], entity['end'])
            merged_entities[-1]['entity'] = entity['entity']
        else:
            merged_entities.append(entity)

    # Extract the identified entities in the text
    extracted_info = {}
    for entity in merged_entities:
        if entity['entity'] in ['age', 'gender', 'name']:
            extracted_info[entity['entity']] = text[entity['start']:entity['end']]

    return extracted_info

# Define the directory containing the text files
directory = "/content/chunks/"
# Define the directory to save the deidentified text files
new_directory = "/content/chunk_deid1"

# Loop through the text files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        # Open the text file and read the text
        with open(os.path.join(directory, filename), 'r') as file:
            text = file.read()
            # Perform NER on the text
            ner_results = nlp(text)
            # Deidentify the text
            deidentified_text = extract_info(text, ner_results)
            print(deidentified_text)
        # Write the deidentified text to a new file
        # with open(os.path.join(new_directory, "deidentified_" + filename), 'w') as file:
            # file.write(deidentified_text)

In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the tokenizer and model from the pretrained "obi/deid_roberta_i2b2"
tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
model = AutoModelForMaskedLM.from_pretrained("obi/deid_roberta_i2b2").to(device)
nlp = pipeline(task="ner", model="obi/deid_roberta_i2b2",device=0 if torch.cuda.is_available() else -1)

"""
tokenizer = AutoTokenizer.from_pretrained("nlpie/clinical-distilbert-i2b2-2010")
model = AutoModelForMaskedLM.from_pretrained("nlpie/clinical-distilbert-i2b2-2010").to(device)
nlp=pipeline(task="ner", model="obi/deid_bert_i2b2",device=0)
"""

# Define a function to deidentify text based on the NER results
def deidentify_text(text, ner_results):
    # Sort the entities based on their start position
    sorted_entities = sorted(ner_results, key=lambda x: x['start'])
    merged_entities = []

    # Merge overlapping entities
    for entity in sorted_entities:
        if merged_entities and entity['start'] <= merged_entities[-1]['end']:
            merged_entities[-1]['end'] = max(merged_entities[-1]['end'], entity['end'])
            merged_entities[-1]['entity'] = entity['entity']
        else:
            merged_entities.append(entity)

    # Replace the identified entities in the text with their entity type
    deidentified_text = text
    for entity in reversed(merged_entities):
        start = entity['start']
        end = entity['end']
        deidentified_text = deidentified_text[:start] + "[" + entity['entity'] + "]" + deidentified_text[end:]
    return deidentified_text

# Define the directory containing the text files
directory = "/content/chunks/"
# Define the directory to save the deidentified text files
new_directory = "/content/chunk_deid1"

# Loop through the text files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        # Open the text file and read the text
        with open(os.path.join(directory, filename), 'r') as file:
            text = file.read()
            # Perform NER on the text
            ner_results = nlp(text)
            # Deidentify the text
            deidentified_text = deidentify_text(text, ner_results)
        # Write the deidentified text to a new file
        with open(os.path.join(new_directory, "deidentified_" + filename), 'w') as file:
            file.write(deidentified_text)

config.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at obi/deid_roberta_i2b2 and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Merge deid files
import os

# Define the directory containing the deidentified chunk files
input_directory = "/content/chunk_deid1/"
# Define the path for the output file
output_file_path = "/content/merged_deid1/merged_deidentified_text.txt"

# Get all filenames in the directory
filenames = os.listdir(input_directory)

# Sort the filenames simply by their natural order; this assumes the names are structured for correct sorting
sorted_filenames = sorted(filenames)

# Check if there are files to process
if not sorted_filenames:
    print("No files found in the directory.")
else:
    # Open the output file in write mode
    with open(output_file_path, 'w') as output_file:
        # Iterate over each sorted filename
        for filename in sorted_filenames:
            # Print the filename being processed
            # print(f"Including file in merge: {filename}")

            # Construct the full path to the file
            file_path = os.path.join(input_directory, filename)
            # Open and read the content of the file
            with open(file_path, 'r') as file:
                content = file.read()
                # Append the content to the output file
                output_file.write(content + "\n")  # Add a newline to separate each file's content

    print(f"All files have been successfully merged into {output_file_path}")



All files have been successfully merged into /content/merged_deid1/merged_deidentified_text.txt


In [None]:
# 2 Chunks for GPT_3.5 model
def split_file(filename):
    with open(filename, 'r') as file:
        lines = file.readlines()

    half = len(lines) // 2
    with open('chunked_deid_file1.txt', 'w') as first_half_file:
        first_half_file.writelines(lines[:half])

    with open('chunked_deid_file2.txt', 'w') as second_half_file:
        second_half_file.writelines(lines[half:])

split_file('/content/merged_deid1/merged_deidentified_text.txt')


In [None]:
# !pip install openai==0.28

import pandas as pd
import openai
from google.colab import userdata
import time

# Ensure your OpenAI API key is correctly set here or through environment variables
openai.api_key = userdata.get('GPT_API')

# Read the file content
with open('/content/chunked_deid_file1.txt', 'r') as file:
    content1 = file.read()

df = pd.read_excel('./ChatGPT prompts v5.xlsx', sheet_name=1)
questions = df['Rafi\'s Qs'].tolist()

def get_gpt3_5_turbo_response(prompt):
    try:
        # Construct the messages parameter correctly as a list of dictionaries
        messages = [
            {"role": "system", "content": "You are a helpful medical record retriever."},
            {"role": "user", "content": prompt}
        ]

        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-0125",
            messages=messages,
            # max_tokens=16000,  # Adjust based on the expected length of completion, ensuring total does not exceed 16385
            # temperature=1,
            # top_p=1,
            # frequency_penalty=0,
            # presence_penalty=0
        )

        # Assuming the response structure matches expected output
        return response.choices[0].message['content'].strip() if response.choices else "No response"
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

with open('chunk1_answers.txt', 'w') as f:
    for question in questions:
        prompt = content1 + '\n Based on the above patient\'s medical record. Answer the following question. \n' + question
        response_text = get_gpt3_5_turbo_response(prompt)
        print(response_text)
        f.write(f"Question:\n{question}\nAnswer:\n{response_text}\n{'-'*32}\n")
        time.sleep(10)


CRE: No
C-Auris: No
MRSA: No
C-DIFF: No
MDRO: No
COVID: No
A&Ox4 - Pages 11, 12
COVID vaccination: No
COVID test: NA.
The patient has a short-term care plan. Here is the plan listed in bullet format along with the page number(s) as the source:

- Inpatient Medication Plan: 
  1. Acetaminophen 325 mg tablet every 4 hours PRN for pain
  2. Lorazepam 1 mg tablet every 6 hours PRN for anxiety
  3. Dicyclomine 20 mg tablet three times daily for stomach cramps
  4. Gabapentin 600 mg tablet three times daily for anxiety
  5. Loperamide 2 mg capsule every 2 hours PRN for diarrhea (not more than 16 mg per day)
  6. Lorazepam 2 mg IM every 4 hours PRN for seizures only

Expected duration is not specified in the provided information.

Source: Pages 13, [U-ID]
No long-term care plan mentioned.
- Major depressive disorder
- Polysubstance abuse
- Seizures related to alcohol withdrawal
- Left lower extremity deep vein thrombosis (DVT) and pulmonary embolism (PE) treated with Xarelto 
- Gastroesophage

In [None]:
import pandas as pd
import openai
from google.colab import userdata

# Ensure your OpenAI API key is correctly set here or through environment variables
openai.api_key = userdata.get('GPT_API')

# Read the file content
with open('/content/chunked_deid_file2.txt', 'r') as file:
    content2 = file.read()

df = pd.read_excel('./ChatGPT prompts v5.xlsx', sheet_name=1)
questions = df['Rafi\'s Qs'].tolist()

def get_gpt3_5_turbo_response(prompt):
    try:
        # Construct the messages parameter correctly as a list of dictionaries
        messages = [
            {"role": "system", "content": "You are a helpful medical record retriever."},
            {"role": "user", "content": prompt}
        ]

        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-0125",
            messages=messages,
        )

        # Assuming the response structure matches expected output
        return response.choices[0].message['content'].strip() if response.choices else "No response"
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

with open('chunk2_answers.txt', 'w') as f:
    for question in questions:
        prompt = content2 + '\n Based on the above patient\'s medical record. Answer the following question. \n' + question
        response_text = get_gpt3_5_turbo_response(prompt)
        f.write(f"Question:\n{question}\nAnswer:\n{response_text}\n{'-'*32}\n")


In [None]:
import pandas as pd
import openai
from google.colab import userdata

# Ensure your OpenAI API key is correctly set here or through environment variables
openai.api_key = userdata.get('GPT_API')

with open('/content/chunk1_answers.txt', 'r') as file:
    content1 = file.read()

with open('/content/chunk2_answers.txt', 'r') as file:
    content2 = file.read()

def get_gpt3_5_turbo_response(prompt):
    try:
        messages = [
            {"role": "system", "content": "You are a helpful medical record retriever."},
            {"role": "user", "content": prompt}
        ]

        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-0125",
            messages=messages,
        )

        return response.choices[0].message['content'].strip() if response.choices else "No response"
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

with open('final_answers.txt', 'w') as f:
    prompt = content1 + '\n\n' + content2 + '\n Summarize these statements, grouping similar items together, but include all page number references. Where statements include both \'Yes\' and \'No\', keep the \'Yes\' response with its page number references and omit the \'No\' statements. The goal is to aggregate the existing text, keeping it as concise as possible without adding any new text."'
    response_text = get_gpt3_5_turbo_response(prompt)
    f.write(f"{response_text}\n{'-'*32}\n")


In [None]:
# GPT_4 Model

# import openai
# from google.colab import userdata
# import pandas as pd

# # Load the data
# df = pd.read_excel('./ChatGPT prompts v5.xlsx', sheet_name=1)
# questions = df['Rafi\'s Qs'].tolist()

# # Read the file content
# with open('/content/merged_deid1/merged_deidentified_text.txt', 'r') as file:
#     content = file.read()

# # Convert the list into a string with each question numbered
# questions_str = '\n'.join([f'{i+1}. {q}' for i, q in enumerate(questions)])

# # Concatenate the strings
# prompt = content + '\n Based on the above patient\'s medical record. Answer the following questions? \n' + questions_str

# # print(prompt)


# # Ensure your OpenAI API key is correctly set here or through environment variables
# openai.api_key = userdata.get('GPT_API')

# def get_gpt3_5_turbo_response(prompt):
#     try:
#         # Construct the messages parameter correctly as a list of dictionaries
#         messages = [
#             {"role": "system", "content": "You are a helpful medical record retriever."},
#             {"role": "user", "content": prompt}
#         ]

#         response = openai.ChatCompletion.create(
#             model="gpt-3.5-turbo-0125",
#             # model = "gpt-4-0125-preview",
#             messages=messages,
#             # max_tokens=4096 ,  # Adjust based on the expected length of completion, ensuring total does not exceed 16385
#             # temperature=0.7,
#             # top_p=1,
#             # frequency_penalty=0,
#             # presence_penalty=0
#         )

#         # Assuming the response structure matches expected output
#         return response.choices[0].message['content'].strip() if response.choices else "No response"
#     except Exception as e:
#         print(f"An error occurred: {e}")
#         return None

# # Example usage
# # prompt = "What is the capital of France?"
# response_text = get_gpt3_5_turbo_response(prompt)
# print(response_text)


1. CRE: No
2. C-Auris: No
3. MRSA: No
4. C-DIFF: No
5. MDRO: No
6. COVID: No
7. A&OxNA
8. COVID vaccination: NA
9. COVID test: NA
10. No short-term care plan mentioned
11. No long-term care plan mentioned
12. Diagnoses: NA
13. • No past medical history (pmhx) mentioned
14. Mobility Aids: NA
15. Patient Monitoring & Safety Equipment: NA
16. Bed Equipment: NA
17. Specialized Medical Equipment: NA
18. Assistance: NA
19. IV: NA
20. Pressure ulcers: NA
21. Complex wounds: No
22. Wound Vac: No
23. Specialty Wound Equipment: No
24. No
25. NA
26. NA
27. Therapy: NA
28. Concerns: NA
29. Tuberculosis: No
30. VRE: No
31. HIV: No
32. ESBL: No
33. AIDS: No
34. ECG: No
35. PT/INR: No
36. eCardio: No
37. Bladder scan: No
38. Central line: No
39. Midline catheter: No
40. Implantable ports: No
41. JP/Penrose drain: No
42. Nephrostomy: No
43. Colostomy: No
44. Suprapubic Catheter: No
45. Dialysis Shunt: No
46. Dialysis Catheter: No
47. BiPAP/CPAP: No
48. Tracheostomy: No
49. Oxygen: No
50. Portable/Astr

In [None]:
# End