<a href="https://colab.research.google.com/github/aman-17/actualwise/blob/main/simplified_chunking_actualwise_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!sudo apt install tesseract-ocr
!pip install pytesseract pdf2image
!apt-get install poppler-utils
!pip install --upgrade git+https://github.com/huggingface/transformers
!pip install openai==0.28

In [2]:
!mkdir 'images1'
!mkdir 'images2'
!mkdir 'images3'
!mkdir 'chunk_deid1'
!mkdir 'chunk_deid2'
!mkdir 'chunk1'
!mkdir 'chunk2'
!mkdir 'chunk1_outputs'
!mkdir 'chunk2_outputs'

In [None]:
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
import re
import os


oem_val = 3  # Example OEM value
psm_val = 6  # Example PSM value - 3, 4, 6? 1, 5, 7, 11, 12 (6 is best for Case 1)
custom_config = f'--oem {oem_val} --psm {psm_val}'

pdf_path = '/content/Case Study 2 SD.pdf'

output_folder = '/content/images2'
pdf_base_name = os.path.splitext(os.path.basename(pdf_path))[0]

output_file_name = f"{pdf_base_name}_oem{oem_val}_psm{psm_val}_pytesseract_poppler_dpi600.txt"
output_file_path = os.path.join(output_folder, output_file_name)

def clean_embedded_page_numbers(text):
    return re.sub(r'Page\s+\d+', '', text)

images = convert_from_path(pdf_path, 600)

with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for i, image in enumerate(images):
        text = pytesseract.image_to_string(image, config=custom_config)
        cleaned_text = clean_embedded_page_numbers(text)
        # Inserting PDF page number
        output_file.write(f'PDF Page Number {i + 1}\n{cleaned_text}\n\n' + '-'*60 + '\n')

print(f"OCR text saved to {output_file_path}")

OCR text saved to /content/images1/Case_study_1_CO_oem3_psm6_pytesseract_poppler_dpi600.txt


In [None]:
# Simplified chunking
import os
import re
from transformers import AutoTokenizer

# Setup
tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
print("Tokenizer loaded.")

def tokenize_text_continuously(text, max_length):
    print("Tokenizing text and creating continuous chunks...")
    token_ids = tokenizer.encode(text, add_special_tokens=True)
    chunks = []
    token_start = 0
    while token_start < len(token_ids):
        token_end = min(token_start + max_length, len(token_ids))
        chunk_token_ids = token_ids[token_start:token_end]
        chunk = tokenizer.decode(chunk_token_ids, skip_special_tokens=True)
        chunks.append((chunk, token_start, token_end-1))
        token_start = token_end
    return chunks

def chunk_files(input_directory, output_directory, max_length=512):
    os.makedirs(output_directory, exist_ok=True)
    print(f"Processing files from {input_directory} to save chunks in {output_directory}")

    for filename in os.listdir(input_directory):
        if filename.endswith(".txt"):
            print(f"Processing file: {filename}")
            with open(os.path.join(input_directory, filename), 'r') as file:
                text = file.read()

            first_pass_chunks = tokenize_text_continuously(text, max_length)
            total_chunks = len(first_pass_chunks)
            max_digits_chunk = len(str(total_chunks))

            for i, (chunk, start_token, end_token) in enumerate(first_pass_chunks, start=1):
                chunk_num_padded = str(i).zfill(max_digits_chunk)
                chunk_filename = f"Chunk{chunk_num_padded}_Tokens{start_token}-{end_token}.txt"
                with open(os.path.join(output_directory, chunk_filename), 'w') as chunk_file:
                    chunk_file.write(chunk)
                    print(f"Saved chunk {chunk_filename}.")

# Example usage
input_directory = "/content/"
output_directory = "/content/chunks/"
chunk_files(input_directory, output_directory)

In [6]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the tokenizer and model from the pretrained "obi/deid_roberta_i2b2"
tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
model = AutoModelForMaskedLM.from_pretrained("obi/deid_roberta_i2b2").to(device)
nlp = pipeline(task="ner", model="obi/deid_roberta_i2b2",device=0 if torch.cuda.is_available() else -1)

"""
tokenizer = AutoTokenizer.from_pretrained("nlpie/clinical-distilbert-i2b2-2010")
model = AutoModelForMaskedLM.from_pretrained("nlpie/clinical-distilbert-i2b2-2010").to(device)
nlp=pipeline(task="ner", model="obi/deid_bert_i2b2",device=0)
"""

# Define a function to deidentify text based on the NER results
def deidentify_text(text, ner_results):
    # Sort the entities based on their start position
    sorted_entities = sorted(ner_results, key=lambda x: x['start'])
    merged_entities = []

    # Merge overlapping entities
    for entity in sorted_entities:
        if merged_entities and entity['start'] <= merged_entities[-1]['end']:
            merged_entities[-1]['end'] = max(merged_entities[-1]['end'], entity['end'])
            merged_entities[-1]['entity'] = entity['entity']
        else:
            merged_entities.append(entity)

    # Replace the identified entities in the text with their entity type
    deidentified_text = text
    for entity in reversed(merged_entities):
        start = entity['start']
        end = entity['end']
        deidentified_text = deidentified_text[:start] + "[" + entity['entity'] + "]" + deidentified_text[end:]
    return deidentified_text

# Define the directory containing the text files
directory = "/content/chunks/"
# Define the directory to save the deidentified text files
new_directory = "/content/chunk_deid1"

# Loop through the text files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        # Open the text file and read the text
        with open(os.path.join(directory, filename), 'r') as file:
            text = file.read()
            # Perform NER on the text
            ner_results = nlp(text)
            # Deidentify the text
            deidentified_text = deidentify_text(text, ner_results)
        # Write the deidentified text to a new file
        with open(os.path.join(new_directory, "deidentified_" + filename), 'w') as file:
            file.write(deidentified_text)

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at obi/deid_roberta_i2b2 and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import os

def chunk_file(file_path, output_folder, max_length=8192):
    with open(file_path, 'r') as file:
        content = file.read().split('------------------------------------------------------------')

    i = 0
    chunk_files = []  # List to store the paths of the chunk files
    while i < len(content):
        chunk = []
        length = 0
        while i < len(content) and length + len(content[i]) + len('------------------------------------------------------------') <= max_length:
            chunk.append(content[i])
            length += len(content[i]) + len('------------------------------------------------------------')
            i += 1

        chunk_text = '------------------------------------------------------------'.join(chunk)

        # Define the output file path
        j = i - len(chunk) + 1  # Start page number
        k = i  # End page number
        output_file_path = os.path.join(output_folder, f'1_chunk_{len(chunk_files) + 1}_page{j}_to_page_{k}.txt')

        with open(output_file_path, 'w') as file:
            file.write(chunk_text)

        chunk_files.append(output_file_path)  # Add the path of the chunk file to the list

    # Delete the last chunk file if there is more than one chunk file
    # if len(chunk_files) > 1:
    #     os.remove(chunk_files[-1])

# Call the function with the path to your file and the output folder
chunk_file('/content/Case_study_1_CO_oem3_psm6_pytesseract_poppler_dpi600_colab.txt', '/content/')


In [None]:
# Download all the file

import os
from google.colab import files

def download_all_files_in_folder(folder_path):
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        # Ensure it's a file
        if os.path.isfile(file_path):
            print(f"Downloading {filename}...")
            files.download(file_path)
        else:
            print(f"Skipping {filename}, not a file.")

# Example usage:
# folder_path = '/content/chunks'  # Change this to your specific folder
download_all_files_in_folder('/content/')
# download_all_files_in_folder('/content/deid')

Skipping .config, not a file.
Downloading 1_chunk_5_page17_to_page_21.txt...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading Case_study_1_CO_oem3_psm6_pytesseract_poppler_dpi600_colab.txt...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading 1_chunk_2_page3_to_page_5.txt...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading 1_chunk_6_page22_to_page_27.txt...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading 1_chunk_3_page6_to_page_11.txt...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading 1_chunk_1_page1_to_page_2.txt...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading 1_chunk_4_page12_to_page_16.txt...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Skipping sample_data, not a file.


In [None]:
def chunk_file(file_path, output_folder, max_length=4096):
    with open(file_path, 'r') as file:
        content = file.read().split('------------------------------------------------------------')

    i = 0
    chunk_files = []  # List to store the paths of the chunk files
    while i < len(content):
        chunk = []
        length = 0
        while i < len(content) and length + len(content[i]) + len('------------------------------------------------------------') <= max_length:
            chunk.append(content[i])
            length += len(content[i]) + len('------------------------------------------------------------')
            i += 1

        chunk_text = '------------------------------------------------------------'.join(chunk)

        # Define the output file path
        j = i - len(chunk) + 1  # Start page number
        k = i  # End page number
        output_file_path = os.path.join(output_folder, f'2_chunk_{len(chunk_files) + 1}_page{j}_to_page_{k}.txt')

        with open(output_file_path, 'w') as file:
            file.write(chunk_text)

        chunk_files.append(output_file_path)  # Add the path of the chunk file to the list

    # Delete the last chunk file if there is more than one chunk file
    # if len(chunk_files) > 1:
    #     os.remove(chunk_files[-1])

# Call the function with the path to your file and the output folder
chunk_file('/content/images1/Case_study_1_CO_oem3_psm6_pytesseract_poppler_dpi600.txt', '/content/chunk2/')


In [None]:
import os

# Define the directory containing the text files
directory = "/content/chunk2/"

# Loop through the text files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        # Open the text file and read the text
        with open(os.path.join(directory, filename), 'r') as file:
            text = file.read()

        # Print the filename and the length of the text
        print(f"{filename}: {len(text)} characters")


In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the tokenizer and model from the pretrained "obi/deid_roberta_i2b2"
tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
model = AutoModelForMaskedLM.from_pretrained("obi/deid_roberta_i2b2").to(device)
nlp = pipeline(task="ner", model="obi/deid_roberta_i2b2",device=0 if torch.cuda.is_available() else -1)

"""
tokenizer = AutoTokenizer.from_pretrained("nlpie/clinical-distilbert-i2b2-2010")
model = AutoModelForMaskedLM.from_pretrained("nlpie/clinical-distilbert-i2b2-2010").to(device)
nlp=pipeline(task="ner", model="obi/deid_bert_i2b2",device=0)
"""

# Define a function to deidentify text based on the NER results
def deidentify_text(text, ner_results):
    # Sort the entities based on their start position
    sorted_entities = sorted(ner_results, key=lambda x: x['start'])
    merged_entities = []

    # Merge overlapping entities
    for entity in sorted_entities:
        if merged_entities and entity['start'] <= merged_entities[-1]['end']:
            merged_entities[-1]['end'] = max(merged_entities[-1]['end'], entity['end'])
            merged_entities[-1]['entity'] = entity['entity']
        else:
            merged_entities.append(entity)

    # Replace the identified entities in the text with their entity type
    deidentified_text = text
    for entity in reversed(merged_entities):
        start = entity['start']
        end = entity['end']
        deidentified_text = deidentified_text[:start] + "[" + entity['entity'] + "]" + deidentified_text[end:]
    return deidentified_text

# Define the directory containing the text files
directory = "/content/"
# Define the directory to save the deidentified text files
new_directory = "/content/"

# Loop through the text files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        # Open the text file and read the text
        with open(os.path.join(directory, filename), 'r') as file:
            text = file.read()
            # Perform NER on the text
            ner_results = nlp(text)
            # Deidentify the text
            deidentified_text = deidentify_text(text, ner_results)
        # Write the deidentified text to a new file
        with open(os.path.join(new_directory, "deidentified_" + filename), 'w') as file:
            file.write(deidentified_text)

tokenizer_config.json:   0%|          | 0.00/351 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at obi/deid_roberta_i2b2 and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define the directory containing the text files
directory = "/content/chunk2/"
# Define the directory to save the deidentified text files
new_directory = "/content/chunk_deid2/"

# Loop through the text files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        # Open the text file and read the text
        with open(os.path.join(directory, filename), 'r') as file:
            text = file.read()
            # Perform NER on the text
            ner_results = nlp(text)
            # Deidentify the text
            deidentified_text = deidentify_text(text, ner_results)
        # Write the deidentified text to a new file
        with open(os.path.join(new_directory, "deidentified_" + filename), 'w') as file:
            file.write(deidentified_text)



In [None]:
import os
import openai
import json
import pandas as pd
from google.colab import userdata
gpt4_api = userdata.get('gpt4_api')

openai.api_type = "azure"
openai.api_key = gpt4_api
openai.api_base = "https://test-gpt-4-ks.openai.azure.com"
openai.api_version = "2023-05-15"
openai.log = "debug"

def chunk_text(text, chunk_size=8192):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

def process_file(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    df = pd.read_excel('./ChatGPT prompts v5.xlsx',sheet_name=1)
    questions = df['Rafi\'s Qs'].tolist()
    chunks = chunk_text(text)
    messages = [
        # {
        #     "role": "system",
        #     "content": "You are a helpful assistant."
        # },
    ]
    for question in questions:
        for chunk in chunks:
            if len(chunk) > 8192:
                print(f"Warning: chunk is too large ({len(chunk)} tokens)")
                continue
            messages.append({
                "role": "user",
                "content": chunk + question
            })
    return messages, questions

def get_responses(messages):
    responses = []
    for message in messages:
        response = openai.ChatCompletion.create(
            # engine="GUISE",
            deployment_id="test",
            messages=[message],
        )
        responses.append(response["choices"][0]["message"]["content"])
    return responses

def save_responses_to_file(responses, questions, output_file):
    with open(output_file, 'w') as f:
        for question, response in zip(questions, responses):
            f.write(f"{question}\n{response}\n{'-'*30}\n")

def process_files_in_folder(folder_path):
    files = [f for f in os.listdir(folder_path) if f.endswith(".txt")]
    for filename in files:
        file_path = os.path.join(folder_path, filename)
        output_file = os.path.join('/content/chunk1_outputs/', f'output_{filename}')
        messages, questions = process_file(file_path)
        responses = get_responses(messages)
        save_responses_to_file(responses, questions, output_file)

folder_path = './chunk_deid1/'
process_files_in_folder(folder_path)

In [None]:
import os

# Define the directory containing the text files
directory = "/content/chunk1_outputs"

# Get a list of all text files in the directory
files = [f for f in os.listdir(directory) if f.endswith(".txt")]

# Sort the files in ascending order of their numerical part
files.sort(key=lambda x: int(x.split('_')[4].split('to')[0]))

# Create a new file to store the merged text
with open(os.path.join('/content/', "chunk1_outputs_merged.txt"), 'w') as outfile:
    for filename in files:
        # Open each text file and append its contents to the merged file
        with open(os.path.join(directory, filename), 'r') as infile:
            outfile.write(infile.read())
            # Add a newline character between files
            outfile.write("\n")



In [None]:
import os
import openai
import json
import pandas as pd
from google.colab import userdata
gpt4_api = userdata.get('gpt4_api')

openai.api_type = "azure"
openai.api_key = gpt4_api
openai.api_base = "https://test-gpt-4-ks.openai.azure.com"
openai.api_version = "2023-05-15"
openai.log = "debug"

def chunk_text(text, chunk_size=8192):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

def process_file(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    chunks = chunk_text(text)
    messages = [
        # {
        #     "role": "system",
        #     "content": "You are a helpful assistant."
        # },
    ]
    for chunk in chunks:
        if len(chunk) > 8192:
            print(f"Warning: chunk is too large ({len(chunk)} tokens)")
            continue
        messages.append({
            "role": "user",
            "content": chunk + 'Summarize these question response statements, grouping similar items together, but include all page number references. Where statements include both \'Yes\' and \'No\', keep the \'Yes\' response with its page number references and omit the \'No\' statements. The goal is to aggregate the existing text, keeping it as concise as possible without adding any new text.'
        })
    return messages

def get_responses(messages):
    responses = []
    for message in messages:
        response = openai.ChatCompletion.create(
            # engine="GUISE",
            deployment_id="test",
            messages=[message],
        )
        responses.append(response["choices"][0]["message"]["content"])
    return responses

def save_responses_to_file(responses, output_file):
    with open(output_file, 'w') as f:
        for response in responses:
            f.write(f"{response}\n{'-'*30}\n")

def process_files_in_folder(folder_path):
    files = [f for f in os.listdir(folder_path) if f.endswith(".txt")]
    for filename in files:
        file_path = os.path.join(folder_path, filename)
        output_file = os.path.join('/content/', f'output_{filename}')
        messages = process_file(file_path)
        responses = get_responses(messages)
        save_responses_to_file(responses, output_file)

folder_path = '/content/'
process_files_in_folder(folder_path)