# Retrieve Relevant Portions of Document

In [1]:
import torch

## Read Protocol Document

In [17]:
# extract all the text from reader
page_content_lookup = [page.extract_text() for page in reader.pages]

## Segment Document

In [18]:
from dataclasses import dataclass

@dataclass
class OutlineSegment:
    title: str
    start_page: int
    end_page: int
    content: str = ""

### Segment - Get PDF Outline

In [19]:
import typing
from pypdf.generic import Destination


def segment_outline(
    outlines: "list[typing.Union[Destination, list]]", reader: PdfReader
) -> "list[OutlineSegment]":
    """Given a list of pypdf.Destinations and a reader, segment document by outline with start and end pages"""

    def _segment_destination(dest: "Destination") -> "OutlineSegment":
        return OutlineSegment(dest.title, reader._get_page_number_by_indirect(dest.page), -1)

    segments = []
    switch_case = {
        Destination: lambda x: [_segment_destination(x)],
        list: lambda x: segment_outline(x, reader),
    }

    # handle first segment
    outline_entry = outlines[0]
    segments += switch_case[type(outline_entry)](outline_entry)

    for outline_entry in outlines[1:]:
        sub_segment = switch_case[type(outline_entry)](outline_entry)
        segments[-1].end_page = sub_segment[-1].start_page + 1
        segments += switch_case[type(outline_entry)](outline_entry)

    return segments

segments = segment_outline(reader.outline, reader)
for segment in segments:
    segment_pages = page_content_lookup[segment.start_page-1:segment.end_page-1]
    segment.content = "\n\n\n".join(segment_pages)

title_segment_lookup: "dict[str, OutlineSegment]" = {
    s.title: s for s in segments
}
len(title_segment_lookup)

181

### Segment - Sentences

In [20]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/btor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [21]:
from nltk import sent_tokenize

pdf_sentences = sent_tokenize("\n".join(page_content_lookup))
pdf_sentences = [
    s.replace('\n', '') for s in pdf_sentences if len(s.split(" ")) > 5
]  # ignore sentences that are too short, likely don't contain descriptive information
f"There are {len(pdf_sentences)} sentences in this document"

'There are 930 sentences in this document'

### Segment - Font Size (invalid)

Segmenting based off of differences in font doesn't work for a few reasons:
* across ICFs, larger font sizes don't always indicate a new segment
* font sizes are based off of the font text matrix, which is not consistent between PDFs
* bold and italicized elements don't consistently indicate new sections
* numbered sections (i.e. 1. Introduction) capture the number and the title in distinct PDF elements

In [22]:
# not worth it, skip
"""
parts = []
page = reader.pages[12]
def visitor_body(text, cm, tm, font_dict, font_size):
    y = cm[5]
    print(text.strip()[:50], "|", y, tm, font_size, font_dict)
    if 50 < y < 720:
        parts.append(text)

text = page.extract_text(visitor_text=visitor_body)
"""

'\nparts = []\npage = reader.pages[12]\ndef visitor_body(text, cm, tm, font_dict, font_size):\n    y = cm[5]\n    print(text.strip()[:50], "|", y, tm, font_size, font_dict)\n    if 50 < y < 720:\n        parts.append(text)\n\ntext = page.extract_text(visitor_text=visitor_body)\n'

## Retrieval

### Retrieval - Study Title

It would be good to know the study title. We guess what it is based on the first 3 pages.

In [53]:
text_first_three_pages

'Abbreviated Title: AMP-224 SBRT Met Colorectal Ca   \nVersion Date: 08/ 15/2016 \n \nConfidential   1 Abbreviated Title:  AMP-224 SBRT  Met Colorectal Ca   \nNCI Protocol #:   15-C-0021 B \nVersion Date:   08/15/2016 \nTitle: A Pilot Study of AMP-224 – a PD-1 Inhibitor – in Combination with Stereotactic Body \nRadiation Therapy (SBRT) in Patients with Metastatic Colorectal Cancer  \n \nPrincipal Investigator:     Tim Greten, MD A-F \nThoracic & GI Oncology Branch  \nNational Cancer Institute   \nBuilding 10, Room 3B43  \n9000 Rockville Pike  \nBethesda, MD 20892 \n301-451-4723 \nFAX: 301-480-8780 \ngretentf@mail.nih.gov  \nLead Associate Investigator :      Austin Duffy, MDA-F \nAssociate Investigators :    Deborah Citrin, MD, ROB , CCR, NCIA, B, E  \nBrad Wood MD, RAD IS, CC, NIHA, B, E \nWilliam D. Figg, PharmD, GMB , CCR, NCIB, E  \nSuzanne Fioravanti, RN, OCD, CCR, NCIA,B  \nMelissa Walker RN, OCD, CCR, NCIA, B  \nJennifer Jones MD, PhD , VB, CCR, NCIA, B,E \nSeth Steinberg, PhD ,

['Abbreviated Title: AMP-224 SBRT Met Colorectal Ca',
 'Confidential   1 Abbreviated Title:  AMP-224 SBRT  Met Colorectal Ca',
 'Title: A Pilot Study of AMP-224 – a PD-1 Inhibitor – in Combination with Stereotactic Body ',
 'Abbreviated Title: AMP-224 SBRT Met Colorectal Ca']

In [47]:
text_first_three_pages = "\n".join([p.extract_text() for p in reader.pages[:2]])
message = f"""-Context-
{text_first_three_pages.split(' ')}

-Question-
What is the full title of this study?"""
response = llm_model.invoke(message)
response

{'messages': [{'role': 'system', 'content': [{'type': 'text', 'text': '-Goal-\nGiven a snipped of text from a clinical study protocol, a trained medical professional asks a question. Answer the question based on the given context.\n          \n-Steps-\n1. Identify if the context answers the question or not.\n2. From the context, identify the most relevant portions to the question.\n3. Based on the context and the identified relevant sections, answer the question. Format your answer in a bullet point list.\n'}]}, {'role': 'user', 'content': [{'type': 'text', 'text': "-Context-\n['Abbreviated', 'Title:', 'AMP-224', 'SBRT', 'Met', 'Colorectal', 'Ca', '', '', '\\nVersion', 'Date:', '08/', '15/2016', '\\n', '\\nConfidential', '', '', '1', 'Abbreviated', 'Title:', '', 'AMP-224', 'SBRT', '', 'Met', 'Colorectal', 'Ca', '', '', '\\nNCI', 'Protocol', '#:', '', '', '15-C-0021', 'B', '\\nVersion', 'Date:', '', '', '08/15/2016', '\\nTitle:', 'A', 'Pilot', 'Study', 'of', 'AMP-224', '–', 'a', 'PD-1',

{'choices': [{'content_filter_results': {'hate': {'filtered': False,
     'severity': 'safe'},
    'self_harm': {'filtered': False, 'severity': 'safe'},
    'sexual': {'filtered': False, 'severity': 'safe'},
    'violence': {'filtered': False, 'severity': 'safe'}},
   'finish_reason': 'length',
   'index': 0,
   'logprobs': None,
   'message': {'content': '- The context provides the descriptive title under the "Title:" section followed by the details of the study',
    'role': 'assistant'}}],
 'created': 1722832656,
 'id': 'chatcmpl-9sjlQhVbF8dhbs1ksOkLSkXe9eeVX',
 'model': 'gpt-4-turbo-2024-04-09',
 'object': 'chat.completion',
 'prompt_filter_results': [{'prompt_index': 0,
   'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'},
    'self_harm': {'filtered': False, 'severity': 'safe'},
    'sexual': {'filtered': False, 'severity': 'safe'},
    'violence': {'filtered': False, 'severity': 'safe'}}}],
 'system_fingerprint': 'fp_811936bd4f',
 'usage': {'completion_t

### Retrieval - Semantic Similarity

Calculate semantic similarity of the categories (purpose, procedures, risks, benefits) against the document.

In [23]:
from sentence_transformers import SentenceTransformer

categories = ["purpose", "procedures", "risks", "benefits"]

model = SentenceTransformer("all-MiniLM-L6-v2")
category_embeddings = model.encode(categories)

  from tqdm.autonotebook import tqdm, trange
  return torch._C._cuda_getDeviceCount() > 0


In [24]:
def min_max_normalization(features: torch.tensor):
    feature_max = features.max()
    feature_min = features.min()
    return (features - feature_min) / (feature_max - feature_min)

#### Retrieval - Semantic Similarity - Topic Outline

In [25]:
topic_embeddings = model.encode([str(key) for key in title_segment_lookup.keys()])
topic_similarities = model.similarity(category_embeddings, topic_embeddings)
topic_similarities.shape

torch.Size([4, 181])

#### Retrieval - Semantic Similarity - Sentences

In [26]:
sentence_embeddings = model.encode(pdf_sentences)
sentence_similarity = model.similarity(category_embeddings, sentence_embeddings)
sentence_similarity.shape

torch.Size([4, 930])

In [94]:
titles = [t.title for t in title_segment_lookup.values()]
for i in topic_similarities[0].argsort(descending=True)[:10]:
    print(categories[0], i, titles[i])

purpose tensor(3) 1 INTRODUCTION
purpose tensor(1) SCHEMA
purpose tensor(6) 1.1.2 Secondary Objectives
purpose tensor(177) 12 REFERENCES
purpose tensor(4) 1.1 Study Objectives
purpose tensor(0) PRÉCIS
purpose tensor(157) 10.3 Evaluation of Benefits and Risks/Discomforts
purpose tensor(156) 10.2 Participation of Children
purpose tensor(155) 10.1 Rationale For Subject Selection
purpose tensor(154) 10 HUMAN SUBJECTS PROTECTIONS


In [87]:
for i in sentence_similarity[1].argsort(descending=True)[:10]:
    print(categories[1], pdf_sentences[i])

procedures The procedure for protecting against or minimizing risks will be to medically evaluate patients on a regular basis as described.
procedures During that meeting, the investigator will inform patients of the purpose, alternatives, treatment plan, research objectives and follow -up of this trial.
procedures For each sample, there are notes associated with the processing method (delay in sample processing, storage conditions on the ward, etc.).
procedures A preliminary report from a phase 2 trial.
procedures It is the responsibility of the NCI Principal Investigator to ensure that the samples requested are being used in a manner consistent with IRB approval.
procedures All specimens obtained in the protocol are used as defined in the protocol.
procedures The investigator will then provide a copy of t he IRB-approved informed consent document that is included in this protocol.
procedures The informed consent pr ocess will be documented on a progress note by the consenting  invest

## Generation - Generate Outputs

In [29]:
TOKEN_MAX = 500

In [30]:
# using a custom azure openai module for simpler control than langchain
from llm import LLM

prompt = """-Goal-
Given a snipped of text from a clinical study protocol, a trained medical professional asks a question. Answer the question based on the given context.
          
-Steps-
1. Identify if the context answers the question or not.
2. From the context, identify the most relevant portions to the question.
3. Based on the context and the identified relevant sections, answer the question. Format your answer in a bullet point list.
"""
llm_model = LLM(prompt=prompt)

### Generation - Study Purpose

Try against sentence based embeddings

In [34]:
question = "What is the purpose of the clinical trial study?"
available_prompt_space = TOKEN_MAX - len(prompt.split(" ")) - len(question.split(" ")) - 200

current_index = 0
sentence_similarities_rankings = sentence_similarity[0].argsort(descending=True)
context = ""

while available_prompt_space > 0:
    sentence_index = sentence_similarities_rankings[current_index]
    current_sentence = pdf_sentences[sentence_index]

    if available_prompt_space - len(current_sentence) > 0:
        available_prompt_space -= len(current_sentence)
        context += current_sentence
    else:
        break

In [33]:
response = llm_model.invoke(f"""
-Context-
{context}

-Question-
{question}""", max_tokens=100)

{'messages': [{'role': 'system', 'content': [{'type': 'text', 'text': '-Goal-\nGiven a snipped of text from a clinical study protocol, a trained medical professional asks a question. Answer the question based on the given context.\n          \n-Steps-\n1. Identify if the context answers the question or not.\n2. From the context, identify the most relevant portions to the question.\n3. Based on the context and the identified relevant sections, answer the question. Format your answer in a bullet point list.\n'}]}, {'role': 'user', 'content': [{'type': 'text', 'text': '\n-Context-\nDuring that meeting, the investigator will inform patients of the purpose, alternatives, treatment plan, research objectives and follow -up of this trial.\n\n-Question-\nWhat is the purpose of this study?'}]}], 'max_tokens': 100}


{'choices': [{'content_filter_results': {'hate': {'filtered': False,
     'severity': 'safe'},
    'self_harm': {'filtered': False, 'severity': 'safe'},
    'sexual': {'filtered': False, 'severity': 'safe'},
    'violence': {'filtered': False, 'severity': 'safe'}},
   'finish_reason': 'stop',
   'index': 0,
   'logprobs': None,
   'message': {'content': '- The context does not provide specific details about the purpose of the study.\n- The context mentions that during the meeting, the investigator will inform patients about the purpose of the trial, but it does not specify what the purpose is.',
    'role': 'assistant'}}],
 'created': 1722832194,
 'id': 'chatcmpl-9sjdy1ZxZhdBo8eX1C4QG4Z9tSy15',
 'model': 'gpt-4-turbo-2024-04-09',
 'object': 'chat.completion',
 'prompt_filter_results': [{'prompt_index': 0,
   'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'},
    'self_harm': {'filtered': False, 'severity': 'safe'},
    'sexual': {'filtered': False, 'severity': 

In [60]:
titles = [
    "A Nurse-led Family-oriented Resilience Program for Caregivers of Community-dwelling Dependent Older Adults",
    "Artificial Intelligence Assisted Breast Ultrasound in Breast Cancer Screening",
    "Active Cycle Breathing Technique (ACBT) on Respiratory Function Forced Expiration Technique (FET)",
    "Novel Locator-Positioner Device for Temporomandibular Joint Arthroscopy",
    "Comparison Between Continuous and Pulsed Oral Doxycycline Treatment Protocols for Refractory Meibomian Gland Dysfunction",
    "Integrated Genetic and Functional Analysis of the Influence of Menstrual Hygiene Products on Female Health",
    "Liver-gut Axis Study Through Identification of Liver Disease-specific Microbiome",
    "Treatment With Psilocybin for Chronic Neuropathic Pain and Depression (TRANSCEND)",
    "Investigating the Influence of Catheter Advancement Techniques on Needle Tip Movement During Intravenous Insertion",
    "Efficacy and Safety of Different Hyaluronic Acid Tear Substitutes Formulations in Evaporative Dry Eye",
]
title_embedding = model.encode(titles)
title_embedding.shape

(10, 384)

In [61]:
type(title_embedding)

numpy.ndarray

In [79]:
import numpy as np
centroid = np.median(title_embedding, axis=0)
centroid.shape

(384,)

In [73]:
with open("title_centroid.bytes", "wb") as f:
    f.write(centroid.tobytes())

In [86]:
[int(i) for i in model.similarity(centroid, title_embedding).flatten().argsort()]

[6, 1, 0, 8, 7, 9, 3, 2, 5, 4]

In [71]:
deserialized_bytes = np.frombuffer(centroid.tobytes(), dtype=np.float32)
deserialized_bytes == centroid

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,