In [None]:
from openai import OpenAI
import os
import pandas as pd

client = OpenAI()


In [None]:
from dotenv import load_dotenv

load_dotenv(override=True)

In [2]:
client = OpenAI(
  api_key=os.environ.get("OPENAI_API_KEY"),
)

In [None]:
def get_extracted_topics(model, extraction_template1, extraction_template2):
  """Return the extracted topics from the text"""
  try:
    extraction_completion1 = client.chat.completions.create(
    model=model, # "gpt-3.5-turbo-0125" or "gpt-4-0125-preview"
    temperature=0.0, # 0.0 is deterministic, 1.0 is maximum randomness
    top_p=0.0, # 0.0 is deterministic, 1.0 is maximum randomness
    seed=100, # Seed is used to make sure that the same prompt will always generate the same completion
    messages=[
        # {"role": "user", "content": extraction_template1},
        {"role": "user", "content": extraction_template1}
      ]
    )
  except Exception as e:
    print(f"Error: {e}")
    return None
  
  try:
    extraction_completion2 = client.chat.completions.create(
    model=model, # "gpt-3.5-turbo-0125" or "gpt-4-0125-preview"
    temperature=0.0, # 0.0 is deterministic, 1.0 is maximum randomness
    top_p=0.0, # 0.0 is deterministic, 1.0 is maximum randomness
    seed=100, # Seed is used to make sure that the same prompt will always generate the same completion
    messages=[
        # {"role": "user", "content": extraction_template1},
        {"role": "user", "content": extraction_template2}
      ]
    )
  except Exception as e:
    print(f"Error: {e}")
    return None
  
  return extraction_completion1.choices[0].message.content, extraction_completion2.choices[0].message.content
    

In [None]:
def get_summarized_text(model, summarization_template):
    """Return the summarized text from the extracted text"""
    summarization_completion = client.chat.completions.create(
    model=model, # "gpt-3.5-turbo-0125" or "gpt-4-0125-preview"
    temperature=0.0,
    top_p=0.0,
    seed=200, # Seed is used to make sure that the same prompt will always generate the same completion
    messages=[
        {"role": "user", "content": summarization_template}
        ]
    )
    return summarization_completion.choices[0].message.content


In [None]:
extraction_template1 = """You are a smart principal investigator for a clinical trial, and I will give you the text of the consent form. Below are questions separated by |. I want you to get the full exact text for each question if it's in the form else return "na".
Provide responses with the exact text from the form in defined format. Rephrasing text is not allowed, and you should not skip over information on the form.
Points: [1. Does the study involve medical research?|
2. What is the purpose of this research study?|
3. How long will the participant be involved in the study?|
4. What procedures will the participant need to follow?|
5. Are any of the procedures experimental?|
6. What are the risks or discomforts of participating?|
7. What are the benefits of participating?|
8. How many people will be participating in the study?]

response1 = {{"study_research": exact_text_1,
"purpose": exact_text_2,
"duration": exact_text_3,
"procedures": exact_text_4,
"experimental_procedures": exact_text_5,
"risks": exact_text_6,
"benefits": exact_text_7,
"participants": exact_text_8}}
 
Be consistent with the naming of the keys and the format of the values.

Form text:  {form_text}"""


extraction_template2 = """You are a smart principal investigator for a clinical trial, and use same text of the consent form from previous message. Below are questions separated by |. I want you to get the full exact text for each question if it's in the form else return "na".
Provide responses with the exact text from the form in the defined format. Rephrasing text is not allowed, and you should not skip over information on the form.
Points: [9. What other treatment options may help the participant besides this research study?|
10. How will the researchers keep the participant's records private?|
11. What are details on compensation, medical treatments, injuries, and where individuals can find more information?|
12. Who can the participant contact if they have questions or get hurt in the research?|
13. Can the participant drop out of the study anytime without consequences?|
14. For what reasons could the researchers remove the participant from the study?|
15. Will participating cost the participant anything?|
16. If the participant wants to stop the study, what should they do and what happens next?|
17. Will the researchers let the participant know if they find anything important that could change their decision to stay in the study?]

response2 = {{"alternative_procedures": exact_text_1,
"confidentiality": exact_text_2,
"compensation": exact_text_3,
"contact_info": exact_text_4,
"voluntary_participation": exact_text_5,
"discontinue_cooperation": exact_text_6,
"additional_costs": exact_text_7,
"withdrawal_effects": exact_text_8,
"new_findings": exact_text_9}}

Be consistent with the naming of the keys and the format of the values.

Form text:  {form_text}"""

Analysis of 11 trials

In [None]:
import os

# Get the current working directory
current_directory = os.getcwd()

# Move up two levels to get the folder path
two_levels_up = os.path.dirname(os.path.dirname(current_directory))

# # Combine the two-level-up directory with the folder name to get the complete path
folder_path = os.path.join(two_levels_up, r"Data\ICF\11trials_for_analysis")
files = os.listdir(folder_path)
files

In [None]:
import tiktoken
def count_tokens(text):
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
    token_count = len(encoding.encode(text))
    return token_count

In [None]:
for file in files:
    if file.endswith(".txt"):
        file_path = os.path.join(folder_path, file)
        with open(os.path.join(folder_path, file), "r", encoding="utf8") as f:
        # with open(os.path.join(folder_path, file), "r", encoding="ISO-8859-1") as f:
            form_text = f.read()
            print(f"File {file } has {count_tokens(form_text)} tokens")

Model selection

In [None]:
model = "gpt-4-0125-preview"

In [None]:
df = pd.DataFrame(columns=["file_name", "token_count", "extracted_topics1", "extracted_topics2",
                        "study_research", "purpose", "duration", "procedures", "experimental_procedures", "risks", "benefits", "participants",
                        "alternative_procedures", "confidentiality", "compensation", "contact_info", "voluntary_participation", "discontinue_cooperation", "additional_costs", "withdrawal_effects", "new_findings"])

In [None]:

for file in files:
    if file.endswith(".txt"):
        with open(os.path.join(folder_path, file), "r", encoding="utf8") as f:
            form_text = f.read()
            token_count = count_tokens(form_text)          
            extracted_content1, extracted_content2 = get_extracted_topics(model, extraction_template1.format(form_text=form_text), extraction_template2.format(form_text=form_text))
            print("\n\n")
            df = df.append({"file_name": file.split('.')[0], "token_count": token_count, "extracted_topics1": extracted_content1, "extracted_topics2":extracted_content2}, ignore_index=True)

Separate the extracted topics into different columns

In [None]:
import json

for row in df.iterrows():
    # print(row[1]['extracted_topics1'])  
    if ("= {" in row[1]['extracted_topics1']):
        response1 = row[1]['extracted_topics1'].split("=")[1]
        response1 = response1[:response1.index('"}')+2]
        json_response1 = json.loads(response1)
    else:
        response1 = row[1]['extracted_topics1']
        response1 = response1[response1.index("{\n"):response1.index("}")+1]
        json_response1 = json.loads(response1)
    # print(json_response1)

    if ("= {" in row[1]['extracted_topics2']):
        response2 = row[1]['extracted_topics2'].split("=")[1]
        response2 = response2[:response2.index('"}')+2]
        json_response2 = json.loads(response2)
    else:
        response2 = row[1]['extracted_topics2']
        response2 = response2[response2.index("{\n"):response2.index("}")+1]
        json_response2 = json.loads(response2)
    # print(json_response2)
    
    # Select the index of the row you want to update
    row_index = row[0]

    # Update the specified row with data from JSON
    for key, value in json_response1.items():
        if key in df.columns:
            df.at[row_index, key] = value
    
    for key, value in json_response2.items():
        if key in df.columns:
            df.at[row_index, key] = value
    

In [None]:
df.to_excel(folder_path+"\extracted_11trials_gpt4.xlsx", index=False, engine='openpyxl')

<h4>PoC for summaries with two ways</h4>
Approach 1: Using extracted topics from ICF

Approach 2: Using form text directly


In [None]:
# mention file name according to the model used
df = pd.read_excel(folder_path+"\extracted_11trials_gpt4.xlsx")

In [13]:
# prompt v5 from prompt excel file
summary_template_approach1 = """As the intelligent principal investigator of a clinical trial, you've received extracted text from the consent form. You must provide a clear summary.

The summary should include a statement that explain the purpose of the research, a description of the procedures, how long the subject will be involved in the study, the risks, the benefits, 
the subject's participation is voluntary, and the alternatives if the subject decides not to participate. 

Use the following guidelines to create a summary in a paragraph style:
- Summary must be at most 150 words.
- Simplify any complex terms or concepts.
- Make the summary highly understandable, recommended for eighth-grade level audience.
- Use respectful and empowering language for patients.
- Spell out acronyms upon first use.
- Include all relevant information without adding extra details.
- Use active voice
- Keep the summary concise and to the point

Your extracted text is: {extracted_content}"""

summary_template_approach2 = """As an intelligent principal investigator of a clinical trial, you must provide a clear summary using the consent form text.

The summary should include a statement that explain the purpose of the research, a description of the procedures, how long the subject will be involved in the study, the risks, the benefits, 
the subject's participation is voluntary, and the alternatives if the subject decides not to participate. 

Use the following guidelines to create a summary in a paragraph style:
- Summary must be at most 150 words.
- Simplify any complex terms or concepts.
- Make the summary highly understandable, recommended for eighth-grade level audience.
- Use respectful and empowering language for patients.
- Spell out acronyms upon first use.
- Include all relevant information without adding extra details.
- Use active voice.
- Keep the summary concise and to the point.

Form Text: {form_text}"""


In [15]:
df_approach1 = pd.DataFrame(columns=["file_name", "summarized_text_approach1"])


In [None]:

for file in files:
    if file.endswith(".txt"):
        with open(os.path.join(folder_path, file), "r", encoding="utf8") as f:
            # form_text = f.read()
            file_name = file.split('.')[0]
            purpose = df[df['file_name'] == file_name]['purpose'].values[0]
            procedures = df[df['file_name'] == file_name]['procedures'].values[0]
            duration = df[df['file_name'] == file_name]['duration'].values[0]
            risks = df[df['file_name'] == file_name]['risks'].values[0]
            benefits = df[df['file_name'] == file_name]['benefits'].values[0]
            alternative_procedures = df[df['file_name'] == file_name]['alternative_procedures'].values[0]
            voluntary_participation = df[df['file_name'] == file_name]['voluntary_participation'].values[0]

            extracted_content = f"{purpose} {procedures} {duration} {risks} {benefits} {alternative_procedures} {voluntary_participation}"

            # print(extracted_content)
            summary = get_summarized_text(model, summary_template_approach1.format(extracted_content=extracted_content))
            print(summary)
            print("\n")
            df_approach1 = df_approach1.append({"file_name": file_name, "summarized_text_approach1": summary}, ignore_index=True)


In [17]:
df_approach1['summarized_text_approach1'] = df_approach1['summarized_text_approach1'].apply(lambda x: x.replace("\n", " "))

In [19]:
df_approach1.to_excel('summarized_10trials_gpt4-approach1.xlsx', index=False, engine='openpyxl')

In [None]:
df_approach2 = pd.DataFrame(columns=["file_name", "summarized_text_approach2"])


In [None]:

for file in files:
    if file.endswith(".txt"):
        with open(os.path.join(folder_path, file), "r", encoding="utf8") as f:
            form_text = f.read()
            file_name = file.split('.')[0]
            
            summary = get_summarized_text(model, summary_template_approach2.format(form_text=form_text))
            print(summary)
            print("\n")
            df_approach2 = df_approach2.append({"file_name": file_name, "summarized_text_approach2": summary}, ignore_index=True)


In [None]:
df_approach2['summarized_text_approach2'] = df_approach2['summarized_text_approach2'].apply(lambda x: x.replace("\n", " "))

In [None]:
df_new_approach = pd.merge(df_approach1, df_approach2, on="file_name")
df_new_approach

In [None]:
df_new_approach.to_excel('summary_trials_gpt4_031124.xlsx', index=False, engine='openpyxl')