In [6]:
import os, sys
import random
import time
from dotenv.main import load_dotenv, find_dotenv
load_dotenv(find_dotenv())
openai_api_key = os.environ['OPENAI_API_KEY']

from openai import OpenAI
from pathlib import Path
import glob
import json

import pandas as pd
import numpy as np
import collections
from collections import defaultdict

from pprint import pprint

import matplotlib.pyplot as plt 
import seaborn as sns
import scipy
import ast

import nltk
from nltk import tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/ytcao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
####### gpt-3-turbo #######
# model_name = "gpt-3.5-turbo-0125"
# pricing_in = 0.0005 #per 1K tokens
# pricing_out = 0.0015 #per 1K tokens

####### gpt-4-turbo #######
model_name = "gpt-4-0125-preview"
pricing_in = 0.01 #per 1K tokens
pricing_out = 0.03 #per 1K tokens

client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key=openai_api_key,
)
temp = 0.5

In [8]:
GPT_MODELS = {
    "gpt-3.5-turbo-0125":{
        "pricing_in": 0.0005,
        "pricing_out":0.0015},
    "gpt-4-0125-preview":{
        "pricing_in":0.01,
        "pricing_out":0.03
    }
}

def get_gpt_response(messages_list, model_name=model_name, temperature=temp):
    '''obtain response from chatgpt'''
    assert model_name in [key for key in GPT_MODELS.keys()], f"make sure {model_name} is one of the following: {[key for key in GPT_MODELS.keys()]}"

    completion = client.chat.completions.create(
            messages=messages_list,
            model=model_name,
            temperature=temperature,
        )
    
    content = completion.choices[0].message.content
    tokens_input = completion.usage.prompt_tokens
    tokens_output = completion.usage.completion_tokens

    completion_price = int(tokens_output)*GPT_MODELS[model_name]['pricing_out']/1000
    prompt_price = int(tokens_input)*GPT_MODELS[model_name]['pricing_in']/1000
    total_price = completion_price+prompt_price
    # print(total_price)

    return content, total_price

In [9]:
# patient analysis functions
def get_user_summary(patient_lines):
    user_summary_prompt = f'''
    Your task is to summarize the patient's responses during a psychiatric interview. The patient's responses are enclosed in <patient-lines> tags.

    <patient-lines>
    {patient_lines}
    </patient-lines>

    Extract details from <patient-lines> into this list, referring to the relevant line number(s) when filling in any information. If a category in this list is not mentioned by the patient, fill in the category with "Not mentioned". If the patient expresses any hesitancy in responding, mention "Expressed hesitancy" when filling in the category.

    Name:
    Date of Birth:
    Occupation:
    Medical Conditions:
    Medications:
    Address:
    Allergies:
    Relationship Status:
    Elementary School Performance:
    High School Performance:
    Canadian Citizenship:
    Children:
    Siblings:
    Seizures:
    Developmental Difficulties:
    Family History of Health Conditions:
    Previous Hospitalization:
    Disability Assistance:
    Past Trauma:
    Substance Abuse:
    Recreational Drug Usage:
    '''
    content, cost = get_gpt_response([{"role": "user", "content": user_summary_prompt}])
    return content, cost

def get_user_coverage(patient_info, patient_transcript_summary):
    patient_info_rmv = [info for info in patient_info.split("\n") if info.split(":")[0] not in ["Typing Style","Sex","Age","Conversational Tone"]]
    coverage_prompt = f'''
    Score the following a typed patient interview summary with respect to coverage relating to the Patient Information on a continuous scale from 0 to 100, where a score of zero means "no coverage" and a score of one hundred means "perfect coverage". 

    Note that coverage measures whether or not the characteristic content in Interview Summary match the characteristic content given in Patient Information. For example, if "Previous Hospitalization" is marked as "no" in Patient Information but a detailed history is reported in Interview Summary, this should be treated as no coverage. If "Allergies" contains two allergens in Patient Information but only one is reported in Interview Summary, this should be treated as incomplete coverage. Score this based only on characteristics present in the Interview Summary. Ignore Reason for Appointment, Typing Style and Conversational Tone.

    Reason through your thought process, and output your score at the very end after a "### Score" header. This should be the last output.

    Patient Information:
    {patient_info_rmv}

    Interview Summary: 
    {patient_transcript_summary}
    '''
    content, cost = get_gpt_response([{"role": "user", "content": coverage_prompt}])
    try:
        specificity_val = int(content.split("### Score")[-1][:4].strip())
    except ValueError:
        specificity_val = np.NaN
    return content, specificity_val, cost

def get_user_edge_case_present(patient_lines):
    edge_case = "Upon first talking about any medication, misspell the medication name."

    edge_case_prompt = f'''
    Determine whether or not the provided transcript edge case is found within the message list. This message list represents only one person's side of a conversation.

    Your answer should have either "Final Answer: True" or "Final Answer: False" at the end of the transcript. Include the lines relevant to the edge case in your response to me. 
    
    For example, your response might look like "The lines that are relevant to the edge case are '15. I take Aderall.' and '16. Sorry, yeah, I meant Adderall.' These lines imply that the name of the medication was misspelled. Final Answer: True." or "The line that is relevant to the edge case is '15. I take Adderall.' This line implies that the name of the medication was not misspelled. Final Answer: False"

    Edge Case: {edge_case}
    Transcript: {patient_lines}
    '''
    content, cost = get_gpt_response([{"role": "user", "content": edge_case_prompt}])
    return content, cost

In [10]:
def get_asst_edge_case_present(asst_lines):
    # edge_case = "Ask a follow-up question to verify the medication dosage."
    edge_case = "Correct the medication name misspelling by suggesting the correct medication name."

    # edge_case_prompt = f'''
    # Determine whether or not the provided transcript edge case is consistent with the contents in the transcript. 
    
    # Your answer should be either True or False. If True, also include a "|" character, followed by the line that the edge case occurs at. If False, only output "False". For examples, "True|16. Could you check your medication dosage?" or "False"

    # Edge Case: {edge_case}
    # Transcript: {asst_lines}
    # '''
    edge_case_prompt = f'''
    Determine whether or not the provided transcript edge case is found within the message list. This message list represents only one person's side of a conversation.

    Your answer should have either "Final Answer: True" or "Final Answer: False" at the end of the transcript. Include the lines relevant to the edge case in your response to me. 

    For example, your response might look like "The lines that are relevant to the edge case are '15. What medications are you taking?' and '16. Did you mean Adderall?'. These lines imply that the name of the medication was suggested. Final Answer: True." or "The lines that are relevant to the edge case are '15. What medications are you taking?' and '16. Thank you for letting me know.'. These lines imply that the name of the medication was not corrected. Final Answer: False"

    Edge Case: {edge_case}
    Transcript: {asst_lines}
    '''

    content, cost = get_gpt_response([{"role": "user", "content": edge_case_prompt}])
    return content, cost

def get_asst_coverage(filtered_qb_list, asst_msgs):
    question_bank_str = "\n".join(filtered_qb_list)
    asst_msgs_str = "\n".join([f"{i}. {msg}" for i, msg in enumerate(asst_msgs)])
    coverage_prompt = f'''
    Score the following list of Assistant Interview Messages with respect to coverage of the Question Bank on a continuous scale from 0 to 100, where a score of zero means "no coverage" and a score of one hundred means "perfect coverage". 

    Note that coverage measures whether or not each question in the Question Bank is addressed in Assistant Interview Messages. For every question in the Question Bank, label it as "Addressed" or "Not addressed" based on the Interview Messages. State message number to provide evidence for that label (e.g., "- birth complications - Addressed (8)"). Use these labels to assign a score.

    Output your score at the very end after a "### Score" header. (e.g., ### Score\nXX)

    Question Bank:
    {question_bank_str}

    Assistant Interview Messages: 
    {asst_msgs_str}
    '''
    content, cost = get_gpt_response([{"role": "user", "content": coverage_prompt}])
    try:
        specificity_val = int(content.split("### Score")[-1][:4].strip())
    except ValueError:
        specificity_val = np.NaN
    return content, specificity_val, cost


def get_sentence_category(asst_msgs):
    '''
    return
    ------
    category_list : list
        list of either "Opening Question", "In-depth Question", "Empathy Statement", or "Other". Length of list is the number of sentences expressed.
    category_per_msg : dict
        dict where keys are the index of each assistant message, and keys are the individual sentence categories. len(category_per_msg) = len(asst_msgs)
    qs_per_msg : dict
        dict where keys are the index of each assistant message, and keys are the ones labeled as either "Opening Question" or "In-depth Question" categories. len(qs_per_msg) = len(asst_msgs)
    asst_sentences_dict : dict
    asst_sentences_list : list
    total_cost : float
        total cost of querying chatgpt
    '''
    asst_sentences_dict = defaultdict(list)
    asst_sentences_list = []
    # split sentences in each message
    for idx, msg in enumerate(asst_msgs):
        asst_sentences_dict[idx] = tokenize.sent_tokenize(msg)
        asst_sentences_list.append(msg)
    
    total_cost = 0
    category_per_msg = defaultdict(list)
    qs_per_msg = defaultdict(list)
    
    for msg_num in asst_sentences_dict.keys(): 
        for msg in asst_sentences_dict[msg_num]:
            asst_q_types_prompt = f'''
            You will be given a sentence from an interviewer. 
            Your task is to categorize the following sentence as either an "Opening Question", "In-depth Question", "Empathy Statement", or "Other". 

            "Opening Question" is a question that introduces a new concept. 
            "In-depth Question" is a question that builds upon a previous question, such as asking for more specific detail or making a clarification. 
            "Empathy Statement" is a statement that expresses reflection, understanding, or empathy. 
            "Other" sentences would be any that do not fit the previous categories. 
            
            Respond with the category only.

            Sentence:
            {msg}
            '''
            content, cost = get_gpt_response([{"role": "user", "content": asst_q_types_prompt}])
            total_cost += cost
            category_per_msg[msg_num].append(content)
            if "Question" in content:
                qs_per_msg[msg_num].append(content)
    
    return category_per_msg, qs_per_msg, asst_sentences_dict, asst_sentences_list, total_cost


def get_question_category(asst_msgs,opening_questions_str,indepth_questions_str):
    '''
    return
    ------
    category_list : list
        list of either "Opening Question", "In-depth Question", "Empathy Statement", or "Other". Length of list is the number of sentences expressed.
    category_per_msg : dict
        dict where keys are the index of each assistant message, and keys are the individual sentence categories. len(category_per_msg) = len(asst_msgs)
    qs_per_msg : dict
        dict where keys are the index of each assistant message, and keys are the ones labeled as either "Opening Question" or "In-depth Question" categories. len(qs_per_msg) = len(asst_msgs)
    asst_sentences_dict : dict
    asst_sentences_list : list
    total_cost : float
        total cost of querying chatgpt
    '''
    asst_sentences_dict = defaultdict(list)
    asst_sentences_list = []
    # split sentences in each message
    for idx, msg in enumerate(asst_msgs):
        asst_sentences_dict[idx] = tokenize.sent_tokenize(msg)
        asst_sentences_list.append(msg)
    
    total_cost = 0
    category_per_msg = defaultdict(list)
    qs_per_msg = defaultdict(list)
    
    for msg_num in asst_sentences_dict.keys(): 
        for msg in asst_sentences_dict[msg_num]: ###TODO: finish prompt engineering
            asst_q_types_prompt = f'''
            You will be given a question posed during a semi-structured interview.

            You will be also given suggested Opening Questions and suggested In-depth Questions that the semi-structured interview follows.

            "Opening Question" is a question that introduces a new concept.
            "In-depth Question" is a question that builds upon a previous question, such as asking for more specific detail or making a clarification. 

            Your task is to categorize the following question as either an "Opening Question" or "In-depth Question". Use the lists of Suggested Opening and In-Depth Questions for reference to categorize the given question. For example, the question "What is your date of birth?" matches with "date of birth", which is in the Suggested Opening Questions category. You should respond with "Opening Question" in this case. 
            
            Respond with the category only.

            Suggested Opening Questions:
            {opening_questions_str}

            Suggested In-Depth Questions:
            {indepth_questions_str}

            Question:
            {msg}
            '''
            content, cost = get_gpt_response([{"role": "user", "content": asst_q_types_prompt}])
            total_cost += cost
            category_per_msg[msg_num].append(content)
            if "Question" in content:
                qs_per_msg[msg_num].append(content)
    
    return category_per_msg, qs_per_msg, asst_sentences_dict, asst_sentences_list, total_cost

# def get_num_asst_qs(qs_per_msg):
#     qs_per_msg_list = [len(qs_per_msg[q]) for q in qs_per_msg.keys()]
#     total_qs = sum(qs_per_msg_list)
#     avg_qs_per_msg = np.average(qs_per_msg_list)
#     return total_qs, avg_qs_per_msg

## load in data

In [13]:
question_bank = Path("../transcript_generation/prompts/questionbank_v2.txt").read_text()
qb_list = question_bank.split("\n")[:-1] #remove last question in qb_list, which is a command to end interview

filtered_qb_list = [q for q in qb_list if len(q) > 0 and q[0] != "#"]
print("number of opening questions in question bank:", len([q for q in filtered_qb_list if len(q) > 0 and q[0] == "-"]))
print("number of in-depth questions in question bank:", len([q for q in filtered_qb_list if len(q) > 0 and q[0] == "+"]))
# print(filtered_qb_list)
print("number of questions in question bank:", len(filtered_qb_list))

number of opening questions in question bank: 37
number of in-depth questions in question bank: 13
number of questions in question bank: 50


In [17]:
# if scraping from analysis.csv
initial_analysis_csv_path = "./analysis.csv"
final_analysis_csv_path = "./gpt_analysis.csv"
# analysis_df = pd.read_csv(initial_analysis_csv_path)

In [22]:
# if needing to drop or working with existing data
analysis_df = pd.read_csv(final_analysis_csv_path)
print(len(analysis_df.keys()))
analysis_df.keys()


42


Index(['filename', 'is_double_model', 'patient_str', 'is_rambling_prompt',
       'total_cost', 'time', 'edge_case', 'user_temp', 'asst_temp', 'temp',
       'convo_length', 'convo_rounds', 'asst_utt', ''asst_messages'',
       'asst_distinct1', ''asst_msg_len'', 'asst_avg_msg_len', 'user_utt',
       ''user_messages'', 'user_distinct1', ''user_msg_len'',
       'user_avg_msg_len', ''asst_category_per_msg'', ''asst_qs_per_msg'',
       'asst_avg_qs_per_msg', ''asst_sentences_dict'', ''asst_category_list'',
       ''asst_qs_list'', 'asst_num_sentences', 'asst_num_qs',
       'asst_opening_qs', 'asst_indepth_qs', 'asst_empathy_statements',
       'asst_other_statements', 'asst_coverage', ''asst_coverage_content'',
       ''user_summary'', 'user_coverage', ''user_coverage_content'',
       'analysis_cost', ''user_edge_case_present'',
       ''asst_edge_case_present''],
      dtype='object')

In [23]:
analysis_df = analysis_df.drop(columns="'user_edge_case_present'")
analysis_df = analysis_df.drop(columns="'asst_edge_case_present'")
print(len(analysis_df.keys()))
analysis_df.keys()

40


Index(['filename', 'is_double_model', 'patient_str', 'is_rambling_prompt',
       'total_cost', 'time', 'edge_case', 'user_temp', 'asst_temp', 'temp',
       'convo_length', 'convo_rounds', 'asst_utt', ''asst_messages'',
       'asst_distinct1', ''asst_msg_len'', 'asst_avg_msg_len', 'user_utt',
       ''user_messages'', 'user_distinct1', ''user_msg_len'',
       'user_avg_msg_len', ''asst_category_per_msg'', ''asst_qs_per_msg'',
       'asst_avg_qs_per_msg', ''asst_sentences_dict'', ''asst_category_list'',
       ''asst_qs_list'', 'asst_num_sentences', 'asst_num_qs',
       'asst_opening_qs', 'asst_indepth_qs', 'asst_empathy_statements',
       'asst_other_statements', 'asst_coverage', ''asst_coverage_content'',
       ''user_summary'', 'user_coverage', ''user_coverage_content'',
       'analysis_cost'],
      dtype='object')

## run analysis

In [25]:
#### NOTE before running: must prompt engineer edge case checking and find a better method for extracting question/statement subtypes

for idx, row in analysis_df.iterrows():
    # print(transcript_path)
    if 'analysis_cost' not in row:
        analysis_cost = 0
    elif pd.isna(row['analysis_cost']):
        analysis_cost = 0
    else:
        analysis_cost=row['analysis_cost']
    
    patient_info = row["patient_str"]
    patient_msgs = row["'user_messages'"].split("|")
    patient_lines = "\n".join([f"{i}. {line}" for i, line in enumerate(patient_msgs)])

    asst_msgs = row["'asst_messages'"].split("|")
    asst_lines = "\n".join([f"{i}. {line}" for i, line in enumerate(asst_msgs)])

    try:
        ### find data about assistant
        if "'asst_category_per_msg'" not in analysis_df.columns or pd.isnull(row["'asst_category_per_msg'"]):
            print("getting assistant message categories...")
            category_per_msg, qs_per_msg, asst_sentences_dict, asst_sentences_list, category_cost = get_sentence_category(asst_msgs)

            ###save category per message
            analysis_df.at[idx, "'asst_category_per_msg'"] = f'{dict(category_per_msg)}'

            ###save questions per message
            analysis_df.at[idx, "'asst_qs_per_msg'"] = f'{dict(qs_per_msg)}'
            qs_per_msg_list = [len(qs_per_msg[q]) for q in qs_per_msg.keys()]
            analysis_df.at[idx, "asst_avg_qs_per_msg"] = np.average(qs_per_msg_list)

            ###save sentence per message
            analysis_df.at[idx, "'asst_sentences_dict'"] = f'{asst_sentences_dict}'

            analysis_cost += category_cost 

            category_list = [c for cs in category_per_msg.values() for c in cs]
            qs_list = [q for qs in qs_per_msg.values() for q in qs]
            analysis_df.at[idx, "'asst_category_list'"] = f'{category_list}'
            analysis_df.at[idx, "'asst_qs_list'"] = f'{qs_list}'

            analysis_df.at[idx, "asst_num_sentences"] = len(asst_sentences_list)
            analysis_df.at[idx, "asst_num_qs"] = len(qs_list)

            num_opening_qs = len([q for q in category_list if q == "Opening Question"])
            num_indepth_qs = len([q for q in category_list if q == 'In-depth Question'])
            analysis_df.at[idx, "asst_opening_qs"] = num_opening_qs
            analysis_df.at[idx, "asst_indepth_qs"] = num_indepth_qs
            analysis_df.at[idx, "asst_empathy_statements"] = len([q for q in category_list if q == 'Empathy Statement'])
            analysis_df.at[idx, "asst_other_statements"] = len([q for q in category_list if q == 'Other'])
        else:
            print("assistant categories are present, skipping")
    except Exception as e:
        print(e, "occurred during get_sentence_category()")
        break

    try:
        if 'asst_coverage' not in analysis_df.columns or pd.isnull(row['asst_coverage']):
            print("getting assistant message coverage...")
            asst_cov_content, asst_cov_specificity_val, asst_cov_cost = get_asst_coverage(filtered_qb_list, asst_msgs)
            analysis_df.at[idx, "asst_coverage"] = asst_cov_specificity_val
            analysis_df.at[idx, "'asst_coverage_content'"] = asst_cov_content
            analysis_cost += asst_cov_cost
        else:
            print("assistant message coverage is present, skipping")
    except Exception as e:
        print(e, "occurred during get_asst_coverage()")
        break
    try:
        if "'user_summary'" not in analysis_df.columns or pd.isnull(row["'user_summary'"]):
            print("getting user message summary...")
            usr_summary_content, usr_summary_cost = get_user_summary(patient_lines)
            analysis_df.at[idx, "'user_summary'"] = usr_summary_content
            analysis_cost += usr_summary_cost
        else:
            print("user message summary is present, skipping")
    except Exception as e:
        print(e, "occurred during get_user_summary()")
        break
    
    try:
        if "user_coverage" not in row or pd.isna(row["user_coverage"]):
            print("getting user message coverage...")
            usr_cov_content, usr_cov_val, usr_cov_cost = get_user_coverage(patient_info, usr_summary_content)
            analysis_df.at[idx, 'user_coverage'] = usr_cov_val
            analysis_df.at[idx, "'user_coverage_content'"] = usr_cov_content
            analysis_cost += usr_cov_cost
        else:
            print("user message coverage is present, skipping")
    except Exception as e:
        print(e, "occurred during get_user_coverage()")
        break

    try:
        if "'user_edge_case_present'" not in analysis_df.columns or pd.isnull(row["'user_edge_case_present'"]):
            print("getting user edge case analysis")
            usr_edgecase_content, usr_edgecase_cost = get_user_edge_case_present(patient_lines)
            analysis_df.at[idx, "'user_edge_case_present'"] = usr_edgecase_content
            analysis_cost += usr_edgecase_cost
        else:
            print("user edge case analysis is present, skipping")
    except Exception as e:
        print(e, "occurred during get_user_edge_case_present(). Try running the cell again.")
        break

    try: 
        if "'asst_edge_case_present'" not in analysis_df.columns or pd.isnull(row["'asst_edge_case_present'"]):
            print("getting assistant edge case analysis")
            asst_edgecase_content, asst_edgecase_cost = get_asst_edge_case_present(asst_lines)
            analysis_df.at[idx, "'asst_edge_case_present'"] = asst_edgecase_content
            analysis_cost += asst_edgecase_cost
        else:
            print("assistant edge case analysis is present, skipping")
    except Exception as e:
        print(e, "occurred during get_asst_edge_case_present(). Try running the cell again.")
        break


    analysis_df.at[idx, 'analysis_cost'] = analysis_cost
    

print(f"saving file to {final_analysis_csv_path}...")
analysis_df.to_csv(final_analysis_csv_path,index=False)
print(f"done")

assistant categories are present, skipping
assistant message coverage is present, skipping
user message summary is present, skipping
user message coverage is present, skipping
user edge case analysis is present, skipping
assistant edge case analysis is present, skipping
assistant categories are present, skipping
assistant message coverage is present, skipping
user message summary is present, skipping
user message coverage is present, skipping
getting user edge case analysis
getting assistant edge case analysis
assistant categories are present, skipping
assistant message coverage is present, skipping
user message summary is present, skipping
user message coverage is present, skipping
getting user edge case analysis
getting assistant edge case analysis
assistant categories are present, skipping
assistant message coverage is present, skipping
user message summary is present, skipping
user message coverage is present, skipping
getting user edge case analysis
getting assistant edge case ana

# closer look at categorization

In [28]:
gpt_analysis_path = "./gpt_analysis.csv"
gpt_df = pd.read_csv(gpt_analysis_path)
gpt_df.keys()

Index(['filename', 'is_double_model', 'patient_str', 'is_rambling_prompt',
       'total_cost', 'time', 'edge_case', 'user_temp', 'asst_temp', 'temp',
       'convo_length', 'convo_rounds', 'asst_utt', ''asst_messages'',
       'asst_distinct1', ''asst_msg_len'', 'asst_avg_msg_len', 'user_utt',
       ''user_messages'', 'user_distinct1', ''user_msg_len'',
       'user_avg_msg_len', ''asst_category_per_msg'', ''asst_qs_per_msg'',
       'asst_avg_qs_per_msg', ''asst_sentences_dict'', ''asst_category_list'',
       ''asst_qs_list'', 'asst_num_sentences', 'asst_num_qs',
       'asst_opening_qs', 'asst_indepth_qs', 'asst_empathy_statements',
       'asst_other_statements', 'asst_coverage', ''asst_coverage_content'',
       ''user_summary'', 'user_coverage', ''user_coverage_content'',
       'analysis_cost', ''user_edge_case_present'',
       ''asst_edge_case_present''],
      dtype='object')

In [41]:
print(gpt_df["filename"][0])

DM_20240428-221529_Interview.json


In [29]:
pprint(gpt_df["'asst_sentences_dict'"][0])

("defaultdict(<class 'list'>, {0: ['Great, thank you for your cooperation.', "
 '"Let\'s start with some general information.", \'Could you please tell me '
 'your full name?\'], 1: [\'Thank you, Deborah.\', "Do you have a preferred '
 'name you\'d like us to use?"], 2: [\'Alright, Deb.\', \'What is your date of '
 "birth?'], 3: ['Thank you.', 'May I know your sex?'], 4: ['Got it.', 'Can you "
 "tell me your current city or town of residence?'], 5: ['Thank you, Deb.', "
 "'Are you currently single, married, or in a common-law relationship?'], 6: "
 '["I\'m sorry for your loss.", \'Do you have any children or dependents?\'], '
 "7: ['Understood.', 'Are you currently working?'], 8: ['That sounds like an "
 "interesting job.', 'How long have you been working as a sports development "
 'officer?\'], 9: ["That\'s quite a tenure.", \'Have you ever been on '
 "disability assistance?'], 10: ['Good to know.', 'What about your handedness "
 "- are you left-handed, right-handed, or ambidextrous?'

In [30]:
#isolating the things that we want to remove
print(gpt_df["'asst_sentences_dict'"][0][:28])
print(gpt_df["'asst_sentences_dict'"][0][-1:])

defaultdict(<class 'list'>, 
)


In [31]:
asst_category_list = ast.literal_eval(gpt_df["'asst_category_list'"][0])
asst_sentences_dict = ast.literal_eval(gpt_df["'asst_sentences_dict'"][0][28:-1])

In [32]:
asst_sentences = [x for v in asst_sentences_dict.values() for x in v]

In [33]:
[(asst_category_list[i], asst_sentences[i]) for i in range(len(asst_category_list))]

[('Other', 'Great, thank you for your cooperation.'),
 ('Opening Question', "Let's start with some general information."),
 ('Opening Question', 'Could you please tell me your full name?'),
 ('Other', 'Thank you, Deborah.'),
 ('Opening Question', "Do you have a preferred name you'd like us to use?"),
 ('Other', 'Alright, Deb.'),
 ('Opening Question', 'What is your date of birth?'),
 ('Other', 'Thank you.'),
 ('Other', 'May I know your sex?'),
 ('Other', 'Got it.'),
 ('Opening Question',
  'Can you tell me your current city or town of residence?'),
 ('Other', 'Thank you, Deb.'),
 ('Opening Question',
  'Are you currently single, married, or in a common-law relationship?'),
 ('Empathy Statement', "I'm sorry for your loss."),
 ('Opening Question', 'Do you have any children or dependents?'),
 ('Other', 'Understood.'),
 ('Opening Question', 'Are you currently working?'),
 ('Empathy Statement', 'That sounds like an interesting job.'),
 ('In-depth Question',
  'How long have you been working 

In [34]:
asst_questions_only = [i for i in range(len(asst_category_list)) if "Question" in asst_category_list[i]]
id_questions_only = [i for i in range(len(asst_category_list)) if "In-depth" in asst_category_list[i]]
o_questions_only = [i for i in range(len(asst_category_list)) if "Open" in asst_category_list[i]]

In [35]:
[asst_sentences[i] for i in id_questions_only]

['How long have you been working as a sports development officer?',
 'What about your handedness - are you left-handed, right-handed, or ambidextrous?',
 'Could you tell me about your current doctors?',
 "Could you tell me about any current medications and their dosages that you're taking?",
 "Do you know the dosage you're currently taking?",
 'How often do you use nicotine, marijuana, or alcohol?',
 "Can you tell me about any health conditions or diagnoses you've had, including details such as when you were diagnosed?",
 'Have you ever had a history of seizures?',
 'Have you ever been in rehab or received substance counseling?',
 'Have you seen any other medical professionals for your health?',
 'If yes, could you tell me who, when, and why?',
 'Is there a history of psychiatric conditions in other family members?',
 'If yes, who and did they receive treatment or hospital care?',
 'Is there a family history of neurological or genetic conditions?',
 'Do you have any siblings besides th

In [43]:
#16 opening questions
print(len(o_questions_only))

23


In [42]:
[asst_sentences[i] for i in o_questions_only]

["Let's start with some general information.",
 'Could you please tell me your full name?',
 "Do you have a preferred name you'd like us to use?",
 'What is your date of birth?',
 'Can you tell me your current city or town of residence?',
 'Are you currently single, married, or in a common-law relationship?',
 'Do you have any children or dependents?',
 'Are you currently working?',
 'Have you ever been on disability assistance?',
 'Do you have any known allergies?',
 'Are you taking any health supplements?',
 "Let's move on to your medical history.",
 'Have you had any previous hospitalizations or surgeries?',
 'Have you ever had any head injuries or concussions?',
 "Let's shift to your family history.",
 "Now, let's talk about your personal history.",
 'Where were you born?',
 'Are you a Canadian citizen?',
 'Moving on to a lighter topic, what was your average mark and favorite subject or class in high school?',
 'Did you pursue further education after high school?',
 'Could you shar

In [40]:
other_only = [i for i in range(len(asst_category_list)) if "Other" in asst_category_list[i]]
print([asst_sentences[i] for i in other_only])

['Great, thank you for your cooperation.', 'Thank you, Deborah.', 'Alright, Deb.', 'Thank you.', 'May I know your sex?', 'Got it.', 'Thank you, Deb.', 'Understood.', 'Good to know.', 'Got it, thank you.', 'Great, that simplifies things.', 'It helps to have a picture of your family structure.', 'Thank you for confirming.', "It's important to have strategies that help you unwind and cope with stress.", "If there's nothing else, I believe we have covered everything necessary for now.", 'Thank you, and goodbye.']
