In [None]:
!python3 -m venv sdoh-venv
!source ./sdoh-venv/bin/activate
!pip install -r requirements.txt

In [None]:
METHOD_NAME = "llm_multi_stage"
DATA_TYPE = "crisis" #["crisis", "categorical"]
DATA_CSV_PATH = f"/data/sampled_balanced_{DATA_TYPE}_data.csv"
CRISIS_DEFINITION_JSON_PATH = f"/data/{DATA_TYPE}_data_definition.json"

# Initiate LLM, gpt-35-turbo-16k

In [None]:
import os
from openai import AzureOpenAI
import pandas as pd 
import json 
import logging 
logging.basicConfig(level=logging.INFO, filename=f"{METHOD_NAME}.log", filemode='w')

client = AzureOpenAI(
  api_key = "",  
  api_version = "2024-05-01-preview",
  azure_endpoint = ""
)

# Load data

In [4]:
df = pd.read_csv(DATA_CSV_PATH)

with open(CRISIS_DEFINITION_JSON_PATH, 'r') as file:
    crisis_definition_dict = json.load(file)

In [6]:
CRISIS_LIST = [key for key, _ in crisis_definition_dict.items()]

# Stage 1 - Relevance Extraction

In [8]:
from json.decoder import JSONDecodeError
from retry import retry

def parse_llm_stage1_response_content(response_content):
    response_content = json.loads(response_content)
    prediction = list(response_content['Relevant Descriptions'])
    return prediction

@retry((JSONDecodeError, ValueError), tries=3, delay=1)
def llm_stage1_relevant_extraction(narrative, crisis_definition):
    response = client.chat.completions.create(
        model="azure-gpt-35-turbo-0125",
        response_format={ "type": "json_object" },
        messages=[
            {"role": "system", 
             "content": f"You are a helpful assistant. You will be given a report narrative, and the definition of a social determinants of health factor. Please read the given narrative, and find the sentences that are closely relevant to the factor of interest."},
            {"role": "user", 
             "content": f"This is the definition of the factor of interest: {crisis_definition}. This is the report narrative: {narrative}. Output the list of closely relevant sentences, or an empty list if there is no relevant sentence, in the format of a valid JSON payload with keys \"Relevant Descriptions\"."}
        ]    
    )
    response_content = response.choices[0].message.content
    try:
        prediction = parse_llm_stage1_response_content(response_content)
    except Exception as e:
        print(response_content)
        prediction = []
    return prediction

In [None]:
from tqdm import tqdm 

for CRISIS in CRISIS_LIST:
    CRISIS_RESULT_DICT = []
    CRISIS_DEFINITION = crisis_definition_dict[CRISIS]
    CRISIS_TEST_DATA = df[df['Target_Class'].isin([CRISIS])].reset_index(drop=True)
    for idx, row in tqdm(CRISIS_TEST_DATA.iterrows(), total=len(CRISIS_TEST_DATA)):
        narrative = ""
        narrative_cme, narrative_le = row['NarrativeCME'], row['NarrativeLE']
        if not pd.isna(narrative_cme):
            narrative += narrative_cme
        if not pd.isna(narrative_le):
            narrative += narrative_le 
        
        if narrative != "":
            relevant_sentences = llm_stage1_relevant_extraction(narrative, CRISIS_DEFINITION)
            CRISIS_RESULT_DICT.append({'PersonID': row['PersonID'],
                                       'Ground_Truth': row['Target_Class_Value'], 
                                       'Stage_1_Relevant_Sentences': relevant_sentences})
    
    with open(f'/{METHOD_NAME}_results/stage_1_relevant_sentences_{CRISIS}_rerun.json', 'w') as file:
        json.dump(CRISIS_RESULT_DICT, file, indent=2)


# Stage 2 - Relevance Verification

In [11]:
from json.decoder import JSONDecodeError
from retry import retry

def parse_llm_stage2_response_content(response_content):
    response_content = json.loads(response_content)
    prediction = response_content['Relevant']
    return prediction

@retry((JSONDecodeError, ValueError), tries=3, delay=1)
def llm_stage2_relevant_verification(sentence, crisis_definition):
    response = client.chat.completions.create(
        model="azure-gpt-35-turbo-0125",
        response_format={ "type": "json_object" },
        messages=[
            {"role": "system", 
             "content": f"You are a helpful assistant. You will be given a sentence, and the definition of a social determinants of health factor. Please read the given sentence, and answer if the sentence is describing the factor of interest."},
            {"role": "user", 
             "content": f"This is the definition of the factor of interest: {crisis_definition}. This is the sentence: {sentence}. Output True or False in the format of a valid JSON payload with keys \"Relevant\"."}
        ]    
    )
    response_content = response.choices[0].message.content
    try:
        prediction = parse_llm_stage2_response_content(response_content)
    except Exception as e:
        print(response_content)
        prediction = False
    return prediction

In [None]:
for CRISIS in CRISIS_LIST:
    CRISIS_DEFINITION = crisis_definition_dict[CRISIS]
    CRISIS_STAGE2_RESULT_DICT = []
    with open(f'/{METHOD_NAME}_results/stage_1_relevant_sentences_{CRISIS}_rerun.json', 'r') as file:
        CRISIS_RESULT_DICT = json.load(file)
    
    for idx in tqdm(range(len(CRISIS_RESULT_DICT))):
        entry = CRISIS_RESULT_DICT[idx]
        personid = entry['PersonID']
        ground_truth = entry['Ground_Truth']
        relevant_sentences = entry['Stage_1_Relevant_Sentences']
        if relevant_sentences is None or len(relevant_sentences) == 0:
            verified_relevant_sentences = []
        else:
            verified_relevant_sentences = []
            for sentence in relevant_sentences:
                prediction = llm_stage2_relevant_verification(sentence, CRISIS_DEFINITION)
                if str(prediction).lower() == 'true':
                    verified_relevant_sentences.append(sentence)
                else:
                    continue 
        
        CRISIS_STAGE2_RESULT_DICT.append({'PersonID': personid,
                                            'Ground_Truth': ground_truth, 
                                            'Stage_1_Relevant_Sentences': relevant_sentences,
                                            'Stage_2_Relevant_Sentences_Verified': verified_relevant_sentences})
    
    with open(f'/{METHOD_NAME}_results/stage_2_relevant_sentences_{CRISIS}_rerun.json', 'w') as file:
        json.dump(CRISIS_STAGE2_RESULT_DICT, file, indent=2)

100%|██████████| 600/600 [09:39<00:00,  1.04it/s]
100%|██████████| 600/600 [10:23<00:00,  1.04s/it]


# Stage 3 - Decision Making

In [14]:
from json.decoder import JSONDecodeError
from retry import retry
from tqdm import tqdm

def parse_llm_stage3_response_content(response_content):
    response_content = json.loads(response_content)
    prediction = response_content['Prediction']
    return prediction

@retry((JSONDecodeError, ValueError), tries=3, delay=1)
def llm_stage3_decision_making(sentences, crisis_definition):
    response = client.chat.completions.create(
        model="azure-gpt-35-turbo-0125",
        response_format={ "type": "json_object" },
        messages=[
            {"role": "system", 
             "content": f"You are a helpful assistant. You will be given a list of sentences, and the definition of a social determinants of health factor. Please read the given sentences, and answer if the factor of interest contributed to the suicide incident."},
            {"role": "user", 
             "content": f"This is the definition of the factor of interest: {crisis_definition}. This are the sentences: {sentences}. Output True or False in the format of a valid JSON payload with key \"Prediction\"."}
        ]    
    )
    response_content = response.choices[0].message.content
    try:
        prediction = parse_llm_stage3_response_content(response_content)
    except Exception as e:
        print(response_content)
        prediction = False
    return prediction

In [None]:
for CRISIS in CRISIS_LIST:
    CRISIS_DEFINITION = crisis_definition_dict[CRISIS]
    CRISIS_STAGE3_RESULT_DICT = []
    with open(f'/{METHOD_NAME}_results/stage_2_relevant_sentences_{CRISIS}_rerun.json', 'r') as file:
        CRISIS_RESULT_DICT = json.load(file)
    
    for idx in tqdm(range(len(CRISIS_RESULT_DICT))):
        entry = CRISIS_RESULT_DICT[idx]
        personid = entry['PersonID']
        ground_truth = entry['Ground_Truth']
        relevant_sentences = entry['Stage_1_Relevant_Sentences']
        verified_relevant_sentences = entry['Stage_2_Relevant_Sentences_Verified']
        if verified_relevant_sentences is None or len(verified_relevant_sentences) == 0:
            prediction_value = 0
        else:
            prediction = llm_stage3_decision_making(verified_relevant_sentences, CRISIS_DEFINITION)
            if str(prediction).lower() == 'true':
                prediction_value = 1
            else:
                prediction_value = 0
        
        CRISIS_STAGE3_RESULT_DICT.append({'PersonID': personid,
                                            'Ground_Truth': ground_truth, 
                                            'Stage_1_Relevant_Sentences': relevant_sentences,
                                            'Stage_2_Relevant_Sentences_Verified': verified_relevant_sentences,
                                            'Stage_3_Decision_Making': prediction_value})
    
    with open(f'/{METHOD_NAME}_results/stage_3_prediction_{CRISIS}_rerun.json', 'w') as file:
        json.dump(CRISIS_STAGE3_RESULT_DICT, file, indent=2)


100%|██████████| 600/600 [03:19<00:00,  3.01it/s]
100%|██████████| 600/600 [03:10<00:00,  3.16it/s]


In [None]:
for CRISIS in CRISIS_LIST:
    CRISIS_RESULT_DICT = []
    with open(f'/{METHOD_NAME}_results/stage_3_prediction_{CRISIS}_rerun.json', 'r') as file:
        stage3_results = json.load(file)

    # CRISIS_TEST_DATA = df[df['Target_Class'].isin([CRISIS])].reset_index(drop=True)
    # gt_dict = {}
    # for idx, row in CRISIS_TEST_DATA.iterrows():
    #     gt_dict[row['PersonID']] = row['Target_Class_Value']

    for entry in stage3_results:
        CRISIS_RESULT_DICT.append({'PersonID': entry['PersonID'],
                                   'ground_truth': entry['Ground_Truth'], #gt_dict[entry['PersonID']],
                                   'prediction_value': entry['Stage_3_Decision_Making']})
    
    with open(f'/{METHOD_NAME}_results/{CRISIS}_rerun.json', 'w') as file:
        json.dump(CRISIS_RESULT_DICT, file, indent=2)

# Evaluation

In [None]:
from sklearn.metrics import classification_report

for CRISIS in CRISIS_LIST:
    CRISIS_TEST_DATA = df[df['Target_Class'].isin([CRISIS])].reset_index(drop=True)
    gt_dict = {}
    for idx, row in CRISIS_TEST_DATA.iterrows():
        gt_dict[row['PersonID']] = row['Target_Class_Value']
    
    with open(f'/{METHOD_NAME}_results/stage_3_prediction_{CRISIS}.json', 'r') as file:
        CRISIS_RESULT_DICT = json.load(file)
    
    gt_list, pred_list = [], []
    for entry in CRISIS_RESULT_DICT:
        personid = entry['PersonID']
        gt_list.append(gt_dict[personid])
        pred_list.append(entry['Stage_3_Decision_Making'])
    
    print(f'{CRISIS}')
    print(classification_report(y_pred=pred_list, y_true=gt_list, digits=3))
    print('-------')

# Bootstrap Analysis

In [19]:
import random
def sample_test_data(data):
    sampled_test_data = random.choices(data, k=len(data))
    predictions, ground_truths = [], []
    for entry in sampled_test_data:
        predictions.append(entry['prediction_value'])
        ground_truths.append(entry['ground_truth'])
    return predictions, ground_truths

In [None]:
def bootstrap_data(crisis, data, n=1000):
    bootstrap_dict = {}
    for i in range(n):
        predictions, ground_truths = sample_test_data(data)
        bootstrap_dict[i] = {'prediction': predictions,
                             'ground_truth': ground_truths}
        
    with open(f"/{METHOD_NAME}_results/bootstrap_samples/bootstrap_{crisis}_rerun.json", "w") as file:
        json.dump(bootstrap_dict, file, indent=2)
        
    return bootstrap_dict

In [21]:
from sklearn.metrics import precision_score, recall_score, f1_score

def compute_bootstrapped_f1(bootstrap_dict):
    precision_list, recall_list, f1_list = [], [], []
    for _, value in bootstrap_dict.items():
        pred, gt = value['prediction'], value['ground_truth']
        precision_score_value, recall_score_value, f1_score_value = precision_score(y_true=gt, y_pred=pred, average='macro'), recall_score(y_true=gt, y_pred=pred, average='macro'), f1_score(y_true=gt, y_pred=pred, average='macro')
        precision_list.append(precision_score_value)
        recall_list.append(recall_score_value)
        f1_list.append(f1_score_value)
    return precision_list, recall_list, f1_list

In [None]:
import os 
bootstrap_confidence_interval_result_file = f'/{METHOD_NAME}_results/bootstrap_confidence_interval_results_rerun.json'
def write_bootstrap_results_to_file(crisis, score_name, mean, lower, higher):
    if os.path.exists(bootstrap_confidence_interval_result_file):
        with open(bootstrap_confidence_interval_result_file, 'r') as file:
            confidence_interval_dict = json.load(file)
    else:
        confidence_interval_dict = {}
    
    if crisis in confidence_interval_dict:
        crisis_dict = confidence_interval_dict[crisis]
    else:
        crisis_dict = {}
    
    if score_name in crisis_dict:
        score_dict = crisis_dict[score_name]
    else:
        score_dict = {}
    
    score_dict['mean'] = mean 
    score_dict['ci_lower'] = lower
    score_dict['ci_higher'] = higher 

    crisis_dict[score_name] = score_dict 
    confidence_interval_dict[crisis] = crisis_dict 

    with open(bootstrap_confidence_interval_result_file, 'w') as file:
        json.dump(confidence_interval_dict, file, indent=2)

In [None]:
import numpy as np
from scipy.stats import bootstrap

for CRISIS in CRISIS_LIST:
    with open(f'/{METHOD_NAME}_results/{CRISIS}_rerun.json', 'r') as file:
        CRISIS_RESULT_VALUE_DICT = json.load(file)
    
    bootstrap_dict = bootstrap_data(CRISIS, CRISIS_RESULT_VALUE_DICT)
    precision_list, recall_list, f1_list = compute_bootstrapped_f1(bootstrap_dict)

    with open(f"/{METHOD_NAME}_results/bootstrap_samples/bootstrap_precision_list_{CRISIS}_rerun.json", "w") as file:
        json.dump(precision_list, file, indent=2)

    with open(f"/{METHOD_NAME}_results/bootstrap_samples/bootstrap_recall_list_{CRISIS}_rerun.json", "w") as file:
        json.dump(recall_list, file, indent=2)
    
    with open(f"/{METHOD_NAME}_results/bootstrap_samples/bootstrap_f1_list_{CRISIS}_rerun.json", "w") as file:
        json.dump(f1_list, file, indent=2)

    print(f"---{CRISIS}---")
    data = (precision_list,)  # samples must be in a sequence
    bootstrap_ci = bootstrap(data, np.mean, confidence_level=0.95, random_state=1, method='percentile')
    #print(f"({bootstrap_ci.confidence_interval.low:.4};{bootstrap_ci.confidence_interval.high:.4})")
    write_bootstrap_results_to_file(CRISIS, 'precision', pd.Series(precision_list).mean(), bootstrap_ci.confidence_interval.low, bootstrap_ci.confidence_interval.high)

    data = (recall_list,)  # samples must be in a sequence
    bootstrap_ci = bootstrap(data, np.mean, confidence_level=0.95, random_state=1, method='percentile')
    #print(f"({bootstrap_ci.confidence_interval.low:.4};{bootstrap_ci.confidence_interval.high:.4})")
    write_bootstrap_results_to_file(CRISIS, 'recall', pd.Series(precision_list).mean(), bootstrap_ci.confidence_interval.low, bootstrap_ci.confidence_interval.high)

    data = (f1_list,)  # samples must be in a sequence
    bootstrap_ci = bootstrap(data, np.mean, confidence_level=0.95, random_state=1, method='percentile')
    print(f"({bootstrap_ci.confidence_interval.low:.4};{bootstrap_ci.confidence_interval.high:.4})")
    write_bootstrap_results_to_file(CRISIS, 'f1', pd.Series(precision_list).mean(), bootstrap_ci.confidence_interval.low, bootstrap_ci.confidence_interval.high)
