In [1]:
import os, sys
import random
import time
from dotenv.main import load_dotenv, find_dotenv
load_dotenv(find_dotenv())
openai_api_key = os.environ['OPENAI_API_KEY']

from openai import OpenAI
from pathlib import Path
import glob
import json

import pandas as pd
import numpy as np
import collections

from pprint import pprint

import matplotlib.pyplot as plt 
import seaborn as sns
import scipy
import ast

import nltk
from nltk import tokenize
nltk.download('punkt')

nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /Users/ytcao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ytcao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
out_csv_path = "./analysis.csv"

In [4]:
tracking_csv_path = "../transcript_generation/tracking.csv"
tracking_df = pd.read_csv(tracking_csv_path, sep="|")

In [6]:
#load in the relevant files from a directory
dm_transcript_dir= Path("../transcript_generation/transcripts/gpt4_v2/DM")
sm_transcript_dir= Path("../transcript_generation/transcripts/gpt4_v2/SM")
dm_transcript_list=glob.glob(str(dm_transcript_dir)+"/*.json")
sm_transcript_list=glob.glob(str(sm_transcript_dir)+"/*.json")
print(dm_transcript_list)
print(sm_transcript_list)

['../transcript_generation/transcripts/gpt4_v2/DM/DM_20240428-221529_Interview.json', '../transcript_generation/transcripts/gpt4_v2/DM/DM_20240428-223120_Interview.json', '../transcript_generation/transcripts/gpt4_v2/DM/DM_20240428-220138_Interview.json', '../transcript_generation/transcripts/gpt4_v2/DM/DM_20240428-222702_Interview.json', '../transcript_generation/transcripts/gpt4_v2/DM/DM_20240428-215539_Interview.json', '../transcript_generation/transcripts/gpt4_v2/DM/DM_20240428-221850_Interview.json', '../transcript_generation/transcripts/gpt4_v2/DM/DM_20240428-222418_Interview.json', '../transcript_generation/transcripts/gpt4_v2/DM/DM_20240428-221238_Interview.json', '../transcript_generation/transcripts/gpt4_v2/DM/DM_20240428-220641_Interview.json', '../transcript_generation/transcripts/gpt4_v2/DM/DM_20240428-220920_Interview.json']
['../transcript_generation/transcripts/gpt4_v2/SM/SM_20240428-220920_Interview.json', '../transcript_generation/transcripts/gpt4_v2/SM/SM_20240428-22

In [None]:
# tracking_df[(tracking_df["interview_path"]==sm_transcript_list[1].split('/')[-1])].dropna(axis=1)
# tracking_df[(tracking_df["interview_path"])==dm_transcript_list[0].split('/')[-1]]["patient_str"].to_list()
# print(tracking_df[(tracking_df["interview_path"])==sm_transcript_list[1].split('/')[-1]]["patient_temp"].to_list()[0] == np.nan)

In [None]:
def lexical_diversity(word_list):
    '''word_list is a list of individual words.'''
    return len(set(word_list))/len(word_list)

def create_word_list(message_list):
    word_str = " ".join(message_list)
    word_str=word_str.lower()
    # print(asst_str)
    word_list = tokenize.word_tokenize(word_str)
    return word_list

In [None]:
def get_convo_stats(transcript_path):
    out_dict = {}
    out_dict["filename"] = transcript_path.split('/')[-1]
    
    if out_dict["filename"][:2] == "DM":
        out_dict["is_double_model"] = True
    else:
        out_dict["is_double_model"] = False

    ### match tracking info
    tracked_info = tracking_df[(tracking_df["interview_path"])==transcript_path.split('/')[-1]].dropna(axis=1)
    out_dict["patient_str"] = tracked_info["patient_str"].to_list()[0]
    if "rambling" in out_dict["patient_str"]:
        out_dict["is_rambling_prompt"]=True
    else:
        out_dict["is_rambling_prompt"]=False
    out_dict["total_cost"] = tracked_info["total_cost"].to_list()[0]
    out_dict["time"] = tracked_info["time"].to_list()[0]
    
    if "edge_case" in tracked_info:
        out_dict["edge_case"] = tracked_info["edge_case"].to_list()[0]
    else:
        out_dict["edge_case"] = ""

    if "patient_temp" in tracked_info:
        out_dict["user_temp"] = tracked_info["patient_temp"].to_list()[0]
        out_dict["asst_temp"] = tracked_info["assistant_temp"].to_list()[0]
        out_dict["temp"] = ""
    else:
        out_dict["user_temp"] = ""
        out_dict["asst_temp"] = ""
        out_dict["temp"] = tracked_info["temp"].to_list()[0]
    
    ### load transcript info
    with open(transcript_path) as f:
        convo = json.load(f)
        f.close()
    # print(len(convo))
    # print((len(convo)-1)/2)
    out_dict["convo_length"] = len(convo)-1
    out_dict["convo_rounds"] = (len(convo)-1)/2 #number of user-assistant messages; first message is automated

    ### load assistant messages
    asst_msgs = [message["content"] for message in convo if message["role"]=="assistant"][1:] #ignore the first message
    # print(len(asst_msgs))
    out_dict["asst_utt"] = len(asst_msgs)
    out_dict["'asst_messages'"] = "|".join(asst_msgs)

    ### get distinct-1
    asst_text = create_word_list(asst_msgs)
    out_dict["asst_distinct1"] = lexical_diversity(asst_text)

    # look at every single assistant message
    asst_msg_length = []
    # asst_sentences = [] #process the assistant sentences in gpt_analysis
    for msg in asst_msgs:
        asst_msg_length.append(len(msg))
        # asst_sentences.append(tokenize.sent_tokenize(msg))
    # print(asst_msg_length)
    # print(np.average(asst_msg_length))
    # out_dict["'asst_sentences'"] = "|".join([item for sublist in asst_sentences for item in sublist])
    
    out_dict["'asst_msg_len'"] = "|".join([str(length) for length in asst_msg_length])
    out_dict["asst_avg_msg_len"] = np.average(asst_msg_length)

    ### load user messages
    user_msgs = [message["content"] for message in convo if message["role"]=="user"]
    # print(len(user_msgs))
    out_dict["user_utt"] = len(user_msgs)
    out_dict["'user_messages'"] = "|".join(user_msgs)
    
    ### get distinct-1
    user_text = create_word_list(user_msgs)
    out_dict["user_distinct1"] = lexical_diversity(user_text)

    # look at every single user message
    user_msg_length = []
    # user_sentences = []
    for msg in user_msgs:
        user_msg_length.append(len(msg))
        # user_sentences.append(tokenize.sent_tokenize(msg))
    # print(user_msg_length)
    # print(np.average(user_msg_length))
    # out_dict["'user_sentences'"] = "|".join([item for sublist in user_sentences for item in sublist])
    out_dict["'user_msg_len'"] = "|".join([str(length) for length in user_msg_length])
    out_dict["user_avg_msg_len"] = np.average(user_msg_length)
    
    return out_dict

## get convo statistics/analysis.csv file

In [None]:
out_dict_list = []
for transcript in dm_transcript_list:
    print(transcript)
    out_dict = get_convo_stats(transcript)
    out_dict_list.append(out_dict)
for transcript in sm_transcript_list:
    print(transcript)
    out_dict = get_convo_stats(transcript)
    out_dict_list.append(out_dict)

### save out_df

In [None]:
super_out_dict = {}
for d in out_dict_list:
    for k, v in d.items():
        super_out_dict.setdefault(k,[]).append(v)

out_df = pd.DataFrame(super_out_dict)
out_df.to_csv(out_csv_path,index=False)

# testing

looking at distinct-1 measures

In [None]:
test_transcript = dm_transcript_list[0]
with open(test_transcript) as f:
    convo = json.load(f)
    f.close()

asst_msgs = [message["content"] for message in convo if message["role"]=="assistant"][1:]
user_msgs = [message["content"] for message in convo if message["role"]=="user"]

#turn message list into one string
asst_wordlist = create_word_list(user_msgs)
user_wordlist = create_word_list(user_msgs)

looking at word frequency

In [None]:
stops = set(stopwords.words('english'))

In [None]:
asst_wordlist_nostops =  set(asst_wordlist) - set(stops)


fd_asst = nltk.FreqDist(asst_wordlist_nostops)
fd_asst1 = nltk.FreqDist(asst_wordlist)

In [None]:
fd_asst.most_common(10)
# fd_asst1.most_common(10)

## graphing
read the analysis csv

In [None]:
analysis_df = pd.read_csv(out_csv_path)

In [None]:
analysis_df.head()

In [None]:
analysis_df[analysis_df["is_double_model"]==True][analysis_df["is_rambling_prompt"]==False]

In [None]:
bar_col = {
    "DM-rambling": "blue",
    "DM-brief":"green",
    "SM-rambling": "yellow",
    "SM-brief":"red",
}

In [None]:
labeled_df = pd.concat(
    [analysis_df[analysis_df["is_double_model"]==True][analysis_df["is_rambling_prompt"]==True].assign(dataset="DM-rambling", bar_col=bar_col["DM-rambling"]), 
    analysis_df[analysis_df["is_double_model"]==True][analysis_df["is_rambling_prompt"]==False].assign(dataset="DM-brief", bar_col=bar_col["DM-brief"]),
    analysis_df[analysis_df["is_double_model"]==False][analysis_df["is_rambling_prompt"]==True].assign(dataset="SM-rambling", bar_col=bar_col["SM-rambling"]),
    analysis_df[analysis_df["is_double_model"]==False][analysis_df["is_rambling_prompt"]==False].assign(dataset="SM-brief", bar_col=bar_col["SM-brief"])])

In [None]:
labeled_dfm = pd.melt(labeled_df, 
                      id_vars=["dataset","bar_col"], #data to keep as the identifier
                      value_vars=["asst_avg_msg_len", "user_avg_msg_len"], #columns to "unpivot" or "melt" together
                      var_name="speaker", #name of new unpivoted col
                      value_name="avg_msg_len") #name of the new col value
labeled_dfm = labeled_dfm.replace("asst_avg_msg_len", "assistant")
labeled_dfm = labeled_dfm.replace("user_avg_msg_len", "patient")
labeled_dfm.head()

In [None]:
def convert_pvalue_to_asterisks(pvalue, bf_correction):
    if pvalue <= 0.0001/bf_correction:
        return "****"
    elif pvalue <= 0.001/bf_correction:
        return "***"
    elif pvalue <= 0.01/bf_correction:
        return "**"
    elif pvalue <= 0.05/bf_correction:
        return "*"
    return "ns"

def get_dataset_comparison_pvalues(dfm, col, bf_correction = False):
    '''
    Parameters
    ----------
    dfm : pandas DataFrame 
        with columns "dataset" and "speaker" as either assistant/patient. values in the "dataset" column will be paired.
    col : str
        name of column in dfm that pvalue should be calculated from
    
    Output
    ------
    list
    '''
    x_values = dfm["dataset"].unique()
    pvalues_list = []
    done = []
    for x in x_values:
        for x1 in x_values:
            if x != x1 and x1 not in done:
                asst_stat, asst_pvalue = scipy.stats.ttest_ind(
                    dfm[(dfm["dataset"] == x) & (dfm["speaker"] == "assistant")][col],
                    dfm[(dfm["dataset"] == x1) & (dfm["speaker"] == "assistant")][col]
                )
                user_stat, user_pvalue = scipy.stats.ttest_ind(
                    dfm[(dfm["dataset"] == x) & (dfm["speaker"] == "patient")][col],
                    dfm[(dfm["dataset"] == x1) & (dfm["speaker"] == "patient")][col]
                )
                pvalues_list.append(((x, x1), 
                                        {"assistant":(asst_pvalue), 
                                        "patient":(user_pvalue)}))
                
            done.append(x)
    if bf_correction:
        corr_val = len(done)
        pvalues_list = [((x, x1), 
                         {"assistant": (pvalue_dict["assistant"], 
                                        convert_pvalue_to_asterisks(pvalue_dict["assistant"], corr_val)),
                          "patient":  (pvalue_dict["patient"], 
                                       convert_pvalue_to_asterisks(pvalue_dict["patient"], corr_val))
                          }) for ((x, x1), pvalue_dict) in pvalues_list]
    else:
        pvalues_list = [((x, x1), 
                         {"assistant": (pvalue_dict["assistant"], 
                                        convert_pvalue_to_asterisks(pvalue_dict["assistant"], 1)),
                          "patient":  (pvalue_dict["patient"], 
                                       convert_pvalue_to_asterisks(pvalue_dict["patient"], 1))
                          }) for ((x, x1), pvalue_dict) in pvalues_list]
    return pvalues_list

def get_pvalues(dfm, col, x_val="dataset", bf_correction=False):
    '''
    Parameters
    ----------
    dfm : pandas DataFrame 
        with columns "dataset" and "speaker" as either assistant/patient. values in the "dataset" column will be paired.
    col : str
        name of column in dfm that pvalue should be calculated from
    x_val : str
        name of the column in dfm that is the x axis grouping
    bf_correction : bool
        whether or not bonferroni correction is applied
    '''
    x_values = dfm[x_val].unique()
    pvalues_list = []
    done = []
    for x in x_values:
        for x1 in x_values:
            if x != x1 and x1 not in done:
                stat, pvalue = scipy.stats.ttest_ind(
                    dfm[dfm[x_val] == x][col],
                    dfm[dfm[x_val] == x1][col]
                )
                pvalues_list.append(((x, x1), pvalue))
            done.append(x)

    #add asterisks for significance
    if bf_correction:
        corr_val = len(done)
        pvalues_list = [((x, x1), pvalue, convert_pvalue_to_asterisks(pvalue, corr_val)) for ((x, x1), pvalue) in pvalues_list]
    else:
        pvalues_list = [((x, x1), pvalue, convert_pvalue_to_asterisks(pvalue, 1)) for ((x, x1), pvalue) in pvalues_list]
    return pvalues_list

### graph Average Message Length per Role Utterance

In [None]:
### average message length per role utterance
g = sns.catplot(
    data=labeled_dfm, kind="bar",
    x="dataset", y="avg_msg_len", hue="speaker",
    errorbar="sd", palette="dark", color=bar_col, alpha=.6, height=6
)

g.despine(left=True)
g.set_axis_labels("Transcript Creation Method", "Avg Message Length (characters)")
g.legend.set_title("Role")


g = sns.swarmplot(x="dataset", y="avg_msg_len", hue='speaker', palette="dark:black", alpha=.5, dodge=True,data=labeled_dfm)

plt.title("Average Message Length per Role Utterance")
plt.legend([],[], frameon=False)

In [None]:
### average message length per role utterance
g = sns.catplot(
    data=labeled_dfm, kind="bar",
    x="speaker", y="avg_msg_len", hue="dataset",
    errorbar="sd", 
    # color="bar_col",
    palette="dark", 
    alpha=.6, height=6
)

g.despine(left=True)
g.set_axis_labels("Role", "Avg Message Length (characters)")
g.legend.set_title("Transcript Creation Method")

g = sns.swarmplot(x="speaker", y="avg_msg_len", hue='dataset', palette="dark:black", alpha=.5, dodge=True,data=labeled_dfm)

plt.title("Average Message Length per Role Utterance")
plt.legend([],[], frameon=False)

In [None]:
avg_msg_len_pvalues = get_dataset_comparison_pvalues(labeled_dfm, "avg_msg_len")
avg_msg_len_pvalues
#do the corrections for multiple comparisons

In [None]:
avg_msg_len_pvalues_bfcorr = get_dataset_comparison_pvalues(labeled_dfm, "avg_msg_len", bf_correction=True) 
#bonferroni correction: divide by the number of tests (in this case, transcript pairs)
avg_msg_len_pvalues_bfcorr

## conversational rounds

In [None]:
### conversational rounds
g = sns.barplot(
    data=labeled_df, 
    x="dataset", y="convo_rounds",
    errorbar="sd", alpha=.6
)
# g.set_axis_labels("Transcript Creation Method", "Conversational Rounds")
g = sns.swarmplot(x="dataset", y="convo_rounds",color="black", alpha=.5, dodge=True,data=labeled_df)

In [None]:
get_pvalues(labeled_df, "convo_rounds", "dataset")

look at distinct-1 comparisons

In [None]:
labeled_dfm1 = pd.melt(labeled_df, 
                      id_vars="dataset", #data to keep as the identifier
                      value_vars=["asst_distinct1", "user_distinct1"], #columns to "unpivot" or "melt" together
                      var_name="speaker", #name of new unpivoted col
                      value_name="distinct-1") #name of the new col value
labeled_dfm1 = labeled_dfm1.replace("asst_distinct1", "assistant")
labeled_dfm1 = labeled_dfm1.replace("user_distinct1", "patient")
labeled_dfm1.head()

In [None]:
g = sns.catplot(
    data=labeled_dfm1, kind="bar",
    x="dataset", y="distinct-1", hue="speaker",
    errorbar="sd", palette="dark", alpha=.6, height=6
)

g.despine(left=True)
g.set_axis_labels("Transcript Creation Method", "Distinct-1 Score")
g.legend.set_title("Role")


g = sns.swarmplot(x="dataset", y="distinct-1", hue='speaker', palette="dark:black", alpha=.5, dodge=True,data=labeled_dfm1)

plt.title("Distinct-1 Score per Role")
plt.legend([],[], frameon=False)

In [None]:
### average message length per role utterance
g = sns.catplot(
    data=labeled_dfm1, kind="bar",
    x="speaker", y="distinct-1", hue="dataset",
    errorbar="sd", 
    # color="bar_col",
    palette="dark", 
    alpha=.6, height=6
)

g.despine(left=True)
g.set_axis_labels("Role", "Distinct-1 Score")
sns.move_legend(g, "upper left", bbox_to_anchor=(0.8, 0.5))
g.legend.set_title("Transcript Creation Method")

g = sns.swarmplot(x="speaker", y="distinct-1", hue='dataset', palette="dark:black", alpha=.5, dodge=True,data=labeled_dfm1)

plt.title("Average Message Length per Role Utterance")
plt.legend([],[], frameon=False,bbox_to_anchor=(1, 1))

In [None]:
get_dataset_comparison_pvalues(labeled_dfm1, "distinct-1")