In [7]:
import pandas as pd
import json
from strsimpy.normalized_levenshtein import NormalizedLevenshtein
import math

def read_transcript_from_id(id):

    path_to_data_folder = '/archive/shared/sim_center/shared/ameer/'
    # lookinto this dictionary to find the path
    # can also manually create the path and it would be faster but not by much
    merged_lookup = pd.read_csv(path_to_data_folder + 'grade_lookupv5.csv')

    path = merged_lookup[merged_lookup.id == id].path.iloc[0]

    path = path[:-4] + '.json'

    # Opening JSON file
    f = open(path)

    # returns JSON object as 
    # a dictionary
    json_transcript = json.load(f)
    
    f.close()

    return json_transcript

In [2]:
id = '10_1005_331402'
transcript = read_transcript_from_id(id)

In [3]:
transcript

[{'timestamp': [0.0, 42.0],
  'text': " remove your cover sheet jot down any notes knock and then enter My name is Sarah, I'm one of the medical students working in clinic today."},
 {'timestamp': [42.0, 45.68], 'text': ' How do you prefer to be addressed?'},
 {'timestamp': [46.36, 47.18], 'text': ' You can call me Sean.'},
 {'timestamp': [47.34, 50.02],
  'text': ' Hey, Sean, tell me what brings you into the clinic'},
 {'timestamp': [50.02, 50.34], 'text': ' today?'},
 {'timestamp': [51.44, 53.4],
  'text': " I've been feeling sick to my stomach."},
 {'timestamp': [53.68, 56.1], 'text': ' I can see'},
 {'timestamp': [56.1, 57.26], 'text': " that you're in some discomfort."},
 {'timestamp': [57.9, 59.56], 'text': ' When did this first start?'},
 {'timestamp': [59.9, 60.86], 'text': ' About two days ago.'},
 {'timestamp': [61.86, 63.8], 'text': ' Do you remember'},
 {'timestamp': [63.8, 65.6],
  'text': ' if anything happened before it first started?'},
 {'timestamp': [65.78, 67.28], 't

In [4]:
def get_diarized_transcript(id, numchunks=3):

    path_to_data_folder = '/archive/shared/sim_center/shared/annie/'
    # lookinto this dictionary to find the path
    # can also manually create the path and it would be faster but not by much

    if numchunks == 3: path = path_to_data_folder + 'GPT4 3-chunk/' + id + '.txt'
    if numchunks == 6: path = path_to_data_folder + 'GPT4 6-chunk/' + id + '.txt'
    if numchunks == 9: path = path_to_data_folder + 'GPT4 9-chunk/' + id + '.txt'

    # Opening file
    with open(path, 'r') as file:
        lines = file.readlines()  

    return lines

In [5]:
diar = get_diarized_transcript(id)
diar

['ID: 10_1005_331402\n',
 '\n',
 'Instructions: remove your cover sheet jot down any notes knock and then enter\n',
 '\n',
 "Student: My name is Sarah, I'm one of the medical students working in clinic today. How do you prefer to be addressed?\n",
 '\n',
 'Patient: You can call me Sean.\n',
 '\n',
 'Student: Hey, Sean, tell me what brings you into the clinic today?\n',
 '\n',
 "Patient: I've been feeling sick to my stomach.\n",
 '\n',
 "Student: I can see that you're in some discomfort. When did this first start?\n",
 '\n',
 'Patient: About two days ago.\n',
 '\n',
 'Student: Do you remember if anything happened before it first started? New foods, anything like that?\n',
 '\n',
 "Patient: No, no. I don't really eat new things except for what I normally eat.\n",
 '\n',
 "Student: Okay, so you've been feeling nauseous for the past few days, and then did you say you just recently started vomiting?\n",
 '\n',
 'Patient: First time was yesterday morning, about 20 minutes after I ate breakfa

In [17]:
normalized_levenshtein = NormalizedLevenshtein()

def levenshtein_timestamp_match(id, numchunks=3):
    # get whisper json and diarized transcripts
    transcript = read_transcript_from_id(id)
    diar = get_diarized_transcript(id, numchunks=numchunks)
    new_transcript = []

    for line in diar[1:]:
        if line == '\n': continue
        max_sim = -math.inf
        timestamp = ''
        for line2 in transcript:
            this_sim = normalized_levenshtein.similarity(line, line2['text'])
            if this_sim > max_sim:
                max_sim = this_sim
                timestamp = line2['timestamp']
        new_transcript.append({'text': line, 'timestamp': timestamp})
        #print({'text': line, 'timestamp': timestamp}, max_sim)

    return new_transcript

        




In [18]:
new_trans = levenshtein_timestamp_match(id)

{'text': 'Instructions: remove your cover sheet jot down any notes knock and then enter\n', 'timestamp': [0.0, 42.0]} 0.36690647482014394
{'text': "Student: My name is Sarah, I'm one of the medical students working in clinic today. How do you prefer to be addressed?\n", 'timestamp': [0.0, 42.0]} 0.3165467625899281
{'text': 'Patient: You can call me Sean.\n', 'timestamp': [46.36, 47.18]} 0.7096774193548387
{'text': 'Student: Hey, Sean, tell me what brings you into the clinic today?\n', 'timestamp': [47.34, 50.02]} 0.7611940298507462
{'text': "Patient: I've been feeling sick to my stomach.\n", 'timestamp': [51.44, 53.4]} 0.8085106382978724
{'text': "Student: I can see that you're in some discomfort. When did this first start?\n", 'timestamp': [56.1, 57.26]} 0.41025641025641024
{'text': 'Patient: About two days ago.\n', 'timestamp': [59.9, 60.86]} 0.6896551724137931
{'text': 'Student: Do you remember if anything happened before it first started? New foods, anything like that?\n', 'timesta

In [19]:
with open(id + ".json", "w") as outfile: 
    json.dump(new_trans, outfile)

In [None]:
id_set1 = ['01_0542_298135',
'02_0036_174595',
'03_0028_174553',
'04_0043_174686',
'05_0033_174804',
'06_0079_175106',
'07_0068_174641',
'08_0029_174576',
'09_0029_174582',
'10_0991_331330']