In [1]:
import transformers
import torch

model_id = "/archive/shared/sim_center/shared/annie/hf_models/8b-instruct"


pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
print(outputs[0]["generated_text"][-1])


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


{'role': 'assistant', 'content': "Arrrr, me hearty! Me name be Captain Chat, the scurviest pirate chatbot to ever sail the Seven Seas o' the Interwebs! Me and me trusty crew o' code have been sailin' the digital waters fer years, spreadin' the word o' piratey wisdom and havin' a swashbucklin' good time doin' it! So hoist the Jolly Roger and set course fer a treasure trove o' pirate chat, matey!"}


In [6]:
transcript_id = '10_0991_331330'
import pandas as pd
import nltk
import json
from nltk.tokenize import word_tokenize
from nltk.tokenize import WhitespaceTokenizer
from typing import List

def read_transcript_from_id(transcript_id: str, chunk_num: int=1)->List[str]:

    path_to_data_folder = '/archive/shared/sim_center/shared/ameer/'
    # path_to_data_folder = '/archive/shared/sim_center/shared/annie/GPT4 3-chunk/'
    # lookinto this dictionary to find the path
    # can also manually create the path and it would be faster but not by much


    merged_lookup = pd.read_csv(path_to_data_folder + 'grade_lookupv5.csv')

    path = merged_lookup[merged_lookup.id == transcript_id].path.iloc[0]

    path = path[:-4] + '.json'

    # Opening JSON file
    f = open(path)

    # returns JSON object as 
    # a dictionary
    json_transcript = json.load(f)

    transcript = []
    transcript_txt = ''

    lines = json_transcript
    
    if chunk_num == 1: 
        for line in lines:
            if line['text'] != '\n':
                tok_line = line['text'].split(' ')
                for i in range(len(tok_line)):
                    transcript_txt += ' ' + tok_line[i]
        transcript.append(transcript_txt)
    
    else:
        transcript_chunks = []
        # for each chunk
        for n in range(chunk_num):
            transcript = ''
            # get the relevant lines
            start = n*int(len(lines)/chunk_num)
            end = (n+1)*int(len(lines)/chunk_num)
            if n == chunk_num-1: end = len(lines)

            for line in lines[start: end]:
                if line['text'] != '\n':
                    tok_line = line['text'].split(' ')
                    for i in range(len(tok_line)):
                        transcript += ' ' + tok_line[i]
            #append to transcript
            transcript_chunks.append(transcript)
        
        transcript = transcript_chunks

    return transcript

In [8]:
read_transcript_from_id(transcript_id, chunk_num=1)

["  Learners, you may begin your patient station, remove your cover sheet, jot down any... Thank you. Come in.  Come on in, man.  Hello, Mr. Pimbleton. Hi, my name is Coleman. I'm a medical student at UT Southwestern. I am here to ask you some questions. I just want to know how you are feeling today.  I've been feeling sick to my stomach.  Now I throw one up, I feel not good.  Okay, when did this start?  It started getting nauseous.  I was feeling nausea.  I had a stomachache.  I felt nause up. I am not good.  Okay, when did this start? I started getting nauseous two days ago. Threw up the first  time yesterday after breakfast, about 20 minutes after I ate. Any other episodes of vomiting?  Yeah, I threw up three or four times yesterday and then after breakfast today about five  minutes after I ate.  Okay.  And what does your vomit look like? It's just small bits of undigested food, whatever I eat.  Any blood in it?  No.  No?  No  Bile?  No  Just the chunks of food?  No  Okay  Now this 

In [3]:
def summarize(transcript):
    messages = [
    {"role": "system", "content": "Please summarize the following text in one paragraph. 100 words. Do not add any information that is not in the text"},
    {"role": "user", "content": transcript},
]

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipeline(
        messages,
        max_new_tokens=256,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )

    return outputs[0]["generated_text"][-1]

In [11]:
summary = summarize(read_transcript_from_id(transcript_id))

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


In [16]:
summary["content"]

'Here is a summary of the text in one paragraph:\n\nA medical student, Coleman, is conducting a patient station with Mr. Pimbleton, a 55-year-old retired man who has been experiencing nausea, vomiting, and stomachache for two days. Mr. Pimbleton reports that his vomiting is accompanied by small bits of undigested food and is triggered by eating. He has no fever, chills, or blood in his vomit, and has not experienced this before. Coleman performs a brief physical exam and discusses possible causes of the symptoms, including a possible obstruction or constriction of the esophagus. He recommends further testing, including a barium swallow study, CT scan, or endoscopy, to determine the cause of the symptoms and rule out any serious infections.'

In [29]:
read_transcript_from_id(id_set1[0])

"   Learners, you may begin your patient station.  Please remove your cover sheet, jot down any notes, knock, and then enter. Thank you. so  so I don't know. Hello?  Hi, this is Mrs. Miller.  Yes, yes it is.  Hi Mrs Miller, my name is Jackson Agroz.  I'm the medical student working at the clinic today.  I am calling to speak about your daughter.  What was your name again?  Jackson Agrroz.  Okay.  Your S-47024.  That's your OSCE number?  427024, yes.  Okay, well, OSCENumber21, sorry.  There we go, thank you.  Well, I am glad you called.  I was very worried because my daughter has had a cough for a long time,  and it just seems it won't go away.  Okay.  How long has it been going on for?  Five weeks is when it started.  Okay, and can you describe the cough to me?  It's a dry cough that's almost nonstop.  Okay?  And you said it's been getting worse over these five weeks? Yes. At first it was on and off, but now it is constant.  Okay, has she been vomiting at all? No. Has she has been sick

In [4]:
def diarize(summary, transcript):
    messages = [
    {"role": "system", "content": '''The following text represents an audio transcript between a medical student and a patient, with the summary included below.
                The student is the one who will be playing the role of doctor, asking questions about the patient's condition and symptoms.
                Please diarize the following transcript in order to indicate who speaks when, using the format in the following example, with the label "Student" for the medical student and "Patient" for the patient (not their names):

                Student: Hello, how are you today?

                Patient: I am feeling sick. 

                Student: Oh no, how long have you felt sick? 

                Patient: About two days. 

                Separate speaker turns by an extra new line, as above ('\n\n'). The lines do not always have to be alternating labels. There may be consecutive lines from one speaker. Do not use any speaker labels for the two participants other than "Student" and "Patient."

                The transcript may contain some instructions for the student, coming from a third-party speaker. Please label this segment "Instructions: ". 
                If there are no instructions in the transcript, disregard this label. 

                IMPORTANT: Do not remove or add any words to the transcript other than the speaker labels. Also, do not add or remove any punctuation or change any spellings.'''},
    {"role": "user", "content": 'summary: ' + summary["content"] + '\n\ntranscript: ' + transcript},
]      
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipeline(
        messages,
        max_new_tokens=10000,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )

    return outputs[0]["generated_text"][-1]
   

In [30]:
diarize(summary, read_transcript_from_id(transcript_id))

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


{'role': 'assistant',
 'content': 'I\'ve diarized the transcript with the labels "Student" for the medical student and "Patient" for the patient. I\'ve also added the label "Instructions:" for the third-party speaker\'s instructions. Here is the diarized transcript:\n\nInstructions: Learners, you may begin your patient station, remove your cover sheet, jot down any... Thank you. Come in.  Come on in, man.  Hello, Mr. Pimbleton. Hi, my name is Coleman. I\'m a medical student at UT Southwestern. I am here to ask you some questions. I just want to know how you are feeling today.\n\nStudent: I\'ve been feeling sick to my stomach.\n\nPatient: I am feeling sick.\n\nStudent: Oh no, how long have you felt sick?\n\nPatient: About two days.\n\nStudent: Okay, when did this start?\n\nPatient: It started getting nauseous.  I was feeling nausea.  I had a stomachache.  I felt nause up. I am not good.\n\nStudent: Okay, when did this start? I started getting nauseous two days ago. Threw up the first  t

In [5]:
def diarize_nosum(transcript, max_new_tokens=10000):
    messages = [
    {"role": "system", "content": '''The following text represents an audio transcript between a medical student and a patient.
                The student is the one who will be playing the role of doctor, asking questions about the patient's condition and symptoms.
                Please diarize the following transcript in order to indicate who speaks when, using the format in the following example, with the label "Student" for the medical student and "Patient" for the patient (not their names):

                Student: Hello, how are you today?

                Patient: I am feeling sick. 

                Student: Oh no, how long have you felt sick? 

                Patient: About two days. 

                Separate speaker turns by an extra new line, as above ('\n\n'). The lines do not always have to be alternating labels. There may be consecutive lines from one speaker. Do not use any speaker labels for the two participants other than "Student" and "Patient."

                The transcript may contain some instructions for the student, coming from a third-party speaker. Please label this segment "Instructions: ". 
                If there are no instructions in the transcript, disregard this label. 

                IMPORTANT: Do not remove or add any words to the transcript other than the speaker labels. Also, do not add or remove any punctuation or change any spellings.'''},
    {"role": "user", "content": 'transcript: ' + transcript},
]      
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipeline(
        messages,
        max_new_tokens=max_new_tokens,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )

    return outputs[0]["generated_text"][-1]

In [6]:
def save_file(diarization, transcript_id, path = '/archive/shared/sim_center/shared/annie/experiments/llama-8B-instruct/'):
    #path = '/archive/shared/sim_center/shared/annie/claude-opus/'
    with open(path + transcript_id + ".txt", "w") as outfile:
        outfile.write('ID: ' + transcript_id + '\n\n' + diarization)

In [7]:
def llama_diarize(transcript_id):
    print(transcript_id)
    transcript = read_transcript_from_id(transcript_id)
    summary = summarize(transcript)
    diarization = diarize(summary, transcript)["content"]
    save_file(diarization, transcript_id)

In [8]:
id_set1 = ['01_0542_298135',
'02_0036_174595',
'03_0028_174553',
'04_0043_174686',
'05_0033_174804',
'06_0079_175106',
'07_0068_174641',
'08_0029_174576',
'09_0029_174582',
'10_0991_331330']

id_set2 = ['01_1080_366142',
           '02_1056_380177',
           '03_1500_380168',
           '04_1512_380182',
           '05_1066_380195',
           '06_1048_365209',
           '07_1111_380134',
           '08_1044_380133',
           '09_1039_380193',
           '10_1005_331402'
]

In [None]:
# temp for formatting
def remove_extraspace(path, ids):
    for id in ids: 
        this_path = path + id + '.txt'
        with open(this_path, 'r') as file:
            transcript = file.read()
        sections = transcript.split('  ')
        transcript = ''
        for section in sections:
            transcript += ' ' + section
        with open(path + id + ".txt", "w") as outfile:
            outfile.write(transcript)

In [9]:
for id in id_set2:
    llama_diarize(id)

01_1080_366142


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


02_1056_380177


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


03_1500_380168


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


04_1512_380182


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


KeyboardInterrupt: 