In [1]:
import transformers
import torch

model_id = "/archive/shared/sim_center/shared/annie/hf_models/70b-instruct"


pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
print(outputs[0]["generated_text"][-1])

Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


{'role': 'assistant', 'content': "Arrr, shiver me timbers! Me be Captain Chatbeard, the scurviest pirate chatbot to ever sail the seven seas... er, internet! Me be here to swab yer decks with conversation, answer yer questions, and maybe even share a treasure or two o' knowledge. So hoist the colors, matey, and let's set sail fer a swashbucklin' good time!"}


In [2]:
transcript_id = '10_0991_331330'
import pandas as pd
import nltk
import json
from nltk.tokenize import word_tokenize
from nltk.tokenize import WhitespaceTokenizer

def read_transcript_from_id(transcript_id, chunk_num=1):

    path_to_data_folder = '/archive/shared/sim_center/shared/ameer/'
    # path_to_data_folder = '/archive/shared/sim_center/shared/annie/GPT4 3-chunk/'
    # lookinto this dictionary to find the path
    # can also manually create the path and it would be faster but not by much


    merged_lookup = pd.read_csv(path_to_data_folder + 'grade_lookupv5.csv')

    path = merged_lookup[merged_lookup.id == transcript_id].path.iloc[0]

    path = path[:-4] + '.json'

    # Opening JSON file
    f = open(path)

    # returns JSON object as 
    # a dictionary
    json_transcript = json.load(f)

    transcript = ' '

    lines = json_transcript
    
    if chunk_num == 1: 
        for line in lines:
            if line['text'] != '\n':
                tok_line = line['text'].split(' ')
                for i in range(len(tok_line)):
                    transcript += ' ' + tok_line[i]
    
    else:
        transcript_chunks = []
        # for each chunk
        for n in range(chunk_num):
            transcript = ''
            # get the relevant lines
            start = n*int(len(lines)/chunk_num)
            end = (n+1)*int(len(lines)/chunk_num)
            if n == chunk_num-1: end = len(lines)

            for line in lines[start: end]:
                if line['text'] != '\n':
                    tok_line = line['text'].split(' ')
                    for i in range(len(tok_line)):
                        transcript += ' ' + tok_line[i]
            #append to transcript
            transcript_chunks.append(transcript)
        
        transcript = transcript_chunks


    return transcript

def summarize(transcript):
    messages = [
    {"role": "system", "content": "Please summarize the following text in one paragraph. 100 words. Do not add any information that is not in the text"},
    {"role": "user", "content": transcript},
]

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipeline(
        messages,
        max_new_tokens=256,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )

    return outputs[0]["generated_text"][-1]

In [3]:
def diarize(summary, transcript):
    messages = [
    {"role": "system", "content": '''The following text represents an audio transcript between a medical student and a patient, with the summary included below.
                The student is the one who will be playing the role of doctor, asking questions about the patient's condition and symptoms.
                Please diarize the following transcript in order to indicate who speaks when, using the format in the following example, with the label "Student" for the medical student and "Patient" for the patient (not their names):

                Student: Hello, how are you today?

                Patient: I am feeling sick. 

                Student: Oh no, how long have you felt sick? 

                Patient: About two days. 

                Separate speaker turns by an extra new line, as above ('\n\n'). The lines do not always have to be alternating labels. There may be consecutive lines from one speaker. Do not use any speaker labels for the two participants other than "Student" and "Patient."

                The transcript may contain some instructions for the student, coming from a third-party speaker. Please label this segment "Instructions: ". 
                If there are no instructions in the transcript, disregard this label. 

                IMPORTANT: Do not remove or add any words to the transcript other than the speaker labels. Also, do not add or remove any punctuation or change any spellings.'''},
    {"role": "user", "content": 'summary: ' + summary["content"] + '\n\ntranscript: ' + transcript},
]      
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipeline(
        messages,
        max_new_tokens=10000,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )

    return outputs[0]["generated_text"][-1]
   

In [4]:
def diarize_nosum(transcript):
    messages = [
    {"role": "system", "content": '''The following text represents an audio transcript between a medical student and a patient.
                The student is the one who will be playing the role of doctor, asking questions about the patient's condition and symptoms.
                Please diarize the following transcript in order to indicate who speaks when, using the format in the following example, with the label "Student" for the medical student and "Patient" for the patient (not their names):

                Student: Hello, how are you today?

                Patient: I am feeling sick. 

                Student: Oh no, how long have you felt sick? 

                Patient: About two days. 

                Separate speaker turns by an extra new line, as above ('\n\n'). The lines do not always have to be alternating labels. There may be consecutive lines from one speaker. Do not use any speaker labels for the two participants other than "Student" and "Patient."

                The transcript may contain some instructions for the student, coming from a third-party speaker. Please label this segment "Instructions: ". 
                If there are no instructions in the transcript, disregard this label. 

                IMPORTANT: Do not remove or add any words to the transcript other than the speaker labels. Also, do not add or remove any punctuation or change any spellings.'''},
    {"role": "user", "content": 'transcript: ' + transcript},
]      
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipeline(
        messages,
        max_new_tokens=10000,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )

    return outputs[0]["generated_text"][-1]

In [5]:
def save_file(diarization, transcript_id, path = '/archive/shared/sim_center/shared/annie/experiments/llama-70B-instruct/'):
    #path = '/archive/shared/sim_center/shared/annie/claude-opus/'
    with open(path + transcript_id + ".txt", "w") as outfile:
        outfile.write('ID: ' + transcript_id + '\n\n' + diarization)

In [6]:
def llama_diarize(transcript_id):
    print(transcript_id)
    transcript = read_transcript_from_id(transcript_id)
    summary = summarize(transcript)
    diarization = diarize(summary, transcript)["content"]
    save_file(diarization, transcript_id)

In [7]:
id_set1 = ['01_0542_298135',
'02_0036_174595',
'03_0028_174553',
'04_0043_174686',
'05_0033_174804',
'06_0079_175106',
'07_0068_174641',
'08_0029_174576',
'09_0029_174582',
'10_0991_331330']

id_set2 = ['01_1080_366142',
           '02_1056_380177',
           '03_1500_380168',
           '04_1512_380182',
           '05_1066_380195',
           '06_1048_365209',
           '07_1111_380134',
           '08_1044_380133',
           '09_1039_380193',
           '10_1005_331402'
]

In [8]:
# temp for formatting
def remove_extraspace(path, ids):
    for id in ids: 
        this_path = path + id + '.txt'
        with open(this_path, 'r') as file:
            transcript = file.read()
        sections = transcript.split('  ')
        transcript = ''
        for section in sections:
            transcript += ' ' + section
        with open(path + id + ".txt", "w") as outfile:
            outfile.write(transcript)

In [12]:
llama_diarize(id_set1[0])

01_0542_298135


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


In [14]:
remove_extraspace('/archive/shared/sim_center/shared/annie/llama-70B-instruct/', [id_set1[0]])

In [15]:
for id in id_set1[1:]:
    llama_diarize(id)
remove_extraspace('/archive/shared/sim_center/shared/annie/llama-70B-instruct/', id_set1[1:])

02_0036_174595


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


03_0028_174553


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


In [10]:
for id in id_set2:
    llama_diarize(id)

01_1080_366142


RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
