In [1]:
from __future__ import print_function
import time
import boto3
import json
import urllib
import random
import pandas as pd
from simpletransformers.classification import ClassificationModel

### The below block is used for obtaining an encounter's full audio clip, which you can upload to the bucket

In [None]:
from __future__ import absolute_import, division, unicode_literals

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

from google.cloud import bigquery
from google.cloud import storage
import predictionhealth as ph
from imp import reload
from pydub import AudioSegment
import numpy as np
import scipy.io.wavfile
import matplotlib.pyplot as plt
%matplotlib inline
import IPython.display as ipd
import librosa
import librosa.display as ld
import random

from tensorflow.keras import datasets, layers, models

### Running transcription jobs

In [2]:
data = {
    "MedicalTranscriptionJobName": GenerateJobName(),
    "LanguageCode": "en-US",
    "MediaFormat": "wav",
    "Media": {
        "MediaFileUri": "https://test-transcribe-bartleby.s3.us-east-2.amazonaws.com/youtube_audio_4.wav"
    },
    "OutputBucketName": "test-transcribe-bartleby",
    "Specialty": "PRIMARYCARE",
    "Type": "CONVERSATION",
    "Settings": {
      "MaxSpeakerLabels": 3,
      "ShowSpeakerLabels": True
    }
}

In [1]:

transcribe = boto3.client('transcribe')
job_name = data['MedicalTranscriptionJobName']
job_uri = data['Media']['MediaFileUri']

start_job(data)

In [6]:
def start_job(d):
    transcribe.start_medical_transcription_job(
        MedicalTranscriptionJobName=d['MedicalTranscriptionJobName'],
        LanguageCode=d['LanguageCode'],
        MediaFormat=d['MediaFormat'],
        Media=d['Media'],
        Type=d['Type'],
        Specialty=d['Specialty'],
        OutputBucketName=d['OutputBucketName'],
        Settings=d['Settings']
    )
    return

transcription_number = 0

def GenerateJobName():
    transcription_number += 1
    return 'test-aws-accuracy-'+transcription_number


In [4]:
while True:
    status = transcribe.get_medical_transcription_job(MedicalTranscriptionJobName=job_name)
    if status['MedicalTranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
        break
    print("Not ready yet...")
    time.sleep(5)


In [2]:
print(status)

In [3]:
transcript = urllib.request.urlopen(status['MedicalTranscriptionJob']['Transcript']['TranscriptFileUri'])
transcript_as_string = (transcript.read()).decode('utf-8')
outfile = open('returned_transcript.json', 'w')
outfile.write(transcript_as_string)


# Preparing data, creating a model, validating the model, and overwriting transcripts

### Loading raw data

In [5]:
unstructured=[]

In [6]:
with open('Downloads/test-aws-accuracy-0.json') as json_file:
    unstructured.append(json.load(json_file))
with open('Downloads/test-aws-accuracy-1.json') as json_file:
    unstructured.append(json.load(json_file))
with open('Downloads/test-aws-accuracy-2.json') as json_file:
    unstructured.append(json.load(json_file))
with open('Downloads/test-aws-accuracy-3.json') as json_file:
    unstructured.append(json.load(json_file))
with open('Downloads/test-aws-accuracy-4.json') as json_file:
    unstructured.append(json.load(json_file))
with open('Downloads/test-aws-accuracy-5.json') as json_file:
    unstructured.append(json.load(json_file))
with open('Downloads/test-aws-accuracy-6.json') as json_file:
    unstructured.append(json.load(json_file))
    
with open('Downloads/test-aws-accuracy-7.json') as json_file:
    unstructured.append(json.load(json_file))
with open('Downloads/test-aws-accuracy-8.json') as json_file:
    unstructured.append(json.load(json_file))

### Breaking down the data based on whether it's a clinician or patient speaking

In [9]:
train = {
    'clinician': [],
    'patient':[],
}
val = {
    'clinician': [],
    'patient': [],
}
p_0 = prepare_encounter_data(unstructured[0], 1)
p_1 = prepare_encounter_data(unstructured[1], 0)
p_2 = prepare_encounter_data(unstructured[2], 1)
p_3 = prepare_encounter_data(unstructured[3], 0)
p_4 = prepare_encounter_data(unstructured[4], 1)
p_5 = prepare_encounter_data(unstructured[5], 0)
p_6 = prepare_encounter_data(unstructured[6], 1)

p_7 = prepare_encounter_data(unstructured[7], 0)
p_8 = prepare_encounter_data(unstructured[8], 0)


train['clinician'].append(p_0['clinician'])
train['patient'].append(p_0['patient'])
train['clinician'].append(p_1['clinician'])
train['patient'].append(p_1['patient'])
train['clinician'].append(p_2['clinician'])
train['patient'].append(p_2['patient'])
train['clinician'].append(p_3['clinician'])
train['patient'].append(p_3['patient'])
train['clinician'].append(p_4['clinician'])
train['patient'].append(p_4['patient'])
train['clinician'].append(p_5['clinician'])
train['patient'].append(p_5['patient'])
train['clinician'].append(p_6['clinician'])
train['patient'].append(p_6['patient'])

val['clinician'].append((p_7['clinician'], 7))
val['patient'].append((p_7['patient'], 7))
val['clinician'].append((p_8['clinician'], 8))
val['patient'].append((p_8['patient'], 8))

In [31]:
with open('Downloads/test-aws-accuracy-40.json') as json_file:
    uglies = json.load(json_file)
res = prepare_encounter_data(uglies,0)
train = {
    'clinician': [],
    'patient': []
}
train['clinician'] = res['clinician']
train['patient'] = res['patient']

In [22]:
def prepare_encounter_data(data, clinician_index):
    word_results = data["results"]["items"]
    speaker_labels_raw = data["results"]["speaker_labels"]["segments"]
    num_speakers = data["results"]["speaker_labels"]["speakers"]
    speaker_labels = []
    for speaker_labels_group in speaker_labels_raw:
        speaker_labels = speaker_labels + speaker_labels_group["items"]

    speaker_aggregation = {
        "spk_0": [],
        "spk_1": [],
        "spk_2": [],
    }
    wr_counter = 0
    sl_counter = 0
    while sl_counter < len(speaker_labels):
        if "start_time" in word_results[wr_counter]:
            speaker_aggregation[speaker_labels[sl_counter]["speaker_label"]].append(word_results[wr_counter]["alternatives"][0]["content"])
            sl_counter += 1
        else:
            speaker_aggregation[speaker_labels[sl_counter]["speaker_label"]].append(word_results[wr_counter]["alternatives"][0]["content"])
        wr_counter += 1
        
    clinician_tag = 'spk_' + str(clinician_index)
    if num_speakers == 3:
        if clinician_index == 1:
            patient_tag_1 = 'spk_' + '0'
            patient_tag_2 = 'spk_' + '2'
        elif clinician_index == 0:
            patient_tag_1 = 'spk_' + '1'
            patient_tag_2 = 'spk_' + '2'
        else:
            patient_tag_1 = 'spk_' + '0'
            patient_tag_2 = 'spk_' + '1'
        
        speaker_data = {
            'clinician': speaker_aggregation[clinician_tag],
            'patient': speaker_aggregation[patient_tag_1] + speaker_aggregation[patient_tag_2]
        }
    else:
        if clinician_index == 1:
            patient_tag_1 = 'spk_' + '0'
        elif clinician_index == 0:
            patient_tag_1 = 'spk_' + '1'
        
        speaker_data = {
            'clinician': speaker_aggregation[clinician_tag],
            'patient': speaker_aggregation[patient_tag_1]
        }
        
    return speaker_data
    

In [4]:
train

In [5]:
val

### Formatting the data as necessary

In [28]:
def create_training_data(
    clin_examples, 
    pt_examples):
    
    data = []

    label_options = [0, 1]
    label_names = ['clinician', 'patient']
    example_lists = [clin_examples, pt_examples]

    for i in range(len(label_options)):
        label = label_options[i]
        label_name = label_names[i]
        example_list = example_lists[i]

        for encounter_conversation in example_list:
            ex_idx = 0
            text = ''
            while ex_idx < len(encounter_conversation):
                if encounter_conversation[ex_idx] in ['.', '!', '?']:
                    text += encounter_conversation[ex_idx]
                    temp_row = [text, label, label_name]
                    data.append(temp_row)
                    text = ''
                else:
                    text += ' ' + encounter_conversation[ex_idx]
                ex_idx += 1

    return data

In [27]:
def create_validation_data(
    clin_examples, 
    pt_examples):
    
    data = []
        
    label_options = [0, 1]
    label_names = ['clinician', 'patient']
    example_lists = [clin_examples, pt_examples]
    
    percent_lists = [1]

    for i in range(len(label_options)):
        label = label_options[i]
        label_name = label_names[i]
        
        for example_list in example_lists[i]:
            max_index = int(len(example_list[0]))
            ex_idx = 0
            text = ''
            while ex_idx < max_index:
                text += ' ' + example_list[0][ex_idx]
                ex_idx += 1
            temp_row = [text, label, label_name, example_list[1]]
            data.append(temp_row)
            if i == 1:
                min_index = max_index

    return data

In [33]:
train = create_training_data(train['clinician'], train['patient'])
val = create_validation_data(val['clinician'], val['patient'])


In [6]:
train

In [7]:
val

## Balancing data

In [17]:
label_options = ['clinician', 'patient']

In [8]:
### Counting number of each label before balancing
pats = 0
clins = 0
for element in train:
    if element[2] == 'patient':
        pats += 1
    else:
        clins += 1
print(pats)
print(clins)

In [20]:
train = balance_and_shuffle_dataset(train, label_options)

In [19]:
# Balances the dataframe provided with text samples extracted from data objects with 'text' and 'label' fields 
# returns dataset with the less common labels randomly oversampled
# Groups by label and then resamples with replacement to achieve equal groups
# Balances the dataset and returns  dataset with the more common labels randomly undersampled
def balance_and_shuffle_dataset(samples, label_options):
    
    indices_by_label = []
    
    # Get the counts for the different types of labels
    for label in label_options:
        
        temp_indices = [pair[0] for pair in enumerate(samples) if pair[1][2] == label]
        indices_by_label.append(temp_indices)
        
    # Shuffle time and figure out what's the smallest list
    maximum_len = -1
    min_len = -1
    for index_list in indices_by_label:
        
        random.shuffle(index_list)
        
        # Determine the max and min # sample index positions
        if maximum_len == -1:
            maximum_len = len(index_list)
            min_len = len(index_list)
        elif len(index_list) > maximum_len:
            maximum_len = len(index_list)
        if min_len > len(index_list):
            min_len = len(index_list)
    
    # Create a new balanced set and return it
    all_indices = []
    mult_factor = int((maximum_len - min_len)/min_len)
    mult_factor += 1
    
    for index_list in indices_by_label:
        # Handle the special oversample case
        if len(index_list) != maximum_len:
            all_indices.extend(index_list*mult_factor)
        else:
            all_indices.extend(index_list)
        
    random.shuffle(all_indices)
        
    return [samples[index] for index in all_indices]

In [9]:
### counting labels after balancing
pats = 0
clins = 0
for element in train:
    if element[2] == 'patient':
        pats += 1
    else:
        clins += 1
print(pats)
print(clins)

In [22]:
train = pd.DataFrame(train)
train.columns = ['text', 'labels', 'label_names']
val = pd.DataFrame(val)
val.columns = ['text', 'labels', 'label_names', 'encounter_number']

In [57]:
train

[]

In [58]:
val

[]

In [25]:
# Create a ClassificationModel
MODEL = ClassificationModel(
    'bert', 
    'bert-base-cased', # can be a path to load a previously trained model
    num_labels=2,
    args={'reprocess_input_data': True, 'overwrite_output_dir': True},
    use_cuda=False) 


In [10]:
# Train the model
MODEL.train_model(train, output_dir='speaker_label_model')

In [49]:
model_loading = ClassificationModel(
    "bert", "speaker_label_model/checkpoint-837-epoch-1", use_cuda=False
)

In [11]:
results = MODEL.predict(val['text'])

### In order to assess correct clinician vs patient determination, compare the results between actual clinician and actual patient transcripts for each encounter.
### To do this comparison, a summation is done on each row; the more-positive value for each encounter will be the predicted-clinician row.

In [30]:
results

(array([0, 0, 1, 1]), array([[ 1.0553558, -1.2158972],
        [ 1.4158463, -1.1864611],
        [-0.5651022,  1.1164746],
        [-0.5484368,  1.1450765]], dtype=float32))

In [31]:
# this is after changing training data length to sentences (previously used full encounters)
modified_results = [result[0]-result[1] for result in results[1]]
print(modified_results[0:int(len(modified_results)/2)])
print(modified_results[int(len(modified_results)/2):])

[2.271253, 2.6023073]
[-1.6815768, -1.6935134]


In [14]:
val

## Method for converting AWS Transcript to PH API type

In [62]:
with open('<filepath>') as json_file:
    text_basis = json.load(json_file)

In [15]:
text_basis

In [None]:
# This is essentially the final format desired for a transcript
final_type = {
  "0_count": 0,
  "1_count": 0,
  "2_count": 0,
  "3_count": 0,
  "4_count": 0,
  "5_count": 0,
  "6_count": 0,
  "7_count": 0,
  "alternatives": [],
  "alternatives_lock": 0,
  "busy_num": 0,
  "completed_functions": [],
  "functions": [],
  "jobID": "",
  "sentences": [
    {
      "compliance_label": {
        "label": "",
        "labelIndex": 0,
        "labelVector": []
      },
      "compliance_label_lock": 0,
      "fam_label": {},
      "fam_label_lock": 0,
      "indexedConcepts": [],
      "originalSentence": "",
      "originalSentence_lock": 0,
      "sentenceID": "",
      "sentiment": {
        "label": "negative",
        "labelIndex": 0,
        "labelVector": []
      },
      "sentiment_lock": 0,
      "tokens": [
        {
          "acronymConfidence": 1,
          "asrSource": "GCPSpeechRecognition",
          "endTime": 1,
          "isAcronym": false,
          "isPunctuation": false,
          "isSpace": false,
          "originalString": "This",
          "speakerConfidence": 0,
          "speakerTag": "0",
          "startTime": 0,
          "textConfidence": 0
        }
      ]
    }
  ],
  "sentences_lock": 0,
  "status": "Completed",
  "uri": "",
  "uri_lock": 0,
  "uris": [],
  "uris_lock": 8
}

In [16]:
# This call will convert all labels properly for a given file
swapped = predict_and_swamp_for_api(text_basis)

In [17]:
print(json.dumps(swapped, indent=4))

In [46]:
content = json.dumps(swapped)
f = open("output_2_swapped.json","w")
f.write(content)
f.close()

In [41]:
def predict_and_swamp_for_api(basis):
    consolidated_by_speaker, originalType = consolidate_sentences_by_speaker(basis['sentences'])
        
    ## preparing data for prediction
    second_filter = []
    label_options = [0, 1, 2]
    if originalType == 'google':
        label_names = ['0', '1', '2']
        example_lists = [consolidated_by_speaker['0'], consolidated_by_speaker['1'], consolidated_by_speaker['2']]
    elif originalType == 'amazon':
        label_names = ['spk_0', 'spk_1', 'spk_2']
        example_lists = [consolidated_by_speaker['spk_0'], consolidated_by_speaker['spk_1'], consolidated_by_speaker['spk_2']]
    
    for i in range(len(label_options)):
        label = label_options[i]
        label_name = label_names[i]
        
        text = example_lists[i]
        temp_row = [text, label, label_name]
        second_filter.append(temp_row)

    second_filter = pd.DataFrame(second_filter)
    second_filter.columns = ['text', 'labels', 'label_names']
    
    ## predicting
    results = MODEL.predict(second_filter['text'])
    
    modified_results = [result[0]-result[1] for result in results[1]]
    print(modified_results)
    modified_results = np.asarray(modified_results)
    clinician_index = modified_results.argmax()
        
    ## internalizing which current label is the clinician/patient
    if originalType == 'amazon':
        clinician_label = 'spk_' + str(clinician_index)
    elif originalType == 'google':
        clinician_label = str(clinician_index)
    
    # reassign all token speaker labels
    for sentence in basis['sentences']:
        for token in sentence['tokens']:
            if token['speakerTag'] == clinician_label:
                token['speakerTag'] = 'clinician'
            else:
                token['speakerTag'] = 'patient'

    return basis

In [37]:
def consolidate_sentences_by_speaker(sentences):
    originalType = ''
    googleTags = ['0', '1', '2']
    amazonTags = ['spk_0', 'spk_1', 'spk_2']
    
    if sentences[0]['tokens'][0]['speakerTag'] in googleTags:
        originalType = 'google'
    elif sentences[0]['tokens'][0]['speakerTag'] in amazonTags:
        originalType = 'amazon'
        
    if originalType == 'amazon':
        consolidated = {
            'spk_0': '',
            'spk_1': '',
            'spk_2': ''
        }
    elif originalType == 'google':
        consolidated = {
            '0': '',
            '1': '',
            '2': ''
        }
        
    for sentence in sentences:
        for token in sentence['tokens']:
            consolidated[token['speakerTag']] += token['originalString'] + ''
            
    return consolidated, originalType

# Testing overall accuracy for prediction based on taskBoxes pre-labeled as 'clinician'/'patient' (not necessary for building a new model)

In [17]:
actual_id = '<encounter id>'
print(actual_id)
actual = ph.get_encounter(actual_id, include_audio=True, dataset='<dataset>')


In [18]:
print([tb.label for tb in actual.taskBoxes])

In [19]:
reload(ph)
print('getting audio clips')
actual.audioClips=ph.get_audio_clips([tb.audioID for tb in actual.taskBoxes if tb.audioID], bucket_name='<bucket name>')
actual.fullAudioClip=actual.get_full_encounter_audio_clip()
    

In [20]:
ac = actual.fullAudioClip
ac.play_audio()

### Determining clinician label

In [13]:
with open('<transcript path name>') as json_file:
    comparison = json.load(json_file)
comparison = prepare_comparison_data(comparison)
comparison = create_comparison_data(comparison['spk_0'], comparison['spk_1'], comparison['spk_2'])

In [14]:
def prepare_comparison_data(data):
    word_results = data["results"]["items"]
    speaker_labels_raw = data["results"]["speaker_labels"]["segments"]
    num_speakers = data["results"]["speaker_labels"]["speakers"]
    speaker_labels = []
    for speaker_labels_group in speaker_labels_raw:
        speaker_labels = speaker_labels + speaker_labels_group["items"]

    speaker_aggregation = {
        "spk_0": [],
        "spk_1": [],
        "spk_2": [],
    }
    wr_counter = 0
    sl_counter = 0
    while sl_counter < len(speaker_labels):
        if "start_time" in word_results[wr_counter]:
            speaker_aggregation[speaker_labels[sl_counter]["speaker_label"]].append(word_results[wr_counter]["alternatives"][0]["content"])
            sl_counter += 1
        else:
            speaker_aggregation[speaker_labels[sl_counter]["speaker_label"]].append(word_results[wr_counter]["alternatives"][0]["content"])
        wr_counter += 1
        
    return speaker_aggregation    

In [15]:
def create_comparison_data(
    spk_0, 
    spk_1,
    spk_2):
    
    data = []
        
    label_options = [0, 1, 2]
    label_names = ['spk_0', 'spk_1', 'spk_2']
    example_lists = [spk_0, spk_1, spk_2]
    
    for i in range(len(label_options)):
        label = label_options[i]
        label_name = label_names[i]
        
        example_list = example_lists[i]
        max_index = int(len(example_list))
        ex_idx = 0
        text = ''
        while ex_idx < max_index:
            text += ' ' + example_list[ex_idx]
            ex_idx += 1
        temp_row = [text, label, label_name]
        data.append(temp_row)

    return data

In [18]:
print(comparison['text'][0])

In [19]:
comparison = pd.DataFrame(comparison)
comparison.columns = ['text', 'labels', 'label_names']
print(comparison)
results = MODEL.predict(comparison['text'])
print(results)
print()
print()
modified_results = [result[0]+result[1] for result in results[1]]
print(modified_results[0:int(len(modified_results)/2)])
print(modified_results[int(len(modified_results)/2):])

In [20]:
with open('Downloads/test-aws-accuracy-40.json') as json_file:
    timestamps = json.load(json_file)
timestamps

In [21]:
timestamps

### Using clinician label to reassign labels

In [22]:
for segment in timestamps['results']['speaker_labels']['segments']:
    if segment['speaker_label'] == 'spk_1':
        segment['speaker_label'] = 'clinician'
    else:
        segment['speaker_label'] = 'patient'
    for subsegment in segment['items']:
        if subsegment['speaker_label'] == 'spk_1':
            subsegment['speaker_label'] = 'clinician'
        else:
            subsegment['speaker_label'] = 'patient'
timestamps

### Checking labels

In [23]:
proportion_taskboxes, proportion_time = find_accuracy(timestamps, actual)
print(proportion_taskboxes)
print(proportion_time)

In [7]:
def find_accuracy(transcript, encounter):
    running_duration = 0
    comparison_duration = 0
    correct_duration = 0
    correct = 0
    total = 0
    transcript_segments = timestamps['results']['speaker_labels']['segments']
    
    taskboxes = encounter.taskBoxes
    labels = [tb.label for tb in taskboxes]
    print(labels)
    for idx in range(len(taskboxes)):
        running_duration += taskboxes[idx].duration
        print(running_duration)
        
        if labels[idx] not in [None, 'None', 'both', 'Other', 'other', 'Both', 'ambient']:
            if labels[idx] == find_transcript_label(transcript_segments, taskboxes[idx].duration, running_duration, True):
                correct_duration += taskboxes[idx].duration
                correct += 1
            comparison_duration += taskboxes[idx].duration
            print(labels[idx])
            print(find_transcript_label(transcript_segments, taskboxes[idx].duration, running_duration, False))
            total += 1
            
    return correct/total, correct_duration/comparison_duration

In [8]:
def find_transcript_label(ts, duration, total_time, print_req):
    time_target = total_time-duration/2
    if print_req:
        print(time_target)
    for idx in range(len(ts)):
        if time_target >= float(ts[idx]['start_time']) and time_target <= float(ts[idx]['end_time']):
            return ts[idx]['speaker_label']
        elif time_target < float(ts[idx]['start_time']) and time_target > float(ts[idx-1]['end_time']):
            return ts[idx]['speaker_label']

## Testing speed of AWS Non-Medical Transcription (unrelated to anything above)

In [32]:
data = {
    "TranscriptionJobName": "test-non-medical-3",
    "LanguageCode": "en-US",
    "MediaFormat": "wav",
    "Media": {
        "MediaFileUri": "https://test-transcribe-bartleby.s3.us-east-2.amazonaws.com/demo.wav"
    },
    "OutputBucketName": "test-transcribe-bartleby",
    "Settings": {
      "MaxSpeakerLabels": 3,
      "ShowSpeakerLabels": True
    }
}

transcribe = boto3.client('transcribe')
job_name = data['TranscriptionJobName']
job_uri = data['Media']['MediaFileUri']

transcribe.start_transcription_job(
    TranscriptionJobName=job_name,
    LanguageCode=data['LanguageCode'],
    MediaFormat=data['MediaFormat'],
    Media=data['Media'],
    OutputBucketName=data['OutputBucketName'],
    Settings=data['Settings']
)

{'TranscriptionJob': {'TranscriptionJobName': 'test-non-medical-3',
  'TranscriptionJobStatus': 'IN_PROGRESS',
  'LanguageCode': 'en-US',
  'MediaFormat': 'wav',
  'Media': {'MediaFileUri': 'https://test-transcribe-bartleby.s3.us-east-2.amazonaws.com/demo.wav'},
  'StartTime': datetime.datetime(2020, 7, 31, 11, 23, 28, 477000, tzinfo=tzlocal()),
  'CreationTime': datetime.datetime(2020, 7, 31, 11, 23, 28, 454000, tzinfo=tzlocal()),
  'Settings': {'ShowSpeakerLabels': True, 'MaxSpeakerLabels': 3}},
 'ResponseMetadata': {'RequestId': 'dd8ff2ea-a765-4f1d-a72e-3715803ecfea',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Fri, 31 Jul 2020 16:23:27 GMT',
   'x-amzn-requestid': 'dd8ff2ea-a765-4f1d-a72e-3715803ecfea',
   'content-length': '364',
   'connection': 'keep-alive'},
  'RetryAttempts': 0}}

In [34]:
while True:
    status = transcribe.get_transcription_job(TranscriptionJobName="test-non-medical-3")
    if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
        break
    print("Not ready yet...")
    time.sleep(5)
print(status)

Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
{'TranscriptionJob': {'TranscriptionJobName': 'test-non-medical-3', 'TranscriptionJobStatus': 'COMPLETED', 'LanguageCode': 'en-US', 'MediaSampleRateHertz': 44100, 'MediaFormat': 'wav', 'Media': {'MediaFileUri': 'https://test-transcribe-bartleby.s3.us-east-2.amazonaws.com/demo.wav'}, 'Transcript': {'TranscriptFileUri': 'https://s3.us-east-2.amazonaws.com/test-transcribe-bartleby/test-non-medical-3.json'}, 'StartTime': datetime.datetime(2020, 7, 31, 11, 23, 28, 477000, tzinfo=tzlocal()), 'CreationTime': datetime.datetime(2020, 7, 31, 11, 23, 28, 454000, tzinfo=tzlocal()), 'CompletionTime': datetime.datetime(2020, 7, 31, 11, 25, 18, 432000, tzinfo=tzlocal()), 'Settings': {'ShowSpeakerLabels': True, 'MaxSpeakerLabels': 3, 

In [67]:
data = {
    "TranscriptionJobName": "test-non-medical-7",
    "LanguageCode": "en-US",
    "MediaFormat": "wav",
    "Media": {
        "MediaFileUri": "https://test-transcribe-bartleby.s3.us-east-2.amazonaws.com/download+(10).wav"
    },
    "OutputBucketName": "test-transcribe-bartleby",
    "Settings": {
      "MaxSpeakerLabels": 3,
      "ShowSpeakerLabels": True
    }
}

transcribe = boto3.client('transcribe')
job_name = data['TranscriptionJobName']
job_uri = data['Media']['MediaFileUri']

transcribe.start_transcription_job(
    TranscriptionJobName=job_name,
    LanguageCode=data['LanguageCode'],
    MediaFormat=data['MediaFormat'],
    Media=data['Media'],
    OutputBucketName=data['OutputBucketName'],
    Settings=data['Settings']
)

{'TranscriptionJob': {'TranscriptionJobName': 'test-non-medical-7',
  'TranscriptionJobStatus': 'IN_PROGRESS',
  'LanguageCode': 'en-US',
  'MediaFormat': 'wav',
  'Media': {'MediaFileUri': 'https://test-transcribe-bartleby.s3.us-east-2.amazonaws.com/download+(10).wav'},
  'StartTime': datetime.datetime(2020, 7, 31, 12, 37, 16, 572000, tzinfo=tzlocal()),
  'CreationTime': datetime.datetime(2020, 7, 31, 12, 37, 16, 544000, tzinfo=tzlocal()),
  'Settings': {'ShowSpeakerLabels': True, 'MaxSpeakerLabels': 3}},
 'ResponseMetadata': {'RequestId': '6d99f6cb-cfc4-4e01-ac14-38313a8222d5',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Fri, 31 Jul 2020 17:37:16 GMT',
   'x-amzn-requestid': '6d99f6cb-cfc4-4e01-ac14-38313a8222d5',
   'content-length': '373',
   'connection': 'keep-alive'},
  'RetryAttempts': 0}}

In [68]:
while True:
    status = transcribe.get_transcription_job(TranscriptionJobName="test-non-medical-7")
    if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
        break
    print("Not ready yet...")
    time.sleep(5)
print(status)

Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
{'TranscriptionJob': {'TranscriptionJobName': 'test-non-medical-7', 'TranscriptionJobStatus': 'COMPLETED', 'LanguageCode': 'en-US', 'MediaSampleRateHertz': 44100, 'MediaFormat': 'wav', 'Media': {'MediaFileUri': 'https://test-transcribe-bartleby.s3.us-east-2.amazonaws.com/download+(10).wav'}, 'Transcrip

In [38]:
data = {
    "TranscriptionJobName": "test-non-medical-5",
    "LanguageCode": "en-US",
    "MediaFormat": "wav",
    "Media": {
        "MediaFileUri": "https://test-transcribe-bartleby.s3.us-east-2.amazonaws.com/download+(17).wav"
    },
    "OutputBucketName": "test-transcribe-bartleby",
    "Settings": {
      "MaxSpeakerLabels": 3,
      "ShowSpeakerLabels": True
    }
}

transcribe = boto3.client('transcribe')
job_name = data['TranscriptionJobName']
job_uri = data['Media']['MediaFileUri']

transcribe.start_transcription_job(
    TranscriptionJobName=job_name,
    LanguageCode=data['LanguageCode'],
    MediaFormat=data['MediaFormat'],
    Media=data['Media'],
    OutputBucketName=data['OutputBucketName'],
    Settings=data['Settings']
)

{'TranscriptionJob': {'TranscriptionJobName': 'test-non-medical-5',
  'TranscriptionJobStatus': 'IN_PROGRESS',
  'LanguageCode': 'en-US',
  'MediaFormat': 'wav',
  'Media': {'MediaFileUri': 'https://test-transcribe-bartleby.s3.us-east-2.amazonaws.com/download+(17).wav'},
  'StartTime': datetime.datetime(2020, 7, 31, 11, 30, 19, 469000, tzinfo=tzlocal()),
  'CreationTime': datetime.datetime(2020, 7, 31, 11, 30, 19, 447000, tzinfo=tzlocal()),
  'Settings': {'ShowSpeakerLabels': True, 'MaxSpeakerLabels': 3}},
 'ResponseMetadata': {'RequestId': '8444613d-829a-4d4e-bc9c-93f15bb413bf',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Fri, 31 Jul 2020 16:30:18 GMT',
   'x-amzn-requestid': '8444613d-829a-4d4e-bc9c-93f15bb413bf',
   'content-length': '373',
   'connection': 'keep-alive'},
  'RetryAttempts': 0}}

In [39]:
while True:
    status = transcribe.get_transcription_job(TranscriptionJobName="test-non-medical-5")
    if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
        break
    print("Not ready yet...")
    time.sleep(5)
print(status)

Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
{'TranscriptionJob': {'TranscriptionJobName': 'test-non-medical-5', 'TranscriptionJobStatus': 'COMPLETED', 'LanguageCode': 'en-US', 'MediaSampleRateHe

In [100]:
data = {
    "TranscriptionJobName": "test-non-medical-8",
    "LanguageCode": "en-US",
    "MediaFormat": "mp3",
    "Media": {
        "MediaFileUri": "https://test-transcribe-bartleby.s3.us-east-2.amazonaws.com/fake-apppointment-2.mp3"
    },
    "OutputBucketName": "test-transcribe-bartleby",
    "Settings": {
      "MaxSpeakerLabels": 3,
      "ShowSpeakerLabels": True
    }
}

transcribe = boto3.client('transcribe')
job_name = data['TranscriptionJobName']
job_uri = data['Media']['MediaFileUri']

transcribe.start_transcription_job(
    TranscriptionJobName=job_name,
    LanguageCode=data['LanguageCode'],
    MediaFormat=data['MediaFormat'],
    Media=data['Media'],
    OutputBucketName=data['OutputBucketName'],
    Settings=data['Settings']
)

{'TranscriptionJob': {'TranscriptionJobName': 'test-non-medical-8',
  'TranscriptionJobStatus': 'IN_PROGRESS',
  'LanguageCode': 'en-US',
  'MediaFormat': 'mp3',
  'Media': {'MediaFileUri': 'https://test-transcribe-bartleby.s3.us-east-2.amazonaws.com/fake-apppointment-2.mp3'},
  'StartTime': datetime.datetime(2020, 7, 31, 15, 40, 27, 205000, tzinfo=tzlocal()),
  'CreationTime': datetime.datetime(2020, 7, 31, 15, 40, 27, 178000, tzinfo=tzlocal()),
  'Settings': {'ShowSpeakerLabels': True, 'MaxSpeakerLabels': 3}},
 'ResponseMetadata': {'RequestId': '2673b8f1-12dd-4da4-adb3-9a6e2a633a3c',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Fri, 31 Jul 2020 20:40:27 GMT',
   'x-amzn-requestid': '2673b8f1-12dd-4da4-adb3-9a6e2a633a3c',
   'content-length': '379',
   'connection': 'keep-alive'},
  'RetryAttempts': 0}}

In [102]:
while True:
    status = transcribe.get_transcription_job(TranscriptionJobName="test-non-medical-8")
    if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
        break
    print("Not ready yet...")
    time.sleep(5)
print(status)

Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
{'TranscriptionJob': {'TranscriptionJobName': 'test-non-medical-8', 'TranscriptionJobStatus': 'COMPLETED', 'LanguageCode': 'en-US', 'MediaSampleRateHertz': 44100, 'MediaFormat': 'mp3', 'Media': {'MediaFileUri': 'https://test-transcribe-bartleby.s3.us-east-2.amazonaws.com/fake-apppointment-2.mp3'}, 'Transcript': {'TranscriptFileUri': 'https://s3.us-east-2.amazonaws.com/test-transcribe-bartleby/test-non-medical-8.json'}, 'StartTime': datetime.datetime(2020, 7, 31, 15, 40, 27, 205000, tzinfo=tzlocal()), 'CreationTime': datetime.datetime(2020, 7, 31, 15, 40, 27, 178000, tz