In [1]:
from moviepy.editor import VideoFileClip

# conver the video to an audio file

video_path = '../data-capture/TherapySessionRecordings/20230613_161520.mp4'
video = VideoFileClip(video_path)
audio = video.audio
audio_file_path = "../data-capture/TherapySessionRecordings/audio.wav"
audio.write_audiofile(audio_file_path)
video.close()

MoviePy - Writing audio in ../data-capture/TherapySessionRecordings/audio.wav


                                                                                                                       

MoviePy - Done.




In [9]:
import requests
import json
import os
from openai import AzureOpenAI
import math


with open("config.json", "r") as f:
    config = json.loads(f.read())

url = "https://usecase1hub1533117385.cognitiveservices.azure.com/speechtotext/transcriptions:transcribe?api-version=2024-11-15"

headers = {
    "Ocp-Apim-Subscription-Key": config["SUBSCRIPTION_KEY"],
    "Accept": "application/json"
}

files = {
    "audio": open(audio_file_path, "rb"),
    "definition": (None, '{"locales":["en-US"], "diarization": {"maxSpeakers": 4,"enabled": true}}', 'application/json')
}

response = requests.post(url, headers=headers, files=files)

#transcribe the video with diarization enabled
r_json = response.json()


In [10]:
def convert_milliseconds_to_minute_timestamps(ms):
    seconds = ms/1000
    minutes = int(seconds//60)
    rem_seconds = int(math.floor(seconds - minutes * 60))
    return f'{minutes}:{rem_seconds:02}'
    
# add formatted timestamps for the video
for phrase in r_json["phrases"]:
    phrase["offsetSeconds"] = convert_milliseconds_to_minute_timestamps(phrase['offsetMilliseconds'])

In [28]:
# pass the transcribed text to chatgpt for analysis and chat

system = """
You will be given a json object that is an audio transcription of a therapy session with a patient with autism.
Your job is to answer questions about the data, e.g. at what time stamps did speaker 1 start and end conversations, 
or where did speaker 1 hesitate?

Return your output as a markdown, with bulleted lists where appropriate.

If you provide any relevant time stamps, always include the transcript for words or phrases that were spoken at those 
time stamps. Only refer to the "offsetSeconds" attributes for phrases when providing timestamps.
"""

endpoint = "https://usecase1hub1533117385.openai.azure.com/"
model_name = "gpt-4"
deployment = "gpt-4"

subscription_key = config["SUBSCRIPTION_KEY"]
api_version = "2024-12-01-preview"

client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=subscription_key,
)

messages=[
    {"role": "system", "content": system},
    {"role": "user", "content": [
            {
                "type": "text",
                "text": json.dumps(r_json["phrases"])
            }
        ],
    }
]


def interface_with_gpt(messages, user_query, append_response=False):
    messages.append({
        "role": "user",
        "content": user_query
    })
    
    response = client.chat.completions.create(
        messages=messages,
        max_tokens=4096,
        temperature=.9,
        top_p=.5,
        model=deployment
    )
    
    if append_response:
        messages.append({
            "role": "assistant",
            "content": response.choices[0].message.content
        })
    
    return response, messages



user_query = "Where does the patient (speaker 2) start and end conversations?"
user_query_2 = "Where does the patient (speaker 2) pause or repeat themselves?"

response, messages = interface_with_gpt(messages, user_query_2, append_response=True)
print(response.choices[0].message.content)

- **Pause:**
  - **Timestamp:** "0:09"
  - **Transcript:** "I'm shopping for the unsalted butter for Father's Day."
  - **Details:** There is a noticeable pause between "the" and "unsalted" (10800ms to 12480ms).

- **Repetition:**
  - **Timestamp:** "0:25"
  - **Transcript:** "Talk to you soon."
  - **Details:** This phrase is repeated by speaker 2 at "0:25" and then immediately echoed by speaker 1 at "0:25" and "0:26".


In [25]:
interface_with_gpt(messages, "refine your outp", append_response=True)

[{'speaker': 1,
  'offsetMilliseconds': 3920,
  'durationMilliseconds': 600,
  'text': 'Hi, Drew.',
  'words': [{'text': 'Hi,',
    'offsetMilliseconds': 3920,
    'durationMilliseconds': 240},
   {'text': 'Drew.', 'offsetMilliseconds': 4160, 'durationMilliseconds': 360}],
  'locale': 'en-US',
  'confidence': 0.8631109,
  'offsetSeconds': '0:03'},
 {'speaker': 1,
  'offsetMilliseconds': 5280,
  'durationMilliseconds': 480,
  'text': 'Hi.',
  'words': [{'text': 'Hi.',
    'offsetMilliseconds': 5280,
    'durationMilliseconds': 480}],
  'locale': 'en-US',
  'confidence': 0.8631109,
  'offsetSeconds': '0:05'},
 {'speaker': 1,
  'offsetMilliseconds': 7800,
  'durationMilliseconds': 1400,
  'text': 'What are you shopping for?',
  'words': [{'text': 'What',
    'offsetMilliseconds': 7800,
    'durationMilliseconds': 240},
   {'text': 'are', 'offsetMilliseconds': 8040, 'durationMilliseconds': 80},
   {'text': 'you', 'offsetMilliseconds': 8120, 'durationMilliseconds': 80},
   {'text': 'shoppin