In [1]:
from moviepy.editor import VideoFileClip

# conver the video to an audio file

video_path = '../data-capture/TherapySessionRecordings/20230613_161520.mp4'
video = VideoFileClip(video_path)
audio = video.audio
audio_file_path = "../data-capture/TherapySessionRecordings/audio.wav"
audio.write_audiofile(audio_file_path)
video.close()

MoviePy - Writing audio in ../data-capture/TherapySessionRecordings/audio.wav


                                                                                                                       

MoviePy - Done.




In [40]:
import requests
import json
import os
from openai import AzureOpenAI
import math


with open("config.json", "r") as f:
    config = json.loads(f.read())

url = "https://usecase1hub1533117385.cognitiveservices.azure.com/speechtotext/transcriptions:transcribe?api-version=2024-11-15"

headers = {
    "Ocp-Apim-Subscription-Key": config["SUBSCRIPTION_KEY"],
    "Accept": "application/json"
}

files = {
    "audio": open(audio_file_path, "rb"),
    "definition": (None, '{"locales":["en-US"], "diarization": {"maxSpeakers": 4,"enabled": true}}', 'application/json')
}

response = requests.post(url, headers=headers, files=files)

#transcribe the video with diarization enabled
r_json = response.json()


In [44]:
def convert_milliseconds_to_minute_timestamps(ms):
    seconds = ms/1000
    minutes = int(seconds//60)
    rem_seconds = int(math.floor(seconds - minutes * 60))
    return f'{minutes}:{rem_seconds:02}'
    
# add formatted timestamps for the video
for phrase in r_json["phrases"]:
    phrase["Timestamp"] = convert_milliseconds_to_minute_timestamps(phrase['offsetMilliseconds'])
    for word in phrase["words"]:
        word["Timestamp"] = convert_milliseconds_to_minute_timestamps(word['offsetMilliseconds'])

In [45]:
# pass the transcribed text to chatgpt for analysis and chat

system = """
You will be given a json object that is an audio transcription of a therapy session with a patient with autism.
Your job is to answer questions about the data, e.g. at what time stamps did speaker 1 start and end conversations, 
or where did speaker 1 hesitate?

Return your output as a markdown, with bulleted lists where appropriate.

If you provide any relevant time stamps, always include the transcript for words or phrases that were spoken at those 
time stamps. Only refer to the "Timestamp" attribute for phrases when providing timestamps. If you provide a duration gap,
convert the gap to seconds instead of milliseconds.
"""

endpoint = "https://usecase1hub1533117385.openai.azure.com/"
model_name = "gpt-4"
deployment = "gpt-4"

subscription_key = config["SUBSCRIPTION_KEY"]
api_version = "2024-12-01-preview"

client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=subscription_key,
)

messages=[
    {"role": "system", "content": system},
    {"role": "user", "content": [
            {
                "type": "text",
                "text": json.dumps(r_json["phrases"])
            }
        ],
    }
]


def interface_with_gpt(messages, user_query, append_response=False):
    messages.append({
        "role": "user",
        "content": user_query
    })
    
    response = client.chat.completions.create(
        messages=messages,
        max_tokens=4096,
        temperature=.9,
        top_p=.5,
        model=deployment
    )
    
    if append_response:
        messages.append({
            "role": "assistant",
            "content": response.choices[0].message.content
        })
    
    return response, messages



user_query = "Where does the patient (speaker 2) start and end conversations?"
user_query_2 = "Where does the patient (speaker 2) pause or repeat themselves?"

response, messages = interface_with_gpt(messages, user_query_2, append_response=True)
print(response.choices[0].message.content)

Speaker 2 pauses or repeats themselves in the following instances:

- **Pause:** 
  - **Timestamp:** "0:10" to "0:12"
  - **Duration:** 2 seconds
  - **Transcript:** Between "the" and "unsalted" in the phrase "I'm shopping for the unsalted butter for Father's Day."

- **Repetition:**
  - **Timestamp:** "0:25"
  - **Transcript:** "Talk to you soon." This phrase is repeated by Speaker 2 and then by Speaker 1.


In [38]:
response, _ = interface_with_gpt(messages, 
                   "I was only looking for portions of the transcript where the Speaker 1 repeated themselves, not where speaker 1 echoed speaker 2. Refine your output accordingly", 
                   append_response=True)

In [30]:
print(response.choices[0].message.content)

Speaker 2 (the patient) does not repeat themselves in the provided transcript. There are no instances where Speaker 2 repeats a word or phrase by themselves. The repetition noted earlier was an echo by Speaker 1, not a self-repetition by Speaker 2.
