In [5]:
import pandas as pd
import numpy as np
import requests
import time
import os
import base64
import cv2


def process_video(video_path, seconds_per_frame=2):
    """extract a series of frames from a video file based on specified # of fps"""
    base64Frames = []
    base_video_path, _ = os.path.splitext(video_path)

    video = cv2.VideoCapture(video_path)
    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = video.get(cv2.CAP_PROP_FPS)
    frames_to_skip = int(fps * seconds_per_frame)
    curr_frame=0

    # Loop through the video and extract frames at specified sampling rate
    while curr_frame < total_frames - 1:
        video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
        success, frame = video.read()
        if not success:
            break
        _, buffer = cv2.imencode(".jpg", frame)
        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
        curr_frame += frames_to_skip
    video.release()

    print(f"Extracted {len(base64Frames)} frames")
    return base64Frames


video_path = '../data-capture/TherapySessionRecordings/20230613_161520.mp4'
b64frames = process_video(video_path, seconds_per_frame=.25)

Extracted 119 frames


In [4]:
!pip install azure-identity

Collecting azure-identity
  Obtaining dependency information for azure-identity from https://files.pythonhosted.org/packages/3d/9f/1f9f3ef4f49729ee207a712a5971a9ca747f2ca47d9cbf13cf6953e3478a/azure_identity-1.21.0-py3-none-any.whl.metadata
  Downloading azure_identity-1.21.0-py3-none-any.whl.metadata (81 kB)
     ---------------------------------------- 0.0/81.3 kB ? eta -:--:--
     ---------------------------------------- 81.3/81.3 kB ? eta 0:00:00
Collecting azure-core>=1.31.0 (from azure-identity)
  Obtaining dependency information for azure-core>=1.31.0 from https://files.pythonhosted.org/packages/39/83/325bf5e02504dbd8b4faa98197a44cdf8a325ef259b48326a2b6f17f8383/azure_core-1.32.0-py3-none-any.whl.metadata
  Downloading azure_core-1.32.0-py3-none-any.whl.metadata (39 kB)
Collecting msal>=1.30.0 (from azure-identity)
  Obtaining dependency information for msal>=1.30.0 from https://files.pythonhosted.org/packages/93/5a/2e663ef56a5d89eba962941b267ebe5be8c5ea340a9929d286e2f5fac505/msa


[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
!pip install openai




[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


### Analyzing extracted images for indicators

In [6]:
import json

with open("config.json", "r") as f:
    config = json.loads(f.read())

In [7]:
import os
from openai import AzureOpenAI


system = """
You are a behavioral analyst that focuses on therapy sessions for patients with autism.
You will be provided a video recording of a therapy session.
Your job is to analyze the video to determine if the patient displays any common "stimming"
patterns. Common examples of stimming patterns include:

Hand-flapping: Rapid movement of the hands, often seen when an individual is excited or agitated.
Rocking: Body rocking back and forth while sitting or standing.
Spinning: Turning in circles or spinning objects repetitively.
Echolalia: Repetitive vocal sounds or phrases, often repeated immediately after hearing them.
Tapping: Tapping hands or objects repeatedly.
Visual Stimming: Staring at lights, moving fingers in front of the eyes, or watching objects spin.
Chewing or Biting: Chewing on objects, clothing, or oneself.

Provide your output as an array of items whether each of these different patterns were, observed in the video and a summary of the patient's
particular behavior corresponding to this pattern if it was observed, e.g.:

[
  {
    "stimming_type": ...Example stimming pattern 1...
    "summary": ...Example summary 1...
  },
  {
    "stimming_type": ...Example stimming pattern 2...
    "summary": ...Example summary 2...
  }, ...
]

Only provide your output as json per the abovementioned format, and nothing more.
"""


endpoint = "https://usecase1hub1533117385.openai.azure.com/"
model_name = "gpt-4"
deployment = "gpt-4"

subscription_key = config["SUBSCRIPTION_KEY"]
api_version = "2024-12-01-preview"

client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=subscription_key,
)

messages=[
    {"role": "system", "content": system},
    {"role": "user", "content": [
        *map(lambda x: {"type": "image_url",
                        "image_url": {"url": f'data:image/jpg{x[0]};base64,{x[1]}', "detail": "low"}}, enumerate(b64frames[110:120]))
        ],
    }
    ]

response = client.chat.completions.create(
    messages=messages,
    max_tokens=4096,
    temperature=0,
    top_p=1.0,
    model=deployment
)


print(response.choices[0].message.content)


[]



In [10]:
from moviepy.editor import VideoFileClip

video = VideoFileClip(video_path)
audio = video.audio
audio_file_path = "../data-capture/TherapySessionRecordings/audio.wav"
audio.write_audiofile(audio_file_path)
video.close()

                                                                                                                       
chunk:   3%|██                                                              | 20/613 [01:29<44:22,  4.49s/it, now=None]
chunk:   5%|██▉                                                             | 28/613 [00:30<10:28,  1.08s/it, now=None][A

MoviePy - Writing audio in ../data-capture/TherapySessionRecordings/audio.wav




chunk:   0%|                                                                         | 0/613 [00:00<?, ?it/s, now=None][A[A

chunk:  69%|█████████████████████████████████████████▊                   | 420/613 [00:00<00:00, 3789.23it/s, now=None][A[A

                                                                                                                       [A[A
chunk:   3%|██                                                              | 20/613 [01:29<44:27,  4.50s/it, now=None]
chunk:   5%|██▉                                                             | 28/613 [00:30<10:32,  1.08s/it, now=None][A

MoviePy - Done.


In [11]:
import requests

url = "https://usecase1hub1533117385.cognitiveservices.azure.com/speechtotext/transcriptions:transcribe?api-version=2024-11-15"

headers = {
    "Ocp-Apim-Subscription-Key": config["SUBSCRIPTION_KEY"],
    "Accept": "application/json"
}

files = {
    "audio": open(audio_file_path, "rb"),
    "definition": (None, '{"locales":["en-US"], "diarization": {"maxSpeakers": 4,"enabled": true}}', 'application/json')
}

response = requests.post(url, headers=headers, files=files)

json = response.json()


In [15]:
json["phrases"][0]

{'speaker': 1,
 'offsetMilliseconds': 3920,
 'durationMilliseconds': 600,
 'text': 'Hi, Drew.',
 'words': [{'text': 'Hi,',
   'offsetMilliseconds': 3920,
   'durationMilliseconds': 240},
  {'text': 'Drew.', 'offsetMilliseconds': 4160, 'durationMilliseconds': 360}],
 'locale': 'en-US',
 'confidence': 0.8631109}