In [1]:
import json
import numpy as np
import pandas as pd
import re
import glob
from dotenv import load_dotenv
import boto3

from deepgram import (
    DeepgramClient,
    PrerecordedOptions,
    FileSource,
)

import os

load_dotenv() # Load the environment variables

API_KEY = os.getenv("DG_API_KEY")


cwd = os.getcwd() # Get the current working directory (cwd)
s3 = boto3.client('s3') # Create an S3 client
deepgram = DeepgramClient(API_KEY)

In [2]:
filenames = glob.glob('../pilot/*.scienceData.jsonl')
data = []
for filename in filenames:
    with open(filename) as f:
        for line in f:
            parsed = json.loads(line)
            if parsed['sampleId'] != 'missing':
                data.append(parsed)

sorted_data = sorted(data, key=lambda x: x['treatment']['name']+x['gameId'])

new_data = []
game_lookups = {}
game_index = 0
for i, row in enumerate(sorted_data):
    row["player"] = i
    if row["gameId"] not in game_lookups:
        game_lookups[row["gameId"]] = game_index
        row["game"] = game_index
        game_index += 1
    else:
        row["game"] = game_lookups[row["gameId"]]

    if len(row["speakerEvents"]) > 0: # if they joined the discussion only
        new_data.append(row)

data = new_data
index = pd.MultiIndex.from_tuples([(row['treatment']["name"], row['game'], int(row['position'])) for row in data], names=['treatment', 'game', 'position'])
len(data)

21

In [3]:
def getFileNames(participant, s3, s3_bucket, s3_region):
    # get S3 filenames for all videos recorded for this participant's group
    folder = f"deliberation/{participant['recordingsFolder']}"
    remote_paths = s3.list_objects_v2(Bucket=s3_bucket, Prefix=folder)['Contents']
    return remote_paths

def getMetadata(local_files):
    # get metadata for all files downloaded for this participant's group
    collector = []
    for filepath in local_files:
        
        basename = os.path.basename(filepath)

        # get start time from metadata
        output = ! ffprobe $filepath
        start_time = re.search(' start: (\d+\.\d+)', '\n'.join(output))
        start = start_time.group(1) if start_time else None

        if ("video") in basename:
            # get resolution for video
            resolution_cmd = f'ffprobe -v error -select_streams v:0 -show_entries stream=width,height -of csv=s=x:p=0 {filepath}'
            resolution_output = ! $resolution_cmd
            resolution = resolution_output[0].strip()
            resolution_x = int(resolution.split("x")[0])
            track_type = "video"
            
            # get start from first frame
            cmd = f'ffprobe -v 0 -show_entries packet=pts -of compact=p=0:nk=1 -read_intervals 0 -select_streams v {filepath} | head -1'
            output = ! $cmd
            first_frame_time = float(output[0])/1000

            # get time of last frame
            cmd = f'ffprobe -v 0 -show_entries packet=pts -of compact=p=0:nk=1 -read_intervals 999999 -select_streams v {filepath}  | tail -1 '
            output = ! $cmd
            last_frame_time = float(output[0])/1000

        else:
            resolution = np.nan
            resolution_x = np.nan
            track_type = "audio"
            first_frame_time = np.nan
            last_frame_time = np.nan

        collector.append({
            "file": filepath,
            "basename": basename,
            "track_type": track_type,
            "internal_start": float(start),
            "resolution": resolution,
            "resolution_x": resolution_x,
        })

    metadata = pd.DataFrame(collector)
    metadata = metadata.sort_values("internal_start").reset_index(drop=True)

    # which tracks go together?
    video_tracks = metadata[metadata['track_type']=="video"]
    audio_tracks = metadata[metadata['track_type']=="audio"]
    for group, (video_track_index, video_track) in enumerate(video_tracks.iterrows()):
        audio_track_index = (audio_tracks['internal_start'] - video_track['internal_start']).abs().idxmin()
        metadata.loc[video_track_index, 'track_group'] = str(int(group))
        metadata.loc[audio_track_index, 'track_group'] = str(int(group))

    internal_group_starts = metadata.groupby("track_group")['internal_start'].min() # internal time that the first track in the group starts
    metadata['group_internal_start'] = metadata['track_group'].map(internal_group_starts)
    metadata['group_internal_offset'] = metadata['internal_start'] - metadata['group_internal_start']

    return metadata

  start_time = re.search(' start: (\d+\.\d+)', '\n'.join(output))


In [4]:
def get_participant_recordings_data(participant):
    if participant['recordingIds'] == "missing":
        return None
    
    folder = participant['recordingsFolder']
    s3_bucket = participant['config']['videoStorage']['bucket']
    s3_region = participant['config']['videoStorage']['region']
    
    remote_paths = getFileNames(participant, s3, s3_bucket, s3_region)
    print("remote_paths", remote_paths)

    files = []
    for recordingId in participant['recordingIds']:
        files += [remote_path['Key'] for remote_path in remote_paths if recordingId in remote_path['Key']]

    print("files", files)

    os.makedirs(cwd + "/recordings/" + folder, exist_ok=True)
    local_files = []
    for file in files:
        path = "recordings/" + "/".join(file.split("/")[1:])
        if not path.endswith(".webm"): 
            path += ".webm"
        local_files.append(path)
        if not os.path.exists(path):
            print("downloading to path", path)
            s3.download_file(s3_bucket, file, path)

    metadata = getMetadata(local_files)
    return metadata

In [5]:

def transcribe(audio_file):
    print("transcribing", audio_file)
    
    with open(audio_file, "rb") as file:
        buffer_data = file.read()

    payload: FileSource = {
        "buffer": buffer_data,
    }

    #STEP 2: Configure Deepgram options for audio analysis
    options = PrerecordedOptions(
        model="nova-2",
        utterances=True,
    )

    # STEP 3: Call the transcribe_file method with the text payload and options
    response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)

    return response

    

In [10]:
for i, row in get_participant_recordings_data(data[8]).iterrows():
    
    print(row)
    

remote_paths [{'Key': 'deliberation/20240829_1458_Pilot_KZMBPG/1724947347486-0d3257a4-4d34-4149-9a6b-5ed74d80f54c-cam-audio-1724947348970', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, 30, tzinfo=tzutc()), 'ETag': '"abe70f6a60c2a471e96b112a5e9a70ba-1"', 'Size': 2411356, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_KZMBPG/1724947347486-0d3257a4-4d34-4149-9a6b-5ed74d80f54c-cam-video-1724947348971', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, 30, tzinfo=tzutc()), 'ETag': '"0cfc8fec3e8c4b7ba1cf21ed60906960-24"', 'Size': 121098544, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_KZMBPG/1724947347486-99db951a-efbe-4935-abe1-ec4aa6cf3937-cam-audio-1724947348459', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, 29, tzinfo=tzutc()), 'ETag': '"456f4e0ffefe84b651bdcb59e2055837-1"', 'Size': 2525125, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_KZMBPG/1724947347486-99db951a-efbe-4935-abe1-ec4aa6cf393

In [13]:
def make_transcription_files(participant):
    recordings = get_participant_recordings_data(participant)
    position = participant['position']
    if recordings is None:
        return
    # print(recordings[recordings['track_type']=="audio"])

    if(len(recordings[recordings['track_type']=="audio"]['file']) > 0):
        for i, row in recordings[recordings['track_type']=="audio"].iterrows():
            print(row)
            audio_file = row['file']
            internal_start = row['internal_start']
            
            utterances_filepath = audio_file.replace(".webm", "-utterances.csv")
            if os.path.exists(utterances_filepath):
                print("skipping transcription of", audio_file)
                continue

            response = transcribe(audio_file)
            transcription_df = pd.DataFrame(response.to_dict()['results']['utterances'])

            # export utterances csv
            utterances = transcription_df[["start", "end", "confidence", "transcript"]]
            utterances['position'] = position
            utterances['start'] += internal_start # add the start time of the recording
            utterances['end'] += internal_start
            utterances.to_csv(utterances_filepath, index=False)

            # export words csv
            words_list = []
            for j, word_row in transcription_df.iterrows():
                words_list += word_row['words']

            words = pd.DataFrame(words_list)
            words['position'] = position
            words['start'] += internal_start
            words['end'] += internal_start
            words_filepath = audio_file.replace(".webm", "-words.csv")
            words.to_csv(words_filepath, index=False)


In [14]:
completed = []
for i, participant in enumerate(data):
    if participant['player'] in completed:
        continue
    print(i, participant['player'])

    make_transcription_files(participant)
    completed.append(participant['player'])


0 0
remote_paths [{'Key': 'deliberation/20240829_1458_Pilot_BGAB07/1724947313451-210e4d0e-38e7-4348-9f70-68d8110a122c-cam-audio-1724947314491', 'LastModified': datetime.datetime(2024, 8, 29, 16, 1, 56, tzinfo=tzutc()), 'ETag': '"89a063c93b25a8dd4e43c4b06d079eb9-1"', 'Size': 1014316, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_BGAB07/1724947313451-210e4d0e-38e7-4348-9f70-68d8110a122c-cam-video-1724947314494', 'LastModified': datetime.datetime(2024, 8, 29, 16, 1, 56, tzinfo=tzutc()), 'ETag': '"7e4803e7fe088380c8868c089bdcb938-12"', 'Size': 61066427, 'StorageClass': 'STANDARD'}]
files ['deliberation/20240829_1458_Pilot_BGAB07/1724947313451-210e4d0e-38e7-4348-9f70-68d8110a122c-cam-audio-1724947314491', 'deliberation/20240829_1458_Pilot_BGAB07/1724947313451-210e4d0e-38e7-4348-9f70-68d8110a122c-cam-video-1724947314494']
file                     recordings/20240829_1458_Pilot_BGAB07/17249473...
basename                 1724947313451-210e4d0e-38e7-4348-9f70-68d8110a.

  response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)


4 4
remote_paths [{'Key': 'deliberation/20240815_2018_miniPiAKNVYX/1723755710029-0591956c-a9d3-4480-82c4-ed298a17dec2-cam-audio-1723755710431', 'LastModified': datetime.datetime(2024, 8, 15, 21, 1, 51, tzinfo=tzutc()), 'ETag': '"4982f1c28c7240c9e40482aa395c78e5-1"', 'Size': 2189670, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240815_2018_miniPiAKNVYX/1723755710029-0591956c-a9d3-4480-82c4-ed298a17dec2-cam-video-1723755710433', 'LastModified': datetime.datetime(2024, 8, 15, 21, 1, 51, tzinfo=tzutc()), 'ETag': '"0b28647d91f42c5bc46c0b6aa5cf853c-6"', 'Size': 29061389, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240815_2018_miniPiAKNVYX/1723755710029-10933107-59d8-46eb-b78c-08f8f41bca7a-cam-audio-1723755716888', 'LastModified': datetime.datetime(2024, 8, 15, 21, 1, 58, tzinfo=tzutc()), 'ETag': '"88b4499927024bdfeeacb7f3f5fcfe14-1"', 'Size': 2484298, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240815_2018_miniPiAKNVYX/1723755710029-10933107-59d8-46eb-b78c-08f8f41bc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['start'] += internal_start # add the start time of the recording
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['end'] += internal_start


file                     recordings/20240815_2018_miniPiAKNVYX/17237557...
basename                 1723755710029-0591956c-a9d3-4480-82c4-ed298a17...
track_type                                                           audio
internal_start                                                       0.385
resolution                                                             NaN
resolution_x                                                           NaN
track_group                                                              0
group_internal_start                                                 0.385
group_internal_offset                                                  0.0
Name: 0, dtype: object
transcribing recordings/20240815_2018_miniPiAKNVYX/1723755710029-0591956c-a9d3-4480-82c4-ed298a17dec2-cam-audio-1723755710431.webm


  response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)


5 5
remote_paths [{'Key': 'deliberation/20240815_2018_miniPiAKNVYX/1723755710029-0591956c-a9d3-4480-82c4-ed298a17dec2-cam-audio-1723755710431', 'LastModified': datetime.datetime(2024, 8, 15, 21, 1, 51, tzinfo=tzutc()), 'ETag': '"4982f1c28c7240c9e40482aa395c78e5-1"', 'Size': 2189670, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240815_2018_miniPiAKNVYX/1723755710029-0591956c-a9d3-4480-82c4-ed298a17dec2-cam-video-1723755710433', 'LastModified': datetime.datetime(2024, 8, 15, 21, 1, 51, tzinfo=tzutc()), 'ETag': '"0b28647d91f42c5bc46c0b6aa5cf853c-6"', 'Size': 29061389, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240815_2018_miniPiAKNVYX/1723755710029-10933107-59d8-46eb-b78c-08f8f41bca7a-cam-audio-1723755716888', 'LastModified': datetime.datetime(2024, 8, 15, 21, 1, 58, tzinfo=tzutc()), 'ETag': '"88b4499927024bdfeeacb7f3f5fcfe14-1"', 'Size': 2484298, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240815_2018_miniPiAKNVYX/1723755710029-10933107-59d8-46eb-b78c-08f8f41bc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['start'] += internal_start # add the start time of the recording
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['end'] += internal_start


file                     recordings/20240815_2018_miniPiAKNVYX/17237557...
basename                 1723755710029-10933107-59d8-46eb-b78c-08f8f41b...
track_type                                                           audio
internal_start                                                       6.853
resolution                                                             NaN
resolution_x                                                           NaN
track_group                                                              0
group_internal_start                                                 6.853
group_internal_offset                                                  0.0
Name: 0, dtype: object
transcribing recordings/20240815_2018_miniPiAKNVYX/1723755710029-10933107-59d8-46eb-b78c-08f8f41bca7a-cam-audio-1723755716888.webm


  response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['start'] += internal_start # add the start time of the recording
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['e

6 6
remote_paths [{'Key': 'deliberation/20240829_1458_Pilot_WPJRZS/1724947320281-38571f69-2e40-4f76-80aa-f653964cda61-cam-audio-1724947321287', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, 2, tzinfo=tzutc()), 'ETag': '"1c241eb761c98bc4bcea4eede954878a-1"', 'Size': 2295177, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_WPJRZS/1724947320281-38571f69-2e40-4f76-80aa-f653964cda61-cam-video-1724947321283', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, 2, tzinfo=tzutc()), 'ETag': '"581e3d3090b32ecd3762243ca99ac842-31"', 'Size': 161115041, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_WPJRZS/1724947320281-7a0f03de-2c43-4769-b438-b854a379732d-cam-audio-1724947321286', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, 2, tzinfo=tzutc()), 'ETag': '"8657b613c436c51d9359910d171b3613-1"', 'Size': 2607181, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_WPJRZS/1724947320281-7a0f03de-2c43-4769-b438-b854a37973

  response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)


7 7
remote_paths [{'Key': 'deliberation/20240829_1458_Pilot_WPJRZS/1724947320281-38571f69-2e40-4f76-80aa-f653964cda61-cam-audio-1724947321287', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, 2, tzinfo=tzutc()), 'ETag': '"1c241eb761c98bc4bcea4eede954878a-1"', 'Size': 2295177, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_WPJRZS/1724947320281-38571f69-2e40-4f76-80aa-f653964cda61-cam-video-1724947321283', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, 2, tzinfo=tzutc()), 'ETag': '"581e3d3090b32ecd3762243ca99ac842-31"', 'Size': 161115041, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_WPJRZS/1724947320281-7a0f03de-2c43-4769-b438-b854a379732d-cam-audio-1724947321286', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, 2, tzinfo=tzutc()), 'ETag': '"8657b613c436c51d9359910d171b3613-1"', 'Size': 2607181, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_WPJRZS/1724947320281-7a0f03de-2c43-4769-b438-b854a37973

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['start'] += internal_start # add the start time of the recording
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['end'] += internal_start


file                     recordings/20240829_1458_Pilot_WPJRZS/17249473...
basename                 1724947320281-38571f69-2e40-4f76-80aa-f653964c...
track_type                                                           audio
internal_start                                                       0.969
resolution                                                             NaN
resolution_x                                                           NaN
track_group                                                              0
group_internal_start                                                 0.969
group_internal_offset                                                  0.0
Name: 0, dtype: object
transcribing recordings/20240829_1458_Pilot_WPJRZS/1724947320281-38571f69-2e40-4f76-80aa-f653964cda61-cam-audio-1724947321287.webm


  response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)


8 8
remote_paths [{'Key': 'deliberation/20240829_1458_Pilot_KZMBPG/1724947347486-0d3257a4-4d34-4149-9a6b-5ed74d80f54c-cam-audio-1724947348970', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, 30, tzinfo=tzutc()), 'ETag': '"abe70f6a60c2a471e96b112a5e9a70ba-1"', 'Size': 2411356, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_KZMBPG/1724947347486-0d3257a4-4d34-4149-9a6b-5ed74d80f54c-cam-video-1724947348971', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, 30, tzinfo=tzutc()), 'ETag': '"0cfc8fec3e8c4b7ba1cf21ed60906960-24"', 'Size': 121098544, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_KZMBPG/1724947347486-99db951a-efbe-4935-abe1-ec4aa6cf3937-cam-audio-1724947348459', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, 29, tzinfo=tzutc()), 'ETag': '"456f4e0ffefe84b651bdcb59e2055837-1"', 'Size': 2525125, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_KZMBPG/1724947347486-99db951a-efbe-4935-abe1-ec4aa6c

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['start'] += internal_start # add the start time of the recording
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['end'] += internal_start


file                     recordings/20240829_1458_Pilot_KZMBPG/17249473...
basename                 1724947347486-99db951a-efbe-4935-abe1-ec4aa6cf...
track_type                                                           audio
internal_start                                                       0.981
resolution                                                             NaN
resolution_x                                                           NaN
track_group                                                              0
group_internal_start                                                 0.981
group_internal_offset                                                  0.0
Name: 0, dtype: object
transcribing recordings/20240829_1458_Pilot_KZMBPG/1724947347486-99db951a-efbe-4935-abe1-ec4aa6cf3937-cam-audio-1724947348459.webm


  response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)


9 9
remote_paths [{'Key': 'deliberation/20240829_1458_Pilot_KZMBPG/1724947347486-0d3257a4-4d34-4149-9a6b-5ed74d80f54c-cam-audio-1724947348970', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, 30, tzinfo=tzutc()), 'ETag': '"abe70f6a60c2a471e96b112a5e9a70ba-1"', 'Size': 2411356, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_KZMBPG/1724947347486-0d3257a4-4d34-4149-9a6b-5ed74d80f54c-cam-video-1724947348971', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, 30, tzinfo=tzutc()), 'ETag': '"0cfc8fec3e8c4b7ba1cf21ed60906960-24"', 'Size': 121098544, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_KZMBPG/1724947347486-99db951a-efbe-4935-abe1-ec4aa6cf3937-cam-audio-1724947348459', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, 29, tzinfo=tzutc()), 'ETag': '"456f4e0ffefe84b651bdcb59e2055837-1"', 'Size': 2525125, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_KZMBPG/1724947347486-99db951a-efbe-4935-abe1-ec4aa6c

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['start'] += internal_start # add the start time of the recording
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['end'] += internal_start


file                     recordings/20240829_1458_Pilot_KZMBPG/17249473...
basename                 1724947347486-0d3257a4-4d34-4149-9a6b-5ed74d80...
track_type                                                           audio
internal_start                                                       1.465
resolution                                                             NaN
resolution_x                                                           NaN
track_group                                                              0
group_internal_start                                                 1.465
group_internal_offset                                                  0.0
Name: 0, dtype: object
transcribing recordings/20240829_1458_Pilot_KZMBPG/1724947347486-0d3257a4-4d34-4149-9a6b-5ed74d80f54c-cam-audio-1724947348970.webm


  response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)


10 10
remote_paths [{'Key': 'deliberation/20240829_1458_Pilot_DJEZ7J/1724947330766-0d8d20df-4dea-452d-adcf-677a5557faa3-cam-audio-1724947331740', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, 13, tzinfo=tzutc()), 'ETag': '"1a6fda6907321a4165b046a276532aa6-1"', 'Size': 2468661, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_DJEZ7J/1724947330766-0d8d20df-4dea-452d-adcf-677a5557faa3-cam-video-1724947331743', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, 13, tzinfo=tzutc()), 'ETag': '"f5af960b9c48f8c06f333533c0892c3c-32"', 'Size': 163188227, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_DJEZ7J/1724947330766-a5579efe-44ef-47f9-ae7a-9c3d6eaf1f97-cam-audio-1724947331741', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, 13, tzinfo=tzutc()), 'ETag': '"b1fd0763f7cfc9ac860a048fd96db21b-1"', 'Size': 2429626, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_DJEZ7J/1724947330766-a5579efe-44ef-47f9-ae7a-9c3d6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['start'] += internal_start # add the start time of the recording
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['end'] += internal_start


file                     recordings/20240829_1458_Pilot_DJEZ7J/17249473...
basename                 1724947330766-a5579efe-44ef-47f9-ae7a-9c3d6eaf...
track_type                                                           audio
internal_start                                                       0.954
resolution                                                             NaN
resolution_x                                                           NaN
track_group                                                              0
group_internal_start                                                 0.954
group_internal_offset                                                  0.0
Name: 0, dtype: object
transcribing recordings/20240829_1458_Pilot_DJEZ7J/1724947330766-a5579efe-44ef-47f9-ae7a-9c3d6eaf1f97-cam-audio-1724947331741.webm


  response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)


11 11
remote_paths [{'Key': 'deliberation/20240829_1458_Pilot_DJEZ7J/1724947330766-0d8d20df-4dea-452d-adcf-677a5557faa3-cam-audio-1724947331740', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, 13, tzinfo=tzutc()), 'ETag': '"1a6fda6907321a4165b046a276532aa6-1"', 'Size': 2468661, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_DJEZ7J/1724947330766-0d8d20df-4dea-452d-adcf-677a5557faa3-cam-video-1724947331743', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, 13, tzinfo=tzutc()), 'ETag': '"f5af960b9c48f8c06f333533c0892c3c-32"', 'Size': 163188227, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_DJEZ7J/1724947330766-a5579efe-44ef-47f9-ae7a-9c3d6eaf1f97-cam-audio-1724947331741', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, 13, tzinfo=tzutc()), 'ETag': '"b1fd0763f7cfc9ac860a048fd96db21b-1"', 'Size': 2429626, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_DJEZ7J/1724947330766-a5579efe-44ef-47f9-ae7a-9c3d6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['start'] += internal_start # add the start time of the recording
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['end'] += internal_start


file                     recordings/20240829_1458_Pilot_DJEZ7J/17249473...
basename                 1724947330766-0d8d20df-4dea-452d-adcf-677a5557...
track_type                                                           audio
internal_start                                                        0.94
resolution                                                             NaN
resolution_x                                                           NaN
track_group                                                              0
group_internal_start                                                  0.94
group_internal_offset                                                  0.0
Name: 0, dtype: object
transcribing recordings/20240829_1458_Pilot_DJEZ7J/1724947330766-0d8d20df-4dea-452d-adcf-677a5557faa3-cam-audio-1724947331740.webm


  response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)


12 12
remote_paths [{'Key': 'deliberation/20240829_1458_Pilot_RPEK4Z/1724947318124-167882d9-17b9-45e1-8bb3-378f06fe91dc-cam-audio-1724947319118', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, tzinfo=tzutc()), 'ETag': '"c3f87e67c342fe3fc133a0a1e675bc5f-1"', 'Size': 2258374, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_RPEK4Z/1724947318124-167882d9-17b9-45e1-8bb3-378f06fe91dc-cam-video-1724947319121', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, tzinfo=tzutc()), 'ETag': '"f219edfb16a6dc1a4810fae9b542b4eb-10"', 'Size': 47710571, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_RPEK4Z/1724947318124-35f93985-cd7b-4eba-ad1d-9599edbe5ada-cam-audio-1724947319122', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, tzinfo=tzutc()), 'ETag': '"86949806c6aefd49d992839a0650731f-1"', 'Size': 1869865, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_RPEK4Z/1724947318124-35f93985-cd7b-4eba-ad1d-9599edbe5ada-cam-v

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['start'] += internal_start # add the start time of the recording
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['end'] += internal_start


file                     recordings/20240829_1458_Pilot_RPEK4Z/17249473...
basename                 1724947318124-167882d9-17b9-45e1-8bb3-378f06fe...
track_type                                                           audio
internal_start                                                       0.959
resolution                                                             NaN
resolution_x                                                           NaN
track_group                                                              0
group_internal_start                                                 0.959
group_internal_offset                                                  0.0
Name: 0, dtype: object
transcribing recordings/20240829_1458_Pilot_RPEK4Z/1724947318124-167882d9-17b9-45e1-8bb3-378f06fe91dc-cam-audio-1724947319118.webm


  response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['start'] += internal_start # add the start time of the recording
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['e

13 13
remote_paths [{'Key': 'deliberation/20240829_1458_Pilot_RPEK4Z/1724947318124-167882d9-17b9-45e1-8bb3-378f06fe91dc-cam-audio-1724947319118', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, tzinfo=tzutc()), 'ETag': '"c3f87e67c342fe3fc133a0a1e675bc5f-1"', 'Size': 2258374, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_RPEK4Z/1724947318124-167882d9-17b9-45e1-8bb3-378f06fe91dc-cam-video-1724947319121', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, tzinfo=tzutc()), 'ETag': '"f219edfb16a6dc1a4810fae9b542b4eb-10"', 'Size': 47710571, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_RPEK4Z/1724947318124-35f93985-cd7b-4eba-ad1d-9599edbe5ada-cam-audio-1724947319122', 'LastModified': datetime.datetime(2024, 8, 29, 16, 2, tzinfo=tzutc()), 'ETag': '"86949806c6aefd49d992839a0650731f-1"', 'Size': 1869865, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240829_1458_Pilot_RPEK4Z/1724947318124-35f93985-cd7b-4eba-ad1d-9599edbe5ada-cam-v

  response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)


14 14
remote_paths [{'Key': 'deliberation/20240904_1519_Pilot_DPJ9MG/1725467538059-77e6eb12-c67d-49cf-8ac6-a44914aef3cd-cam-audio-1725467539069', 'LastModified': datetime.datetime(2024, 9, 4, 16, 32, 20, tzinfo=tzutc()), 'ETag': '"6e905595d6a21a8048a19476b4f095c8-1"', 'Size': 2101574, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240904_1519_Pilot_DPJ9MG/1725467538059-77e6eb12-c67d-49cf-8ac6-a44914aef3cd-cam-video-1725467539071', 'LastModified': datetime.datetime(2024, 9, 4, 16, 32, 20, tzinfo=tzutc()), 'ETag': '"5d3a408afd4d7a1381b8e1d20432080d-28"', 'Size': 141835116, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240904_1519_Pilot_DPJ9MG/1725467538059-fde34445-0d5c-4234-90d9-526649b83a25-cam-audio-1725467547832', 'LastModified': datetime.datetime(2024, 9, 4, 16, 32, 29, tzinfo=tzutc()), 'ETag': '"cd9225c38a1934e3ccc06fbf66df0848-1"', 'Size': 2285272, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240904_1519_Pilot_DPJ9MG/1725467538059-fde34445-0d5c-4234-90d9-52664

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['start'] += internal_start # add the start time of the recording
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['end'] += internal_start


file                     recordings/20240904_1519_Pilot_DPJ9MG/17254675...
basename                 1725467538059-77e6eb12-c67d-49cf-8ac6-a44914ae...
track_type                                                           audio
internal_start                                                       0.963
resolution                                                             NaN
resolution_x                                                           NaN
track_group                                                              0
group_internal_start                                                 0.963
group_internal_offset                                                  0.0
Name: 0, dtype: object
transcribing recordings/20240904_1519_Pilot_DPJ9MG/1725467538059-77e6eb12-c67d-49cf-8ac6-a44914aef3cd-cam-audio-1725467539069.webm


  response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)


15 15
remote_paths [{'Key': 'deliberation/20240904_1519_Pilot_MXAFSF/1725467492730-11a8e072-8013-4a15-bd4d-79b94b00629b-cam-audio-1725467493719', 'LastModified': datetime.datetime(2024, 9, 4, 16, 31, 34, tzinfo=tzutc()), 'ETag': '"fede48937c15d7c9ff80f2366b0e02f3-1"', 'Size': 2372973, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240904_1519_Pilot_MXAFSF/1725467492730-11a8e072-8013-4a15-bd4d-79b94b00629b-cam-video-1725467493722', 'LastModified': datetime.datetime(2024, 9, 4, 16, 31, 34, tzinfo=tzutc()), 'ETag': '"78155a83cac380a52c00c6e6af31b90f-31"', 'Size': 162216361, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240904_1519_Pilot_MXAFSF/1725467492730-6e3575d7-6b43-44c2-8c50-f2863eef15e9-cam-audio-1725467493720', 'LastModified': datetime.datetime(2024, 9, 4, 16, 31, 34, tzinfo=tzutc()), 'ETag': '"06924bb764a343182c596c842468b938-1"', 'Size': 2138878, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240904_1519_Pilot_MXAFSF/1725467492730-6e3575d7-6b43-44c2-8c50-f2863

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['start'] += internal_start # add the start time of the recording
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['end'] += internal_start


file                     recordings/20240904_1519_Pilot_MXAFSF/17254674...
basename                 1725467492730-6e3575d7-6b43-44c2-8c50-f2863eef...
track_type                                                           audio
internal_start                                                       0.957
resolution                                                             NaN
resolution_x                                                           NaN
track_group                                                              0
group_internal_start                                                 0.957
group_internal_offset                                                  0.0
Name: 0, dtype: object
transcribing recordings/20240904_1519_Pilot_MXAFSF/1725467492730-6e3575d7-6b43-44c2-8c50-f2863eef15e9-cam-audio-1725467493720.webm


  response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)


16 16
remote_paths [{'Key': 'deliberation/20240904_1519_Pilot_MXAFSF/1725467492730-11a8e072-8013-4a15-bd4d-79b94b00629b-cam-audio-1725467493719', 'LastModified': datetime.datetime(2024, 9, 4, 16, 31, 34, tzinfo=tzutc()), 'ETag': '"fede48937c15d7c9ff80f2366b0e02f3-1"', 'Size': 2372973, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240904_1519_Pilot_MXAFSF/1725467492730-11a8e072-8013-4a15-bd4d-79b94b00629b-cam-video-1725467493722', 'LastModified': datetime.datetime(2024, 9, 4, 16, 31, 34, tzinfo=tzutc()), 'ETag': '"78155a83cac380a52c00c6e6af31b90f-31"', 'Size': 162216361, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240904_1519_Pilot_MXAFSF/1725467492730-6e3575d7-6b43-44c2-8c50-f2863eef15e9-cam-audio-1725467493720', 'LastModified': datetime.datetime(2024, 9, 4, 16, 31, 34, tzinfo=tzutc()), 'ETag': '"06924bb764a343182c596c842468b938-1"', 'Size': 2138878, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240904_1519_Pilot_MXAFSF/1725467492730-6e3575d7-6b43-44c2-8c50-f2863

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['start'] += internal_start # add the start time of the recording
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['end'] += internal_start


file                     recordings/20240904_1519_Pilot_MXAFSF/17254674...
basename                 1725467492730-11a8e072-8013-4a15-bd4d-79b94b00...
track_type                                                           audio
internal_start                                                       0.955
resolution                                                             NaN
resolution_x                                                           NaN
track_group                                                              0
group_internal_start                                                 0.955
group_internal_offset                                                  0.0
Name: 0, dtype: object
transcribing recordings/20240904_1519_Pilot_MXAFSF/1725467492730-11a8e072-8013-4a15-bd4d-79b94b00629b-cam-audio-1725467493719.webm


  response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)


17 17
remote_paths [{'Key': 'deliberation/20240815_2018_miniPiZ4VQ1R/1723755676733-9dfd15cc-20be-4158-a847-290b4518db5a-cam-audio-1723755677706', 'LastModified': datetime.datetime(2024, 8, 15, 21, 1, 18, tzinfo=tzutc()), 'ETag': '"60a7fee448f6325157e324fd329c8e06-1"', 'Size': 3428608, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240815_2018_miniPiZ4VQ1R/1723755676733-9dfd15cc-20be-4158-a847-290b4518db5a-cam-video-1723755677708', 'LastModified': datetime.datetime(2024, 8, 15, 21, 1, 18, tzinfo=tzutc()), 'ETag': '"a47aae3344b775361b4e93bf7ddd5ff7-15"', 'Size': 75233460, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240815_2018_miniPiZ4VQ1R/1723755676733-9fd4b432-e039-453d-8380-5d6b369ed807-cam-audio-1723755677707', 'LastModified': datetime.datetime(2024, 8, 15, 21, 1, 18, tzinfo=tzutc()), 'ETag': '"254b65a839c5ad3001c386e553424237-1"', 'Size': 3050717, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240815_2018_miniPiZ4VQ1R/1723755676733-9fd4b432-e039-453d-8380-5d6b36

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['start'] += internal_start # add the start time of the recording
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['end'] += internal_start


file                     recordings/20240815_2018_miniPiZ4VQ1R/17237556...
basename                 1723755676733-9dfd15cc-20be-4158-a847-290b4518...
track_type                                                           audio
internal_start                                                       0.952
resolution                                                             NaN
resolution_x                                                           NaN
track_group                                                              0
group_internal_start                                                 0.952
group_internal_offset                                                  0.0
Name: 0, dtype: object
transcribing recordings/20240815_2018_miniPiZ4VQ1R/1723755676733-9dfd15cc-20be-4158-a847-290b4518db5a-cam-audio-1723755677706.webm


  response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)


18 18
remote_paths [{'Key': 'deliberation/20240815_2018_miniPiZ4VQ1R/1723755676733-9dfd15cc-20be-4158-a847-290b4518db5a-cam-audio-1723755677706', 'LastModified': datetime.datetime(2024, 8, 15, 21, 1, 18, tzinfo=tzutc()), 'ETag': '"60a7fee448f6325157e324fd329c8e06-1"', 'Size': 3428608, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240815_2018_miniPiZ4VQ1R/1723755676733-9dfd15cc-20be-4158-a847-290b4518db5a-cam-video-1723755677708', 'LastModified': datetime.datetime(2024, 8, 15, 21, 1, 18, tzinfo=tzutc()), 'ETag': '"a47aae3344b775361b4e93bf7ddd5ff7-15"', 'Size': 75233460, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240815_2018_miniPiZ4VQ1R/1723755676733-9fd4b432-e039-453d-8380-5d6b369ed807-cam-audio-1723755677707', 'LastModified': datetime.datetime(2024, 8, 15, 21, 1, 18, tzinfo=tzutc()), 'ETag': '"254b65a839c5ad3001c386e553424237-1"', 'Size': 3050717, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240815_2018_miniPiZ4VQ1R/1723755676733-9fd4b432-e039-453d-8380-5d6b36

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['start'] += internal_start # add the start time of the recording
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['end'] += internal_start


file                     recordings/20240815_2018_miniPiZ4VQ1R/17237556...
basename                 1723755676733-9fd4b432-e039-453d-8380-5d6b369e...
track_type                                                           audio
internal_start                                                        0.96
resolution                                                             NaN
resolution_x                                                           NaN
track_group                                                              0
group_internal_start                                                  0.96
group_internal_offset                                                  0.0
Name: 0, dtype: object
transcribing recordings/20240815_2018_miniPiZ4VQ1R/1723755676733-9fd4b432-e039-453d-8380-5d6b369ed807-cam-audio-1723755677707.webm


  response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['start'] += internal_start # add the start time of the recording
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['e

19 19
remote_paths [{'Key': 'deliberation/20240904_1519_Pilot_8YRK6A/1725467480299-16c2892f-2a60-4cc7-8eb5-ac14f83a333e-cam-audio-1725467481270', 'LastModified': datetime.datetime(2024, 9, 4, 16, 31, 22, tzinfo=tzutc()), 'ETag': '"8e7869608ec7a9d75d82b19b9463d550-1"', 'Size': 2227249, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240904_1519_Pilot_8YRK6A/1725467480299-16c2892f-2a60-4cc7-8eb5-ac14f83a333e-cam-video-1725467481271', 'LastModified': datetime.datetime(2024, 9, 4, 16, 31, 22, tzinfo=tzutc()), 'ETag': '"e139e8fd7c7c231dcca83e5d76039f69-29"', 'Size': 150607204, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240904_1519_Pilot_8YRK6A/1725467480299-2c8bfb44-f14a-4a27-8573-895fa346dcae-cam-audio-1725467481590', 'LastModified': datetime.datetime(2024, 9, 4, 16, 31, 22, tzinfo=tzutc()), 'ETag': '"a194edc123e666be6d90e9c7e74a4099-1"', 'Size': 2287007, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240904_1519_Pilot_8YRK6A/1725467480299-2c8bfb44-f14a-4a27-8573-895fa

  response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)


20 20
remote_paths [{'Key': 'deliberation/20240904_1519_Pilot_8YRK6A/1725467480299-16c2892f-2a60-4cc7-8eb5-ac14f83a333e-cam-audio-1725467481270', 'LastModified': datetime.datetime(2024, 9, 4, 16, 31, 22, tzinfo=tzutc()), 'ETag': '"8e7869608ec7a9d75d82b19b9463d550-1"', 'Size': 2227249, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240904_1519_Pilot_8YRK6A/1725467480299-16c2892f-2a60-4cc7-8eb5-ac14f83a333e-cam-video-1725467481271', 'LastModified': datetime.datetime(2024, 9, 4, 16, 31, 22, tzinfo=tzutc()), 'ETag': '"e139e8fd7c7c231dcca83e5d76039f69-29"', 'Size': 150607204, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240904_1519_Pilot_8YRK6A/1725467480299-2c8bfb44-f14a-4a27-8573-895fa346dcae-cam-audio-1725467481590', 'LastModified': datetime.datetime(2024, 9, 4, 16, 31, 22, tzinfo=tzutc()), 'ETag': '"a194edc123e666be6d90e9c7e74a4099-1"', 'Size': 2287007, 'StorageClass': 'STANDARD'}, {'Key': 'deliberation/20240904_1519_Pilot_8YRK6A/1725467480299-2c8bfb44-f14a-4a27-8573-895fa

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['start'] += internal_start # add the start time of the recording
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['end'] += internal_start


file                     recordings/20240904_1519_Pilot_8YRK6A/17254674...
basename                 1725467480299-16c2892f-2a60-4cc7-8eb5-ac14f83a...
track_type                                                           audio
internal_start                                                       0.942
resolution                                                             NaN
resolution_x                                                           NaN
track_group                                                              0
group_internal_start                                                 0.942
group_internal_offset                                                  0.0
Name: 0, dtype: object
transcribing recordings/20240904_1519_Pilot_8YRK6A/1725467480299-16c2892f-2a60-4cc7-8eb5-ac14f83a333e-cam-audio-1725467481270.webm


  response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['start'] += internal_start # add the start time of the recording
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utterances['e

In [15]:
%debug

> [0;32m/Users/jamesphoughton/github/contact-theory-topic/env/lib/python3.12/site-packages/pandas/core/indexes/base.py[0m(3812)[0;36mget_loc[0;34m()[0m
[0;32m   3810 [0;31m            [0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   3811 [0;31m                [0;32mraise[0m [0mInvalidIndexError[0m[0;34m([0m[0mkey[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m-> 3812 [0;31m            [0;32mraise[0m [0mKeyError[0m[0;34m([0m[0mkey[0m[0;34m)[0m [0;32mfrom[0m [0merr[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   3813 [0;31m        [0;32mexcept[0m [0mTypeError[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   3814 [0;31m            [0;31m# If we have a listlike key, _check_indexing_error will raise[0m[0;34m[0m[0;34m[0m[0m
[0m


In [26]:
# Stitch together the individual transcription files

# for each of the folders in the recordings directory
# get all the files in the folder that end in -utterances.csv
for folder in os.listdir(cwd + "/recordings"):
    files = glob.glob(cwd + f"/recordings/{folder}/*-audio-*-utterances.csv")
    
    # load all the files into a dataframe
    if len(files) == 0:
        continue
    utterances = pd.concat([pd.read_csv(file) for file in files])

    # sort by start time
    utterances = utterances.sort_values("start")

    # save the dataframe to a csv
    utterances.to_csv(f"{cwd}/recordings/{folder}/group_transcript.csv", index=False)

    # merge consecutive utterances by the same speaker (indicated by "position" column)
    # reset the start and end times to the beginning and end of the merged utterance
    # combine the transcripts in order
    utterances['position'] = utterances['position'].astype(int)
    utterances['position_shifted'] = utterances['position'].shift(1)
    utterances['position_change'] = utterances['position'] != utterances['position_shifted']
    utterances['group'] = utterances['position_change'].cumsum()
    utterances['group_start'] = utterances.groupby('group')['start'].transform('first')
    utterances['group_end'] = utterances.groupby('group')['end'].transform('last')
    utterances['merged_transcript'] = utterances.groupby('group')['transcript'].transform(' '.join)
    utterances = utterances.drop_duplicates('group')
    utterances = utterances.drop(columns=['confidence', 'position_shifted', 'position_change', 'group', 'transcript', 'start', 'end'])
    utterances = utterances.rename(columns={'group_start': 'start', 'group_end': 'end', 'merged_transcript': 'transcript'})
    utterances["conversation_id"] = folder

    




    # save the dataframe to a csv
    utterances.to_csv(f"{cwd}/recordings/{folder}/group_transcript_merged_consecutives.csv", index=False)
    