In [52]:
!conda install -y ffmpeg

Retrieving notices: ...working... done
Channels:
 - defaults
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done

# All requested packages already installed.



In [47]:
import concurrent
from collections import namedtuple
import os
import zipfile
import shutil
from getpass import getpass
import datetime
from itertools import repeat

import boto3
import requests
import pandas as pd
import subprocess


In [48]:
CHECKPOINTS_DIR = "checkpoints/"
POSE_DIR = "v2/data/raw/gloss2pose/poses/"
VIDEO_DOWNLOAD_DIR = "v2/data/raw/gloss2pose/signs/"
VIDEO_METADATA_FILE = "v2/data/metadata/video_metadata.csv"
UNFORMATTED_URL = "http://csr.bu.edu/ftp/asl/asllvd/asl-data2/quicktime/{session}/scene{scene}-camera1.mov"
S3_BUCKET= "genasl-avatar"  #replacce with your bucket name
TABLE_NAME="Pose_Data7"
partition_id = 0
number_partitions = 1
S3_LOOKUP_FOLDER = "v2/gloss2pose/lookup/"
DATA_DIR=os.getcwd()
FRAME_RATE=30

In [70]:
class VideoSegmentMetadata(object):
    def __init__(self, segment_id, start_frame, end_frame,gloss):
        self.segment_id = segment_id
        self.start_frame = start_frame
        self.end_frame = end_frame
        self.gloss = gloss

class VideoMetadata(object):

    def __init__(self, video_id, url, session, scene, segments_metadata):
        self.video_id = video_id
        self.url = url
        self.session = session
        self.scene = scene
        self.segments_metadata = segments_metadata

def time_print(string):
    now = datetime.datetime.now()
    return print("{}: {}".format(now, string))

# taken from https://stackoverflow.com/questions/1855095/how-to-create-a-zip-archive-of-a-directory-in-python
def zip_dir(directory, zipped_filepath):
    # ziph is zipfile handle
    ziph = zipfile.ZipFile(zipped_filepath, 'w', zipfile.ZIP_DEFLATED)
    for root, dirs, files in os.walk(directory):
        for filename in files:
            ziph.write(os.path.join(root, filename))

    return zipped_filepath

def get_video_metadata(bucket, s3_filepath, download_filepath, partition, num_partitions):
    print(s3_filepath)
    os.makedirs(os.path.dirname(download_filepath), exist_ok=True)
    bucket.download_file(s3_filepath, download_filepath)
    metadata = pd.read_csv(s3_filepath)
    print(metadata)
    metadata = metadata[metadata["session_scene_id"] % num_partitions == partition]
    # Remove corrupt segments
    metadata = metadata[metadata["is_corrupt"] == 0]
    #Keep only Liz videos
    metadata = metadata[metadata["Consultant"] == "Liz"]
    collapsed_metadata = metadata[["session_scene_id", "Session", "Scene"]].drop_duplicates().sort_values(by=["session_scene_id"])
    collapsed_metadata.index = collapsed_metadata["session_scene_id"]
    metadata["id-start-end-gloss"] = metadata["id"].apply(str) + "$" + \
                                     metadata["Start"].apply(str) + "$" +\
                                     metadata["End"].apply(str) + "$"+ \
                                     metadata["Gloss Variant"]
    frames_info = metadata.groupby(["session_scene_id"])["id-start-end-gloss"].apply(list)
    collapsed_metadata = pd.concat([collapsed_metadata, frames_info], axis=1)
    return [
        VideoMetadata(
            value[0],
            UNFORMATTED_URL.format(
                session=value[1],
                scene=value[2]
            ),
            value[1],
            value[2],
            sorted([
                VideoSegmentMetadata(
                    segment_id=int(segment.split("$")[0]),
                    start_frame=int(segment.split("$")[1]),
                    end_frame=int(segment.split("$")[2]),
                    gloss=segment.split("$")[3]
                ) for segment in value[3]
            ], key=lambda x: x.start_frame)
        ) for value in collapsed_metadata.values
    ]


def process_video(video,checkpoint_video_id):
    # Skip all videos with id <= checkpoint_video_id
    # If checkpoint_video_id isn't found, raise an Exception
    if checkpoint_video_id is None:
        pass
    elif checkpoint_video_id > video.video_id:
        return
    elif checkpoint_video_id == video.video_id:
        checkpoint_video_id = None
        return
    else:
        raise Exception(
            "Checkpoint video_id {} not valid".format(checkpoint_video_id)
        )
    time_print("Downloading {} with video_id {}".format(video.url, video.video_id))
    download_dir = os.path.join(DATA_DIR, VIDEO_DOWNLOAD_DIR)
    video_filepath = download_large_file(
        video.url,
        download_dir,
        "{}-{}.{}".format(
            video.session,
            video.scene,
            video.url.split(".")[-1]
        )
    )
    for segment in video.segments_metadata:
        time_print("Processing video segment {}".format(
            segment.segment_id
        ))
        temp_segment_filepath = clip_video(
            video_filepath,
            os.path.join(
                download_dir,
                "temp-segment-{}.mov".format(segment.segment_id)
            ),
            segment.start_frame,
            segment.end_frame,
        )

        segment_filepath = resample_video(
            temp_segment_filepath,
            os.path.join(
                download_dir,
                "sign-{}.mp4".format(segment.segment_id)
            ),
           FRAME_RATE
        )

        if os.path.exists(segment_filepath):
              bucket.upload_file(
                segment_filepath,
                os.path.join(
                    S3_LOOKUP_FOLDER,
                    "sign/",
                    os.path.basename(segment_filepath)
                )
            )

        if os.path.exists(temp_segment_filepath):
              bucket.upload_file(
                temp_segment_filepath,
                os.path.join(
                    S3_LOOKUP_FOLDER,
                    "rawsign/",
                    os.path.basename(temp_segment_filepath)
                )
            )

        gloss=segment.gloss.upper()
        for g in gloss.split('/'):
            g=g.replace('+','')
            g=g.replace('#','')
            response = table.put_item(
                Item={
                    'Gloss': g,
                    'SignID': segment.segment_id
                }
            )


        # Clean up
        os.remove(temp_segment_filepath)
        if os.path.exists(segment_filepath):
            os.remove(segment_filepath)

    # Update checkpoint after processing entire video
    checkpoint = s3.Object(S3_BUCKET, checkpoint_filepath)
    checkpoint.put(Body=r'{}'.format(video.video_id))
    os.remove(video_filepath)

In [73]:
##utils


from collections import namedtuple
import os
import zipfile
import shutil
from getpass import getpass
import datetime

import boto3
import requests
import pandas as pd


def run_bash_cmd(cmd,dir=None):
    result=subprocess.run(cmd, shell=True,  cwd=dir,capture_output=True, text=True)
    

def zip_dir(directory, zipped_filepath):
    # ziph is zipfile handle
    ziph = zipfile.ZipFile(zipped_filepath, 'w', zipfile.ZIP_DEFLATED)
    for root, dirs, files in os.walk(directory):
        for filename in files:
            ziph.write(os.path.join(root, filename))

    return zipped_filepath

def clip_video(from_video_filepath, to_video_filepath, start_frame, end_frame):

    """
    create video clip starting at @start_frame and ending at @end_frame inclusive
    """

    unformatted_cmd = "ffmpeg -i {from_path} -vf trim=start_frame={start_frame}:end_frame={end_frame} -y -an {to_path}"

    cmd = unformatted_cmd.format(
        from_path=from_video_filepath,
        to_path=to_video_filepath,
        start_frame=start_frame,
        end_frame=end_frame + 1,
    )

    run_bash_cmd(cmd)

    return to_video_filepath

def resample_video(from_video_filepath, to_video_filepath, frame_rate):
    """
    resamples video with @frame_rate and outputs new video
    """

    unformatted_cmd = "ffmpeg -i {from_path} -filter:v fps={frame_rate} -q:v 0 -vcodec h264  -y {to_path}"

    cmd = unformatted_cmd.format(
        from_path=from_video_filepath,
        to_path=to_video_filepath,
        frame_rate=frame_rate,
    )


    run_bash_cmd(cmd)



    return to_video_filepath



# taken from https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests
def download_large_file(url, download_dir, filename):
    os.makedirs(download_dir, exist_ok=True)
    local_filename = os.path.join(download_dir, filename)
    with requests.get(url, stream=True) as response:
        with open(local_filename, 'wb') as file_obj:
            shutil.copyfileobj(response.raw, file_obj)

    return local_filename

def write_concat_input_file(video_filepaths, input_filepath):

    with open(input_filepath, "w") as file_obj:
        for filepath in video_filepaths:
            file_obj.write("file '{}'\n".format(filepath))

    return input_filepath

def download_pose_files(bucket, s3_lookup_folder, download_directory, pose_ids):
    pose_filepaths = []
    for pose_id in pose_ids:
        pose_filename = "pose-{}.mov".format(pose_id)
        pose_filepath = os.path.join(download_directory, pose_filename)
        bucket.download_file(
            os.path.join(
                s3_lookup_folder,
                pose_filename
            ),
            pose_filepath
        )
        pose_filepaths.append(pose_filepath)

    return pose_filepaths

def images_to_video(image_filepaths, video_filepath):
    """
    creates video from directory of images
    """


    unformatted_cmd = "ffmpeg -r 30 -f image2 -s 400x336 -i {image_filepaths} -vcodec libx264 -crf 25  -pix_fmt yuv420p {video_filepath}"

    cmd = unformatted_cmd.format(
        image_filepaths=image_filepaths,
        video_filepath=video_filepath,
    )

    run_bash_cmd(cmd)

    return video_filepath


def video_to_jpg(video_filepath, jpg_directory):
    """
    create video clip starting at @start_frame and ending at @end_frame inclusive
    """


    unformatted_cmd = "ffmpeg -i {} {} -hide_banner"

    cmd = unformatted_cmd.format(
        video_filepath,
        os.path.join(jpg_directory, "video-%06d.jpg")
    )

    run_bash_cmd(cmd)

    return jpg_directory

def jpg_to_png(jpg_directory, png_directory):

    unformatted_cmd = "mogrify -format png -path {} {}"

    cmd = unformatted_cmd.format(
        png_directory,
        os.path.join(jpg_directory, "*.jpg")
    )

    run_bash_cmd(cmd)

    return png_directory

def concat_videos(video_filepaths, output_filepath):

    """
    create video clip starting at @start_frame and ending at @end_frame inclusive
    """

    input_filepath = os.path.join(
        os.path.dirname(video_filepaths[0]),
        "input.txt"
    )

    video_filenames = [os.path.basename(f) for f in video_filepaths]
    write_concat_input_file(video_filenames, input_filepath)

    unformatted_cmd = "ffmpeg -f concat -safe 0 -i {} -codec copy -y {}"

    cmd = unformatted_cmd.format(
        input_filepath,
        output_filepath
    )

    run_bash_cmd(cmd)

    return output_filepath



In [74]:
s3 = boto3.resource('s3')
bucket = s3.Bucket(S3_BUCKET)

dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table(TABLE_NAME)


# If the bucket doesn't exist, create it
if not bucket.creation_date:
    bucket = s3.create_bucket(
        Bucket=S3_BUCKET,
        CreateBucketConfiguration={'LocationConstraint': boto3.session.Session().region_name}
    )

checkpoint_filename = "partition-{}-of-{}.txt".format(
    partition_id,
    number_partitions
)

checkpoint_filepath = os.path.join(
    S3_LOOKUP_FOlDER,
    CHECKPOINTS_DIR,
    checkpoint_filename
)

try:
    checkpoint_video_id = s3.Object(
        S3_BUCKET, checkpoint_filepath
    ).get()['Body'].read().decode('utf-8')
    checkpoint_video_id = int(checkpoint_video_id)
except s3.meta.client.exceptions.NoSuchKey:
    checkpoint_video_id = None

video_metadata_filepath = os.path.join(
        DATA_DIR,
        VIDEO_METADATA_FILE
    )

videos = get_video_metadata(
    bucket, VIDEO_METADATA_FILE, video_metadata_filepath,
    partition_id, number_partitions
)
for video in videos:
    process_video(video,checkpoint_video_id)

# with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
#     executor.map(process_video,videos,repeat(checkpoint_video_id))




v2/data/metadata/video_metadata.csv
                Main New Gloss.1               Gloss Variant Consultant  \
0                          False                       False      Tyler   
1                          False                      FALSE+      Brady   
2                          False                      FALSE+      Brady   
3                          False                      FALSE+      Brady   
4                          False                      FALSE+        Liz   
...                          ...                         ...        ...   
9758            ns-nat-SRI-LANKA            ns-nat-SRI-LANKA        Liz   
9759             ns-nat-THAILAND             ns-nat-THAILAND      Brady   
9760             ns-nat-THAILAND             ns-nat-THAILAND        Liz   
9761  ns-nat-TRINIDAD-AND-TOBAGO  ns-nat-TRINIDAD-AND-TOBAGO      Brady   
9762  ns-nat-TRINIDAD-AND-TOBAGO  ns-nat-TRINIDAD-AND-TOBAGO        Liz   

                   Session  Scene  Start   End    id  \
0      