### Install python modules
Install on SageMaker the necesary modules for this notebook to work

In [None]:
!/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip
!pip install pytube
!pip install seaborn
!pip install smart_open

### Import modules

In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import random as rd
from adaptors import youtube2df, aws2df, upload_yt_file
import logging
from pytube import YouTube
import boto3
from botocore.exceptions import ClientError
import sys, os
from urllib.parse import urlparse
import re

### Functions to get data from S3

In [None]:
### TODO: Move to adaptaros.py and standarice this code.
def get_all_s3_objects(s3, **base_kwargs):
    """ Amplify the limit of AWS results to 1000+
    
    :param s3: Bucket to amplify the result limit
    :return: None
    """
    continuation_token = None
    while True:
        list_kwargs = dict(MaxKeys=1000, **base_kwargs)
        if continuation_token:
            list_kwargs['ContinuationToken'] = continuation_token
        response = s3.list_objects_v2(**list_kwargs)
        yield from response.get('Contents', [])
        if not response.get('IsTruncated'):  # At the end of the list?
            break
        continuation_token = response.get('NextContinuationToken')

def get_folder_list(bucket='awstranscribe-tests', key='transcribeOutputs/Files'):
    """ Get the name of the files inside an AWS S3 Bucket
    
    :param bucket: AWS S3 bucket name
    :param key: directory and name in bucket, defaults to transcribeOutputs/Files
    :return: List with the name of each object in the S3 key
    """
    ###
    #  Get the name of the files in a bucket. While bucket is the AWS S3 Bucket and key is the folder inside that bucket
    # it defaults to transcribeOutputs/Files
    ###
    s3 = boto3.client('s3')
    data_loc = []
    for obj in get_all_s3_objects(s3, Bucket=bucket, Prefix=key):
        names = 's3://{}/{}'.format(bucket, obj['Key'])
        data_loc.append(names)
    return data_loc


Define a YouTube video URL

In [None]:
url = 'https://youtu.be/eEOhx-u9Z2k'
#url = 'https://youtu.be/Mxv9AM397Y8'

# Get video object
yt = YouTube(url)

### Download videos from Youtube
We'll download videos from youtube, save them into an S3 bucket that triggers a transcribe job (with or without IPA vocabulary)

In [4]:
# Get the tittle and replace spaces with underscore
yt_title = yt.title.replace(' ', '_')
    
# Download the video to tmp folder and save the output name to file_name
file_name = yt.streams.first().download(output_path="/tmp" ,filename=yt_title)


In [None]:
# Returns True if file was uploaded, else False
upload_yt_file(file_name)

### Download captions from the YouTube video
From the said YouTube video we'll get the 'a.es' caption if exist 

In [None]:
if yt.captions.get_by_language_code('a.es'):
    caption = yt.captions.get_by_language_code('a.es')
    yt_caption = caption.generate_srt_captions()
    yt_title = yt.title.replace(' ', '_')
    with open(f'/tmp/{yt_title}.txt', 'a') as f:
        f.write(yt_caption)
else:
    print('This caption doesn\'t exist')
    print('You can use one of the following captions: \n', yt.captions.all())

### Upload the caption to S3

In [None]:
upload_yt_file(f'/tmp/{yt_title}.txt', object_name=f'levenshteinTests/ytCaptions/{yt_title}.txt')

In [None]:
def aws2df2(filepath, aws_path=True):
    print(filepath)
    if aws_path:
        print('Here')
        data = pd.read_json(filepath)
    else:
        # read the file
        with open(filepath) as f:
            data = json.load(f)
            
    # compress into tuples
    tuples = []
    for value in data['results']['items']:
        if value.get('start_time'):
            tuples.append((value.get('start_time'),
                           value.get('end_time'),
                           value['alternatives'][0].get('content')
                           ))

    # transform to df
    _df = pd.DataFrame(tuples, columns=['start', 'end', 'transcript'])

    return _df

In [None]:
aws2df('s3://awstranscribe-tests/levenshteinTests/RAW/utubeVideos/Un_show_FALLIDO_-_Mini_Especial_Rancagua__Edo_Caroe.json')


#print(get_folder_list(key='levenshteinTests/RAW/utubeVideos'))

In [13]:
youtube2df('s3://awstranscribe-tests/levenshteinTests/ytCaptions/Un_show_FALLIDO_-_Mini_Especial_Rancagua_|_Edo_Caroe.txt')

ValueError: invalid literal for int() with base 10: ''

# Troubleshooting different Functions
I'll use the next cells for debug

In [9]:
def _times(a_string, index):
    """ create internal function that splits time into 'start' and 'end' times
    
    :param a_string: String to split
    :param index: 0 for start 1 for end
    :return: return start or end depending on index
    """  
    
    # split string using re lib
    splits = re.split(r'[, :]', a_string)

    # cast to int each split
    splits = [int(a_split) for index, a_split in enumerate(splits) if index != 4]

    # calculate second start and end
    if index == 0:
        t = splits[0] * 3600 + splits[1] * 60 + splits[2] + splits[3]/1000
    elif index == 1:
        t = splits[4] * 3600 + splits[5] * 60 + splits[6] + splits[7]/1000
    else:
        t = -1

    return t

In [15]:
def youtube2df2(filepath, aws_path=True):
    if aws_path:
        fpath = urlparse(filepath)
        bucket = fpath.netloc
        key = fpath.path.lstrip('/')
        s3 = boto3.resource('s3')
        obj = s3.Object(bucket, key)
        file = obj.get()['Body'].read().decode('utf8')
    else:
        # read the file
        with open(filepath) as f:
            file = f.read()

    # split the file
    lines = file.splitlines()
    
    # compress into tuples of three, ignoring 4th
    # tuples = [index, time, transcript]
    [print(lines[i], _times(lines[i + 1], 0), _times(lines[i + 1], 1), lines[i + 2]) for i in range(0, len(lines), 4)]
    tps = [(lines[i], _times(lines[i + 1], 0), _times(lines[i + 1], 1), lines[i + 2]) for i in range(0, len(lines), 4)]
    print(tps)
    # convert to dataframe
    _df = pd.DataFrame(tps, columns=['orig_index', 'start', 'end', 'transcript'])

    return _df

In [16]:
youtube2df2('s3://awstranscribe-tests/levenshteinTests/ytCaptions/Un_show_FALLIDO_-_Mini_Especial_Rancagua_|_Edo_Caroe.txt')

1 0.0 5.43 la fallida actuación en rancagua 1
2 3.27 6.87 problema de este lugar web sin duda uno
3 5.43 9.09 de los shows más raros de la gira moto
4 6.87 15.99 al sur y probablemente uno de los
5 9.09 18.15 mejores pero bueno los shows de la gira
6 15.99 21.119 en moto no se hacen en grandes teatros
7 18.15 23.22 mi casino no se hacen en bares lugares
8 21.119 27.32 más pequeños improvisados a veces
9 23.22 30.33 rústicos hostiles porque no toma
10 27.32 32.189 entonces las cosas no siempre salen como
11 30.33 34.59 uno quiere pero el bar de rancagua la
12 32.189 36.97 cago qué manera de fallar todo en ese
13 34.59 40.049 lugar igual
14 36.97 40.049 [Música]
15 42.65 47.07 los carros seguir será vas a pagar un
16 45.21 47.73 remate no da risa tanto cuidado del
17 47.07 49.32 huerto
18 47.73 52.38 hay una iluminación fallaron las
19 49.32 54.96 pantallas falló el lugar fue allí yo
20 52.38 57.3 pero la concha de tu madre allí yo por
21 54.96 58.69 la mismísima media estación me calles

ValueError: invalid literal for int() with base 10: ''