### Install python modules
Install on SageMaker the necesary modules for this notebook to work

In [None]:
!/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip
!pip install pytube
!pip install seaborn
!pip install smart_open

### Import modules

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import random as rd
from adaptors import youtube2df, aws2df, upload_yt_file
import logging
from pytube import YouTube
import boto3
from botocore.exceptions import ClientError
import sys, os
from urllib.parse import urlparse
import re

### Functions to get data from S3

In [None]:
### TODO: Move to adaptaros.py and standarice this code.
def get_all_s3_objects(s3, **base_kwargs):
    """ Amplify the limit of AWS results to 1000+
    
    :param s3: Bucket to amplify the result limit
    :return: None
    """
    continuation_token = None
    while True:
        list_kwargs = dict(MaxKeys=1000, **base_kwargs)
        if continuation_token:
            list_kwargs['ContinuationToken'] = continuation_token
        response = s3.list_objects_v2(**list_kwargs)
        yield from response.get('Contents', [])
        if not response.get('IsTruncated'):  # At the end of the list?
            break
        continuation_token = response.get('NextContinuationToken')

def get_folder_list(bucket='awstranscribe-tests', key='transcribeOutputs/Files'):
    """ Get the name of the files inside an AWS S3 Bucket
    
    :param bucket: AWS S3 bucket name
    :param key: directory and name in bucket, defaults to transcribeOutputs/Files
    :return: List with the name of each object in the S3 key
    """
    ###
    #  Get the name of the files in a bucket. While bucket is the AWS S3 Bucket and key is the folder inside that bucket
    # it defaults to transcribeOutputs/Files
    ###
    s3 = boto3.client('s3')
    data_loc = []
    for obj in get_all_s3_objects(s3, Bucket=bucket, Prefix=key):
        names = 's3://{}/{}'.format(bucket, obj['Key'])
        data_loc.append(names)
    return data_loc


Define a YouTube video URL

In [None]:
url = 'https://youtu.be/eEOhx-u9Z2k'
#url = 'https://youtu.be/Mxv9AM397Y8'

# Get video object
yt = YouTube(url)

### Download videos from Youtube
We'll download videos from youtube, save them into an S3 bucket that triggers a transcribe job (with or without IPA vocabulary)

In [None]:
# Get the tittle and replace spaces with underscore
yt_title = yt.title.replace(' ', '_')
    
# Download the video to tmp folder and save the output name to file_name
file_name = yt.streams.first().download(output_path="/tmp" ,filename=yt_title)


In [None]:
# Returns True if file was uploaded, else False
upload_yt_file(file_name)

### Download captions from the YouTube video
From the said YouTube video we'll get the 'a.es' caption if exist 

In [None]:
if yt.captions.get_by_language_code('a.es'):
    caption = yt.captions.get_by_language_code('a.es')
    yt_caption = caption.generate_srt_captions()
    yt_title = yt.title.replace(' ', '_')
    with open(f'/tmp/{yt_title}.txt', 'a') as f:
        f.write(yt_caption)
else:
    print('This caption doesn\'t exist')
    print('You can use one of the following captions: \n', yt.captions.all())

### Upload the caption to S3

In [None]:
upload_yt_file(f'/tmp/{yt_title}.txt', object_name=f'levenshteinTests/ytCaptions/{yt_title}.txt')

# Test and compare Data

In [6]:
aws_transcribe = aws2df('s3://awstranscribe-tests/levenshteinTests/RAW/utubeVideos/Un_show_FALLIDO_-_Mini_Especial_Rancagua__Edo_Caroe.json')
aws_transcribe.head()

Unnamed: 0,start,end,transcript
0,0.04,0.51,La
1,0.51,1.03,Fallida
2,1.03,1.73,actuación
3,1.74,1.92,en
4,1.92,3.08,Rancagua


In [5]:
yt_transcribe = youtube2df('s3://awstranscribe-tests/levenshteinTests/ytCaptions/Un_show_FALLIDO_-_Mini_Especial_Rancagua_|_Edo_Caroe.txt')
yt_transcribe.head()

Unnamed: 0,orig_index,start,end,transcript
0,1,0.0,5.43,la fallida actuación en rancagua 1
1,2,3.27,6.87,problema de este lugar web sin duda uno
2,3,5.43,9.09,de los shows más raros de la gira moto
3,4,6.87,15.99,al sur y probablemente uno de los
4,5,9.09,18.15,mejores pero bueno los shows de la gira


# Troubleshooting different Functions
I'll use the next cells for debug