In [1]:
# Installing pip dependencies.
!pip install pydub
!pip install crepe
!pip install tqdm
# Installing conda dependencies
!conda install -c conda-forge ffmpeg -y

In [1]:
import sagemaker as sage
from sagemaker import get_execution_role
from sagemaker import ModelPackage
import boto3

from datetime import datetime
import zipfile
import os
import json 
import uuid
import requests
import time
from pydub import AudioSegment

# Installing src dependency.
import sys
sys.path.append('src')
import processing_util
import audio_util

Go to https://aws.amazon.com/marketplace/pp/prodview-23n4vi2zw67we to get the arn for the model package. 

***YOU NEED TO REPLACE THE STRING FOR THE ```modelpackage_arn``` VARIABLE WITH YOUR OWN***

In [3]:
# Execution role
role = get_execution_role()
# S3 prefixes
common_prefix = "source_separation"
batch_inference_input_prefix = common_prefix + "/batch-inference-input-data"
# Sagemaker Session
sagemaker_session = sage.Session()
# Arn for Quantphi Source Separator Model Package
modelpackage_arn = 'arn:aws:sagemaker:us-east-2:057799348421:model-package/source-separation-v11570291536-75ed8128ecee95e142ec4404d884ecad'

For the Corresponding IAM Role, add the following policies:

* AmazonTranscribeFullAccess
* AWSMarketplaceManageSubscriptions
* AmazonPollyFullAccess
* AmazonSageMakerFullAccess

### Choose Song for Input

Below here just choose what song you want to do the demo with by replacing the current song specified by the input_song variable with one of the songs below:

* ```imagine-john_lennon```
* ```toosie_slide-drake```
* ```just_the_way_you_are-bruno_mars```
* ```love_yourself-justin_bieber```

In [7]:
input_song = "love_yourself-justin_bieber"
batch_input_folder = "source-separation-input/" + input_song + "/"

transform_input = sagemaker_session.upload_data(batch_input_folder, key_prefix=batch_inference_input_prefix)

time.sleep(5)

### Creating the Model

In [9]:
def predict_wrapper(endpoint, session):
    return sage.RealTimePredictor(endpoint, session, content_type='application/x-recordio-protobuf')

model = ModelPackage(role=role,
                     model_package_arn=modelpackage_arn,
                     sagemaker_session=sagemaker_session,
                     predictor_cls=predict_wrapper)

### Running the Batch Job


In [2]:
bucket = sagemaker_session.default_bucket()

transformer = model.transformer(1, 'ml.m4.xlarge', strategy='SingleRecord', output_path='s3://'+bucket+'/'+common_prefix+'/batch-transform-output')
transformer.transform(transform_input, content_type='application/x-recordio-protobuf')
transformer.wait()

time.sleep(5)
print("Batch Transform output saved to " + transformer.output_path)

### Processing the Batch Output

In [None]:
# Downloading files from s3.
s3 = boto3.resource('s3')
my_bucket = s3.Bucket(sagemaker_session.default_bucket())
prefix = "source_separation/batch-transform-output/"
i = 0
processing_util.clear_folder('source-separation-output/batch-transform-output')
for object_summary in my_bucket.objects.filter(Prefix=prefix):
    i = i + 1
    file_name = object_summary.key.split('/')[-1]
    print(file_name)
    my_bucket.download_file(prefix+ file_name, 'source-separation-output/batch-transform-output/output-{}.zip'.format(i))
    
time.sleep(5)

In [None]:
# Extracting files from zip files. 
processing_util.clear_folder('source-separation-output/extracted')
for file in os.listdir('source-separation-output/batch-transform-output'):
    print(file)
    with zipfile.ZipFile('source-separation-output/batch-transform-output/'+file, 'r') as zip_ref:
        zip_ref.extractall('source-separation-output/extracted/'+file.split('.')[0]+'/')
        
time.sleep(5)

In [None]:
# Separating the vocal files and the background sound files.
processing_util.clear_folder('source-separation-output/vocals')
processing_util.clear_folder('source-separation-output/background')
for i, folder in enumerate(sorted(os.listdir('source-separation-output/extracted/'))):
    for file in os.listdir('source-separation-output/extracted/' + folder + '/output'):
        new_file_name = str(i).zfill(5) + ".wav"
        if "vocals" in file:
            os.rename('source-separation-output/extracted/' + folder + '/output/' + file, 'source-separation-output/vocals/vocals' + new_file_name)
        elif "accompaniment" in file:
            os.rename('source-separation-output/extracted/' + folder + '/output/' + file, 'source-separation-output/background/background' + new_file_name)
            
time.sleep(5)

### Transcribe the Vocal Files

In [None]:
# Upload the Vocal files onto s3
local_vocals_folder = "source-separation-output/vocals/"
transcribe_input_prefix = "transcribe-input"

transcribe_input = sagemaker_session.upload_data(local_vocals_folder, key_prefix=transcribe_input_prefix)
print("Transcribe input uploaded to " + transcribe_input)

time.sleep(10)

In [None]:
# Start a transcription job for each file. Add the transcription to finsihed jobs once finished. 
transcribe = boto3.client('transcribe')
output_bucket_name = "transcribe-output"
processing_util.clear_folder('transcribe-output')
uri_prefix = "https://%s.s3.%s.amazonaws.com/transcribe-input/" % (sagemaker_session.default_bucket(), boto3.client('s3').get_bucket_location(Bucket=sagemaker_session.default_bucket())['LocationConstraint'])
finished_jobs = list()

for file in sorted(os.listdir(local_vocals_folder)):

    print("Transcribing: " + file)
    job_uri = uri_prefix + file
    transcribe.start_transcription_job(
        TranscriptionJobName=file,
        Media={'MediaFileUri': job_uri},
        MediaFormat='wav',
        LanguageCode='en-US'
    )
    while True:
        status = transcribe.get_transcription_job(TranscriptionJobName=file)
        if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
            break
    time.sleep(3)
    api_data = requests.get(url=status['TranscriptionJob']['Transcript']['TranscriptFileUri'])
    data = api_data.json()
    finished_jobs.append(data)
    dump_file_name = 'transcribe-output/transcription' + file.split(".")[0] + '.json'
    # Writing to json files for analysis purposes.
    with open(dump_file_name, 'w') as f:
        json.dump(data, f, indent=4)
    transcribe.delete_transcription_job(TranscriptionJobName=file)
    
finished_jobs.sort(key=lambda x : x['jobName'])

time.sleep(10)

### Processing the Transcribe Output

In [1]:
# Short words tend to be transcribed too short. So this manually extends them. 
extend_word_length_factor = 200 # (percent of total word duration)
word_under_x_ms_long = 500 # ms

In [35]:
# Patching the batches back together, generate transcription list from all the batches. 
transcribe_output_folder = "transcribe-output/"
offset = 0 # Takes into account that batches are sequential.
transcription_list = list()
index = 0
for file in sorted(os.listdir(transcribe_output_folder)):
    transcription_batch = json.load(open(transcribe_output_folder + file, "r", encoding="utf-8"))
    for map_item in transcription_batch["results"]["items"]:
        transcribe_object = processing_util.TranscriptionItem(map_item, index, offset)
        # Skip punctuation
        if transcribe_object.is_word():
            # Increase word duration if very short
            if transcribe_object.duration() < word_under_x_ms_long:
                transcribe_object.end_time += extend_word_length_factor
            transcription_list.append(transcribe_object)
            index += 1

    offset += 30000
    
# Compile the entire song transcription into one file.
transcribed_song_folder = "song-transcription/"
processing_util.clear_folder(transcribed_song_folder)
with open(transcribed_song_folder + "transcribed_song.json", 'w') as outfile:
    json.dump([item.to_dict() for item in transcription_list], outfile, indent=4)

time.sleep(5)

### Giving Transcriptions to Amazon Polly

Amazon Polly is queried for each individual word to allow for easier control of timing and pitch.

In [32]:
def query_polly(polly_client, word, length, prefix, output_folder):
    
    ssml = """<speak><prosody amazon:max-duration="{max_len}ms">{word}</prosody></speak>""".format(max_len=str(length), word=word)          
    response = polly_client.start_speech_synthesis_task(VoiceId='Justin',
                OutputS3BucketName='sagemaker-us-east-2-075178354542',
                OutputS3KeyPrefix='polly-output/' + prefix,
                OutputFormat='mp3', 
                TextType = 'ssml',
                Text = ssml)


In [3]:
from tqdm import tqdm

polly_client = boto3.client('polly')
polly_output_folder = "polly-output/"

print("Generating audio file for each word...")
for transcribe_object in tqdm(transcription_list):
    
    response = query_polly(polly_client, transcribe_object.content, transcribe_object.duration(), transcribe_object.index, polly_output_folder)
    
time.sleep(30)

### Processing the Output from Amazon Polly

In [4]:
# Downloading files from s3.
s3 = boto3.resource('s3')
my_bucket = s3.Bucket(sagemaker_session.default_bucket())
prefix = "polly-output/"
processing_util.clear_folder(prefix)


for object_summary in my_bucket.objects.filter(Prefix=prefix):
    file_name = object_summary.key.split('/')[-1]
    my_bucket.download_file(prefix+ file_name, prefix + file_name)

time.sleep(5)
print("Files moved from s3 to repo.")

Files moved from s3 to repo.


### Generate the song


In [4]:
import os
import sys
sys.path.append('src')
import processing_util
import audio_util
from tqdm import tqdm
import json
import warnings
from IPython.utils import io

POLLY_OUTPUT_FOLDER = "polly-output/"
BACKGROUND_FOLDER = "source-separation-output/background/"
FINAL_OUTPUT_FOLDER = "final-output/"
SONG_TRANSCRIPTION_PATH = "song-transcription/transcribed_song.json"
BATCH_LENGTH = 30000 # m
warnings.simplefilter("ignore")

# Generate Background
background_mp3_files = [BACKGROUND_FOLDER + s for s in sorted(os.listdir(BACKGROUND_FOLDER))]
background_mp3 = audio_util.interpret_polly_output_file(background_mp3_files[0])
background_mp3_files.pop(0)
for fname in tqdm(background_mp3_files):
    background_mp3 += audio_util.interpret_polly_output_file(fname)
background_mp3.export(FINAL_OUTPUT_FOLDER + "background.mp3", format="mp3")

# Generate Vocals
polly_output = sorted(os.listdir(POLLY_OUTPUT_FOLDER))
song_transcription = json.load(open(SONG_TRANSCRIPTION_PATH, "r", encoding="utf-8"))

vocal_mp3 = audio_util.get_silence(1)
expected_start_time = 0

for transcription_item, mp3_file in tqdm(list(zip(song_transcription, polly_output))):
    if expected_start_time < transcription_item["start_time"]:
        vocal_mp3 += audio_util.get_silence(transcription_item["start_time"] - expected_start_time)
        expected_start_time = transcription_item["start_time"]

    assert(mp3_file.startswith(transcription_item["index"]))
    audio_clip = audio_util.interpret_polly_output_file(POLLY_OUTPUT_FOLDER + mp3_file)

# Version 1.1: No pitch modification, more stable
#     vocal_mp3 += audio_clip
#     expected_start_time += len(audio_clip)
# Version 2.2: Includes pitch modification
    corrected_audio_clip = None
    if transcription_item["end_time"] - transcription_item["start_time"] > 50:
        with io.capture_output() as captured:
            corrected_audio_clip = audio_util.pitch_correction(audio_clip, transcription_item["start_time"], transcription_item["end_time"], "temp/")
    if corrected_audio_clip:
        vocal_mp3 += corrected_audio_clip
        expected_start_time += len(corrected_audio_clip)
    else:
        vocal_mp3 += audio_clip
        expected_start_time += len(audio_clip)

vocal_mp3.export(FINAL_OUTPUT_FOLDER + "vocals.mp3", format="mp3")


print("Overlaying the vocals with the accompaniment and generating the final audio file...")
final_audio = background_mp3.overlay(vocal_mp3)
final_audio.export(FINAL_OUTPUT_FOLDER + "final_audio.mp3", format="mp3")
print("Done.")


### Listen to the Song Cover :)

In [4]:
import IPython.display as ipd
ipd.Audio("final-output/final_audio.mp3")