In [5]:
# Installing pip dependencies.
!pip install pydub
!pip install crepe
!pip install tqdm
# Installing conda dependencies
!conda install -c conda-forge ffmpeg -y

Collecting pydub
  Downloading https://files.pythonhosted.org/packages/79/db/eaf620b73a1eec3c8c6f8f5b0b236a50f9da88ad57802154b7ba7664d0b8/pydub-0.23.1-py2.py3-none-any.whl
Installing collected packages: pydub
Successfully installed pydub-0.23.1
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Collecting crepe
  Downloading https://files.pythonhosted.org/packages/c8/74/1677b9369f233745b3dedf707ce26fb935c5c400379c45400df818f3a805/crepe-0.0.11.tar.gz
Collecting resampy<0.3.0,>=0.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/79/75/e22272b9c2185fc8f3af6ce37229708b45e8b855fd4bc38b4d6b040fff65/resampy-0.2.2.tar.gz (323kB)
[K     |████████████████████████████████| 327kB 4.3MB/s eta 0:00:01
Collecting hmmlearn<0.3.0,>=0.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/ff/7b/33f629a443a0671161c019e55c3f1b511c7e9fdce5ab8c8c3c33470eb939/hmmlearn-0.2.3-cp36-cp36m-manylinux1_x86_64.whl (363kB)
[K     |████████████████████████████████| 

bzip2-1.0.8          | 396 KB    | ##################################### | 100% 
krb5-1.17.1          | 1.5 MB    | ##################################### | 100% 
ffmpeg-4.2           | 80.2 MB   | ##################################### | 100% 
gnutls-3.6.5         | 2.1 MB    | ##################################### | 100% 
pykerberos-1.2.1     | 27 KB     | ##################################### | 100% 
nettle-3.4.1         | 5.7 MB    | ##################################### | 100% 
curl-7.69.1          | 137 KB    | ##################################### | 100% 
openh264-1.8.0       | 1.4 MB    | ##################################### | 100% 
python-3.6.7         | 34.5 MB   | ##################################### | 100% 
cryptography-2.8     | 628 KB    | ##################################### | 100% 
pycurl-7.43.0.5      | 69 KB     | ##################################### | 100% 
lame-3.100           | 498 KB    | ##################################### | 100% 
libcurl-7.69.1       | 573 K

In [2]:
import sagemaker as sage
from sagemaker import get_execution_role
from sagemaker import ModelPackage
import boto3

from datetime import datetime
import zipfile
import os
import json 
import uuid
import requests
import time
from tqdm import tqdm
from pydub import AudioSegment

# Installing src dependency.
import sys
sys.path.append('src')
import processing_util
import audio_util

Go to https://aws.amazon.com/marketplace/pp/prodview-23n4vi2zw67we to get the arn for the model package. 

***YOU NEED TO REPLACE THE STRING FOR THE ```modelpackage_arn``` VARIABLE WITH YOUR OWN***

In [3]:
# Execution role
role = get_execution_role()
# Sagemaker Session
sagemaker_session = sage.Session()
# S3
s3 = boto3.resource('s3')
bucket_name = sagemaker_session.default_bucket()
my_bucket = s3.Bucket(bucket_name)
bucket_client = boto3.client('s3')
# Arn for Quantphi Source Separator Model Package
modelpackage_arn = 'arn:aws:sagemaker:us-east-2:057799348421:model-package/source-separation-v11570291536-75ed8128ecee95e142ec4404d884ecad'

For the Corresponding IAM Role, add the following policies:

* AmazonTranscribeFullAccess
* AWSMarketplaceManageSubscriptions
* AmazonPollyFullAccess
* AmazonSageMakerFullAccess

### Choose Song for Input

Below here just choose what song you want to do the demo with by replacing the current song specified by the input_song variable with one of the songs below:

* ```imagine-john_lennon```
* ```toosie_slide-drake```
* ```just_the_way_you_are-bruno_mars```
* ```love_yourself-justin_bieber```
* ```savage-megan_thee_stallion```
* ```crazy_in_love-sofia-karlberg```

*Note that you can add a custom input by uploading an mp3 file to the ```archive/songs/``` directory and adding the name of the mp3 file to the ```input_song``` variable.*

You can also choose the corresponding voice id from Amazon Polly which will be used to cover the song:

* ```Joey```
* ```Joanna```
* ```Matthew```

In [16]:
# Configuration
input_song = "toosie_slide-drake"
voice_id = "Joey"

# Take the mp3 and split it into 30 second segments.  
input_song = input_song if input_song.endswith(".mp3") else input_song + ".mp3"
batch_input_folder = "source-separation-input"
audio_util.split_mp3("./archive/songs/" + input_song, batch_input_folder + "/")
# Save all segments in s3 bucket for batch. 
processing_util.clear_s3_folder(my_bucket, bucket_client, batch_input_folder)
transform_input = sagemaker_session.upload_data(batch_input_folder + "/", key_prefix=batch_input_folder)

time.sleep(5)

### Creating the Model

In [12]:
def predict_wrapper(endpoint, session):
    return sage.RealTimePredictor(endpoint, session, content_type='application/x-recordio-protobuf')

model = ModelPackage(role=role,
                     model_package_arn=modelpackage_arn,
                     sagemaker_session=sagemaker_session,
                     predictor_cls=predict_wrapper)

### Running the Batch Job


In [13]:
batch_output_folder = "source-separation-output"

processing_util.clear_s3_folder(my_bucket, bucket_client, batch_output_folder + "/")
transformer = model.transformer(1, 'ml.m4.xlarge', strategy='SingleRecord', output_path='s3://' + bucket_name + "/" + batch_output_folder)
transformer.transform(transform_input, content_type='application/x-recordio-protobuf')
transformer.wait()

time.sleep(5)
print("Batch Transform output saved to " + transformer.output_path)

....................[34mStarting the inference server with 4 workers.[0m
[34m[2020-04-22 19:23:20 +0000] [10] [INFO] Starting gunicorn 19.9.0[0m
[34m[2020-04-22 19:23:20 +0000] [10] [INFO] Listening at: unix:/tmp/gunicorn.sock (10)[0m
[34m[2020-04-22 19:23:20 +0000] [10] [INFO] Using worker: gevent[0m
[34m[2020-04-22 19:23:20 +0000] [14] [INFO] Booting worker with pid: 14[0m
[34m[2020-04-22 19:23:20 +0000] [15] [INFO] Booting worker with pid: 15[0m
[34m[2020-04-22 19:23:20 +0000] [16] [INFO] Booting worker with pid: 16[0m
[34m[2020-04-22 19:23:20 +0000] [18] [INFO] Booting worker with pid: 18[0m
[34mTesting...[0m
[35mTesting...[0m
[34m2020-04-22 19:23:43.680968: I tensorflow/core/platform/cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA[0m
[34m169.254.255.130 - - [22/Apr/2020:19:23:43 +0000] "GET /ping HTTP/1.1" 200 1 "-" "Go-http-client/1.1"[0m
[34m169.254.255.130 - - [22/Apr/2020:19:23:43 +

[34m169.254.255.130 - - [22/Apr/2020:19:30:04 +0000] "POST /invocations HTTP/1.1" 200 4372785 "-" "Go-http-client/1.1"[0m
[35m169.254.255.130 - - [22/Apr/2020:19:30:04 +0000] "POST /invocations HTTP/1.1" 200 4372785 "-" "Go-http-client/1.1"[0m

Batch Transform output saved to s3://sagemaker-us-east-2-075178354542/source-separation-output


### Processing the Batch Output

In [15]:
# Downloading files from s3.
i = 0
processing_util.clear_folder(batch_output_folder + "/")
for object_summary in my_bucket.objects.filter(Prefix=batch_output_folder):
    i = i + 1
    file_name = object_summary.key.split('/')[-1]
    print(file_name)
    my_bucket.download_file(batch_output_folder + "/" + file_name, batch_output_folder + '/output-{}.zip'.format(i))
    
time.sleep(5)

input1.mp3.out
input2.mp3.out
input3.mp3.out
input4.mp3.out
input5.mp3.out
input6.mp3.out
input7.mp3.out
input8.mp3.out
input9.mp3.out


In [17]:
# Extracting files from zip files. 
extraction_folder = 'source-separation-output-extracted/'
processing_util.clear_folder(extraction_folder)
for file in os.listdir(batch_output_folder):
    print(file)
    with zipfile.ZipFile(batch_output_folder + "/" + file, 'r') as zip_ref:
        zip_ref.extractall(extraction_folder+file.split('.')[0]+'/')
        
time.sleep(5)

output-9.zip
output-7.zip
output-4.zip
output-8.zip
output-3.zip
output-1.zip
output-2.zip
output-6.zip
output-5.zip


In [19]:
# Separating the vocal files and the background sound files.
isolated_vocals_folder = "isolated-vocals/"
isolated_background_folder = "isolated-background/"

processing_util.clear_folder(isolated_vocals_folder)
processing_util.clear_folder(isolated_background_folder)
for i, folder in enumerate(sorted(os.listdir(extraction_folder))):
    for file in os.listdir(extraction_folder + folder + '/output'):
        new_file_name = str(i).zfill(5) + ".wav"
        if "vocals" in file:
            os.rename(extraction_folder + folder + '/output/' + file, isolated_vocals_folder + 'vocals' + new_file_name)
        elif "accompaniment" in file:
            os.rename(extraction_folder + folder + '/output/' + file, isolated_background_folder + 'background' + new_file_name)
            
time.sleep(5)

### Transcribe the Vocal Files

In [41]:
# Upload the Vocal files onto s3
transcribe_input_prefix = "transcribe-input"

processing_util.clear_s3_folder(my_bucket, bucket_client, transcribe_input_prefix + "/")
transcribe_input = sagemaker_session.upload_data(isolated_vocals_folder, key_prefix=transcribe_input_prefix)
print("Transcribe input uploaded to " + transcribe_input)

time.sleep(10)

Transcribe input uploaded to s3://sagemaker-us-east-2-075178354542/transcribe-input


In [42]:
# Start a transcription job for each file. Add the transcription to finsihed jobs once finished. 
transcribe = boto3.client('transcribe')
output_bucket_name = "transcribe-output"
processing_util.clear_folder('transcribe-output')
uri_prefix = "https://%s.s3.%s.amazonaws.com/transcribe-input/" % (sagemaker_session.default_bucket(), boto3.client('s3').get_bucket_location(Bucket=sagemaker_session.default_bucket())['LocationConstraint'])
finished_jobs = list()

for file in sorted(os.listdir(isolated_vocals_folder)):

    print("Transcribing: " + file)
    job_uri = uri_prefix + file
    transcribe.start_transcription_job(
        TranscriptionJobName=file,
        Media={'MediaFileUri': job_uri},
        MediaFormat='wav',
        LanguageCode='en-US'
    )
    while True:
        status = transcribe.get_transcription_job(TranscriptionJobName=file)
        if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
            break
    time.sleep(3)
    api_data = requests.get(url=status['TranscriptionJob']['Transcript']['TranscriptFileUri'])
    data = api_data.json()
    finished_jobs.append(data)
    dump_file_name = 'transcribe-output/transcription' + file.split(".")[0] + '.json'
    # Writing to json files for analysis purposes.
    with open(dump_file_name, 'w') as f:
        json.dump(data, f, indent=4)
    transcribe.delete_transcription_job(TranscriptionJobName=file)
    
finished_jobs.sort(key=lambda x : x['jobName'])

time.sleep(10)

Transcribing: vocals00000.wav
Transcribing: vocals00001.wav
Transcribing: vocals00002.wav
Transcribing: vocals00003.wav
Transcribing: vocals00004.wav
Transcribing: vocals00005.wav
Transcribing: vocals00006.wav
Transcribing: vocals00007.wav
Transcribing: vocals00008.wav


### Processing the Transcribe Output

In [10]:
# Short words tend to be transcribed too short. So this manually extends them. 
extend_word_length_factor = 100 
word_under_x_ms_long = 200 

In [11]:
# Patching the batches back together, generate transcription list from all the batches. 
transcribe_output_folder = "transcribe-output/"
offset = 0 # Takes into account that batches are sequential.
transcription_list = list()
index = 0
for file in sorted(os.listdir(transcribe_output_folder)):
    transcription_batch = json.load(open(transcribe_output_folder + file, "r", encoding="utf-8"))
    for map_item in transcription_batch["results"]["items"]:
        transcribe_object = processing_util.TranscriptionItem(map_item, index, offset)
        # Skip punctuation
        if transcribe_object.is_word():
            # Increase word duration if very short
            if transcribe_object.duration() < word_under_x_ms_long:
                transcribe_object.end_time += extend_word_length_factor
            transcription_list.append(transcribe_object)
            index += 1

    offset += 30000
    
# Compile the entire song transcription into one file.
transcribed_song_folder = "song-transcription/"
processing_util.clear_folder(transcribed_song_folder)
with open(transcribed_song_folder + "transcribed_song.json", 'w') as outfile:
    json.dump([item.to_dict() for item in transcription_list], outfile, indent=4)

time.sleep(5)

### Giving Transcriptions to Amazon Polly

Amazon Polly is queried for each individual word to allow for easier control of timing and pitch.

In [5]:
def query_polly(polly_client, word, length, bucket_prefix, output_folder, pitch_mod, voice_id='Joey'):
    
    if pitch_mod > 12 or pitch_mod < -12:
        pitch_mod_factor = 0
    else:
        pitch_mod_factor = ((2**(1.0 * pitch_mod / 12.0)) - 1) * 100
        
    pitch_mod_factor = "+" + str(pitch_mod_factor) if pitch_mod_factor >= 0 else str(pitch_mod_factor)
    
    ssml = """<speak><prosody amazon:max-duration="{max_len}ms"><prosody pitch="{pitch_mod_factor}%">{word}</prosody></prosody></speak>""".format(max_len=str(length), word=word, pitch_mod_factor=pitch_mod_factor)          
    response = polly_client.start_speech_synthesis_task(VoiceId=voice_id,
                OutputS3BucketName=sagemaker_session.default_bucket(),
                OutputS3KeyPrefix=output_folder + bucket_prefix,
                OutputFormat='mp3', 
                TextType = 'ssml',
                Text = ssml)


Here we are giving the initial transcriptions to Amazon Polly in order to find the pitch of each individual word. 

In [17]:
print("Generating audio file for each word...")

polly_client = boto3.client('polly')
polly_output_folder = "polly-output-1/"
processing_util.clear_s3_folder(my_bucket, bucket_client, polly_output_folder)

for transcribe_object in tqdm(transcription_list):
    response = query_polly(polly_client, transcribe_object.content, transcribe_object.duration(), transcribe_object.index, polly_output_folder, 0, voice_id)
    
time.sleep(30)

Generating audio file for each word...


100%|██████████| 524/524 [00:52<00:00, 10.04it/s]


### Processing the Output from Amazon Polly

In [20]:
# Downloading files from s3.
s3 = boto3.resource('s3')
my_bucket = s3.Bucket(sagemaker_session.default_bucket())
prefix = "polly-output-1/"
processing_util.clear_folder(prefix)

for object_summary in my_bucket.objects.filter(Prefix=prefix):
    file_name = object_summary.key.split('/')[-1]
    my_bucket.download_file(prefix+ file_name, prefix + file_name)

time.sleep(5)
print("Files moved from s3 to repo.")

Files moved from s3 to repo.


### Pitch Correction

In [21]:
from IPython.utils import io

polly_output_folder = "polly-output-1/"
SONG_TRANSCRIPTION_PATH = "song-transcription/transcribed_song.json"


polly_client = boto3.client('polly')
polly_output_corrected_folder = "polly-output-corrected/"
processing_util.clear_s3_folder(my_bucket, bucket_client, polly_output_corrected_folder)

polly_output = sorted(os.listdir(polly_output_folder))
song_transcription = json.load(open(SONG_TRANSCRIPTION_PATH, "r", encoding="utf-8"))

for transcription_item, mp3_file in list(zip(song_transcription, polly_output)):
    
    assert(mp3_file.startswith(transcription_item["index"]))
    audio_clip = audio_util.interpret_polly_output_file(polly_output_folder + mp3_file)
    
    pitch_mod = 0
    if transcription_item["end_time"] - transcription_item["start_time"] > 50:
        with io.capture_output() as captured:
            pitch_mod = audio_util.pitch_difference(audio_clip, transcription_item["start_time"], transcription_item["end_time"], "temp/")
            pitch_mod = 0 if pitch_mod is None else pitch_mod
       
    print("Content: {content}, Pitch Correction: {pitch_mod}, Pitch Factor: {pitch_factor}".format(content=transcription_item["content"], pitch_mod=pitch_mod, pitch_factor=(2**(1.0 * pitch_mod / 12.0)) - 1))
    
    response = query_polly(polly_client, transcription_item["content"], transcription_item["end_time"] - transcription_item["start_time"], transcription_item["index"], polly_output_corrected_folder, pitch_mod, voice_id)


Content: Oh, Pitch Correction: 19, Pitch Factor: 1.996614153753363
Content: black, Pitch Correction: 15, Pitch Factor: 1.378414230005442
Content: leather, Pitch Correction: 10, Pitch Factor: 0.7817974362806785
Content: blood, Pitch Correction: 11, Pitch Factor: 0.8877486253633868
Content: No, Pitch Correction: 9, Pitch Factor: 0.681792830507429
Content: seafood, Pitch Correction: 10, Pitch Factor: 0.7817974362806785
Content: buckles, Pitch Correction: -4, Pitch Factor: -0.2062994740159002
Content: on, Pitch Correction: 13, Pitch Factor: 1.1189261887185906
Content: a, Pitch Correction: -3, Pitch Factor: -0.1591035847462855
Content: jacket, Pitch Correction: 2, Pitch Factor: 0.12246204830937302
Content: It's, Pitch Correction: 8, Pitch Factor: 0.5874010519681994
Content: a, Pitch Correction: 9, Pitch Factor: 0.681792830507429
Content: leak, Pitch Correction: 8, Pitch Factor: 0.5874010519681994
Content: Shit, Pitch Correction: -26, Pitch Factor: -0.7772753204649152
Content: Nike, Pitch Co

Content: show, Pitch Correction: 8, Pitch Factor: 0.5874010519681994
Content: you, Pitch Correction: 3, Pitch Factor: 0.18920711500272103
Content: how, Pitch Correction: 6, Pitch Factor: 0.41421356237309515
Content: to, Pitch Correction: -4, Pitch Factor: -0.2062994740159002
Content: get, Pitch Correction: 23, Pitch Factor: 2.775497250726774
Content: it, Pitch Correction: 1, Pitch Factor: 0.05946309435929531
Content: go, Pitch Correction: 8, Pitch Factor: 0.5874010519681994
Content: right, Pitch Correction: 14, Pitch Factor: 1.244924096618746
Content: for, Pitch Correction: 6, Pitch Factor: 0.41421356237309515
Content: uh, Pitch Correction: 5, Pitch Factor: 0.33483985417003437
Content: that, Pitch Correction: 2, Pitch Factor: 0.12246204830937302
Content: that, Pitch Correction: 4, Pitch Factor: 0.2599210498948732
Content: foot, Pitch Correction: -23, Pitch Factor: -0.7351342264101761
Content: right, Pitch Correction: -12, Pitch Factor: -0.5
Content: foot, Pitch Correction: -25, Pitch F

Content: leaky, Pitch Correction: 4, Pitch Factor: 0.2599210498948732
Content: ship, Pitch Correction: -38, Pitch Factor: -0.8886376602324576
Content: Nike, Pitch Correction: 9, Pitch Factor: 0.681792830507429
Content: cross, Pitch Correction: 14, Pitch Factor: 1.244924096618746
Content: body, Pitch Correction: 11, Pitch Factor: 0.8877486253633868
Content: got, Pitch Correction: 11, Pitch Factor: 0.8877486253633868
Content: a, Pitch Correction: 11, Pitch Factor: 0.8877486253633868
Content: piece, Pitch Correction: 16, Pitch Factor: 1.5198420997897464
Content: and, Pitch Correction: 14, Pitch Factor: 1.244924096618746
Content: got, Pitch Correction: 18, Pitch Factor: 1.8284271247461903
Content: a, Pitch Correction: 6, Pitch Factor: 0.41421356237309515
Content: dance, Pitch Correction: 30, Pitch Factor: 4.656854249492381
Content: But, Pitch Correction: 11, Pitch Factor: 0.8877486253633868
Content: it's, Pitch Correction: -25, Pitch Factor: -0.7640314218295766
Content: really, Pitch Corre

Content: if, Pitch Correction: -24, Pitch Factor: -0.75
Content: I'm, Pitch Correction: 11, Pitch Factor: 0.8877486253633868
Content: over, Pitch Correction: 6, Pitch Factor: 0.41421356237309515
Content: shake, Pitch Correction: 27, Pitch Factor: 3.756828460010884
Content: each, Pitch Correction: 1, Pitch Factor: 0.05946309435929531
Content: other, Pitch Correction: -4, Pitch Factor: -0.2062994740159002
Content: due, Pitch Correction: 4, Pitch Factor: 0.2599210498948732
Content: to, Pitch Correction: -1, Pitch Factor: -0.05612568731830647
Content: shit, Pitch Correction: 5, Pitch Factor: 0.33483985417003437
Content: himself, Pitch Correction: 6, Pitch Factor: 0.41421356237309515
Content: Your, Pitch Correction: 25, Pitch Factor: 3.2378523774371812
Content: solo, Pitch Correction: 13, Pitch Factor: 1.1189261887185906
Content: figures, Pitch Correction: 27, Pitch Factor: 3.756828460010884
Content: on, Pitch Correction: 10, Pitch Factor: 0.7817974362806785
Content: a, Pitch Correction: 2,

Content: with, Pitch Correction: -1, Pitch Factor: -0.05612568731830647
Content: me, Pitch Correction: 4, Pitch Factor: 0.2599210498948732
Content: No, Pitch Correction: 10, Pitch Factor: 0.7817974362806785
Content: I, Pitch Correction: -1, Pitch Factor: -0.05612568731830647
Content: could, Pitch Correction: -3, Pitch Factor: -0.1591035847462855
Content: guess, Pitch Correction: 8, Pitch Factor: 0.5874010519681994
Content: I'm, Pitch Correction: 9, Pitch Factor: 0.681792830507429
Content: Michael, Pitch Correction: 0, Pitch Factor: 0.0
Content: J, Pitch Correction: 12, Pitch Factor: 1.0
Content: I, Pitch Correction: 1, Pitch Factor: 0.05946309435929531
Content: could, Pitch Correction: 1, Pitch Factor: 0.05946309435929531
Content: get, Pitch Correction: 6, Pitch Factor: 0.41421356237309515
Content: your, Pitch Correction: 6, Pitch Factor: 0.41421356237309515
Content: satisfied, Pitch Correction: 7, Pitch Factor: 0.4983070768766815
Content: You, Pitch Correction: 6, Pitch Factor: 0.4142

TypeError: unsupported operand type(s) for *: 'float' and 'NoneType'

### Processing Corrected Output from Amazon Polly

In [22]:
# Downloading files from s3.
s3 = boto3.resource('s3')
my_bucket = s3.Bucket(sagemaker_session.default_bucket())
processing_util.clear_folder(polly_output_corrected_folder)

for object_summary in my_bucket.objects.filter(Prefix=polly_output_corrected_folder):
    file_name = object_summary.key.split('/')[-1]
    my_bucket.download_file(polly_output_corrected_folder+ file_name, polly_output_corrected_folder + file_name)

time.sleep(5)
print("Files moved from s3 to repo.")

Files moved from s3 to repo.


### Generate the song


In [26]:
# Version 1:
# POLLY_OUTPUT_FOLDER = polly_output_folder
# Version 2:
POLLY_OUTPUT_FOLDER = polly_output_corrected_folder
BACKGROUND_FOLDER = "isolated-background/"
FINAL_OUTPUT_FOLDER = "final-output/"
SONG_TRANSCRIPTION_PATH = "song-transcription/transcribed_song.json"
BATCH_LENGTH = 30000 # m

# Generate Background
background_mp3_files = [BACKGROUND_FOLDER + s for s in sorted(os.listdir(BACKGROUND_FOLDER))]
background_mp3 = audio_util.interpret_polly_output_file(background_mp3_files[0])
background_mp3_files.pop(0)
for fname in tqdm(background_mp3_files):
    background_mp3 += audio_util.interpret_polly_output_file(fname)
background_mp3.export(FINAL_OUTPUT_FOLDER + "background.mp3", format="mp3")

# Generate Vocals
polly_output = sorted(os.listdir(POLLY_OUTPUT_FOLDER))
song_transcription = json.load(open(SONG_TRANSCRIPTION_PATH, "r", encoding="utf-8"))

vocal_mp3 = audio_util.get_silence(1)
expected_start_time = 0

for transcription_item, mp3_file in tqdm(list(zip(song_transcription, polly_output))):
    if expected_start_time < transcription_item["start_time"]:
        vocal_mp3 += audio_util.get_silence(transcription_item["start_time"] - expected_start_time)
        expected_start_time = transcription_item["start_time"]

    assert(mp3_file.startswith(transcription_item["index"]))
    audio_clip = audio_util.interpret_polly_output_file(POLLY_OUTPUT_FOLDER + mp3_file)

# Version 1.1: No pitch modification, more stable
    vocal_mp3 += audio_clip
    expected_start_time += len(audio_clip)

vocal_mp3.export(FINAL_OUTPUT_FOLDER + "vocals.mp3", format="mp3")


print("Overlaying the vocals with the accompaniment and generating the final audio file...")
final_audio = background_mp3.overlay(vocal_mp3)
final_audio.export(FINAL_OUTPUT_FOLDER + "final_audio.mp3", format="mp3")
print("Done.")


100%|██████████| 8/8 [00:01<00:00,  5.51it/s]
100%|██████████| 499/499 [00:53<00:00,  9.41it/s]


Overlaying the vocals with the accompaniment and generating the final audio file...
Done.


### Listen to the Song Cover :)

In [None]:
import IPython.display as ipd
ipd.Audio("final-output/final_audio.mp3")