In [89]:
!git pull

remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 4 (delta 2), reused 4 (delta 2), pack-reused 0[K
Unpacking objects: 100% (4/4), done.
From https://github.com/basilwong/awstest1
   a4d6946..a1eba0a  master     -> origin/master
Updating a4d6946..a1eba0a
Fast-forward
 src/processing_util.py | 3 [32m++[m[31m-[m
 1 file changed, 2 insertions(+), 1 deletion(-)


### Add Dependencies

In [95]:
import sagemaker as sage
from sagemaker import get_execution_role

import zipfile
import os

from sagemaker import ModelPackage

# some_file.py
import sys
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.append('src')

!pip install pydub

import audio_util
import processing_util

[33mYou are using pip version 10.0.1, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
# Execution role
role = get_execution_role()
# S3 prefixes
common_prefix = "source_separation"
batch_inference_input_prefix = common_prefix + "/batch-inference-input-data"
# Sagemaker Session
sagemaker_session = sage.Session()
# Arn for Source Separator Model Package
modelpackage_arn = 'arn:aws:sagemaker:us-east-2:057799348421:model-package/source-separation-v11570291536-75ed8128ecee95e142ec4404d884ecad'



### Creating the Model

In [3]:
from sagemaker import ModelPackage

def predict_wrapper(endpoint, session):
    return sage.RealTimePredictor(endpoint, session, content_type='application/x-recordio-protobuf')

model = ModelPackage(role=role,
                     model_package_arn=modelpackage_arn,
                     sagemaker_session=sagemaker_session,
                     predictor_cls=predict_wrapper)

### Running the Batch Job

Note that if the initial audio file is longer than around 30 seconds, it is too large for the model. The split_mp3() method in  src.audio_util works around this by splitting an mp3 file into 30 second segments. 

In [None]:
audio_util.split_mp3("../songs/drake-toosie_slide.mp3", "../source-separation-input/")

In [4]:
batch_input_folder = "source-separation-input"


transform_input = sagemaker_session.upload_data(batch_input_folder, key_prefix=batch_inference_input_prefix)
print("Transform input uploaded to " + transform_input)

Transform input uploaded to s3://sagemaker-us-east-2-075178354542/source_separation/batch-inference-input-data


In [5]:
import json 
import uuid

bucket = sagemaker_session.default_bucket()

transformer = model.transformer(1, 'ml.m4.xlarge', strategy='SingleRecord', output_path='s3://'+bucket+'/'+common_prefix+'/batch-transform-output')
transformer.transform(transform_input, content_type='application/x-recordio-protobuf')
transformer.wait()

print("Batch Transform output saved to " + transformer.output_path)

....................[34mStarting the inference server with 4 workers.[0m
[34m[2020-04-13 04:48:15 +0000] [10] [INFO] Starting gunicorn 19.9.0[0m
[34m[2020-04-13 04:48:15 +0000] [10] [INFO] Listening at: unix:/tmp/gunicorn.sock (10)[0m
[34m[2020-04-13 04:48:15 +0000] [10] [INFO] Using worker: gevent[0m
[34m[2020-04-13 04:48:15 +0000] [14] [INFO] Booting worker with pid: 14[0m
[34m[2020-04-13 04:48:15 +0000] [15] [INFO] Booting worker with pid: 15[0m
[34m[2020-04-13 04:48:15 +0000] [16] [INFO] Booting worker with pid: 16[0m
[34m[2020-04-13 04:48:15 +0000] [17] [INFO] Booting worker with pid: 17[0m
[34mTesting...[0m
[35mTesting...[0m
[34m2020-04-13 04:48:39.079016: I tensorflow/core/platform/cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA[0m
[34m169.254.255.130 - - [13/Apr/2020:04:48:39 +0000] "GET /ping HTTP/1.1" 200 1 "-" "Go-http-client/1.1"[0m
[34m169.254.255.130 - - [13/Apr/2020:04:48:39 +

[34m['audio_file_1586753595.1332912.mp3_vocals.wav', 'audio_file_1586753595.1332912.mp3_accompaniment.wav'][0m
[35m['audio_file_1586753595.1332912.mp3_vocals.wav', 'audio_file_1586753595.1332912.mp3_accompaniment.wav'][0m
[34m169.254.255.130 - - [13/Apr/2020:04:54:01 +0000] "POST /invocations HTTP/1.1" 200 19459102 "-" "Go-http-client/1.1"[0m
[34mInput path : /tmp/audio_file_1586753641.1667993.mp3[0m
[34mProducing source estimates for input mixture file /tmp/audio_file_1586753641.1667993.mp3[0m
[34mTesting...[0m
[35m169.254.255.130 - - [13/Apr/2020:04:54:01 +0000] "POST /invocations HTTP/1.1" 200 19459102 "-" "Go-http-client/1.1"[0m
[35mInput path : /tmp/audio_file_1586753641.1667993.mp3[0m
[35mProducing source estimates for input mixture file /tmp/audio_file_1586753641.1667993.mp3[0m
[35mTesting...[0m
[34mNum of variables64[0m
[34mPre-trained model restored for song prediction[0m
[35mNum of variables64[0m
[35mPre-trained model restored for song prediction[0

### Processing the Batch Output

In [142]:
import boto3
# Downloading files from s3.
s3 = boto3.resource('s3')
my_bucket = s3.Bucket(sagemaker_session.default_bucket())
prefix = "source_separation/batch-transform-output/"
i = 0
audio_util.clear_folder('source-separation-output/batch-transform-output')
for object_summary in my_bucket.objects.filter(Prefix=prefix):
    i = i + 1
    file_name = object_summary.key.split('/')[-1]
    print(file_name)
    my_bucket.download_file(prefix+ file_name, 'source-separation-output/batch-transform-output/output-{}.zip'.format(i))

input1.mp3.out
input2.mp3.out
input3.mp3.out
input4.mp3.out
input5.mp3.out
input6.mp3.out
input7.mp3.out
input8.mp3.out
input9.mp3.out


In [148]:
# Extracting files from zip files. 
audio_util.clear_folder('source-separation-output/extracted')
for file in os.listdir('source-separation-output/batch-transform-output'):
    print(file)
    with zipfile.ZipFile('source-separation-output/batch-transform-output/'+file, 'r') as zip_ref:
        zip_ref.extractall('source-separation-output/extracted/'+file.split('.')[0]+'/')

output-3.zip
output-6.zip
output-8.zip
output-9.zip
output-1.zip
output-4.zip
output-7.zip
output-2.zip
output-5.zip


In [150]:
import os
# Separating the vocal files and the background sound files.
audio_util.clear_folder('source-separation-output/vocals')
audio_util.clear_folder('source-separation-output/background')
for i, folder in enumerate(sorted(os.listdir('source-separation-output/extracted/'))):
    for file in os.listdir('source-separation-output/extracted/' + folder + '/output'):
        new_file_name = str(i).zfill(5) + ".wav"
        if "vocals" in file:
            os.rename('source-separation-output/extracted/' + folder + '/output/' + file, 'source-separation-output/vocals/vocals' + new_file_name)
        elif "accompaniment" in file:
            os.rename('source-separation-output/extracted/' + folder + '/output/' + file, 'source-separation-output/background/background' + new_file_name)

### Transcribe the Vocal Files

In [151]:
# Upload the Vocal files onto s3
local_vocals_folder = "source-separation-output/vocals/"
transcribe_input_prefix = "transcribe-input"

transcribe_input = sagemaker_session.upload_data(local_vocals_folder, key_prefix=transcribe_input_prefix)
print("Transcribe input uploaded to " + transcribe_input)

Transcribe input uploaded to s3://sagemaker-us-east-2-075178354542/transcribe-input


In [None]:
# Start a transcription job for each file. Add the transcription to finsihed jobs once finished. 

import boto3
from datetime import datetime

transcribe = boto3.client('transcribe')
output_bucket_name = "transcribe-output"
audio_util.clear_folder('transcribe-output')
uri_prefix = "https://sagemaker-us-east-2-075178354542.s3.us-east-2.amazonaws.com/transcribe-input/"
finished_jobs = list()

for file in sorted(os.listdir(local_vocals_folder)):

    print("Transcribing: " + file)
    job_uri = uri_prefix + file
    transcribe.start_transcription_job(
        TranscriptionJobName=file,
        Media={'MediaFileUri': job_uri},
        MediaFormat='wav',
        LanguageCode='en-US'
    )
    while True:
        status = transcribe.get_transcription_job(TranscriptionJobName=file)
        if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
            break
    
    api_data = requests.get(url=status['TranscriptionJob']['Transcript']['TranscriptFileUri'])
    data = api_data.json()
    finished_jobs.append(data)
    dump_file_name = 'transcribe-output/transcription' + file.split(".")[0] + '.json'
    # Writing to json files for analysis purposes.
    with open(dump_file_name, 'w') as f:
        json.dump(data, f, indent=4)
    transcribe.delete_transcription_job(TranscriptionJobName=file)
    
finished_jobs.sort(key=lambda x : x['jobName'])

Transcribing: vocals00000.wav
Transcribing: vocals00001.wav


In [158]:
transcribe.delete_transcription_job(TranscriptionJobName="vocals00000.wav")

{'ResponseMetadata': {'RequestId': '20153835-a1cb-4d8b-b31b-fc5194cf9a11',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Mon, 13 Apr 2020 23:21:58 GMT',
   'x-amzn-requestid': '20153835-a1cb-4d8b-b31b-fc5194cf9a11',
   'content-length': '0',
   'connection': 'keep-alive'},
  'RetryAttempts': 0}}

### Giving Transcriptions to Amazon Polly

In [206]:
def query_polly(polly_client, word, length, prefix, output_folder):
    
    ssml = """<speak><prosody amazon:max-duration="{max_len}ms">{word}</prosody></speak>""".format(max_len=str(length), word=word)          
    response = polly_client.start_speech_synthesis_task(VoiceId='Justin',
                OutputS3BucketName='sagemaker-us-east-2-075178354542',
                OutputS3KeyPrefix='polly-output/' + prefix,
                OutputFormat='mp3', 
                TextType = 'ssml',
                Text = ssml)



In [205]:
polly_client = boto3.client('polly')

transcribe_output_folder = "transcribe-output/"
polly_output_folder = "polly-output/"
audio_util.clear_folder(polly_output_folder)
silence_dict = {"length" : 0}
index = 0

for file in sorted(os.listdir(transcribe_output_folder)):
    
    transcription_batch = json.load(open(transcribe_output_folder + file, "r", encoding="utf-8"))

    expected_start_time = 0 # milliseconds
    
    for map_item in transcription_batch["results"]["items"]:
        
        transcribe_object = processing_util.TranscriptionItem(map_item)
        
        # Skip punctuation
        if not transcribe_object.is_word():
            continue
            
        if expected_start_time != transcribe_object.start_time:
            silence_dict["length"] = transcribe_object.start_time - expected_start_time
            with open(polly_output_folder + str(index).zfill(5) + ".json", 'w') as outfile:
                json.dump(silence_dict, outfile)
            expected_start_time = transcribe_object.start_time
            index += 1


        response = query_polly(polly_client, transcribe_object.content, transcribe_object.duration(), str(index).zfill(5), polly_output_folder)
        print("Polly Queried for: " + transcribe_object.content)    
        index += 1

<speak><prosody amazon:max-duration="430.0ms">Oh</prosody></speak>
Polly Queried for: Oh
<speak><prosody amazon:max-duration="460.0ms">black</prosody></speak>
Polly Queried for: black
<speak><prosody amazon:max-duration="370.0ms">leather</prosody></speak>
Polly Queried for: leather
<speak><prosody amazon:max-duration="400.0ms">blood</prosody></speak>
Polly Queried for: blood
<speak><prosody amazon:max-duration="250.0ms">No</prosody></speak>
Polly Queried for: No
<speak><prosody amazon:max-duration="780.0ms">seafood</prosody></speak>
Polly Queried for: seafood
<speak><prosody amazon:max-duration="560.0ms">buckles</prosody></speak>
Polly Queried for: buckles
<speak><prosody amazon:max-duration="170.0ms">on</prosody></speak>
Polly Queried for: on
<speak><prosody amazon:max-duration="110.0ms">a</prosody></speak>
Polly Queried for: a
<speak><prosody amazon:max-duration="440.0ms">jacket</prosody></speak>
Polly Queried for: jacket
<speak><prosody amazon:max-duration="180.0ms">It's</prosody></

Polly Queried for: wanna
<speak><prosody amazon:max-duration="480.0ms">dance</prosody></speak>
Polly Queried for: dance
<speak><prosody amazon:max-duration="150.0ms">with</prosody></speak>
Polly Queried for: with
<speak><prosody amazon:max-duration="540.0ms">me</prosody></speak>
Polly Queried for: me
<speak><prosody amazon:max-duration="560.0ms">No</prosody></speak>
Polly Queried for: No
<speak><prosody amazon:max-duration="150.0ms">I</prosody></speak>
Polly Queried for: I
<speak><prosody amazon:max-duration="220.0ms">could</prosody></speak>
Polly Queried for: could
<speak><prosody amazon:max-duration="230.0ms">guess</prosody></speak>
Polly Queried for: guess
<speak><prosody amazon:max-duration="140.0ms">I'm</prosody></speak>
Polly Queried for: I'm
<speak><prosody amazon:max-duration="280.0ms">Michael</prosody></speak>
Polly Queried for: Michael
<speak><prosody amazon:max-duration="610.0ms">J</prosody></speak>
Polly Queried for: J
<speak><prosody amazon:max-duration="720.0ms">Son</pros

Polly Queried for: being
<speak><prosody amazon:max-duration="570.0ms">mistaken</prosody></speak>
Polly Queried for: mistaken
<speak><prosody amazon:max-duration="140.0ms">for</prosody></speak>
Polly Queried for: for
<speak><prosody amazon:max-duration="370.0ms">other</prosody></speak>
Polly Queried for: other
<speak><prosody amazon:max-duration="420.0ms">people</prosody></speak>
Polly Queried for: people
<speak><prosody amazon:max-duration="150.0ms">would</prosody></speak>
Polly Queried for: would
<speak><prosody amazon:max-duration="160.0ms">at</prosody></speak>
Polly Queried for: at
<speak><prosody amazon:max-duration="90.0ms">a</prosody></speak>
Polly Queried for: a
<speak><prosody amazon:max-duration="470.0ms">love</prosody></speak>
Polly Queried for: love
<speak><prosody amazon:max-duration="180.0ms">out</prosody></speak>
Polly Queried for: out
<speak><prosody amazon:max-duration="70.0ms">of</prosody></speak>
Polly Queried for: of
<speak><prosody amazon:max-duration="330.0ms">tro

Polly Queried for: foot
<speak><prosody amazon:max-duration="670.0ms">Basically</prosody></speak>
Polly Queried for: Basically
<speak><prosody amazon:max-duration="80.0ms">I'm</prosody></speak>
Polly Queried for: I'm
<speak><prosody amazon:max-duration="500.0ms">saying</prosody></speak>
Polly Queried for: saying
<speak><prosody amazon:max-duration="260.0ms">Either</prosody></speak>
Polly Queried for: Either
<speak><prosody amazon:max-duration="250.0ms">way</prosody></speak>
Polly Queried for: way
<speak><prosody amazon:max-duration="140.0ms">we</prosody></speak>
Polly Queried for: we
<speak><prosody amazon:max-duration="290.0ms">about</prosody></speak>
Polly Queried for: about
<speak><prosody amazon:max-duration="200.0ms">so</prosody></speak>
Polly Queried for: so
<speak><prosody amazon:max-duration="530.0ms">I</prosody></speak>
Polly Queried for: I
<speak><prosody amazon:max-duration="340.0ms">can</prosody></speak>
Polly Queried for: can
<speak><prosody amazon:max-duration="160.0ms">l

Polly Queried for: other
<speak><prosody amazon:max-duration="240.0ms">due</prosody></speak>
Polly Queried for: due
<speak><prosody amazon:max-duration="90.0ms">to</prosody></speak>
Polly Queried for: to
<speak><prosody amazon:max-duration="270.0ms">shit</prosody></speak>
Polly Queried for: shit
<speak><prosody amazon:max-duration="480.0ms">himself</prosody></speak>
Polly Queried for: himself
<speak><prosody amazon:max-duration="260.0ms">Your</prosody></speak>
Polly Queried for: Your
<speak><prosody amazon:max-duration="920.0ms">solo</prosody></speak>
Polly Queried for: solo
<speak><prosody amazon:max-duration="410.0ms">figures</prosody></speak>
Polly Queried for: figures
<speak><prosody amazon:max-duration="160.0ms">on</prosody></speak>
Polly Queried for: on
<speak><prosody amazon:max-duration="150.0ms">a</prosody></speak>
Polly Queried for: a
<speak><prosody amazon:max-duration="340.0ms">goal</prosody></speak>
Polly Queried for: goal
<speak><prosody amazon:max-duration="800.0ms">O'Fa

Polly Queried for: I
<speak><prosody amazon:max-duration="220.0ms">could</prosody></speak>
Polly Queried for: could
<speak><prosody amazon:max-duration="250.0ms">guess</prosody></speak>
Polly Queried for: guess
<speak><prosody amazon:max-duration="130.0ms">I'm</prosody></speak>
Polly Queried for: I'm
<speak><prosody amazon:max-duration="360.0ms">Michael</prosody></speak>
Polly Queried for: Michael
<speak><prosody amazon:max-duration="310.0ms">Chang</prosody></speak>
Polly Queried for: Chang
<speak><prosody amazon:max-duration="180.0ms">I</prosody></speak>
Polly Queried for: I
<speak><prosody amazon:max-duration="210.0ms">could</prosody></speak>
Polly Queried for: could
<speak><prosody amazon:max-duration="440.0ms">get</prosody></speak>
Polly Queried for: get
<speak><prosody amazon:max-duration="310.0ms">up</prosody></speak>
Polly Queried for: up
<speak><prosody amazon:max-duration="370.0ms">Hey</prosody></speak>
Polly Queried for: Hey
<speak><prosody amazon:max-duration="640.0ms">it's<

### Processing the Output from Amazon Polly

In [207]:
import boto3
# Downloading files from s3.
s3 = boto3.resource('s3')
my_bucket = s3.Bucket(sagemaker_session.default_bucket())
prefix = "polly-output/"

for object_summary in my_bucket.objects.filter(Prefix=prefix):
    file_name = object_summary.key.split('/')[-1]
    my_bucket.download_file(prefix+ file_name, prefix + file_name)
    
print("Files moved from s3 to repo.")

Files moved from s3 to repo.


In [193]:
print(index)

1036


Mixing Audio:

https://stackoverflow.com/questions/7629873/how-do-i-mix-audio-files-using-python

Pitch Modulation:

https://stackoverflow.com/questions/38923438/does-pydub-support-pitch-modulation

