# Word2Wave: generate audio samples from a text prompt 📝 --> 🎶

This notebook is a playground for [Word2Wave](https://github.com/ilaria-manco/word2wave), a simple method that uses [WaveGAN](https://arxiv.org/abs/1802.04208) to generate audio samples from text prompts by optimising audio-text similarity based on [COALA](https://arxiv.org/abs/2006.08386) embeddings.

Author: [@Ilaria__Manco](https://twitter.com/ilaria__manco)

Before you start, make sure you select a (free) GPU to work with: `Runtime > Change runtime type > Hardware accelerator > GPU`. Then check that this step worked by executing the cell below.

In [None]:
!nvidia-smi -L

In [None]:
#@title Mount your drive
from google.colab import drive
drive.mount('/content/drive')

### Set up

In [None]:
#@markdown Install Word2Wave, import necessary packages

!git clone https://github.com/ilaria-manco/word2wave
%cd /content/word2wave/
!pip3  install -r requirements.txt

# Imports
import os
import random
import pickle
import json
import logging
import librosa
import torch
import numpy as np
import torchaudio
from torchaudio import transforms as T

from IPython import display as ipd
import matplotlib.pyplot as plt
from google.colab import output, files

In [None]:
#@markdown Download model from Pollinations IPFS
!mkdir -p /content/models
!wget -N https://pollinations.ai/ipfs/QmfRSNhj4z8bmVzrhcUTeSLd3PPBGrUJJB1w5tML1dtjB5/gan_fs_loop_3.tar -P /content/models

In [None]:
#@markdown Copy the pre-trained WaveGAN and COALA weights from drive
drive_path = "/content/model" #@param {type:"string"}

!cp -r -v {drive_path}"wavegan" "/content/word2wave/wavegan"

# if the pre-trained coala models are in a gdrive folder, copy them here
!cp -r -v {drive_path}"coala" "/content/word2wave/coala/"
!mv "/content/word2wave/coala/coala/" "/content/word2wave/coala/models/"

# othertwise, download them
!wget https://raw.githubusercontent.com/xavierfav/coala/master/saved_models/dual_e_c/audio_encoder_epoch_200.pt
!wget https://raw.githubusercontent.com/xavierfav/coala/master/saved_models/dual_e_c/tag_encoder_epoch_200.pt



!mkdir "/content/output/"
!mkdir "/content/output/audio/"
!mkdir "/content/output/latents/"

In [None]:
#@title Define some helper functions

def sample_noise(size, latent_dim):
  noise = torch.FloatTensor(size, latent_dim)
  noise.data.normal_()
  return noise


def latent_space_interpolation(model, n_samples=10, source=None, target=None):
  if source is None and target is None:
    random_samples = sample_noise(2, 100)
    source = random_samples[0]
    target = random_samples[1]
  with torch.no_grad():
    interpolated_z = []
    for alpha in np.linspace(0, 1, n_samples):
      interpolation = alpha * source + ((1 - alpha) * target)
      interpolated_z.append(interpolation.cuda())

    interpolated_z = torch.stack(interpolated_z)
    generated_audio = model(interpolated_z)
  return generated_audio


def save_audio(audio_to_save):
    librosa.output.write_wav(os.path.join("/content/audio/", text + ".wav"), audio_to_save, 16000)


def check_text_input(text):
  _, words_in_dict, words_not_in_dict = word2wave.tokenize_text(text)
  if not words_in_dict:
      raise Exception("All the words in the text prompt are out-of-vocabulary, please try with another prompt")
  elif words_not_in_dict:
      missing_words = ", ".join(words_not_in_dict)
      logging.info("Out-of-vocabulary words found, ignoring: \"{}\"".format(missing_words))
  logging.info("Making sounds to match the following text: {}".format(" ".join(words_in_dict)))

In [None]:
#@title Settings
model_name = 'gan_fs_loop_3' #@param ["gan_drum","gan_fs_loop_3", "gan_fs_loop_4"] {type:"string"}
audio_save_freq =  10 #@param {type:"number"}
learning_rate =  0.02#@param {type:"number"}
training_steps = 10000 #@param {type:"number"}
threshold = 0.11 #@param {type:"number"}
verbose = True #@param {type:"boolean"}

## Generate audio from text

In [None]:
#@markdown

available_tags = widgets.Dropdown(options=[tag for id, tag in id2tag.items()], value='laughing')
available_tags

In [None]:
#@title Input a text prompt
 
#@markdown This can be an arbitrary combination of words from those available in the dropdown menu above (run the cell first to see the menu)
text = "firework" #@param {type:"string"}
 
import ipywidgets as widgets
import json
 
id2tag = json.load(open('/content/word2wave/coala/id2token_top_1000.json', 'rb'))
 
check_text_input(text)

In [None]:
#@markdown Load pre-trained WaveGAN and generate an audio sample from a random latent vector drawn from a Gaussian distribution
 
from word2wave import Word2Wave
 
class Config():
  def __init__(self):
    self.coala_model_name = "dual_e_c"
    self.wavegan_path = "/content/word2wave/wavegan/{}.tar".format(model_name)
 
config = Config()
word2wave = Word2Wave(config).cuda()
 
original_audio, loss = word2wave(text)
 
ipd.Audio(original_audio.detach().cpu().numpy(), rate=16000)

In [None]:
#@title Run training cycle

for name, param in word2wave.named_parameters():
  if name != "latents" and "generator" not in name:
  # if name != "latents":
      param.requires_grad = False 

optimizer = torch.optim.Adam(
params=[word2wave.latents],
lr=learning_rate,
betas=(0.9, 0.999)
)

i = 0

while i < training_steps:
    audio, loss = word2wave(text)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if verbose and i % 100 == 0:
        print(f'Step {i}', f'|| Loss: {loss.data.cpu().numpy()[0]}')

    if loss < threshold:
        break
    i += 1

In [None]:
#@title Listen to generated audio { run: "auto" }

save_output = True #@param {type:"boolean"}
save_dir = '/content/output/' #@param {type:"string"}

generated_audio = audio.squeeze().detach().cpu().numpy()
latents = word2wave.latents.detach().cpu().numpy()

if save_output:
  np.save(os.path.join(save_dir, "audio/{}.npy".format(text)), generated_audio)
  np.save(os.path.join(save_dir, "latents/{}.npy".format(text)), latents)

ipd.Audio(generated_audio, rate=16000)

In [None]:
#@title Interpolate between samples { run: "auto" }

source_name = "firework" #@param {type:"string"}
target_name = "wobble" #@param {type:"string"}

source_z = torch.tensor(np.load("/content/output/latents/{}.npy".format(source_name)))
target_z = torch.tensor(np.load("/content/output/latents/{}.npy".format(target_name)))

interpolation = latent_space_interpolation(word2wave.generator, n_samples=20, source=source_z, target=target_z)
interpolation = interpolation.squeeze(1).flatten().detach().cpu().numpy()
ipd.Audio(interpolation, rate=16000)

In [None]:
#@title Soundtrack your story (WIP) { run: "auto" }

#@markdown Type some text in here, the model will check whether it recognises any of the words (it might not recognise any, its vocabulary is super small!), interpret them and turn them into sounds. It will then play back the final piece "soundtracking" your story, joke, random text. This uses latent space interpolation like the cell above, but with more samples. (Warning: it'll probably sound weird 🤷)

text = "Once upon a time..." #@param {type:"string"}

# concatenate all audio 

for i in text:
  pass