# Install dependancies


In [None]:
!pip install torchtext==0.7
!pip install -q torch==1.6.0 torchvision
!pip install unidecode==1.0.22
!pip install tensorboardX
 import torch
 torch.cuda.empty_cache()

In [1]:
import sys

!{sys.executable} -m pip install numpy==1.13.3 tensorflow==1.15 inflect==0.2.5 librosa==0.6.0 scipy==1.0.0 tensorboardX==1.1 Unidecode==1.0.22 pillow 

!git clone https://github.com/karkirowle/flowtron.git
%cd flowtron
!git submodule init
!git submodule update
%cd tacotron2
!git submodule update --init
%cd ..


!ls
# This is ported from https://github.com/yhgon/mellotron/blob/master/inference_colab.ipynb
!wget -N  -q https://raw.githubusercontent.com/yhgon/colab_utils/master/gfile.py
!python gfile.py -u 'https://drive.google.com/open?id=1KhJcPawFgmfvwV7tQAOeC253rYstLrs8' -f 'flowtron_libritts.pt'
!python gfile.py -u 'https://drive.google.com/open?id=1Cjd6dK_eFz6DE0PKXKgKxrzTUqzzUDW-' -f 'flowtron_ljs.pt'

!python gfile.py -u 'https://drive.google.com/open?id=1Rm5rV5XaWWiUbIpg5385l5sh68z2bVOE' -f 'waveglow_256channels_v4.pt'




Collecting numpy==1.13.3
[?25l  Downloading https://files.pythonhosted.org/packages/bf/2d/005e45738ab07a26e621c9c12dc97381f372e06678adf7dc3356a69b5960/numpy-1.13.3.zip (5.0MB)
[K     |████████████████████████████████| 5.0MB 2.2MB/s 
[?25hCollecting tensorflow==1.15
[?25l  Downloading https://files.pythonhosted.org/packages/92/2b/e3af15221da9ff323521565fa3324b0d7c7c5b1d7a8ca66984c8d59cb0ce/tensorflow-1.15.0-cp37-cp37m-manylinux2010_x86_64.whl (412.3MB)
[K     |████████████████████████████████| 412.3MB 41kB/s 
[?25hCollecting inflect==0.2.5
[?25l  Downloading https://files.pythonhosted.org/packages/66/15/2d176749884cbeda0c92e0d09e1303ff53a973eb3c6bb2136803b9d962c9/inflect-0.2.5-py2.py3-none-any.whl (58kB)
[K     |████████████████████████████████| 61kB 7.9MB/s 
[?25hCollecting librosa==0.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/6b/f4/422bfbefd581f74354ef05176aa48558c548243c87e359d91512d4b65523/librosa-0.6.0.tar.gz (1.5MB)
[K     |██████████████████████████

# Make alterations to Flowtron files

### Alter the train.py file with the following code
```
###########################################################################
#
#  Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#
###########################################################################
import argparse
import json
import os
import torch
from torch.utils.data import DataLoader
import ast

from flowtron import FlowtronLoss
from flowtron import Flowtron
from data import Data, DataCollate
from flowtron_logger import FlowtronLogger


#=====START: ADDED FOR DISTRIBUTED======
from distributed import init_distributed, apply_gradient_allreduce, reduce_tensor
from torch.utils.data.distributed import DistributedSampler
#=====END:   ADDED FOR DISTRIBUTED======

def update_params(config, params):
    for param in params:
        print(param)
        k, v = param.split("=")
        try:
            v = ast.literal_eval(v)
        except:
            pass

        k_split = k.split('.')
        if len(k_split) > 1:
            parent_k = k_split[0]
            cur_param = ['.'.join(k_split[1:])+"="+str(v)]
            update_params(config[parent_k], cur_param)
        elif k in config and len(k_split) == 1:
            config[k] = v
        else:
            print("{}, {} params not updated".format(k, v))


def prepare_dataloaders(data_config, n_gpus, batch_size):
    # Get data, data loaders and 1ollate function ready
    ignore_keys = ['training_files', 'validation_files']
    trainset = Data(data_config['training_files'],
                    **dict((k, v) for k, v in data_config.items()
                    if k not in ignore_keys))

    valset = Data(data_config['validation_files'],
                  **dict((k, v) for k, v in data_config.items()
                  if k not in ignore_keys), speaker_ids=trainset.speaker_ids)

    collate_fn = DataCollate()

    train_sampler, shuffle = None, True
    if n_gpus > 1:
        train_sampler, shuffle = DistributedSampler(trainset), False

    train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle,
                              sampler=train_sampler, batch_size=batch_size,
                              pin_memory=False, drop_last=True,
                              collate_fn=collate_fn)

    return train_loader, valset, collate_fn


def warmstart(checkpoint_path, model, include_layers=None):
    print("Warm starting model", checkpoint_path)
    pretrained_dict = torch.load(checkpoint_path, map_location='cpu')
    if 'model' in pretrained_dict:
        pretrained_dict = pretrained_dict['model'].state_dict()
    else:
        pretrained_dict = pretrained_dict['state_dict']

    if include_layers is not None:
        pretrained_dict = {k: v for k, v in pretrained_dict.items()
                           if any(l in k for l in include_layers)}

    model_dict = model.state_dict()
    pretrained_dict = {k: v for k, v in pretrained_dict.items()
                       if k in model_dict}

    if pretrained_dict['speaker_embedding.weight'].shape != model_dict['speaker_embedding.weight'].shape:
        del pretrained_dict['speaker_embedding.weight']

    model_dict.update(pretrained_dict)
    model.load_state_dict(model_dict)
    return model


def load_checkpoint(checkpoint_path, model, optimizer, ignore_layers=[]):
    assert os.path.isfile(checkpoint_path)
    checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
    #iteration = checkpoint_dict['iteration']
    #model_dict = checkpoint_dict['model'].state_dict()
    if 'state_dict' in checkpoint_dict:
        print("active state")
        state_dict = checkpoint_dict['state_dict']
        model.load_state_dict(state_dict)
    elif 'model' in checkpoint_dict:
        print("active model")
        model = checkpoint_dict['model'].cuda()
    else:
        assert False, "cannot load model!"
    if len(ignore_layers) > 0:
        state_dict = {k: v for k, v in state_dict.items()
                      if k not in ignore_layers}
        dummy_dict = model.state_dict()
        dummy_dict.update(state_dict)
        state_dict = dummy_dict
    else:
        optimizer.load_state_dict(checkpoint_dict['optimizer'])

    model.load_state_dict(state_dict)
    print("Loaded checkpoint '{}' (iteration )" .format(
          checkpoint_path))
    return model, optimizer


def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):
    print("Saving model and optimizer state at iteration {} to {}".format(
          iteration, filepath))
    model_for_saving = Flowtron(**model_config).cuda()
    model_for_saving.load_state_dict(model.state_dict())
    torch.save({'model': model_for_saving,
                'iteration': iteration,
                'optimizer': optimizer.state_dict(),
                'learning_rate': learning_rate}, filepath)


def compute_validation_loss(model, criterion, valset, collate_fn, batch_size,
                            n_gpus):                     
    model.eval()
    print("eval")  
    with torch.no_grad():
        val_sampler = DistributedSampler(valset) if n_gpus > 1 else None
        val_loader = DataLoader(valset, sampler=val_sampler, num_workers=1,
                                shuffle=False, batch_size=batch_size,
                                pin_memory=False, collate_fn=collate_fn)
        print("next iter")
        val_loss = 0.0
        for i, batch in enumerate(val_loader):
            print("next 1")
            mel, speaker_vecs, text, in_lens, out_lens, gate_target = batch
            print("next 2")
            mel, speaker_vecs, text = mel.cuda(), speaker_vecs.cuda(), text.cuda()
            print("next 3")
            in_lens, out_lens, gate_target = in_lens.cuda(), out_lens.cuda(), gate_target.cuda()
            z, log_s_list, gate_pred, attn, mean, log_var, prob = model(
                mel, speaker_vecs, text, in_lens, out_lens)

            loss = criterion((z, log_s_list, gate_pred, mean, log_var, prob),
                             gate_target, out_lens)

            if n_gpus > 1:
                reduced_val_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_val_loss = loss.item()
            val_loss += reduced_val_loss
        print("end")
        val_loss = val_loss / (i + 1)

    print("here")
    print("Mean {}\nLogVar {}\nProb {}".format(mean, log_var, prob))
    model.train()
    return val_loss, attn, gate_pred, gate_target


def train(n_gpus, rank, output_directory, epochs, learning_rate, weight_decay,
          sigma, iters_per_checkpoint, batch_size, seed, checkpoint_path,
          ignore_layers, include_layers, warmstart_checkpoint_path,
          with_tensorboard, fp16_run):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    if n_gpus > 1:
        init_distributed(rank, n_gpus, **dist_config)

    criterion = FlowtronLoss(sigma, model_config['n_components'] > 1,
                             model_config['use_gate_layer'])
    model = Flowtron(**model_config).cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
                                 weight_decay=weight_decay)

    # Load checkpoint if one exists
    iteration = 0
    if warmstart_checkpoint_path != "":
        model = warmstart(warmstart_checkpoint_path, model)

    if checkpoint_path != "":
        model, optimizer = load_checkpoint(checkpoint_path, model,
                                                      optimizer, ignore_layers)
        iteration += 1  # next iteration is iteration + 1

    if n_gpus > 1:
        model = apply_gradient_allreduce(model)
    print(model)
    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    train_loader, valset, collate_fn = prepare_dataloaders(
        data_config, n_gpus, batch_size)

    # Get shared output_directory ready
    if rank == 0 and not os.path.isdir(output_directory):
        os.makedirs(output_directory)
        os.chmod(output_directory, 0o775)
    print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        logger = FlowtronLogger(os.path.join(output_directory, 'logs'))

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for batch in train_loader:
            model.zero_grad()

            mel, speaker_vecs, text, in_lens, out_lens, gate_target = batch
            mel, speaker_vecs, text = mel.cuda(), speaker_vecs.cuda(), text.cuda()
            in_lens, out_lens, gate_target = in_lens.cuda(), out_lens.cuda(), gate_target.cuda()

            z, log_s_list, gate_pred, attn, mean, log_var, prob = model(
                mel, speaker_vecs, text, in_lens, out_lens)
            loss = criterion((z, log_s_list, gate_pred, mean, log_var, prob),
                             gate_target, out_lens)

            if n_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()

            if fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            optimizer.step()

            if rank == 0:
                print("{}:\t{:.9f}".format(iteration, reduced_loss), flush=True)

            if with_tensorboard and rank == 0:
                logger.add_scalar('training_loss', reduced_loss, iteration)
                logger.add_scalar('learning_rate', learning_rate, iteration)

            if (iteration % iters_per_checkpoint == 0):
                print("checkpoint time")
                #val_loss, attns, gate_pred, gate_target = compute_validation_loss(
                  #  model, criterion, valset, collate_fn, batch_size, n_gpus)
                if rank == 0:
                    #print("Validation loss {}: {:9f}  ".format(iteration, val_loss))
                    #if with_tensorboard:
                    #    logger.log_validation(
                    #        val_loss, attns, gate_pred, gate_target, iteration)

                    checkpoint_path = "{}/model_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--config', type=str,
                        help='JSON file for configuration')
    parser.add_argument('-p', '--params', nargs='+', default=[])
    args = parser.parse_args()
    args.rank = 0

    # Parse configs.  Globals nicer in this case
    with open(args.config) as f:
        data = f.read()

    global config
    config = json.loads(data)
    update_params(config, args.params)
    print(config)

    train_config = config["train_config"]
    global data_config
    data_config = config["data_config"]
    global dist_config
    dist_config = config["dist_config"]
    global model_config
    model_config = config["model_config"]

    # Make sure the launcher sets `RANK` and `WORLD_SIZE`.
    rank = int(os.getenv('RANK', '0'))
    n_gpus = int(os.getenv("WORLD_SIZE", '1'))
    print('> got rank {} and world size {} ...'.format(rank, n_gpus))

    if n_gpus == 1 and rank != 0:
        raise Exception("Doing single GPU training on rank > 0")

    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
    train(n_gpus, rank, **train_config)
```

### Alter the Inferennce.py file with the following code
```
###############################################################################
#
#  Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#
###############################################################################
import matplotlib
matplotlib.use("Agg")
import matplotlib.pylab as plt

import os
import argparse
import json
import sys
import numpy as np
import torch


from flowtron import Flowtron
from torch.utils.data import DataLoader
from data import Data
from train import update_params

sys.path.insert(0, "tacotron2")
sys.path.insert(0, "tacotron2/waveglow")
from glow import WaveGlow
from scipy.io.wavfile import write


def infer(flowtron_path, waveglow_path, output_dir, text, speaker_id, n_frames,
          sigma, gate_threshold, seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    # load waveglow
    waveglow = torch.load(waveglow_path)['model'].cuda().eval()
    waveglow.cuda().half()
    for k in waveglow.convinv:
        k.float()
    waveglow.eval()

    # load flowtron
    model = Flowtron(**model_config).cuda()
    state_dict = torch.load(flowtron_path, map_location='cpu')['model'].cuda()
    #model.load_state_dict(state_dict)

    model.eval()
    print("Loaded checkpoint '{}')" .format(flowtron_path))

    ignore_keys = ['training_files', 'validation_files']
    trainset = Data(
        data_config['training_files'],
        **dict((k, v) for k, v in data_config.items() if k not in ignore_keys))
    speaker_vecs = trainset.get_speaker_id(speaker_id).cuda()
    text = trainset.get_text(text).cuda()
    speaker_vecs = speaker_vecs[None]
    text = text[None]

    with torch.no_grad():
        residual = torch.cuda.FloatTensor(1, 80, n_frames).normal_() * sigma
        mels, attentions = model.infer(
            residual, speaker_vecs, text, gate_threshold=gate_threshold)

    for k in range(len(attentions)):
        attention = torch.cat(attentions[k]).cpu().numpy()
        fig, axes = plt.subplots(1, 2, figsize=(16, 4))
        axes[0].imshow(mels[0].cpu().numpy(), origin='bottom', aspect='auto')
        axes[1].imshow(attention[:, 0].transpose(), origin='bottom', aspect='auto')
        fig.savefig(os.path.join(output_dir, 'sid{}_sigma{}_attnlayer{}.png'.format(speaker_id, sigma, k)))
        plt.close("all")

    audio = waveglow.infer(mels.half(), sigma=0.8).float()
    audio = audio.cpu().numpy()[0]
    # normalize audio for now
    audio = audio / np.abs(audio).max()
    print(audio.shape)

    write(os.path.join(output_dir, 'sid{}_sigma{}.wav'.format(speaker_id, sigma)),
          data_config['sampling_rate'], audio)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--config', type=str,
                        help='JSON file for configuration')
    parser.add_argument('-p', '--params', nargs='+', default=[])
    parser.add_argument('-f', '--flowtron_path',
                        help='Path to flowtron state dict', type=str)
    parser.add_argument('-w', '--waveglow_path',
                        help='Path to waveglow state dict', type=str)
    parser.add_argument('-t', '--text', help='Text to synthesize', type=str)
    parser.add_argument('-i', '--id', help='Speaker id', type=int)
    parser.add_argument('-n', '--n_frames', help='Number of frames',
                        default=400, type=int)
    parser.add_argument('-o', "--output_dir", default="results/")
    parser.add_argument("-s", "--sigma", default=0.5, type=float)
    parser.add_argument("-g", "--gate", default=0.5, type=float)
    parser.add_argument("--seed", default=1234, type=int)
    args = parser.parse_args()

    # Parse configs.  Globals nicer in this case
    with open(args.config) as f:
        data = f.read()

    global config
    config = json.loads(data)
    update_params(config, args.params)

    data_config = config["data_config"]
    global model_config
    model_config = config["model_config"]

    # Make directory if it doesn't exist
    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)
        os.chmod(args.output_dir, 0o775)

    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
    infer(args.flowtron_path, args.waveglow_path, args.output_dir, args.text,
          args.id, args.n_frames, args.sigma, args.gate, args.seed)
```


# Download Training and Validation files
 
Down files from https://drive.google.com/drive/folders/1QNu4Wx07oRJ9xQ6EJRYZWgoKSBw37QVS?usp=sharing and upload to files in colab

In [None]:
# add a new emotests directory
! mkdir -p emotests

# Add files to emotests directory

In [None]:
!tar -xvf train.tar.xz -C emotests

train/
train/sam_neutral_227.wav
train/jenie_disgust_150.wav
train/sam_amused_202.wav
train/sam_sleepiness_236.wav
train/jenie_neutral_151.wav
train/jen_amused_089.wav
train/bea_amused_014.wav
train/bea_neutral_058.wav
train/sam_amused_205.wav
train/bea_amused_015.wav
train/josh_amused_174.wav
train/bea_disgust_035.wav
train/bea_sleepiness_074.wav
train/sam_sleepiness_231.wav
train/jen_anger_119.wav
train/jenie_sleepiness_161.wav
train/sam_sleepiness_239.wav
train/jen_anger_115.wav
train/jenie_sleepiness_167.wav
train/bea_neutral_048.wav
train/bea_sleepiness_068.wav
train/josh_neutral_181.wav
train/josh_amused_172.wav
train/bea_sleepiness_071.wav
train/jenie_sleepiness_164.wav
train/bea_amused_013.wav
train/josh_amused_175.wav
train/jen_amused_086.wav
train/sam_disgust_212.wav
train/sam_amused_204.wav
train/josh_sleepiness_198.wav
train/josh_sleepiness_192.wav
train/bea_amused_002.wav
train/jen_amused_084.wav
train/jen_amused_077.wav
train/josh_sleepiness_193.wav
train/jen_anger_118.wa

In [None]:
!tar -xvf /content/test.tar.xz -C /content/emotests

test/
test/jenie_sleepiness_501.wav
test/jen_anger_532.wav
test/josh_amused_305.wav
test/jen_amused_472.wav
test/bea_neutral_362.wav
test/jenie_disgust_560.wav
test/jenie_sleepiness_504.wav
test/bea_anger_334.wav
test/josh_neutral_306.wav
test/jen_anger_528.wav
test/bea_neutral_363.wav
test/josh_sleepiness_306.wav
test/jen_amused_474.wav
test/jen_amused_473.wav
test/jenie_sleepiness_503.wav
test/josh_neutral_308.wav
test/josh_amused_306.wav
test/jenie_neutral_448.wav
test/josh_sleepiness_308.wav
test/bea_amused_304.wav
test/bea_neutral_360.wav
test/bea_disgust_332.wav
test/jen_amused_476.wav
test/bea_disgust_334.wav
test/josh_amused_307.wav
test/bea_disgust_336.wav
test/jenie_disgust_559.wav
test/bea_disgust_333.wav
test/bea_sleepiness_501.wav
test/josh_amused_308.wav
test/jen_anger_531.wav
test/bea_neutral_364.wav
test/josh_sleepiness_307.wav
test/bea_amused_306.wav
test/bea_anger_336.wav
test/bea_sleepiness_503.wav
test/jenie_neutral_445.wav
test/jenie_neutral_447.wav
test/josh_neutr

## NOTE:
If the training files do not download properly use the following method

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!tar -xvf /content/drive/MyDrive/filelists/train.tar.xz -C /content/emotests

# Begin Training


In [None]:
!python /content/flowtron/train.py -c config.json -p train_config.ignore_layers=["speaker_embedding.weight"] train_config.checkpoint_path="/content/flowtron/flowtron_ljs.pt"

train_config.ignore_layers=[speaker_embedding.weight]
ignore_layers=[speaker_embedding.weight]
train_config.checkpoint_path=/content/flowtron/flowtron_ljs.pt
checkpoint_path=/content/flowtron/flowtron_ljs.pt
{'train_config': {'output_directory': 'outdir', 'epochs': 10000000, 'learning_rate': 0.0001, 'weight_decay': 1e-06, 'sigma': 1.0, 'iters_per_checkpoint': 500, 'batch_size': 12, 'seed': 1234, 'checkpoint_path': '/content/flowtron/flowtron_ljs.pt', 'ignore_layers': '[speaker_embedding.weight]', 'include_layers': ['speaker', 'encoder', 'embedding'], 'warmstart_checkpoint_path': '', 'with_tensorboard': True, 'fp16_run': False}, 'data_config': {'training_files': '/content/all_train_fin.txt', 'validation_files': '/content/all_val.txt', 'text_cleaners': ['flowtron_cleaners'], 'p_arpabet': 0.5, 'cmudict_path': 'data/cmudict_dictionary', 'sampling_rate': 22050, 'filter_length': 1024, 'hop_length': 256, 'win_length': 1024, 'mel_fmin': 0.0, 'mel_fmax': 8000.0, 'max_wav_value': 32768.0}, 'dist

# Load Tensorboard to view training metrics

In [None]:
%reload_ext tensorboard
%tensorboard --logdir /content/flowtron/outdir/logs

# Inference
Alter the text for any input text, Make sure you alter the model file to the step count you would like to test. eg content/flowtron/outdir/model_2500 for the checkpoint at 2500 steps. 

In [None]:
!python inference.py -c config.json -f /content/flowtron/outdir/model_500 -w /content/flowtron/waveglow_256channels_v4.pt -t "How are you?" -i 0

python3: can't open file 'inference.py': [Errno 2] No such file or directory
