In [1]:
import os
import sys

sys.path.append("../")

In [2]:
%load_ext autoreload
%autoreload 2
%env DATASET_PATH=/shared/g-luo/vctk

env: DATASET_PATH=/shared/g-luo/vctk


In [3]:
from hyper_params import *
from train import load_data
from data_utils import VCTK, Collate

from models.tacotron2 import Tacotron2, Tacotron2Loss
from models.wav2vec_asr import Wav2VecASR, Wav2VecASRLoss
from models.wav2vec_id import Wav2VecID, Wav2VecIDLoss

from multitask import AccentedMultiTaskNetwork, Task

import pytorch_lightning as pl
from pytorch_lightning import loggers as pl_loggers

import torch
from torch.utils.data import DataLoader, random_split

from metrics import SoftmaxAccuracy

In [10]:
tp = TrainingParams(val_size=0.1, batch_size=4)
dp = DataParams(filter_length=800, sample_rate=16000, win_length=800, hop_length=200)
mp = MultiTaskParams(hidden_dim=[13], in_dim=1024)

In [5]:
tacotron = Tacotron2(TacotronParams())
tacotron_loss = Tacotron2Loss()
tts_task = Task(model=tacotron, loss=tacotron_loss, learning_rate=1e-3, weight_decay=1e-6, name='TTS', loss_weight=0.5, metrics=[])

In [6]:
asr = Wav2VecASR(Wav2VecASRParams())
asr_loss = Wav2VecASRLoss()
asr_task = Task(model=asr, loss=asr_loss, learning_rate=1e-5, weight_decay=0, name='ASR', loss_weight=1, metrics=[])

In [7]:
accent_id = Wav2VecID(Wav2VecIDParams())
accent_id_loss = Wav2VecIDLoss()
accent_id_task = Task(model=accent_id, loss=accent_id_loss, learning_rate=1e-5, weight_decay=1e-6, name='ID', loss_weight=1, metrics=[SoftmaxAccuracy()])

In [11]:
AccentedMultiTaskNetwork(mp, [accent_id_task])

Some weights of the model checkpoint at facebook/wav2vec2-large-960h were not used when initializing Wav2Vec2Model: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AccentedMultiTaskNetwork(
  (bottleneck): Sequential(
    (0): Linear(in_features=1024, out_features=13, bias=True)
    (1): ReLU()
    (2): Linear(in_features=13, out_features=13, bias=True)
  )
  (models): ModuleList(
    (0): Wav2VecID(
      (network): Sequential(
        (0): Linear(in_features=13, out_features=13, bias=True)
      )
    )
  )
  (wav2vec_model): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureExtractor(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        )
        (2): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        )
        (3): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 

In [12]:
model = AccentedMultiTaskNetwork.load_from_checkpoint("../runs/freeze_feat_extractor.ckpt", params=mp, tasks=[accent_id_task])

Some weights of the model checkpoint at facebook/wav2vec2-large-960h were not used when initializing Wav2Vec2Model: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
train_loader, val_loader = load_data(tp, dp)

INFO: Loading Audio Lengths
Number of samples:  37372


In [24]:
logger = pl_loggers.CSVLogger("./eval/freeze_feat_extract")

trainer = pl.Trainer(gradient_clip_val=tp.grad_clip_thresh, max_epochs=30, gpus=1, 
                     logger=logger, accumulate_grad_batches=16, log_every_n_steps=5, accelerator="gpu", devices=6)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [25]:
trainer.validate(model=model, dataloaders=val_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [6]
  rank_zero_warn(


Validating: 0it [00:00, ?it/s]

  tensor = as_tensor(value)


--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'Accuracy_on_ID': 0.9710997939109802,
 'val_loss': 0.08757246285676956,
 'val_loss_ID': 0.08757246285676956}
--------------------------------------------------------------------------------


[{'val_loss_ID': 0.08757246285676956,
  'val_loss': 0.08757246285676956,
  'Accuracy_on_ID': 0.9710997939109802}]

In [None]:
# share 