<a href="https://colab.research.google.com/github/YihanCao123/SoundAction/blob/two_tower_training/Train_Two_Tower_on_Clotho.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! wget https://zenodo.org/record/3490684/files/clotho_captions_evaluation.csv
! wget https://zenodo.org/record/3490684/files/clotho_captions_development.csv

In [None]:
! rm -rf /content/SoundAction
! git clone --branch two_tower_training https://github.com/YihanCao123/SoundAction

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
! rm -rf /content/Clotho-Workspace
! mkdir /content/Clotho-Workspace
! mkdir /content/Clotho-Workspace/features
! cp /content/drive/MyDrive/Clotho/waveform.h5 /content/Clotho-Workspace/features

In [None]:
! pip install torchlibrosa
! pip install transformers

In [None]:
! wget https://zenodo.org/record/3987831/files/Cnn14_mAP%3D0.431.pth

In [98]:
# generate data with 'clotho_data_generator.py'
! wget https://zenodo.org/record/3490684/files/clotho_audio_development.7z
! sudo apt-get install p7zip
! 7za x /content/clotho_audio_development.7z

! rm -rf /content/Clotho-Final
! mkdir /content/Clotho-Final

DATASET_DIR="/content/development"
WORKSPACE="/content/Clotho-Final"

! python3 /content/SoundAction/data/clotho_data_generator.py pack_audio_files_to_hdf5 --dataset_dir=$DATASET_DIR --workspace=$WORKSPACE

Validating...
1/20251
2001/20251
4001/20251
6001/20251
8001/20251
10001/20251
12001/20251
14001/20251
16001/20251
18001/20251
20001/20251
Write hdf5 to /content/Clotho-Test/features/waveform.h5
Time 2427.265 s


In [1]:
import numpy as np
import argparse
import logging
import os
import time
import sys

sys.path.append('/content/SoundAction')

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from model.utils import move_data_to_device

from params import train_config
from data.utils import create_folder, create_logging, get_filename
from data.data_loader import AudioDataset, TrainSampler, EvaluateSampler, collate_fn
from model.models import ConcatCLS
from model.losses import get_loss_func
from model.evaluate import Eva

In [2]:
# Parameters

# --dataset_dir=$DATASET_DIR --workspace=$WORKSPACE --holdout_fold=1 --model_type="Transfer_Cnn14" --pretrained_checkpoint_path=$PRETRAINED_CHECKPOINT_PATH 
# --loss_type=clip_nll --augmentation='none' --learning_rate=1e-4 --batch_size=32 --resume_iteration=0 --stop_iteration=50000 --cuda

DATASET_DIR="/content/development"
WORKSPACE="/content/Clotho-Test"
PRETRAINED_CHECKPOINT_PATH = "/content/Cnn14_mAP=0.431.pth"

args = {'holdout_fold': 1,
        'stop_iteration': 50000,
        'learning_rate': 1e-5,
        'batch_size': 32,
        'model_type': "Transfer_Cnn14",
        'freeze_base': False,
        'loss_type': 'clip_nll',
        'augmentation': 'none',
        'resume_iteration': 0,
        'lr_step_size': 13000}

dataset_dir = DATASET_DIR
workspace = WORKSPACE
holdout_fold = args['holdout_fold']
model_type = args['model_type']
pretrained_checkpoint_path = PRETRAINED_CHECKPOINT_PATH
freeze_base = args['freeze_base']
loss_type = args['loss_type']
augmentation = args['augmentation']
learning_rate = args['learning_rate']
batch_size = args['batch_size']
resume_iteration = args['resume_iteration']
stop_iteration = args["stop_iteration"]
device = 'cuda' if torch.cuda.is_available() else 'cpu'
num_workers = 1

loss_func = get_loss_func(loss_type)
pretrain = True if pretrained_checkpoint_path else False

hdf5_path = os.path.join(workspace, 'features', 'waveform.h5')

checkpoints_dir = os.path.join(workspace, 'checkpoints')
create_folder(checkpoints_dir)

# Model
Model = ConcatCLS # This could be Model = Transfer_Cnn14() in our case, however, here for easy implementation, we will still use this.

model = Model(train_config.sample_rate, train_config.window_size, train_config.hop_size, train_config.mel_bins,
train_config.fmin, train_config.fmax, train_config.classes_num, train_config.freeze_base)

if pretrain:
    print("Load pretrained model from {}".format(pretrained_checkpoint_path))
    model.load_from_pretrain(pretrained_checkpoint_path)

if resume_iteration:
    resume_checkpoint_path = os.path.join(checkpoints_dir, '{}_iterations.pth'.format(resume_iteration))
    print("Load resume model from {}".format(resume_checkpoint_path))
    resume_checkpoint = torch.load(resume_checkpoint_path)
    model.load_state_dict(resume_checkpoint['model'])
    iteration = resume_checkpoint['iteration']
else:
    iteration = 0

# Data
dataset = AudioDataset()

# Generator
train_sampler = TrainSampler(
    hdf5_path=hdf5_path,
    holdout_fold=holdout_fold,
    batch_size=batch_size
)

validate_sampler = EvaluateSampler(
    hdf5_path=hdf5_path,
    holdout_fold=holdout_fold,
    batch_size=batch_size
)

# Data Loader
train_loader = torch.utils.data.DataLoader(dataset=dataset,
    batch_sampler=train_sampler, collate_fn=collate_fn,
    num_workers=num_workers, pin_memory=True
)

validate_loader = torch.utils.data.DataLoader(dataset=dataset,
    batch_sampler=validate_sampler, collate_fn=collate_fn,
    num_workers=num_workers, pin_memory=True
)

if 'cuda' in device:
    model.to(device)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999),
    eps=1e-08, weight_decay=0., amsgrad=True
)
# loss


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Load pretrained model from /content/Cnn14_mAP=0.431.pth


In [3]:
! rm -r /content/Clotho-Test/checkpoints/*

rm: cannot remove '/content/Clotho-Test/checkpoints/*': No such file or directory


In [None]:
loss = nn.BCELoss()
# Evaluator
evaluator = Eva(model=model)
# scheduler
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=args['lr_step_size'], gamma=0.5)

train_begin_time = time.time()

acc_save_threshold = 0.73

total_loss = 0
n = 0

# Train
print('Start Training')
model.train()
for batch_data_dict in train_loader:
            # Move data to GPU
    for key in batch_data_dict.keys():
        batch_data_dict[key] = move_data_to_device(batch_data_dict[key], device)

    optimizer.zero_grad()
    # Train
    # print([element for element in batch_data_dict['caption']])
    batch_output_dict = model(batch_data_dict['waveform'], [element for element in batch_data_dict['caption']])

    output_loss = loss(batch_output_dict, batch_data_dict['target'])
    n += 1
    total_loss += output_loss
    if iteration % 1000 == 0 and iteration > 0:
        total_loss = output_loss
        n = 1
    if iteration % 200 == 0 and iteration > 0:
        print('Iteration Number: {} Loss: {} LR: {}'.format(iteration, float(output_loss), scheduler.get_last_lr()))

    # Backward
    output_loss.backward()
    optimizer.step()
    scheduler.step()

    # Evaluate AND Save Model
    if iteration % 1000 == 0:
        print("-----------------------------------------------")
        print("Evaluating in iteration: {}".format(iteration))

        train_fin_time = time.time()
        statistics =  evaluator.evaluate(validate_loader)
        print("Validate accuracy: {:.3f}".format(statistics['accuracy']))

        if statistics['accuracy'] > acc_save_threshold:
            acc_save_threshold = statistics['accuracy']
            checkpoint = {
                'iteration': iteration,
                'two_tower_state_dict': model.audio_encoder.state_dict(),
                'cnn14_state_dict': model.state_dict()
            }
            checkpoint_path = os.path.join(checkpoints_dir, '{}_{}_{}_iterations.pth'.format(int(time.time() % 100000), iteration, acc_save_threshold))
            torch.save(checkpoint, checkpoint_path)
            print('Model saved to {}'.format(checkpoint_path))

    # Stop
    if iteration == stop_iteration:
        break

    iteration += 1

Start Training
-----------------------------------------------
Evaluating in iteration: 0
Mean: 0.5187724232673645 Standard Deviation: 0.015822427347302437 Range: [0.4290657639503479, 0.5863184928894043]
Validate accuracy: 0.000
Iteration Number: 200 Loss: 0.4925774931907654 LR: [1e-05]
Iteration Number: 400 Loss: 0.5175431966781616 LR: [1e-05]
Iteration Number: 600 Loss: 0.49262410402297974 LR: [1e-05]
Iteration Number: 800 Loss: 0.39231938123703003 LR: [1e-05]
Iteration Number: 1000 Loss: 0.407196044921875 LR: [1e-05]
-----------------------------------------------
Evaluating in iteration: 1000
Mean: 0.7143572568893433 Standard Deviation: 0.26683133840560913 Range: [0.012398933060467243, 0.9914548397064209]
Validate accuracy: 0.624
Iteration Number: 1200 Loss: 0.4288156032562256 LR: [1e-05]
Iteration Number: 1400 Loss: 0.3864608705043793 LR: [1e-05]
Iteration Number: 1600 Loss: 0.16219082474708557 LR: [1e-05]
Iteration Number: 1800 Loss: 0.17413458228111267 LR: [1e-05]
Iteration Numb

In [None]:
!cp -R /content/Clotho-Test/checkpoints/* /content/drive/MyDrive/Clotho/checkpoints

In [123]:
# This can be use to train as well
DATASET_DIR="/content/development"
WORKSPACE="/content/Clotho-Test"
PRETRAINED_CHECKPOINT_PATH = "/content/Cnn14_mAP=0.431.pth"

! python3 /content/SoundAction/run_model.py train --dataset_dir=$DATASET_DIR --workspace=$WORKSPACE --holdout_fold=1 --model_type="Transfer_Cnn14" --pretrained_checkpoint_path=$PRETRAINED_CHECKPOINT_PATH --loss_type=clip_nll --augmentation='none' --learning_rate=1e-4 --batch_size=32 --resume_iteration=0 --stop_iteration=50000 --cuda
#! python3 /content/SoundAction/test_run_model.py train --dataset_dir=$DATASET_DIR --workspace=$WORKSPACE --holdout_fold=1 --model_type="Transfer_Cnn14" --pretrained_checkpoint_path=$PRETRAINED_CHECKPOINT_PATH --loss_type=clip_nll --augmentation='none' --learning_rate=1e-3 --batch_size=32 --resume_iteration=0 --stop_iteration=50000 --cuda

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Load pretrained model from /content/Cnn14_mAP=0.431.pth
Start Training
Iteration Number: 100 Loss: 0.6827074885368347
Iteration Number: 200 Loss: 0.5312267