In [1]:
import os
import random
import warnings
from sklearn.model_selection import train_test_split

import librosa
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from tqdm.auto import tqdm
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import torchaudio
from transformers import AutoFeatureExtractor,WhisperForAudioClassification

import warnings
warnings.filterwarnings(action='ignore') 
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [2]:
CFG = {
    'SR':16000,
    'SEED':42,
    'BATCH_SIZE':2, 
    'TOTAL_BATCH_SIZE':8, 
    'EPOCHS':1,
    'LR':1e-4,
}
MODEL_NAME = "openai/whisper-small"

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

In [4]:
df = pd.read_csv('./speech_data/train.csv')
df.path = df.path.str.replace('./t','./speech_data/t')
train_df, valid_df, _, _ = train_test_split(df,df['label'],test_size = 0.2, random_state = CFG['SEED'])
train_df.reset_index(drop=True, inplace=True)
valid_df.reset_index(drop=True, inplace=True)

In [5]:
class CustomDataSet(Dataset):
    def __init__(self, file_list, labels, processor):
        self.file_list = file_list
        self.labels = labels
        self.processor = processor

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        audio, _ = torchaudio.load(self.file_list[idx])
        input = self.processor(audio[0], sampling_rate=CFG['SR'], return_tensors="pt").input_features[0]
        if self.labels is not None:
            return input, self.labels[idx]
        else:
            return input

In [6]:
processor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
train_dataset = CustomDataSet(train_df['path'], train_df['label'], processor)
valid_dataset = CustomDataSet(valid_df['path'], valid_df['label'], processor)

train_loader = DataLoader(train_dataset, CFG['BATCH_SIZE'], True,  num_workers = 0)
valid_loader = DataLoader(valid_dataset, CFG['BATCH_SIZE'], False, num_workers = 0)

In [7]:
class BaseModel(torch.nn.Module):
    def __init__(self):
        super(BaseModel, self).__init__()
        self.model = WhisperForAudioClassification.from_pretrained(MODEL_NAME)
        self.model.classifier = nn.Identity()
        self.classifier = nn.Linear(256, 6)

    def forward(self, x):
        output = self.model(x)
        output = self.classifier(output.logits)
        return output

In [8]:
def validation(model, valid_loader, creterion):
    model.eval()
    val_loss = []

    total, correct = 0, 0
    test_loss = 0

    with torch.no_grad():
        for x, y in tqdm(iter(valid_loader)):
            x = x.to(device)
            y = y.flatten().to(device)

            output = model(x).logits
            loss = creterion(output, y)

            val_loss.append(loss.item())

            test_loss += loss.item()
            _, predicted = torch.max(output, 1)
            total += y.size(0)
            correct += predicted.eq(y).cpu().sum()

    accuracy = correct / total

    avg_loss = np.mean(val_loss)

    return avg_loss, accuracy

def train(model, train_loader, valid_loader, optimizer, scheduler):
    accumulation_step = int(CFG['TOTAL_BATCH_SIZE'] / CFG['BATCH_SIZE'])
    model.to(device)
    creterion = nn.CrossEntropyLoss().to(device)

    best_model = None
    best_acc = 0

    for epoch in range(1, CFG['EPOCHS']+1):
        train_loss = []
        model.train()
        for i, (x, y) in enumerate(tqdm(train_loader)):
            x = x.to(device)
            y = y.flatten().to(device)

            optimizer.zero_grad()
            
            output = model(x).logits
            loss = creterion(output, y)
            
            loss.backward()

            if (i+1) % accumulation_step == 0:
                optimizer.step()
                optimizer.zero_grad()

            train_loss.append(loss.item())

        avg_loss = np.mean(train_loss)
        valid_loss, valid_acc = validation(model, valid_loader, creterion)

        if scheduler is not None:
            scheduler.step(valid_acc)

        if valid_acc > best_acc:
            best_acc = valid_acc
            best_model = model
            torch.save(best_model.state_dict(), './ckp/best_model_score.pt')
            print('model_save !')    
        print(f'epoch:[{epoch}] train loss:[{avg_loss:.5f}] valid_loss:[{valid_loss:.5f}] valid_acc:[{valid_acc:.5f}]')
    
    print(f'best_acc:{best_acc:.5f}')


In [9]:
model = WhisperForAudioClassification.from_pretrained(MODEL_NAME,num_labels = 6)
model = model.to(device)
#model.freeze_encoder()


Some weights of the model checkpoint at openai/whisper-small were not used when initializing WhisperForAudioClassification: ['model.decoder.layers.6.self_attn_layer_norm.bias', 'model.decoder.layers.9.self_attn.v_proj.bias', 'model.decoder.layers.8.final_layer_norm.weight', 'model.decoder.layers.7.final_layer_norm.weight', 'model.decoder.layers.10.encoder_attn.out_proj.weight', 'model.decoder.layers.4.encoder_attn.v_proj.weight', 'model.decoder.layers.7.fc2.weight', 'model.decoder.layers.2.final_layer_norm.weight', 'model.decoder.layers.10.encoder_attn.v_proj.weight', 'model.decoder.layers.5.encoder_attn.k_proj.weight', 'model.decoder.layers.8.self_attn.out_proj.bias', 'model.decoder.layers.0.self_attn.out_proj.bias', 'model.decoder.layers.8.encoder_attn_layer_norm.bias', 'model.decoder.layers.7.encoder_attn.q_proj.weight', 'model.decoder.layers.6.fc2.weight', 'model.decoder.layers.8.self_attn.q_proj.bias', 'model.decoder.layers.7.self_attn.out_proj.bias', 'model.decoder.layers.2.encod

In [10]:
for name,param in model.named_parameters():
    print(name, param.requires_grad)

encoder.conv1.weight True
encoder.conv1.bias True
encoder.conv2.weight True
encoder.conv2.bias True
encoder.embed_positions.weight True
encoder.layers.0.self_attn.k_proj.weight True
encoder.layers.0.self_attn.v_proj.weight True
encoder.layers.0.self_attn.v_proj.bias True
encoder.layers.0.self_attn.q_proj.weight True
encoder.layers.0.self_attn.q_proj.bias True
encoder.layers.0.self_attn.out_proj.weight True
encoder.layers.0.self_attn.out_proj.bias True
encoder.layers.0.self_attn_layer_norm.weight True
encoder.layers.0.self_attn_layer_norm.bias True
encoder.layers.0.fc1.weight True
encoder.layers.0.fc1.bias True
encoder.layers.0.fc2.weight True
encoder.layers.0.fc2.bias True
encoder.layers.0.final_layer_norm.weight True
encoder.layers.0.final_layer_norm.bias True
encoder.layers.1.self_attn.k_proj.weight True
encoder.layers.1.self_attn.v_proj.weight True
encoder.layers.1.self_attn.v_proj.bias True
encoder.layers.1.self_attn.q_proj.weight True
encoder.layers.1.self_attn.q_proj.bias True
en

In [11]:
optimizer = torch.optim.AdamW(model.parameters(), lr=CFG['LR'])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2, verbose=True)
train(model, train_loader, valid_loader, optimizer, scheduler)

  0%|          | 0/2000 [00:00<?, ?it/s]

KeyboardInterrupt: 