In [1]:
from IPython.display import clear_output

!pip install pytorch_lightning transformers

clear_output()

In [10]:
import pytorch_lightning as pl
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
from scipy.stats import spearmanr
import torchmetrics
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.metrics import recall_score
from sklearn.preprocessing import LabelEncoder
import librosa
import pickle

tqdm.pandas()

In [3]:
import os

class RequestsDataset(torch.utils.data.Dataset):
    @classmethod
    def get_le(cls,df,target='상황'):
        df[target] = df[target].apply(lambda x: x.lower().strip())
        le = LabelEncoder()
        le.fit(df[target])
        return le

    def get_labels(self):
        return self.labels

    def __init__(self,df, data_path,target='상황',max_sec=10,sr=16000, le = None,truncate=True,test=False):
        self.test = test
        self.truncate = truncate
        self.files = df['wav_id'].apply(lambda x: os.path.join(data_path, f'{x}.wav')).copy()

        df[target] = df[target].apply(lambda x: x.lower().strip())
        if le is None:
            self.le = LabelEncoder()
            self.labels = self.le.fit_transform(df[target].values)
        else:
            self.le = le
            self.labels = self.le.transform(df[target].values)
        self.maxlen = max_sec * sr
        print('Loading and processing audio')
        self.processor = Wav2Vec2FeatureExtractor.from_pretrained('kresnik/wav2vec2-large-xlsr-korean')

        self.audio_files = []
        for file_path in self.files:
            if os.path.exists(file_path):
                audio = librosa.load(file_path,sr=sr)[0]
                audio_processed = self.processor(audio, sampling_rate=sr, return_tensors="pt", padding=True).input_values.squeeze(0)
                self.audio_files.append(audio_processed)
            else:
                print(f"File {file_path} does not exist.")
        self.files['audio'] = self.audio_files


    def __len__(self):
        return len(self.files['audio'])

    def __getitem__(self, idx):
        audio = self.files['audio'][idx]
        if not self.truncate:
            return audio, self.labels[idx]
        if (audio.shape[0] > self.maxlen):
            start = np.random.randint(audio.shape[0] - self.maxlen)
            audio = audio[start:start+self.maxlen]
        else:
            audio = torch.cat((audio, torch.zeros(self.maxlen - audio.shape[0])))
        if not self.test:
            return audio, self.labels[idx]
        else:
            return audio

In [11]:
class AudioModel(pl.LightningModule):
    def __init__(self,num_classes, ckpt='kresnik/wav2vec2-large-xlsr-korean'):
        super().__init__()
        self.model = Wav2Vec2Model.from_pretrained(ckpt)
        self.model.feature_extractor._freeze_parameters()
        self.layer_weights = torch.nn.Parameter(torch.ones(25))
        self.linear = torch.nn.Linear(1024*2, num_classes)
        self.dropout = torch.nn.Dropout(0.2)
        self.preds = []
        self.labels = []

    def compute_features(self, x):
        x = self.model(input_values=x, output_hidden_states=True).hidden_states
        x = torch.stack(x,dim=1)
        weights = torch.nn.functional.softmax(self.layer_weights, dim=-1)
        mean_x = x.mean(dim = 2)
        std_x = x.std(dim = 2)
        x = torch.cat((mean_x, std_x), dim=-1)
        x = (x * weights.view(-1,25,1)).sum(dim=1)
        return x

    def forward(self, x):
        x = self.compute_features(x)
        x = self.dropout(x)
        x = self.linear(x)
        x = torch.softmax(x,dim=-1)
        return x

    def training_step(self, batch,batch_idx):
        x,y = batch
        logits = self.forward(x)
        loss_fn = torch.nn.CrossEntropyLoss()
        loss = loss_fn(logits,y)
        self.log('train_loss', loss,sync_dist=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x,y = batch
        logits = self.forward(x)
        loss_fn = torch.nn.CrossEntropyLoss()
        loss = loss_fn(logits,y)
        self.log('val_loss', loss,sync_dist=True)
        logits = torch.sigmoid(logits)
        preds = logits.argmax(dim=-1).detach().cpu().numpy()
        self.preds.append(preds)
        self.labels.append(y.detach().cpu().numpy())
        return loss

    def on_validation_epoch_end(self):
        self.preds = np.concatenate(self.preds)
        self.labels = np.concatenate(self.labels)
        self.log('val_recall', recall_score(self.labels,self.preds,average='macro'), sync_dist=True)
        self.preds = []
        self.labels = []

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=5e-5)
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': torch.optim.lr_scheduler.LinearLR(optimizer, 0.01, 1,total_iters=100),
                'interval': 'step',
            },
            'monitor': 'val_recall',
            'interval': 'epoch'
        }

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
csv3_path = '/content/drive/MyDrive/의현/감정 분류를 위한 대화 음성 데이터셋/4차년도.csv'
wav_path = '/content/drive/MyDrive/의현/감정 분류를 위한 대화 음성 데이터셋/4차년도.zip'
!mkdir ./wav/
!cp -r "$csv3_path" ./
!cp -r "$wav_path" ./wav/
drive.flush_and_unmount()

In [6]:
import os
import shutil
from sys import platform
from glob import glob
!unzip './wav/4차년도.zip' -d file/
clear_output()

In [7]:
csv3_data_path = './4차년도.csv'
csv3 = pd.read_csv(csv3_data_path, encoding = 'CP949')
data_path = './file/'

In [8]:

from sklearn.model_selection import train_test_split

train_size = 0.80

train, val = train_test_split(csv3, train_size = train_size, stratify=csv3['상황'], random_state=77)

In [12]:
import os

if __name__ == '__main__':
    train_df = train
    dev_df = val
    model = AudioModel(5)
    train_dataset = RequestsDataset(train_df, data_path, max_sec = 10)
    le = train_dataset.le
    dev_dataset = RequestsDataset(dev_df, data_path, max_sec = 10,le=le)
    checkpoint_callback = ModelCheckpoint(dirpath='com_ckpts',monitor='val_recall',save_top_k=1,mode='max')
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, num_workers=4, shuffle=True)
    dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=8, num_workers=4, shuffle=False)
    logger = pl.loggers.TensorBoardLogger(save_dir='logs/')
    trainer = pl.Trainer(
        devices= 'auto',
        accelerator='gpu',
        max_epochs=10,
        logger=logger,
        callbacks=[checkpoint_callback],
        precision=16
    )

Downloading (…)lve/main/config.json:   0%|          | 0.00/2.31k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Some weights of the model checkpoint at kresnik/wav2vec2-large-xlsr-korean were not used when initializing Wav2Vec2Model: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loading and processing audio


Downloading (…)rocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

File ./file/5e3161c65807b852d9e032af.wav does not exist.
File ./file/5e2ad4145807b852d9e020d9.wav does not exist.
File ./file/5e32924e5807b852d9e03894.wav does not exist.
File ./file/5e3292825807b852d9e0389a.wav does not exist.
File ./file/5e33a9d35807b852d9e050f4.wav does not exist.
File ./file/5e298c085807b852d9e01a12.wav does not exist.
File ./file/5e2ad43e5807b852d9e020dc.wav does not exist.
File ./file/5e2998b85807b852d9e01b02.wav does not exist.
File ./file/5e33638b5807b852d9e04aeb.wav does not exist.
File ./file/5e298bc45807b852d9e01a10.wav does not exist.
File ./file/5e298b9f5807b852d9e01a0f.wav does not exist.
File ./file/5e298bdc5807b852d9e01a11.wav does not exist.
File ./file/5e2979c25807b852d9e018d5.wav does not exist.
File ./file/5e31622f5807b852d9e032ba.wav does not exist.
File ./file/5e3292655807b852d9e03896.wav does not exist.
Loading and processing audio
File ./file/5e315dca5807b852d9e03275.wav does not exist.


  rank_zero_warn(
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [13]:
trainer.fit(model, train_loader, dev_loader)

INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type          | Params
------------------------------------------
0 | model   | Wav2Vec2Model | 315 M 
1 | linear  | Linear        | 10.2 K
2 | dropout | Dropout       | 0     
------------------------------------------
311 M     Trainable params
4.2 M     Non-trainable params
315 M     Total params
1,261.796 Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [14]:
checkpoint_path = trainer.checkpoint_callback.best_model_path
checkpoint_path

'/content/com_ckpts/epoch=0-step=1459.ckpt'

In [None]:
trainer.test(ckpt_path=checkpoint_path, dataloaders=[dev_loader])

INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/com_ckpts/epoch=2-step=4377.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/com_ckpts/epoch=2-step=4377.ckpt


Testing: 0it [00:00, ?it/s]

[{'test_acc': 0.33353325724601746, 'test_loss': 1.2093791961669922}]

In [15]:
from google.colab import drive
drive.mount('/content/drive')

import shutil

checkpoint_path = trainer.checkpoint_callback.best_model_path
drive_path = "/content/drive/MyDrive/의현/speech_best.ckpt"

shutil.copy(checkpoint_path, drive_path)
drive.flush_and_unmount()

Mounted at /content/drive
