In [1]:
# # Server
# %cd notebooks

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Config

In [3]:
train_audio_dir = '../data/train_audio/'
train_json_file = '../data/train.json'

test_audio_dir = '../data/test_audio/'

In [4]:
model_id = "openai/whisper-medium"

In [5]:
num_workers = 0
batch_size = 16

In [6]:
n_splits = 5
num_epochs = 1

In [7]:
encoder_lr = 1e-5
head_lr = 3e-4

## Data

In [8]:
import os
import json
import torch
import torchaudio
import numpy as np
import pandas as pd
from transformers import AutoProcessor
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold

### Read

In [9]:
with open(train_json_file) as file:
    train_data = json.load(file)

In [10]:
train_df = pd.DataFrame({'id': [file_name[:file_name.rfind('.')] for file_name in os.listdir(train_audio_dir)]})
train_df['label'] = train_df['id'].apply(lambda id: id in train_data).astype(int)
train_df['audio_path'] = train_audio_dir + train_df['id'] + '.opus'

### Split

In [11]:
splitter = StratifiedKFold(n_splits=n_splits)

### Dataset

In [12]:
class AudioClassificationDataset(Dataset):
    processor = AutoProcessor.from_pretrained(model_id)
    
    def __init__(self, audio_paths, labels=None, sampling_rate=16_000):
        self.audio_paths = audio_paths
        self.labels = labels
        self.sampling_rate = sampling_rate
        
    def __len__(self):
        return len(self.audio_paths)
    
    def __getitem__(self, idx):
        audio_path = self.audio_paths[idx]
        waveform, sr = torchaudio.load(audio_path)
        if sr != self.sampling_rate:
            waveform = torchaudio.functional.resample(waveform, sr, self.sampling_rate)

        inputs = self.processor(
            waveform.squeeze(0),
            sampling_rate=self.sampling_rate,
            return_tensors="pt"
        )

        inputs_dict = {"input_features": inputs.input_features.squeeze(0)}
        if self.labels is not None:
            inputs_dict["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)

        return inputs_dict

preprocessor_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

## Model

In [13]:
import torch
import torch.nn as nn
from transformers import AutoModelForSpeechSeq2Seq

In [14]:
class AudioClassificationModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = AutoModelForSpeechSeq2Seq.from_pretrained(model_id).model.encoder  # только энкодер
        self.head = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(self.encoder.layer_norm.normalized_shape[0], 2)
        )

    def forward(self, input_features, labels=None):
        outputs = self.encoder(input_features)
        hidden_states = outputs.last_hidden_state  # (batch, seq_len, hidden)
        
        # усредняем по времени (глобальный пуллинг)
        pooled = hidden_states.mean(dim=1)
        logits = self.head(pooled)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)
        
        return {"loss": loss, "logits": logits}

## Train

In [None]:
import os
from sklearn.metrics import f1_score
from tqdm.auto import tqdm
import torch

os.makedirs('../models', exist_ok=True)

# Save
models = []
fit_results = []
device = 'cuda' if torch.cuda.is_available() else 'cpu'
dtype = torch.float32

# Train
for i, (train_index, valid_index) in enumerate(splitter.split(train_df, train_df['label'])):
    print(f"Split: {i + 1}", end="\n\n")

    train_audio_paths_split, train_labels_split = train_df['audio_path'].values[train_index], train_df['label'].values[train_index]
    valid_audio_paths_split, valid_labels_split = train_df['audio_path'].values[valid_index], train_df['label'].values[valid_index]

    train_dataset = AudioClassificationDataset(train_audio_paths_split, train_labels_split)
    valid_dataset = AudioClassificationDataset(valid_audio_paths_split, valid_labels_split)

    train_loader = DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        num_workers=num_workers, 
        shuffle=True, 
        drop_last=True,
        pin_memory=True
    )
    valid_loader = DataLoader(
        valid_dataset, 
        batch_size=batch_size, 
        num_workers=num_workers, 
        shuffle=False, 
        drop_last=False,
        pin_memory=True
    )

    model = AudioClassificationModel()
    model.to(device=device, dtype=dtype)
    optimizer = torch.optim.Adam([
        {'params': model.encoder.parameters(), 'lr': encoder_lr},
        {'params': model.head.parameters(), 'lr': head_lr}
    ])
    
    fit_result = {
        'train_losses': [],
        'train_f1': [],
        'valid_losses': [],
        'valid_f1': []
    }

    for epoch in range(1, num_epochs + 1):
        # === TRAIN ===
        model.train()
        train_losses = []
        all_train_preds, all_train_targets = [], []
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch} [Train]")
        
        for batch in progress_bar:
            input_features = batch['input_features'].to(device=device, dtype=dtype)
            labels = batch['labels'].to(device=device)

            optimizer.zero_grad()
            outputs = model(input_features, labels=labels)
            loss = outputs['loss']
            loss.backward()
            optimizer.step()

            preds = outputs['logits'].argmax(dim=1).detach().cpu().numpy()
            targets = labels.cpu().numpy()

            all_train_preds.extend(preds)
            all_train_targets.extend(targets)
            train_losses.append(loss.item())

            progress_bar.set_postfix(loss=loss.item())

        train_loss = sum(train_losses) / len(train_losses)
        train_f1 = f1_score(all_train_targets, all_train_preds, average='macro')
        fit_result['train_losses'].append(train_loss)
        fit_result['train_f1'].append(train_f1)
        print(f"Train loss: {train_loss:.4f} | F1 (macro): {train_f1:.4f}")

        # === VALID ===
        model.eval()
        valid_losses = []
        all_valid_preds, all_valid_targets = [], []

        progress_bar = tqdm(valid_loader, desc=f"Epoch {epoch} [Valid]")
        with torch.no_grad():
            for batch in progress_bar:
                input_features = batch['input_features'].to(device=device, dtype=dtype)
                labels = batch['labels'].to(device=device)

                outputs = model(input_features, labels=labels)
                loss = outputs['loss']

                preds = outputs['logits'].argmax(dim=1).detach().cpu().numpy()
                targets = labels.cpu().numpy()

                all_valid_preds.extend(preds)
                all_valid_targets.extend(targets)
                valid_losses.append(loss.item())

                progress_bar.set_postfix(loss=loss.item())

        valid_loss = sum(valid_losses) / len(valid_losses)
        valid_f1 = f1_score(all_valid_targets, all_valid_preds, average='macro')

        fit_result['valid_losses'].append(valid_loss)
        fit_result['valid_f1'].append(valid_f1)

        print(f"Valid loss: {valid_loss:.4f} | F1 (macro): {valid_f1:.4f}\n")

    fit_results.append(fit_result)
    models.append(model)
    
    path = f'../models/split_{i + 1}.pt'
    torch.save(model.state_dict(), path)

    break


Split: 1



config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/3.06G [00:00<?, ?B/s]