In [3]:
# Two-Stage Audio Autoencoder Demo
# Демонстрация обучения и реконструкции аудиосигнала
import os
os.chdir("/Users/talabaev/Desktop/two_stage_autoencoder_project")
print("Текущая папка:", os.getcwd())
print("Файлы в проекте:", os.listdir())


Текущая папка: /Users/talabaev/Desktop/two_stage_autoencoder_project
Файлы в проекте: ['demo_autoencoder.ipynb', '.DS_Store', 'config.py', 'models.py', 'requirements.txt', 'checkpoints', 'README.md', '.gitignore', 'utils.py', '.venv', 'train.py', '.ipynb_checkpoints', '.git', 'main.py', 'audio_data', 'data.py']


In [4]:
import torch
import torchaudio
from models import TwoStageAudioAutoencoder
from config import TrainConfig
from utils import load_model_from_checkpoint
import IPython.display as ipd
import os
os.listdir("checkpoints")


['checkpoint_epoch_1.pth',
 'checkpoint_epoch_2.pth',
 'checkpoint_epoch_3.pth',
 'checkpoint_epoch_7.pth',
 'checkpoint_epoch_6.pth',
 'checkpoint_epoch_4.pth',
 'checkpoint_epoch_5.pth',
 'checkpoint_epoch_8.pth',
 'checkpoint_epoch_9.pth',
 'checkpoint_epoch_10.pth']

In [4]:
audio_path = "audio_data/test/sample1.wav"
waveform, sr = torchaudio.load(audio_path)

target_sr = 16000


if sr != target_sr:
    resampler = torchaudio.transforms.Resample(sr, target_sr)
    waveform = resampler(waveform)
    sr = target_sr

print("Sample rate after resampling:", sr)
print("Shape:", waveform.shape)


Sample rate after resampling: 16000
Shape: torch.Size([1, 32064])


In [9]:
import torchaudio
import torch

audio_path = "audio_data/test/sample1.wav"
waveform, sr = torchaudio.load(audio_path)

target_sr = 16000

if sr != target_sr:
    resampler = torchaudio.transforms.Resample(sr, target_sr)
    waveform = resampler(waveform)
    sr = target_sr

print("Sample rate after resampling:", sr)
print("Waveform shape:", waveform.shape)


Sample rate after resampling: 16000
Waveform shape: torch.Size([1, 32064])




In [10]:
num_samples = int(16000 * 1.0)

if waveform.shape[1] < num_samples:
    waveform = torch.nn.functional.pad(
        waveform, (0, num_samples - waveform.shape[1])
    )
else:
    waveform = waveform[:, :num_samples]

x = waveform.view(1, -1)
print("Final waveform shape:", x.shape)


Final waveform shape: torch.Size([1, 16000])


In [6]:
from config import TrainConfig
from models import TwoStageAudioAutoencoder
from utils import load_model_from_checkpoint

cfg = TrainConfig(
    epochs=10,
    batch_size=32,
    lr=0.001,
    latent_dim_1=256,   # как при обучении checkpoint_epoch_10.pth
    latent_dim_2=64,    # как при обучении checkpoint_epoch_10.pth
    log_interval=20,
    sample_rate=16000,
    max_duration_sec=1.0,   # как при обучении checkpoint_epoch_10.pth
    audio_root="audio_data",
)

num_samples = int(cfg.sample_rate * cfg.max_duration_sec)

model = TwoStageAudioAutoencoder(
    input_dim=num_samples,
    latent_dim_1=cfg.latent_dim_1,
    latent_dim_2=cfg.latent_dim_2,
)

last_epoch = load_model_from_checkpoint(
    model, "checkpoints/checkpoint_epoch_10.pth", "cpu"
)
print("Loaded checkpoint from epoch", last_epoch)

model.eval()


Loaded checkpoint from epoch 10


TwoStageAudioAutoencoder(
  (stage1): AutoencoderStage1(
    (encoder): Sequential(
      (0): Linear(in_features=16000, out_features=1024, bias=True)
      (1): ReLU(inplace=True)
      (2): Linear(in_features=1024, out_features=512, bias=True)
      (3): ReLU(inplace=True)
      (4): Linear(in_features=512, out_features=256, bias=True)
    )
    (decoder): Sequential(
      (0): Linear(in_features=256, out_features=512, bias=True)
      (1): ReLU(inplace=True)
      (2): Linear(in_features=512, out_features=1024, bias=True)
      (3): ReLU(inplace=True)
      (4): Linear(in_features=1024, out_features=16000, bias=True)
      (5): Tanh()
    )
  )
  (stage2): AutoencoderStage2(
    (encoder): Sequential(
      (0): Linear(in_features=256, out_features=256, bias=True)
      (1): ReLU(inplace=True)
      (2): Linear(in_features=256, out_features=64, bias=True)
    )
    (decoder): Sequential(
      (0): Linear(in_features=64, out_features=256, bias=True)
      (1): ReLU(inplace=True)
  

In [11]:
with torch.no_grad():
    out = model(x)  # x у тебя уже есть из предыдущей ячейки
    reconstructed = out["x_recon_stage2"].view(1, -1)

torchaudio.save(
    "notebook_reconstructed.wav",
    reconstructed,
    cfg.sample_rate
)

print("Сохранено: notebook_reconstructed.wav")


Сохранено: notebook_reconstructed.wav




In [7]:
from config import TrainConfig
from models import TwoStageAudioAutoencoder
from utils import load_model_from_checkpoint

cfg = TrainConfig(
    epochs=10,
    batch_size=32,
    lr=0.001,
    latent_dim_1=256,   # как при обучении checkpoint_epoch_10.pth
    latent_dim_2=64,    # как при обучении checkpoint_epoch_10.pth
    log_interval=20,
    sample_rate=16000,
    max_duration_sec=1.0,   # как при обучении checkpoint_epoch_10.pth
    audio_root="audio_data",
)

num_samples = int(cfg.sample_rate * cfg.max_duration_sec)
print("num_samples:", num_samples)

model = TwoStageAudioAutoencoder(
    input_dim=num_samples,
    latent_dim_1=cfg.latent_dim_1,
    latent_dim_2=cfg.latent_dim_2,
)

last_epoch = load_model_from_checkpoint(
    model, "checkpoints/checkpoint_epoch_10.pth", "cpu"
)
print("Loaded checkpoint from epoch", last_epoch)

model.eval()


num_samples: 16000
Loaded checkpoint from epoch 10


TwoStageAudioAutoencoder(
  (stage1): AutoencoderStage1(
    (encoder): Sequential(
      (0): Linear(in_features=16000, out_features=1024, bias=True)
      (1): ReLU(inplace=True)
      (2): Linear(in_features=1024, out_features=512, bias=True)
      (3): ReLU(inplace=True)
      (4): Linear(in_features=512, out_features=256, bias=True)
    )
    (decoder): Sequential(
      (0): Linear(in_features=256, out_features=512, bias=True)
      (1): ReLU(inplace=True)
      (2): Linear(in_features=512, out_features=1024, bias=True)
      (3): ReLU(inplace=True)
      (4): Linear(in_features=1024, out_features=16000, bias=True)
      (5): Tanh()
    )
  )
  (stage2): AutoencoderStage2(
    (encoder): Sequential(
      (0): Linear(in_features=256, out_features=256, bias=True)
      (1): ReLU(inplace=True)
      (2): Linear(in_features=256, out_features=64, bias=True)
    )
    (decoder): Sequential(
      (0): Linear(in_features=64, out_features=256, bias=True)
      (1): ReLU(inplace=True)
  

In [12]:
import IPython.display as ipd

print("Оригинал:")
ipd.Audio(audio_path)


Оригинал:


In [13]:
print("Реконструкция:")
ipd.Audio("notebook_reconstructed.wav")


Реконструкция:
