In [1]:
import os
import sys
import numpy as np
import pandas as pd
import librosa
import librosa.display
import soundfile
import matplotlib.pyplot as plt
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas

In [3]:
def get_audio(audio_path, start_time=0, duration_sec=10):
    wav, sr = librosa.load(audio_path, sr=22050)
    
    length = sr*duration_sec
    y = np.zeros(length)
    if wav.shape[0] < length:
        y[:len(wav)] = wav
    else:
        y = wav[start_time:start_time+length]
    
    return y

def get_spec_from_audio(audio_path, start_time=0, duration_sec=10):
    y = get_audio(audio_path, start_time, duration_sec)
    
    window_size = 1024
    window = np.hanning(window_size)
    stft  = librosa.core.spectrum.stft(y, n_fft=window_size, hop_length=512, window=window)
    out = 2 * np.abs(stft) / np.sum(window)
    
    return out

def get_sample_list(audio_path):
    sample_nums = []
    for f in os.listdir(audio_path):
        if '.wav' in f:
            sample_nums.append(f.strip('.wav'))
    return sample_nums

def save_spec(audio_path):
    sample_nums = get_sample_list(audio_path)
    for num in sample_nums:
        spec = get_spec_from_audio(os.path.join(audio_path, num + '.wav'))
        fig = plt.Figure()
        canvas = FigureCanvas(fig)
        ax = fig.add_subplot(111)
        p = librosa.display.specshow(librosa.amplitude_to_db(spec, ref=np.max), ax=ax, y_axis='log', x_axis='time')
        fig.savefig(os.path.join(audio_path, num + '.png'))

In [18]:
audio_root = '/media/daftpunk2/home/yoonjin'
exp_name = '2nd_inference_save_' + '200_800'

In [21]:
for d in os.listdir(os.path.join(audio_root, exp_name)):
    print('Start save ...' + d)
    save_spec(os.path.join(audio_root, exp_name, d, 'interpolates'))
    save_spec(os.path.join(audio_root, exp_name, d, 'optimized'))
    save_spec(os.path.join(audio_root, exp_name, d, 'target'))
            

Start save ...Someone_groan_in_the_background-000799e_799iter.pth
Start save ...Person_whistle-000799e_799iter.pth
Start save ...A_man_gives_a_speech-000799e_799iter.pth
Start save ...Cat_is_whistling-000799e_799iter.pth
Start save ...Someone_grunt_in_the_background-000799e_799iter.pth
Start save ...A_chime_bell_rings_musically-000799e_799iter.pth
Start save ...A_bell_sounds_loudly_and_then_fades_away-000799e_799iter.pth


In [25]:
# 원본 데이터 spec 저장
audio_path = '/media/daftpunk2/home/jakeoneijk/221008_audio_caps/audiocaps_audio_dataset/train'
df = pd.read_csv(f'./train.csv')

f_dict = {}

for f in os.listdir(audio_path):
    f_dict[f.split(']_')[0].strip('[')] = os.path.join(audio_path, f)

# 프롬프트 y_id 뽑기 start time 찾기. 왤케 비효율적으로 코드 짜는걸까 나란 녀석...
for d in os.listdir('./'):
    if not os.path.isdir(d) or d == 'caps_full' or d == '.ipynb_checkpoints':
        continue
    
    if len(os.listdir(d)) == 6:
        continue
    
    # if 'A_man' not in d:
    #     continue
    
    print('Save ...' + d)
    y_id = (os.listdir(os.path.join(d, 'test'))[1]).strip('_mel.npy')
    start_time = df.loc[df['youtube_id'] == y_id]['start_time']
    spec = get_spec_from_audio(f_dict[y_id], start_time=start_time)
    fig = plt.Figure()
    canvas = FigureCanvas(fig)
    ax = fig.add_subplot(111)
    p = librosa.display.specshow(librosa.amplitude_to_db(spec, ref=np.max), ax=ax, y_axis='log', x_axis='time')
    fig.savefig(os.path.join(d, y_id + '.png'))
    
    audio = get_audio(f_dict[y_id], start_time=start_time)
    soundfile.write(os.path.join(d, y_id + '.wav'), audio, 22050, 'PCM_24')

Save ...A_man_gives_a_speech
Save ...Cat_is_whistling


KeyError: 'ame_list.pk'