#### parametryzacja wymagana do przeprowadzenia odpowiednich kalkulacji

In [22]:
frame_size = 512
window_size = 128
directory = "./data/znormalizowane/"

#### ekstrakcja cech

In [60]:
def divide_into_frames(samples, frame_size):
    frames = []
    for i in range(0, len(samples), frame_size):
        frame = samples[i:i+frame_size]
        frame = np.asarray(frame, dtype = np.float32)
        frames.append(frame)
    return frames


def calculate_zcr(frames):
    zcrs = [np.sum(librosa.zero_crossings(frame)) for frame in frames]
    return zcrs

def calculate_f0(samples):
    f0, voiced_flag, voiced_probs = librosa.pyin(samples, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), frame_length=frame_size, win_length=window_size)
    times = librosa.times_like(f0)
    return(times, f0, voiced_flag)





def calculate_rms(samples):
    rms = librosa.feature.rms(y = samples, frame_length=frame_size, hop_length = frame_size)
    return rms


def calculate_ste(volumes):
    return volumes**2


def calculate_sr(volume, zcr):
    return(1*(volume < 0.02) * (np.array(zcr) < 50))


def calculate_features(samples, frame_size):
    frames = divide_into_frames(samples, frame_size)

    volumes = calculate_rms(samples)
    stes = calculate_ste(volumes)
    zcrs = calculate_zcr(frames)
    times, f0, voiced_flag = calculate_f0(samples)
    sr = calculate_sr(volumes, zcrs)

    VSTD = np.std(volumes) / np.max(volumes)
    VDR = ( np.max(volumes) - np.min(volumes) ) / np.max(volumes)
    LSTER = []
    for i in range(0, len(samples), 22050):
        if i+22050 > len(samples):
            mean = np.mean(stes[len(samples)-22050:len(samples)])
        else:
            mean = np.mean(stes[i:i+22050])
        if len(samples) < 22050:
            mean = np.mean(samples)
        LSTER.append( 1/2 * np.mean(np.sum(((1/2 * mean - stes[i:i+22050]) > 0)*1 + 1)) )
    LSTER = np.mean(LSTER)
    
    energy_segments = []
    for frame in frames:
        for i in range(0, len(frame), window_size):
            s = frame[i: i+window_size]
            energy_segments.append(np.mean(s**2))
    energy_segments /= np.sum(energy_segments)
    entropy = -np.sum(energy_segments*np.log2(energy_segments))
    HZCRR = np.mean(np.sum( ((zcrs - 3/2 * np.mean(zcrs)) > 0)*1 + 1))/2
    
    
    features = {
        "volume" : np.array(volumes).reshape(-1,1),
        "short time energy" : np.array(stes).reshape(-1,1),
        "zero crossing rate" : np.array(zcrs).reshape(-1,1),
        "fundamental frequency" : np.array(f0).reshape(-1,1),
        "voiced flag" : np.array(voiced_flag).reshape(-1,1),
        "silent ratio" : np.array(sr).reshape(-1,1),
    }
    values = {
        "VSTD" : VSTD,
        "VDR" : VDR,
        "ZSTD" : np.std(zcrs),
        "HZCRR" : HZCRR,
        "entropy" : entropy,
        "Low Ste Ratio" : LSTER
        }
    return features, values

#### wykresy

In [47]:
def plot_volume_time(duration, volumes):
    plt.figure(figsize=(20, 4))
    plt.plot(np.linspace(0, duration, len(volumes)), volumes)
    plt.title('Volume over time')
    plt.xlabel('Time (seconds)')
    plt.ylabel('Volume')


def plot_features(samples, features, duration, filename):
    n = len(features)
    fig, ax = plt.subplots(n-2, 1)
    fig.set_size_inches(30, 4*n-8)

    sr = features.pop("silent ratio")
    f0 = features.pop("fundamental frequency")
    voiced = features.pop("voiced flag")


    times = np.linspace(0, duration, len(samples))
    ax[0].plot(times, samples, linewidth = 1/2)
    ax[0].set_title('Amplitude over time')
    ax[0].set_ylabel('Amplitude')
    for j in np.where(voiced)[0]:
        ax[0].axvspan(times[np.int64(j*len(samples)/len(voiced))], times[np.int64((j+1)*len(samples)/len(voiced))], color='green', alpha=0.3)

    for i, feature in enumerate(list(features.items())[:-1]):
        i = i+1
        name, data = feature
        times = np.linspace(0, duration, len(data))
        ax[i].plot(times, data, linewidth = 1)
        ax[i].set_title(name)
        ax[i].set_ylabel(name)
        for j in range(len(sr)-1):
            if sr[j] == 1:
                ax[i].axvspan(times[np.int64(j*len(data)/len(sr))], times[np.int64((j+1)*len(data)/len(sr))], color='red', alpha=0.5)
    plt.xlabel('Time [seconds]')

    times = np.linspace(0, duration, len(f0))
    D = librosa.amplitude_to_db(np.abs(librosa.stft(samples)), ref=np.max)
    img = librosa.display.specshow(D, x_axis='time', y_axis='log', ax=ax[-1]) 
    ax[-1].set(title='pYIN fundamental frequency estimation')
    fig.colorbar(img, ax=ax[-1], format="%+2.f dB")
    ax[-1].plot(times, f0, label='f0', color='cyan', linewidth=3)
    fig.legend().set_visible(False)
    # ax[-1].legend(loc='upper right', fontsize="4", borderpad = 0, borderaxespad = 0)
    IPython.display.display(Audio(filename, rate = sample_rate, autoplay=True))

def plot_features_df(row):
    features = {
        "volume" :row.volume,
        "short time energy" : row['short time energy']	,
        "zero crossing rate" : row['zero crossing rate'],
        "fundamental frequency" : row['fundamental frequency'],
        "voiced flag" : row['voiced flag'],
        "silent ratio" : row['silent ratio'],
    }
    n = len(features)
    fig, ax = plt.subplots(n-2, 1)
    fig.set_size_inches(30, 4*n-8)

    sr = features.pop("silent ratio")
    f0 = features.pop("fundamental frequency")
    voiced = features.pop("voiced flag")


    times = np.linspace(0, row.duration, len(row.samples))
    ax[0].plot(times, row.samples, linewidth = 1/2)
    ax[0].set_title('Amplitude over time')
    ax[0].set_ylabel('Amplitude')
    for j in np.where(voiced)[0]:
        ax[0].axvspan(times[np.int64((j-1)*len(row.samples)/len(voiced))], times[np.int64((j)*len(row.samples)/len(voiced))], color='green', alpha=0.3)

    for i, feature in enumerate(list(features.items())[:-1]):
        i = i+1
        name, data = feature
        times = np.linspace(0, row.duration, len(data))
        ax[i].plot(times, data, linewidth = 1)
        ax[i].set_title(name)
        ax[i].set_ylabel(name)
        for j in range(len(sr)-1):
            if sr[j] == [1]:
                ax[i].axvspan(times[np.int64(j*len(data)/len(sr))], times[np.int64((j+1)*len(data)/len(sr))], color='red', alpha=0.5)
    plt.xlabel('Time [seconds]')

    times = np.linspace(0, row.duration, len(f0))
    D = librosa.amplitude_to_db(np.abs(librosa.stft(np.array(row.samples))), ref=np.max)
    img = librosa.display.specshow(D, x_axis='time', y_axis='log', ax=ax[-1]) 
    ax[-1].set(title='pYIN fundamental frequency estimation')
    # fig.colorbar(img, ax=ax[-1], format="%+2.f dB")
    ax[-1].plot(times, f0, label='f0', color='cyan', linewidth=3)
    plt.legend('' , frameon=False)
    print(f"Values for the whole clip:\n\t name = {row['filename'].split('/')[-1]} \n\t duration = {row.duration}\n\t Volume Standard Deviation = {row.VSTD}\n\t	Volume dynamic range = {row.VDR}\n\t standard deviation of ZCR = {row.ZSTD}	\n\t High zero crossing rate = {row.HZCRR} \n\t entropy = {row.entropy} \n\t Low Ste Ratio = {row['Low Ste Ratio']}")
    # ax[-1].legend(loc='upper right', fontsize="4", borderpad = 0, borderaxespad = 0)
    IPython.display.display(Audio(row.filename, rate = row.sample_rate, autoplay=True))

### wczytanie wszystkich danych do ramki

In [None]:
directory = "./data/znormalizowane/"
list_features = []
list_values = []
for f in os.listdir(directory):
    filename = directory+f
    samples, sample_rate = librosa.load(filename)
    duration = len(samples) / sample_rate
    features, values = calculate_features(samples, frame_size)
    features['samples'] = samples
    features['filename'] = filename
    features['duration'] = duration
    features['sample_rate'] = sample_rate
    values['filename'] = filename
    list_features.append(features)
    list_values.append(values)
list_features

In [68]:
df1 = pd.DataFrame(list_features)
df2 = pd.DataFrame(list_values)
df = pd.concat([df1.set_index("filename"), df2.set_index('filename')], axis = 1, join = 'inner').reset_index(drop=False)

#### zapis danych do csv

In [70]:
df.to_json("./data/output/processed_male.json")

### muzyka

In [None]:
directory = "./data/muzyka/znormalizowane/"
list_features = []
list_values = []
for f in os.listdir(directory):
    filename = directory+f
    samples, sample_rate = librosa.load(filename, mono = True)
    duration = len(samples) / sample_rate
    features, values = calculate_features(samples, frame_size)
    features['samples'] = samples
    features['filename'] = filename
    features['duration'] = duration
    features['sample_rate'] = sample_rate
    values['filename'] = filename
    list_features.append(features)
    list_values.append(values)
list_features

In [66]:
df1 = pd.DataFrame(list_features)
df2 = pd.DataFrame(list_values)
df_music = pd.concat([df1.set_index("filename"), df2.set_index('filename')], axis = 1, join = 'inner').reset_index(drop=False)
df_music.to_json("./data/output/processed_music.json")

głos kobiecy

In [None]:
directory = "./data/głos koleżanki/znormalizowane/"
list_features = []
list_values = []
for f in os.listdir(directory):
    filename = directory+f
    samples, sample_rate = librosa.load(filename, mono = True)
    duration = len(samples) / sample_rate
    features, values = calculate_features(samples, frame_size)
    features['samples'] = samples
    features['filename'] = filename
    features['duration'] = duration
    features['sample_rate'] = sample_rate
    values['filename'] = filename
    list_features.append(features)
    list_values.append(values)
list_features
df1 = pd.DataFrame(list_features)
df2 = pd.DataFrame(list_values)
df_female = pd.concat([df1.set_index("filename"), df2.set_index('filename')], axis = 1, join = 'inner').reset_index(drop=False)
df_female.to_json("./data/output/processed_music.json")

In [49]:
df_female.to_json('./data/output/processed_female.json')