In [1]:
import os

import librosa
import numpy as np


In [2]:
def get_chroma_stft(y, sr):
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    chroma_stft_mean = np.mean(chroma_stft)
    chroma_stft_var = np.var(chroma_stft)
    return chroma_stft_mean, chroma_stft_var


def get_rms(y):
    rms = librosa.feature.rms(y=y)
    rms_mean = np.mean(rms)
    rms_var = np.var(rms)
    return rms_mean, rms_var


def get_spectral_centroid(y, sr):
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    spectral_centroid_mean = np.mean(spectral_centroid)
    spectral_centroid_var = np.var(spectral_centroid)
    return spectral_centroid_mean, spectral_centroid_var


def get_spectral_bandwidth(y, sr):
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    spectral_bandwidth_mean = np.mean(spectral_bandwidth)
    spectral_bandwidth_var = np.var(spectral_bandwidth)
    return spectral_bandwidth_mean, spectral_bandwidth_var


def get_rolloff(y, sr):
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    rf_mean = np.mean(rolloff)
    rf_var = np.var(rolloff)
    return rf_mean, rf_var


def get_zero_crossing_rate(y):
    zcr = librosa.feature.zero_crossing_rate(y)
    return np.mean(zcr), np.var(zcr)


def get_harmony_and_perceptual(y):
    y_harm, y_perc = librosa.effects.hpss(y)
    return np.mean(y_harm), np.var(y_harm), np.mean(y_perc), np.var(y_perc)


def get_tempo(y, sr):
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    return tempo


def get_mfcc(y, sr):
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
    mfcc_means = list(map(np.mean, mfccs))
    mfcc_vars = list(map(np.var, mfccs))
    return mfcc_means, mfcc_vars


In [5]:
y, sr = librosa.load(r"./resources/1.wav")
print(get_mfcc(y, sr))

([-179.10237, 67.31532, 24.43042, 25.300352, 7.930529, 12.166368, 5.7244244, 7.8207207, 3.811854, 8.40122, -0.074752845, 6.248459, 0.5148195, 1.3991385, -1.4270134, 0.66210747, -2.3945432, 3.2063277, 0.79292667, 3.2554183], [7365.68, 655.66284, 452.33438, 229.57414, 143.94186, 131.85965, 78.1151, 68.787125, 90.55072, 69.79385, 62.77555, 67.177925, 46.247295, 53.219196, 54.427555, 69.26596, 74.788025, 70.095146, 76.1646, 67.58579])


In [3]:
def get_eigenvector(path):
    """
        获取音频文件的特征向量
        path: 音频文件路径
        @return: 返回音频文件对应的特征向量
    """
    y, sr = librosa.load(path)
    y, _ = librosa.effects.trim(y)
    eigenvector = [*y.shape, *get_chroma_stft(y, sr), *get_rms(y), *get_spectral_centroid(y, sr),
                   *get_spectral_bandwidth(y, sr),
                   *get_rolloff(y, sr), *get_zero_crossing_rate(y), *get_harmony_and_perceptual(y), get_tempo(y, sr)]
    mfcc = get_mfcc(y, sr)
    for i in range(20):
        eigenvector.append(mfcc[0][i])
        eigenvector.append(mfcc[1][i])
    return eigenvector

In [4]:
path = r"./resources/1.wav"
eigenvector = get_eigenvector(path)
for e in eigenvector:
    print(e)

6225509
0.3785841
0.09234533
0.0752176
0.0007694788
2793.2194789359064
757913.8247103085
2943.622874690518
150635.3963277575
6359.840397081877
2937035.399465197
0.0987455267655222
0.003776289346881683
-1.6962967e-05
0.0038723024
4.378385e-05
0.0010846063
129.19921875
-179.10237
7365.68
67.31532
655.66284
24.43042
452.33438
25.300352
229.57414
7.930529
143.94186
12.166368
131.85965
5.7244244
78.1151
7.8207207
68.787125
3.811854
90.55072
8.40122
69.79385
-0.074752845
62.77555
6.248459
67.177925
0.5148195
46.247295
1.3991385
53.219196
-1.4270134
54.427555
0.66210747
69.26596
-2.3945432
74.788025
3.2063277
70.095146
0.79292667
76.1646
3.2554183
67.58579
