# Pipeline Evaluation

We noticed that the output from the drum_exraction function has minor differences from time to time, which also impacted the dataframe output of the drum_to_frame function. It is impossible to provide manually created label for every possible outcome from the drum_to_frame function. Therefore, we decided to pre-process 2 songs, convert it into dataframe using the drum_to_frame function and manually transcribe it for evaluation.

To avoid potential copyright issues, we also hide the song name and labeled it as song_1 and song_2

In [1]:
# To begin evaluation, we first need to load the dataframe from pickles
import pandas as pd
df_song_1=pd.read_pickle('../model_development/song_1.pkl')
df_song_2=pd.read_pickle('../model_development/song_2.pkl')

In [2]:
# Initiate the pre-trained model
from tensorflow import keras
model = keras.models.load_model('../inference/pretrained_models/annoteators/complete_network.h5')

In [3]:
# Define a function here to convert the df.audio_clip into mel-frequency spectrogram, and make the prediction
import librosa
import numpy as np

def evaluate(df, song_sampling_rate):
    df=df.copy()
    pred_x = []

    for i in range(df.shape[0]):
        pred_x.append(librosa.feature.melspectrogram(y=df.audio_clip.iloc[i], 
                                                 sr=song_sampling_rate, n_mels=128, fmax=8000))
        
    X = np.array(pred_x)
    X = X.reshape(X.shape[0],X.shape[1],X.shape[2],1)
    result = []
    pred_raw=model.predict(X)
    
    pred = np.round(pred_raw)

    for i in range(pred_raw.shape[0]):
        prediction = pred[i]
        if sum(prediction) == 0:
            raw = pred_raw[i]
            new = np.zeros(6)
            ind = raw.argmax()
            new[ind] = 1
            result.append(new)
        else:
            result.append(prediction)

    result = np.array(result)

    drum_hits = ['SD','HH','KD','RC','TT','CC']
    prediction = pd.DataFrame(result, columns = drum_hits)
    
    df.reset_index(inplace=True)
    prediction.reset_index(inplace=True)

    result = df.merge(prediction,left_on='index', right_on= 'index')
    result.drop(columns=['index'],inplace=True)
    
    return result

In [4]:
# function to compute exact and partial match score 
#"Exact match” means the prediction needs to be exactly matched with all the true labels regardless of whether
# they are multi-labeled or single-labeled. “
# Partial match” means the prediction only needs to be matched with one of the true labels.

def customized_metric(df):
    df=df.copy()
    for pair in list(zip(['SD_T','HH_T','KD_T','RC_T','TT_T','CC_T'], ['SD','HH','KD','RC','TT','CC'])):
        df[pair[0]].replace(1, pair[1], inplace=True)
        df[pair[1]].replace(1, pair[1], inplace=True)
    def create_truth_set(x):
        s=set([x['SD_T'], x['HH_T'],x['KD_T'],x['RC_T'],x['TT_T'],x['CC_T']])
        s.remove(0)
        return s

    def create_pred_set(x):
        s=set([x['SD'], x['HH'],x['KD'],x['RC'],x['TT'],x['CC']])
        s.remove(0)
        return s

    df['true']=df.apply(lambda x:create_truth_set(x), axis=1)
    df['pred']=df.apply(lambda x:create_pred_set(x), axis=1)
    df['exact']=df['true']==df['pred']

    def intersect(x):
        if len(x['true'].intersection(x['pred']))>0:
            return True
        else:
            return False

    df['partial']=df.apply(lambda x:intersect(x), axis=1)

    print(f"exact match: {round(df['exact'].value_counts(normalize=True)[True],2)}")
    print(f"partial match: {round(df['partial'].value_counts(normalize=True)[True],2)}")
    
    return None

In [5]:
#Then we will use the classification_report function to evaluate the result
from sklearn.metrics import classification_report
pred_df=evaluate(df_song_1, 44100)

labels=['SD','HH','KD','RC','TT','CC']
print('song_1')
print(classification_report(pred_df[['SD_T','HH_T','KD_T','RC_T','TT_T','CC_T']],
                            pred_df[['SD','HH','KD','RC','TT','CC']],
                           target_names=labels,                            
                           zero_division=0))

print('song_1')
customized_metric(pred_df)

song_1
              precision    recall  f1-score   support

          SD       0.50      0.49      0.50        59
          HH       0.90      0.60      0.72       316
          KD       0.94      0.74      0.83       164
          RC       0.00      0.00      0.00         1
          TT       0.55      0.60      0.57        40
          CC       0.00      0.00      0.00         3

   micro avg       0.82      0.63      0.71       583
   macro avg       0.48      0.41      0.44       583
weighted avg       0.84      0.63      0.71       583
 samples avg       0.82      0.68      0.72       583

song_1
exact match: 0.52
partial match: 0.83


In [6]:
#Then we will use the precision_recall_fscore_support function to evaluate the result

pred_df=evaluate(df_song_2, 44100)

labels=['SD','HH','KD','RC','TT','CC']
print('song_2')
print(classification_report(pred_df[['SD_T','HH_T','KD_T','RC_T','TT_T','CC_T']],
                            pred_df[['SD','HH','KD','RC','TT','CC']],
                           target_names=labels,
                           zero_division=0))

print('song_2')
customized_metric(pred_df)

song_2
              precision    recall  f1-score   support

          SD       0.88      0.69      0.77       212
          HH       0.61      0.14      0.23       241
          KD       0.91      0.89      0.90       313
          RC       0.00      0.00      0.00         0
          TT       0.07      0.29      0.11        14
          CC       0.00      0.00      0.00       233

   micro avg       0.79      0.46      0.58      1013
   macro avg       0.41      0.33      0.34      1013
weighted avg       0.61      0.46      0.50      1013
 samples avg       0.80      0.47      0.58      1013

song_2
exact match: 0.13
partial match: 0.81
