In [61]:
import pandas as pd
import numpy as np
import librosa
import math
import csv
import matplotlib.pyplot as plt

In [62]:
#Reading audio files and storing the data
dev = pd.read_csv('./datasets/development.csv')
eval = pd.read_csv('./datasets/evaluation.csv')

In [63]:
def clean_age(ds):
    #we remove the edge age values
    #a = ds.groupby('age')['Id'].count()
    ds = ds[(ds['age']>14) & (ds['age']<91)]
    ds['age'] = ds['age'].apply(lambda x: math.floor(x))
    return ds

In [64]:
def clean_dataset(ds_dev,ds_eval):
    
    ds_dev= clean_age(ds_dev)

    #concatenation of dev and eval datasets
    ds = pd.concat([ds_dev, ds_eval], sort=False)
    ds['path'] = 'datasets/' + ds['path']  #change the path for 
    ds_mask = ~ds["age"].isna()
    print('Length of data sets: dev, eval, concat')
    print((ds_dev.shape), (ds_eval.shape), (ds.shape))
    
    #reset index
    ds = ds.reset_index().drop(columns=['index'])
    ds = ds.drop(columns=['sampling_rate','min_pitch', 'max_pitch', 'Id'])
    
    #tempo column 
    if ds['tempo'].dtype != float:
        ds['tempo'] = ds['tempo'].apply(lambda x: float(x.lstrip('[').rstrip(']')))
        
    
    ds.loc[ds['gender'] == 'famale', 'gender'] = 'female'
    
    return ds, ds_dev,ds_eval,ds_mask

In [65]:
def extract_audio_features(file_audio):
    y, sr = librosa.load(file_audio)


    duration = librosa.get_duration(y=y, sr=sr)
    # MFCC
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    mfcc_mean = np.mean(mfcc, axis=1) 
    mfcc_std = np.std(mfcc, axis=1)    
    
    # Mel-spectrogramma
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=20)
    log_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
    mel_mean = np.mean(log_spectrogram, axis=1)  
    mel_std = np.std(log_spectrogram, axis=1)    
    
   
    spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
    spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
    
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)

  
    mean_chroma = np.mean(chroma, axis=1) 
    std_chroma = np.std(chroma, axis=1) 


    features = {}
    
    for i in range(mfcc_mean.shape[0]):
        features[f'mfcc_mean_{i+1}'] = mfcc_mean[i]
        features[f'mfcc_std_{i+1}'] = mfcc_std[i]


    for i in range(mel_mean.shape[0]):
        features[f'mel_mean_{i+1}'] = mel_mean[i]
        features[f'mel_std_{i+1}'] = mel_std[i]
        
    for i in range(mean_chroma.shape[0]):
        features[f'chroma_mean_{i+1}'] = mean_chroma[i]
        features[f'chroma_std_{i+1}'] = std_chroma[i]

    features['rolloff'] = spectral_rolloff
    features['spectral_bandwidth'] = spectral_bandwidth
    features['duration'] = duration


    
    return features

In [66]:
def test_extract(path):
    f = extract_audio_features(path)
    return pd.DataFrame([f])

In [67]:
def extract_features(df):
    feature_list = df['path'].apply(extract_audio_features)
        
    features_df = pd.DataFrame(feature_list.to_list())

    df = pd.concat([df, features_df], axis=1)
    return df

In [68]:
def clean_path(df):
    #storing the paths
    df = df.drop(columns=['path'])
    return df
    

In [69]:
def hot_encoding(df,to_encode):
    df_dumm = pd.get_dummies(df, columns=to_encode)
    print("Shape: dumm ",df_dumm.shape)
    return df_dumm

In [70]:
def outliers_detection(df_dumm,df_mask):
   
    df_mask=np.reshape(df_mask.values, newshape=df_dumm.shape[0])
    df_dev = df_dumm.loc[df_mask,:]
    df_eval = df_dumm.loc[~df_mask,:]


    print(df_mask)


    #df_eval = df_eval.reset_index()
    print('Dev iniziale:', df_dev.shape)
    print('Eval iniziale:', df_eval.shape)
    upper_limit = df_dev.mean(axis=0) + 3*df_dev.std(axis=0)
    lower_limit = df_dev.mean(axis=0) - 3*df_dev.std(axis=0)


    n=15
    count_filtro = (df_dev > upper_limit).sum(axis=1)
    filtro=count_filtro>n
    df_filtrato_up= df_dev.loc[filtro]
    df_up_index = df_filtrato_up.index
    print('Righe droppate up:', len(df_filtrato_up))
    #age_up = df_filtrato_up.groupby('age')['Id'].count()

    count_filtro = (df_dev < lower_limit).sum(axis=1)
    filtro=count_filtro>n
    df_filtrato_down = df_dev.loc[filtro]
    df_down_index = df_filtrato_down.index

    print('Righe droppate down:', len(df_filtrato_down))
    #age_down = df_filtrato_down.groupby('age')['Id'].count()

    # plt.figure()
    # plt.bar(height=age_up, x = age_up.index)
    # plt.figure()

    # plt.bar(height = age_down, x = age_down.index)



    indexes = np.concatenate((df_down_index, df_up_index))
    print('Mask iniziale',len(df_mask))
    
    df_dev = df_dev.drop(index=indexes)
    df_mask = np.delete(df_mask, indexes)
    
    print('Dev e mask finale', df_dev.shape, len(df_mask))

    df_dumm= pd.concat([df_dev, df_eval], sort=False)
    print('Dumm final:', df_dumm.shape)

    df_dumm = df_dumm.reset_index().drop(columns=['index'])
    
    return df_dumm, df_dev, df_mask



In [71]:
#cleaning, preprocessing, set
def main():
    df,df_dev,df_eval,df_mask=clean_dataset(dev,eval)

    df= extract_features(df)

    df=clean_path(df)

    df_dumm=hot_encoding(df,["ethnicity","gender"])

    df_dumm, df_dev, df_mask = outliers_detection(df_dumm,df_mask)


    df_dumm.to_csv('./data_completi/df_tot.csv', index=False, header=True)
    df_dev.to_csv('./data_completi/dev.csv', index=False, header=True)
    df_eval.to_csv('./data_completi/eval.csv', index=False, header=True)
    df_mask=pd.DataFrame(df_mask)
    df_mask.to_csv('./data_completi/mask.csv', index=False)

    return df_dumm, np.reshape(df_mask.values, newshape=df_dumm.shape[0]), df_dev, df_eval

Code for generating a graph that shows the duration of the files

In [72]:
# list_duration = []
# for i in range(1,2900):
#     d = test_extract(f'./datasets/audios_development/{i}.wav')['duration']
#     list_duration.append(d)

# s = pd.DataFrame([i[0].round() for i in list_duration], columns=['duration'])
# a = s.groupby('duration')['duration'].count()
# a.plot(kind='bar', figsize=(10, 6));
# plt.yscale("log")
# plt.ylabel('Count [log scale]');
# plt.xlabel('Duration [s]')
# ticks = plt.gca().get_xticks()  
# tens_ticks = [tick for tick in ticks if tick % 10 == 0]  
# plt.xticks(tens_ticks, rotation=0)  
