mfcc_classifying_heartbeat_features_mfcc_.py

# -*- coding: utf-8 -*-
"""MFCC_Classifying_heartbeat_features_MFCC .ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1TWrCBVgA5ZTuZtmjfNVyGrr6eQjhS2D9

Details and Resources of the project in the doc:                                
https://docs.google.com/document/d/1F-KYQ5nRnDAUVDUrEbTFYGn7TEVsT8EByuM9xJLT4gA/edit
"""

# Commented out IPython magic to ensure Python compatibility.
import warnings                        # To ignore any warnings
warnings.filterwarnings("ignore")
# %matplotlib inline
# %pylab inline
import os
import pandas as pd
import librosa
import librosa.display
import glob 
import matplotlib.pyplot as plt
# %config InlineBackend.figure_format = 'retina'

from google.colab import drive
drive.mount("/content/drive")

INPUT_DIR = '/content/drive/MyDrive/AI ML Things/University of Turku Research Internship'
'''
set_a_path=base_path+'/set_a'
set_a_metadata_path=base_path+'/set_a.csv'
set_b_path=base_path+'/set_b'
set_b_metadata_path=base_path+'/set_b.csv'
set_a_metadata = pd.read_csv(set_a_metadata_path)
'''
dataset_path = INPUT_DIR+'/set_a'
metadata = pd.read_csv(INPUT_DIR+'/set_a.csv')

SAMPLE_RATE = 16000
# seconds
MAX_SOUND_CLIP_DURATION=12

"""#Explorer data"""

set_a=pd.read_csv(INPUT_DIR+"/set_a.csv")
set_a.head()

set_a_timing=pd.read_csv(INPUT_DIR+"/set_a_timing.csv")
set_a_timing.head()

set_b=pd.read_csv(INPUT_DIR+"/set_b.csv")
set_b.head()

#merging both set-a and set-b
frames = [set_a, set_b]
train_ab=pd.concat(frames)
train_ab.describe()

#checking for duplicates
nb_classes=train_ab.label.unique()

print("Number of training examples=", train_ab.shape[0], "  Number of classes=", len(train_ab.label.unique()))
print (nb_classes)

"""Note: 'nan' indicate unclassified and unlabel test files"""

# visualize data distribution by category
category_group = train_ab.groupby(['label','dataset']).count()
plot = category_group.unstack().reindex(category_group.unstack().sum(axis=1).sort_values().index)\
          .plot(kind='bar', stacked=True, title="Number of Audio Samples per Category", figsize=(16,5))
plot.set_xlabel("Category")
plot.set_ylabel("Samples Count");

print('Min samples per category = ', min(train_ab.label.value_counts()))
print('Max samples per category = ', max(train_ab.label.value_counts()))

print('Minimum samples per category = ', min(train_ab.label.value_counts()))
print('Maximum samples per category = ', max(train_ab.label.value_counts()))

"""##Exploring each category individually"""


"""#Extracting Features of Data in Audio Domain

##Sound Feature: MFCCs
"""

# Checking an example generate mfccs from a audio file
example_file=INPUT_DIR+"/set_a/normal__201106111136.wav"
#y, sr = librosa.load(sample_file, offset=7, duration=7)
y, sr = librosa.load(example_file)
mfccs = librosa.feature.mfcc(y=y, sr=sr)
print (mfccs)

mfccs.shape

# Get more components
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
mfccs.shape

"""#Loading Data"""

print("Number of training examples=", train_ab.shape[0], "  Number of classes=", len(train_ab.label.unique()))

def audio_norm(data):
    max_data = np.max(data)
    min_data = np.min(data)
    data = (data-min_data)/(max_data-min_data+0.0001)
    return data-0.5

# get audio data without padding highest qualify audio
def load_file_data_without_change(folder,file_names, duration=3, sr=16000):
    input_length=sr*duration
    # function to load files and extract features
    # file_names = glob.glob(os.path.join(folder, '*.wav'))
    data = []
    for file_name in file_names:
        try:
            sound_file=folder+file_name
            print ("load file ",sound_file)
            # use kaiser_fast technique for faster extraction
            X, sr = librosa.load( sound_file, res_type='kaiser_fast') 
            dur = librosa.get_duration(y=X, sr=sr)
            # extract normalized mfcc feature from data
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sr, n_mfcc=40).T,axis=0) 
        except Exception as e:
            print("Error encountered while parsing file: ", file)
        feature = np.array(mfccs).reshape([-1,1])
        data.append(feature)
    return data


# get audio data with a fix padding may also chop off some file
def load_file_data (folder,file_names, duration=12, sr=16000):
    input_length=sr*duration
    # function to load files and extract features
    # file_names = glob.glob(os.path.join(folder, '*.wav'))
    data = []
    for file_name in file_names:
        try:
            sound_file=folder+file_name
            print ("load file ",sound_file)
            # use kaiser_fast technique for faster extraction
            X, sr = librosa.load( sound_file, sr=sr, duration=duration,res_type='kaiser_fast') 
            dur = librosa.get_duration(y=X, sr=sr)
            # pad audio file same duration
            if (round(dur) < duration):
                print ("fixing audio lenght :", file_name)
                y = librosa.util.fix_length(X, input_length)                
            #normalized raw audio 
            # y = audio_norm(y)            
            # extract normalized mfcc feature from data
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sr, n_mfcc=40).T,axis=0)             
        except Exception as e:
            print("Error encountered while parsing file: ", file)        
        feature = np.array(mfccs).reshape([-1,1])
        data.append(feature)
    return data

# load dataset-a, keep them separate for testing purpose
import os, fnmatch

A_folder=INPUT_DIR+'/set_a/'
# set-a
A_artifact_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_a'), 'artifact*.wav')
A_artifact_sounds = load_file_data(folder=A_folder,file_names=A_artifact_files, duration=MAX_SOUND_CLIP_DURATION)
A_artifact_labels = [0 for items in A_artifact_files]

A_normal_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_a'), 'normal*.wav')
A_normal_sounds = load_file_data(folder=A_folder,file_names=A_normal_files, duration=MAX_SOUND_CLIP_DURATION)
A_normal_labels = [2 for items in A_normal_sounds]

A_extrahls_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_a'), 'extrahls*.wav')
A_extrahls_sounds = load_file_data(folder=A_folder,file_names=A_extrahls_files, duration=MAX_SOUND_CLIP_DURATION)
A_extrahls_labels = [1 for items in A_extrahls_sounds]

A_murmur_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_a'), 'murmur*.wav')
A_murmur_sounds = load_file_data(folder=A_folder,file_names=A_murmur_files, duration=MAX_SOUND_CLIP_DURATION)
A_murmur_labels = [1 for items in A_murmur_files]

# test files
A_unlabelledtest_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_a'), 'Aunlabelledtest*.wav')
A_unlabelledtest_sounds = load_file_data(folder=A_folder,file_names=A_unlabelledtest_files, duration=MAX_SOUND_CLIP_DURATION)
A_unlabelledtest_labels = [-1 for items in A_unlabelledtest_sounds]

print ("loaded dataset-a")

# Commented out IPython magic to ensure Python compatibility.
# %%time
# # load dataset-b, keep them separate for testing purpose 
# B_folder=INPUT_DIR+'/set_b/'
# # set-b
# B_normal_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_b'), 'normal*.wav')  # include noisy files
# B_normal_sounds = load_file_data(folder=B_folder,file_names=B_normal_files, duration=MAX_SOUND_CLIP_DURATION)
# B_normal_labels = [2 for items in B_normal_sounds]
# 
# B_murmur_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_b'), 'murmur*.wav')  # include noisy files
# B_murmur_sounds = load_file_data(folder=B_folder,file_names=B_murmur_files, duration=MAX_SOUND_CLIP_DURATION)
# B_murmur_labels = [1 for items in B_murmur_files]
# 
# B_extrastole_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_b'), 'extrastole*.wav')
# B_extrastole_sounds = load_file_data(folder=B_folder,file_names=B_extrastole_files, duration=MAX_SOUND_CLIP_DURATION)
# B_extrastole_labels = [1 for items in B_extrastole_files]
# 
# #test files
# B_unlabelledtest_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_b'), 'Bunlabelledtest*.wav')
# B_unlabelledtest_sounds = load_file_data(folder=B_folder,file_names=B_unlabelledtest_files, duration=MAX_SOUND_CLIP_DURATION)
# B_unlabelledtest_labels = [-1 for items in B_unlabelledtest_sounds]
# print ("loaded dataset-b")

#combine set-a and set-b 
x_data = np.concatenate((A_artifact_sounds, A_normal_sounds,A_extrahls_sounds,A_murmur_sounds, 
                         B_normal_sounds,B_murmur_sounds,B_extrastole_sounds))

y_data = np.concatenate((A_artifact_labels, A_normal_labels,A_extrahls_labels,A_murmur_labels,
                         B_normal_labels,B_murmur_labels,B_extrastole_labels))

test_x = np.concatenate((A_unlabelledtest_sounds,B_unlabelledtest_sounds))
test_y = np.concatenate((A_unlabelledtest_labels,B_unlabelledtest_labels))

print ("combined training data record: ",len(y_data), len(test_y))

'''
# shuffle - whether or not to shuffle the data before splitting. If shuffle=False then stratify must be None.
# random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random.

seed = 1000
# split data into Train, Validation and Test
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, train_size=0.9, random_state=seed, shuffle=True)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.9, random_state=seed, shuffle=True)
'''

'''
print ("label shape: ", y_data.shape)
print ("data size of the array: : %s" % y_data.size)
print ("length of one array element in bytes: ", y_data.itemsize)
print ("total bytes consumed by the elements of the array: ", y_data.nbytes)
print (y_data[1])
print ("")
print ("audio data shape: ", x_data.shape)
print ("data size of the array: : %s" % x_data.size)
print ("length of one array element in bytes: ", x_data.itemsize)
print ("total bytes consumed by the elements of the array: ", x_data.nbytes)
#print (x_data[1])
print ("")
print ("training data shape: ", x_train.shape)
print ("training label shape: ", y_train.shape)
print ("")
print ("validation data shape: ", x_val.shape)
print ("validation label shape: ", y_val.shape)
print ("")
print ("test data shape: ", x_test.shape)
print ("test label shape: ", y_test.shape)
'''

np.save('x_data.npy',x_data)

pwd

x_data.shape


#df_1 = pd.read_csv(set_a_metadata_path)
#df_1.head()
'''
audio_files_set_a = glob(base_path+'/set_a/*.wav')
len(audio_files_set_a)
'''
'''
audio, sfreq = lr.load(audio_files_set_a[0])
time = np.arange(0,len(audio))/sfreq
time
'''
'''
fig, ax = plt.subplots()
ax.plot(time,audio)
'''
'''
audio, sfreq = lr.load(audio_files_set_a[1])
time = np.arange(0,len(audio))/sfreq
fig, ax = plt.subplots()
ax.plot(time,audio)
'''
'''
import glob
os.chdir(dataset_path)


MAX_LEN_MFCC = []

for file in glob.glob("*.wav"):
    #print(file)
    #extract_features(file)

    audio,sample_rate = lr.load(file, res_type='kaiser_fast')
    mfccs = lr.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    #print(mfccs.shape[1])
    MAX_LEN_MFCC.append(mfccs.shape[1])

print(max(MAX_LEN_MFCC))
'''
'''
max_pad_len = 388
def extract_features(file_name):
   
    try:
        audio, sample_rate = lr.load(file_name, res_type='kaiser_fast') 
        mfccs = lr.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        pad_width = max_pad_len - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None 
     
    return mfccs
'''
'''
for index, row in metadata.iterrows():
    file_name = os.path.join(os.path.abspath(dataset_path),str(row['fname']))
    
    class_label = row['fname']
    data = extract_features(file_name)
    
    features.append([data,class_label])
'''
'''
DATAX = pd.DataFrame(features, columns=['feature','class_label'])

print('Finished feature extraction from ', len(DATAX), ' files')
'''
'''
str(DATAX['class_label'][0]).split('__')[0]

LABEL = []

for i in range(len(DATAX)):
  str(DATAX['class_label'][i]).split('__')[0]
  LABEL.append(str(DATAX['class_label'][i]).split('__')[0])   
'''
'''
DATAX['LABEL'] = LABEL

NEW_DATAX = DATAX.drop(columns=['class_label'])

#NEW_DATAX.to_csv(r'NEW_DATAX.csv')

NEW_DATAX.head(10)
'''