In [68]:
import tensorflow as tf
import numpy as np
import os
import librosa
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

In [69]:
def load_and_process_audio(file_path):
    # Load audio file using librosa
    audio, sample_rate = librosa.load(file_path, sr=22050)

    # Compute Mel spectrogram
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_fft=2048, hop_length=512, n_mels=128)
    mel_spec_decibel = librosa.power_to_db(mel_spectrogram, ref=np.max)
    return mel_spec_decibel

In [70]:
root_dir = "../Data"
audio_data = []
label_encoder = LabelEncoder()

In [71]:
def get_audio_data(path_name):
    paths, genres = [], []
    for root, _, files in os.walk(path_name):
        for name in files:
            filename = os.path.join(root, name)
            genre = os.path.split(root)[-1]
            paths.append(filename)
            genres.append(genre)
    return paths, genres

In [72]:
paths, gens = get_audio_data(root_dir)

In [91]:
def split_songs(audio_paths, genres, max_length=78):
    split_spects_mel_db = []
    split_genres = []
    window = 0.06
    overlap = 0.3

    for path, genre in tqdm(zip(audio_paths, genres), total=len(audio_paths),desc='Processing Audio Files'):  
        audio, sample_rate = librosa.load(path)  
        audio_shape = audio.shape[0]
        chunk = int(audio_shape * window)
        offset = int(chunk*(1 - overlap))
        individual_split_song = []

        # create array of smaller audio clips
        for i in range(0, audio_shape - chunk + offset, offset):
            individual_split_song.append(audio[i:i+chunk])
        
        # convert small clips into mel_spectrograms
        for sample in individual_split_song:
            if sample.shape[0] != chunk:
                continue
            mel_spec = librosa.feature.melspectrogram(y=sample, sr=sample_rate, n_fft=2048, hop_length=512, n_mels=128)
            mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
            
            # Pad outputs to ensure uniformity 
            if mel_spec_db.shape[1] < max_length:
                padding = max_length - mel_spec_db.shape[1]
                mel_spec_db = np.pad(mel_spec_db, pad_width=((0,0), (0, padding)), mode='constant')
            else:
                mel_spec_db = mel_spec_db[:, :max_length]
            
                        
            #split_spects_mel_only.append(mel_spec)
            split_spects_mel_db.append(mel_spec_db)
            
            split_genres.append(genre)
    
    return split_spects_mel_db, split_genres

In [92]:
spects, gens2 = split_songs(paths, gens)

Processing Audio Files: 100%|██████████| 1000/1000 [01:52<00:00,  8.90it/s]


In [93]:
l = len(spects)

In [94]:
tr = l * .90
ts = l - tr
tr1 = tr / 50
ts1 = ts/50
print(tr, ts, tr1, ts1)

20700.0 2300.0 414.0 46.0


In [95]:
g1 = np.array(gens2)
s1 = np.array(spects)
genres_encoded = label_encoder.fit_transform(g1)
dataset = tf.data.Dataset.from_tensor_slices((s1, genres_encoded))
dataset = dataset.shuffle(len(s1))

In [96]:
final_data = dataset.cache()
final_data = final_data.batch(50)
final_data = final_data.prefetch(25)
train = final_data.take(int(tr1))
test = final_data.skip(int(tr1)).take(int(ts1))

In [97]:
samples, labels = train.as_numpy_iterator().next()

In [98]:
samples[0].shape

(128, 78)

In [99]:
spectrogram_height = 128
spectrogram_width = 78
num_channels = 1  #should be 1 because it is a numpy array, not a color image.
num_classes = 10

model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(16, (3, 3), activation='relu', input_shape=(spectrogram_height, spectrogram_width, num_channels)),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax') #could be sigmoid?
])

In [100]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
hist = model.fit(train, epochs=15, validation_data=test)

In [16]:
model.save('genre_categorization')

INFO:tensorflow:Assets written to: genre_categorization\assets


INFO:tensorflow:Assets written to: genre_categorization\assets
