In [1]:
import csv
import os
import torch
import torchaudio
import librosa
from cnn_model import CNNModel

from datapreprocessing import AudioProcessor
from torch.utils.data import Dataset



The test passed successfully.


In [2]:
# audio_file_path = "/Users/zainhazzouri/projects/Bachelor_Thesis/Data/Kaggle/music_wav/bartok.wav"
audio_file_path = "/Users/zainhazzouri/Desktop/egp1.mp3"
# audio_file_path = "/Users/zainhazzouri/projects/Bachelor_Thesis/Data/Kaggle/music_wav/bagpipe.wav"

SAMPLE_RATE = 22050 # sample rate of the audio file
bit_depth = 16 # bit depth of the audio file
hop_length = 512
n_mfcc = 20 # number of MFCCs features
n_fft=1024, # window size
n_mels = 256 # number of mel bands to generate
win_length = None # window length



In [3]:
# Set device
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_built():  # if you have apple silicon mac
    device = "mps"  # if it doesn't work try device = torch.device('mps')
else:
    device = "cpu"
print(f"Using {device}")


Using mps


In [4]:
model = CNNModel().to(device)
model.load_state_dict(torch.load("CNNModel_speech_music_discrimination.pth"))
model.eval() # some parts are turned off for evaluation because we don't need them

CNNModel(
  (Maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (Conv1): conv_block(
    (conv): Sequential(
      (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): ReLU(inplace=True)
    )
  )
  (Conv2): conv_block(
    (conv): Sequential(
      (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (4): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): ReLU(inplace=True)
  

In [5]:
def split_waveform(waveform, sample_rate):
    segment_length = sample_rate # 1 second segments
    num_segments = waveform.shape[-1] // segment_length
    segments = []

    for i in range(num_segments):
        start = i * segment_length
        end = start + segment_length
        segments.append(waveform[:, start:end])

    return segments

In [8]:
audio_processor = AudioProcessor(audio_file_path)


In [9]:
def classify_audio_file_segments(audio_file_path, audio_processor):
    file_ext = os.path.splitext(audio_file_path)[1].lower() # file extension

    if file_ext == '.mp3':
        waveform, sample_rate = librosa.load(audio_file_path, sr=SAMPLE_RATE)
        waveform = torch.from_numpy(waveform).unsqueeze(0)
    else:
        waveform, sample_rate = torchaudio.load(audio_file_path)

    segments = split_waveform(waveform, sample_rate)

    segment_classifications = []

    for segment in segments:
        # Apply the sequence of operations
        segment = audio_processor._resample_if_necessary(segment, sample_rate)
        segment = audio_processor._mix_down_if_necessary(segment)
        segment = audio_processor._right_pad_if_necessary(segment)
        segment = audio_processor._cut_if_necessary(segment)

        # Apply the MFCC transformation directly
        mfcc = torchaudio.transforms.MFCC(sample_rate=sample_rate, n_mfcc=audio_processor.n_mfcc)(segment)
        mfcc = mfcc.to(audio_processor.device).unsqueeze(0)
        output = model(mfcc)
        _, predicted_class = torch.max(output, 1)
        segment_classifications.append(predicted_class.item())

    return segment_classifications

In [10]:

def format_time(seconds):
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{hours:02d}:{minutes:02d}:{seconds:02d}"


def generate_classification_table(classification_results, segment_duration=1):
    table = []
    current_label = classification_results[0]
    start_time = 0

    for i, label in enumerate(classification_results[1:], 1):
        if label != current_label:
            table.append([format_time(start_time), format_time(i * segment_duration), current_label])
            start_time = i * segment_duration
            current_label = label

    table.append([format_time(start_time), format_time(len(classification_results) * segment_duration), current_label])

    return table


def save_classification_table_to_csv(table, output_file):
    with open(output_file, 'w', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['Start Time', 'End Time', 'Class'])
        for row in table:
            csvwriter.writerow(row)

# print(audio_file_path)
classification_results = classify_audio_file_segments(audio_file_path, audio_processor)
table = generate_classification_table(classification_results)

# Save the table to a CSV file
save_classification_table_to_csv(table, "classification_table.csv")

# Print the table
print("Start Time | End Time | Class")
print("-" * 28)
for row in table:
    print(f"{row[0]} | {row[1]} | {row[2]}")


# music_wav = 0, speech_wav = 1, Mix_wav = 2, silence_wav = 3




Start Time | End Time | Class
----------------------------
00:00:00 | 00:00:08 | 1
00:00:08 | 00:00:11 | 0
00:00:11 | 00:00:12 | 2
00:00:12 | 00:00:13 | 0
00:00:13 | 00:00:14 | 2
00:00:14 | 00:00:21 | 0
00:00:21 | 00:00:23 | 2
00:00:23 | 00:00:25 | 0
00:00:25 | 00:00:26 | 2
00:00:26 | 00:00:27 | 0
00:00:27 | 00:00:29 | 2
00:00:29 | 00:00:30 | 0
00:00:30 | 00:00:32 | 2
00:00:32 | 00:00:34 | 0
00:00:34 | 00:00:35 | 2
00:00:35 | 00:00:39 | 0
00:00:39 | 00:00:40 | 2
00:00:40 | 00:00:41 | 1
00:00:41 | 00:00:43 | 0
00:00:43 | 00:00:45 | 2
00:00:45 | 00:00:52 | 1
00:00:52 | 00:00:54 | 2
00:00:54 | 00:01:03 | 1
00:01:03 | 00:01:04 | 2
00:01:04 | 00:01:44 | 1
00:01:44 | 00:01:45 | 0
00:01:45 | 00:01:50 | 1
00:01:50 | 00:01:51 | 0
00:01:51 | 00:01:58 | 1
00:01:58 | 00:01:59 | 2
00:01:59 | 00:02:00 | 1
00:02:00 | 00:02:02 | 2
00:02:02 | 00:02:09 | 1
00:02:09 | 00:02:10 | 0
00:02:10 | 00:02:14 | 1
00:02:14 | 00:02:15 | 2
00:02:15 | 00:02:40 | 1
00:02:40 | 00:02:41 | 2
00:02:41 | 00:02:55 | 1
00:02

In [11]:
audio_processor = AudioProcessor(audio_file_path)

classification_results = classify_audio_file_segments(audio_file_path, audio_processor)
table = generate_classification_table(classification_results)
print(table)



[['00:00:00', '00:00:08', 1], ['00:00:08', '00:00:11', 0], ['00:00:11', '00:00:12', 2], ['00:00:12', '00:00:13', 0], ['00:00:13', '00:00:14', 2], ['00:00:14', '00:00:21', 0], ['00:00:21', '00:00:23', 2], ['00:00:23', '00:00:25', 0], ['00:00:25', '00:00:26', 2], ['00:00:26', '00:00:27', 0], ['00:00:27', '00:00:29', 2], ['00:00:29', '00:00:30', 0], ['00:00:30', '00:00:32', 2], ['00:00:32', '00:00:34', 0], ['00:00:34', '00:00:35', 2], ['00:00:35', '00:00:39', 0], ['00:00:39', '00:00:40', 2], ['00:00:40', '00:00:41', 1], ['00:00:41', '00:00:43', 0], ['00:00:43', '00:00:45', 2], ['00:00:45', '00:00:52', 1], ['00:00:52', '00:00:54', 2], ['00:00:54', '00:01:03', 1], ['00:01:03', '00:01:04', 2], ['00:01:04', '00:01:44', 1], ['00:01:44', '00:01:45', 0], ['00:01:45', '00:01:50', 1], ['00:01:50', '00:01:51', 0], ['00:01:51', '00:01:58', 1], ['00:01:58', '00:01:59', 2], ['00:01:59', '00:02:00', 1], ['00:02:00', '00:02:02', 2], ['00:02:02', '00:02:09', 1], ['00:02:09', '00:02:10', 0], ['00:02:10', 