In [12]:
import os
import pandas as pd
import numpy as np
import torch
import torchaudio
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from torchaudio.transforms import MelSpectrogram, Resample

from PIL import Image

In [21]:
def preemphasis_filter(waveform, coeff=0.97):
    # Apply pre-emphasis filter
    return torch.cat((waveform[:, :1], waveform[:, 1:] - coeff * waveform[:, :-1]), dim=1)

def save_melspectrograms_for_folder(audio_folder, csv_file_path, output_folder, sample_rate=16000):
    # Create output directory if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    df = pd.read_csv(csv_file_path)

    # Iterate through each audio file
    for audio_file in df['file_path']:
        # Load the audio
        audio_path = os.path.join(audio_folder, audio_file)
        waveform, sr = torchaudio.load(audio_path)

        # Downsample the audio to 16000 Hz
        if sr != sample_rate:
            resampler = torchaudio.transforms.Resample(sr, sample_rate)
            waveform = resampler(waveform)
        
        # Convert the audio to monochannel
        if waveform.size(0) > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        # Apply pre-emphasis filter
        waveform = preemphasis_filter(waveform)

        # Adjust audio length
        duration = waveform.size(1) / sample_rate
        target_length = sample_rate  # Target length is 1 second

        if duration < 1:
            # Loop audio to make it 1 second
            loops = int(target_length / duration) + 1
            waveform = waveform.repeat(1, loops)[:, :target_length]
            # Compute the Mel spectrogram
            mel_spec = MelSpectrogram(sample_rate)(waveform)

            # Convert the Mel spectrogram to a numpy array
            mel_spec = mel_spec.squeeze().numpy()
            # Apply the 'jet' colormap
            mel_spec_jet = cm.jet(mel_spec)

            # Convert the NumPy array to a PIL image
            mel_spec_jet_image = Image.fromarray((mel_spec_jet * 255).astype(np.uint8))

            # Resize the image to 224x224 resolution
            mel_spec_jet_image = mel_spec_jet_image.resize((224, 224), Image.LANCZOS)

            # Save the image
            image_file = os.path.join(output_folder, audio_file.split('.wav')[0] + '.png')
            #print(image_file)
            mel_spec_jet_image.save(image_file)

            # Save the Mel spectrogram as an image in the "jet" colormap
            #image_file = os.path.join(output_folder, audio_file.split('.wav')[0], '.png')
            #plt.imsave(image_file, mel_spec, cmap='jet')
        elif duration > 1:
            # Split audio into 1-second segments
            #segments = []
            for i in range(int(duration)):
                segment = waveform[:, i * sample_rate : (i + 1) * sample_rate]
                # Compute the Mel spectrogram
                mel_spec = MelSpectrogram(sample_rate)(segment)

                # Convert the Mel spectrogram to a numpy array
                mel_spec = mel_spec.squeeze().numpy()
                # Apply the 'jet' colormap
                mel_spec_jet = cm.jet(mel_spec)

                # Convert the NumPy array to a PIL image
                mel_spec_jet_image = Image.fromarray((mel_spec_jet * 255).astype(np.uint8))

                # Resize the image to 224x224 resolution
                mel_spec_jet_image = mel_spec_jet_image.resize((224, 224), Image.LANCZOS)

                # Save the image
                image_file = os.path.join(output_folder, str(i) + '_' + audio_file.split('.wav')[0] + '.png')
                #print(image_file)
                mel_spec_jet_image.save(image_file)


                # Save the Mel spectrogram as an image in the "jet" colormap
                #image_file = os.path.join(output_folder, audio_file.split('.wav')[0], '.png')
                #plt.imsave(image_file, mel_spec, cmap='jet')     

In [22]:
# Example usage:
audio_folder = r'C:\Users\arunps\OneDrive\Projects\Scripts\Python\VisionInfantNet\train'
output_folder = r'C:\Users\arunps\OneDrive\Projects\Scripts\Python\VisionInfantNet\train_images'
csv_file_path = r'C:\Users\arunps\OneDrive\Projects\Scripts\Python\VisionInfantNet\train.csv'
save_melspectrograms_for_folder(audio_folder, csv_file_path, output_folder)



C:\Users\arunps\OneDrive\Projects\Scripts\Python\VisionInfantNet\train_images\WW04_segment1_0_NonCanonical.png
C:\Users\arunps\OneDrive\Projects\Scripts\Python\VisionInfantNet\train_images\WW04_segment1_11_Canonical.png
C:\Users\arunps\OneDrive\Projects\Scripts\Python\VisionInfantNet\train_images\0_WW04_segment1_14_Canonical.png
C:\Users\arunps\OneDrive\Projects\Scripts\Python\VisionInfantNet\train_images\0_WW04_segment1_16_Canonical.png
C:\Users\arunps\OneDrive\Projects\Scripts\Python\VisionInfantNet\train_images\WW04_segment1_17_NonCanonical.png
C:\Users\arunps\OneDrive\Projects\Scripts\Python\VisionInfantNet\train_images\0_WW04_segment1_18_NonCanonical.png
C:\Users\arunps\OneDrive\Projects\Scripts\Python\VisionInfantNet\train_images\0_WW04_segment1_20_LaughCry.png
C:\Users\arunps\OneDrive\Projects\Scripts\Python\VisionInfantNet\train_images\WW04_segment1_22_Canonical.png
C:\Users\arunps\OneDrive\Projects\Scripts\Python\VisionInfantNet\train_images\WW04_segment1_23_Canonical.png
C:\U