In [1]:
import os

def count_files_in_directory(directory):
    return len([name for name in os.listdir(directory) if os.path.isfile(os.path.join(directory, name))])

# Check input file

In [2]:
directory_path = '/kaggle/input/transform-and-pre-progress-for-audio/Data/Images/not/'
file_count = count_files_in_directory(directory_path)
print(f"Number of files in '{directory_path}': {file_count}")

Number of files in '/kaggle/input/transform-and-pre-progress-for-audio/Data/Images/not/': 2629


In [3]:
directory_path = '/kaggle/input/transform-and-pre-progress-for-audio/Data/Images/scream/'
file_count = count_files_in_directory(directory_path)
print(f"Number of files in '{directory_path}': {file_count}")

Number of files in '/kaggle/input/transform-and-pre-progress-for-audio/Data/Images/scream/': 908


# We import the needpackage

In [4]:
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
from torchvision import datasets, transforms

# Now we make the dataloader

## Define datalo0ader by folder

In [5]:
data_path = '/kaggle/input/transform-and-pre-progress-for-audio/Data/Images/' #looking in subfolder train

scream_dataset = datasets.ImageFolder(
    root=data_path,
    transform=transforms.Compose([transforms.Resize((64,862)),
                                  transforms.ToTensor()])
)

len(scream_dataset)

3537

## Check label meaning

In [6]:
class_map=scream_dataset.class_to_idx

print("\nClass category and index of the images: {}\n".format(class_map))


Class category and index of the images: {'not': 0, 'scream': 1}



# Define trainset and testset

In [7]:
#split data to test and train
train_size = int(0.8 * len(scream_dataset)) # We use 80% as train
test_size = len(scream_dataset) - train_size
scream_train_dataset, scream_test_dataset = torch.utils.data.random_split(scream_dataset, [train_size, test_size])

print("Training size:", len(scream_train_dataset))
print("Testing size:",len(scream_test_dataset))

Training size: 2829
Testing size: 708


## Check the sample count in trainset

In [8]:
from collections import Counter

# labels in training set
train_classes = [label for _, label in scream_train_dataset]
Counter(train_classes)

Counter({0: 2099, 1: 730})

## Define trainloader and testloader

In [9]:
train_dataloader = torch.utils.data.DataLoader(
    scream_train_dataset,
    batch_size=64,
    num_workers=2,
    shuffle=True
)

test_dataloader = torch.utils.data.DataLoader(
    scream_test_dataset,
    batch_size=64,
    num_workers=2,
    shuffle=True
)

## View a example image shape

In [10]:
td = train_dataloader.dataset[0][0]
td.shape

torch.Size([3, 64, 862])

# Import model

We use resnbet34 for example

In [11]:
from torchvision.models import resnet34
import torch
import torch.nn as nn

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')

# Updated model loading with weights
model = resnet34()

# Updating the fully connected layer and the first convolutional layer
model.fc = nn.Linear(512, 2)
model.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)

model = model.to(device)

Using cuda device


# Define loss function and optimizer

In [12]:
# cost function used to determine best parameters
cost = torch.nn.CrossEntropyLoss()

# used to create optimal parameters
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.0001)

# Define Train and Test function

In [13]:
# Create the training function

def train(dataloader, model, loss, optimizer):
    model.train()
    size = len(dataloader.dataset)
    correct = 0  # Counter for correct predictions
    total = 0  # Counter for total examples

    for batch, (X, Y) in enumerate(dataloader):

        X, Y = X.to(device), Y.to(device)
        optimizer.zero_grad()
        pred = model(X)
        loss = cost(pred, Y)
        loss.backward()
        optimizer.step()

        # Compute accuracy
        _, predicted = pred.max(1)
        total += Y.size(0)
        correct += predicted.eq(Y).sum().item()

        if batch % 10 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f'loss: {loss:>7f}  [{current:>5d}/{size:>5d}]  Train Accuracy: {(100 * correct / total):.2f}%')


# Create the validation/test function

def test(dataloader, model):
    size = len(dataloader.dataset)
    model.eval()
    test_loss, correct = 0, 0

    with torch.no_grad():
        for batch, (X, Y) in enumerate(dataloader):
            X, Y = X.to(device), Y.to(device)
            pred = model(X)

            test_loss += cost(pred, Y).item()
            correct += (pred.argmax(1)==Y).type(torch.float).sum().item()

    test_loss /= size
    correct /= size

    print(f'\nTest Error:\nacc: {(100*correct):>0.1f}%, avg loss: {test_loss:>8f}\n')
    return test_loss

# Train the model

In [14]:
# Define the early stopping parameters
early_stopping_patience = 15  # Number of epochs to wait before early stopping
best_loss = torch.inf
wait = 0  # Counter for patience
epochs = 150

best_model_weights = None

# Training loop
for t in range(epochs):
    print(f'Epoch {t + 1}\n-------------------------------')
    train(train_dataloader, model, cost, optimizer)
    test_loss = test(test_dataloader, model)

    # Check if the test loss has improved
    if test_loss < best_loss:
        best_loss = test_loss
        wait = 0  # Reset patience

        # Save the best model weights
        best_model_weights = model.state_dict()
    else:
        wait += 1

    if wait >= early_stopping_patience:
        print("Early stopping triggered. No improvement in test loss for {} epochs.".format(early_stopping_patience))
        break  # Stop training

# Restore the best model weights
if best_model_weights is not None:
    model.load_state_dict(best_model_weights)

print('Done!')

Epoch 1
-------------------------------
loss: 1.300108  [    0/ 2829]  Train Accuracy: 26.56%
loss: 0.555992  [  640/ 2829]  Train Accuracy: 69.74%
loss: 0.485033  [ 1280/ 2829]  Train Accuracy: 75.67%
loss: 0.380712  [ 1920/ 2829]  Train Accuracy: 77.32%
loss: 0.491013  [ 2560/ 2829]  Train Accuracy: 77.74%

Test Error:
acc: 83.5%, avg loss: 0.007317

Epoch 2
-------------------------------
loss: 0.345843  [    0/ 2829]  Train Accuracy: 82.81%
loss: 0.554268  [  640/ 2829]  Train Accuracy: 81.11%
loss: 0.354082  [ 1280/ 2829]  Train Accuracy: 83.18%
loss: 0.412931  [ 1920/ 2829]  Train Accuracy: 83.27%
loss: 0.227363  [ 2560/ 2829]  Train Accuracy: 83.54%

Test Error:
acc: 82.3%, avg loss: 0.011509

Epoch 3
-------------------------------
loss: 0.454768  [    0/ 2829]  Train Accuracy: 85.94%
loss: 0.442653  [  640/ 2829]  Train Accuracy: 82.39%
loss: 0.256347  [ 1280/ 2829]  Train Accuracy: 83.93%
loss: 0.544520  [ 1920/ 2829]  Train Accuracy: 84.83%
loss: 0.327128  [ 2560/ 2829]  Tra

# Save model

In [15]:
import torch
from datetime import datetime

# Get the current timestamp in the desired format
timestamp = datetime.now().strftime("%Y-%m-%d--%H-%M-%S")

# Define the file name with the timestamp
file_name = f"Resnet34_Model_{timestamp}.pt"

# Save the entire model (including architecture and weights)
torch.save(model, file_name)

# Print the saved file name
print(f"Model saved as {file_name}")

Model saved as Resnet34_Model_2023-12-19--06-43-19.pt


# Let's see how the model can be used!

## Define output direction

In [16]:
import os

# Directory path
directory = "/kaggle/working/transform-and-pre-progress-for-audio/Data/TestImages"

# Check if the directory already exists
if not os.path.exists(directory):
    # Create the directory
    os.makedirs(directory)
    print(f"Directory '{directory}' created")
else:
    print(f"Directory '{directory}' already exists")

    # Folder names to create
folders = ["Screaming", "NotScreaming"]

# Create each folder
for folder in folders:
    directory_path = os.path.join(directory, folder)

    # Check if the directory already exists
    if not os.path.exists(directory_path):
        # Create the directory
        os.makedirs(directory_path)
        print(f"Directory '{directory_path}' created")
    else:
        print(f"Directory '{directory_path}' already exists")

Directory '/kaggle/working/transform-and-pre-progress-for-audio/Data/TestImages' created
Directory '/kaggle/working/transform-and-pre-progress-for-audio/Data/TestImages/Screaming' created
Directory '/kaggle/working/transform-and-pre-progress-for-audio/Data/TestImages/NotScreaming' created


## Define the function apply to audio

In [17]:
import os
import torchaudio
from torchvision import transforms
from PIL import Image
import matplotlib.pyplot as plt

def pad_waveform(waveform, target_length):
    num_channels, current_length = waveform.shape

    if current_length < target_length:
        # Calculate the amount of padding needed
        padding = target_length - current_length
        # Pad the waveform with zeros on the right side
        waveform = torch.nn.functional.pad(waveform, (0, padding))

    return waveform

# Define a function to transform audio data into images
def transform_data_to_image(audio, sample_rate, label, i):
    # Pad waveform to a consistent length of 44100 samples
    audio = pad_waveform(audio, 441000)

    spectrogram_tensor = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate, n_mels=64, n_fft=1024)(audio)[0] + 1e-10

    # Save the spectrogram as an image
    image_path = f'/kaggle/working/transform-and-pre-progress-for-audio/Data/TestImages/{label}/audio_img{i}.png'

    plt.imsave(image_path, spectrogram_tensor.log2().numpy(), cmap='viridis')
    return image_path

# Define the image transformation pipeline
transform = transforms.Compose([
    transforms.Resize((64, 862)),
    transforms.ToTensor(),
    transforms.Lambda(lambda x: x[:3, :, :])
])

# Transform and predict the wav for positive

Note: Thus we use the train set as example show, you can use any wave audio within 10s

In [18]:
import pandas as pd

# Define the folder containing WAV files
folder_path = '/kaggle/input/human-screaming-detection-dataset/Screaming'  # Replace with the path to your folder
label = 'Screaming'  # Label for the images

# Create an empty list to store data
predictions_data = []

# Iterate through WAV files in the folder
for i, filename in enumerate(os.listdir(folder_path)):
    if filename.endswith('.wav'):
        # Load the audio
        audio, sample_rate = torchaudio.load(os.path.join(folder_path, filename))

        # Transform audio to an image and save it
        image_path = transform_data_to_image(audio, sample_rate, label, i)

        # Load the saved image and apply transformations
        image = Image.open(image_path)
        image = transform(image).unsqueeze(0)  # Add batch dimension

        # Make predictions using the model
        model.eval()
        with torch.no_grad():
            outputs = model(image.to(device))

        predict = outputs.argmax(dim=1).cpu().detach().numpy().ravel()[0]

        # Store the filename and prediction in the DataFrame
        predictions_data.append({'Filename': filename, 'Prediction': predict})

# Create a DataFrame from the list of data
scream_predictions_df = pd.DataFrame(predictions_data)

# Display the DataFrame with predictions
scream_predictions_df

Unnamed: 0,Filename,Prediction
0,nIFbKv1qjfw_out.wav,1
1,d4v3_z0ISrM_out.wav,0
2,9AZZncb_yek_out.wav,1
3,IdenFdkeASo_out.wav,1
4,LY90s5AgkWM_out.wav,1
...,...,...
857,nmbLZtYRoBs_out.wav,1
858,OtTKt5--3jo_out.wav,0
859,_zzhHu7HwZc_out.wav,0
860,ZPAY71_lrEk_out.wav,1


In [19]:
scream_predictions_df['Prediction'].value_counts()

Prediction
0    567
1    295
Name: count, dtype: int64

# Transform and predict the wav for negative

In [20]:
# Define the folder containing WAV files
folder_path = '/kaggle/input/human-screaming-detection-dataset/NotScreaming'  # Replace with the path to your folder
label = 'NotScreaming'  # Label for the images
import pandas as pd

# Create an empty list to store data
predictions_data = []

# Iterate through WAV files in the folder
for i, filename in enumerate(os.listdir(folder_path)):
    if filename.endswith('.wav'):
        # Load the audio
        audio, sample_rate = torchaudio.load(os.path.join(folder_path, filename))

        # Transform audio to an image and save it
        image_path = transform_data_to_image(audio, sample_rate, label, i)

        # Load the saved image and apply transformations
        image = Image.open(image_path)
        image = transform(image).unsqueeze(0)  # Add batch dimension

        # Make predictions using the model
        model.eval()
        with torch.no_grad():
            outputs = model(image.to(device))

        predict = outputs.argmax(dim=1).cpu().detach().numpy().ravel()[0]

        # Store the filename and prediction in the DataFrame
        predictions_data.append({'Filename': filename, 'Prediction': predict})

# Create a DataFrame from the list of data
not_scream_predictions_df = pd.DataFrame(predictions_data)

# Display the DataFrame with predictions
not_scream_predictions_df

Unnamed: 0,Filename,Prediction
0,lcgKlZ1xFGk_out.wav,0
1,OqXsRZMZQDY_out.wav,0
2,cRQeAJabx0c_out.wav,0
3,87IhSY4r2DY_out.wav,0
4,BI4fjQBZliY_out.wav,0
...,...,...
2626,Obc79LFQ05Q_out.wav,0
2627,bc39DoAFVe0_out.wav,0
2628,7K4kLUy6Kqo_out.wav,0
2629,pk0xBpBk0s0_out.wav,0


In [21]:
not_scream_predictions_df['Prediction'].value_counts()

Prediction
0    2623
1       8
Name: count, dtype: int64