In [1]:
# audio libraries
import librosa
import librosa.display as lplt
import IPython

# import matplotlib to be able to display graphs
import matplotlib.pyplot as plt

# transform .wav into .csv
import csv
import os
import numpy as np
import pandas as pd

# preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# model
import keras
import tensorflow as tf
from tensorflow.keras.models import Sequential

2024-12-16 14:27:21.172926: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734355641.191630   35877 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734355641.197698   35877 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-16 14:27:21.217516: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict, ClassLabel
import torch
import librosa
import os
import numpy as np
from sklearn.model_selection import train_test_split

# Directories and parameters
data_dir = "data/"  # Replace with the path to your dataset
SAMPLE_RATE = 16000  # Wave2Vec2.0 expects 16 kHz audio

# Selected class folders to use
selected_classes = [
    "BowheadWhale",
    "Beluga_WhiteWhale",
    "SouthernRightWhale",
    "NorthernRightWhale",
    "Short_Finned(Pacific)PilotWhale",
    "Long_FinnedPilotWhale",
    "HumpbackWhale",
    "KillerWhale",
    "SpermWhale",
    "BottlenoseDolphin",
]  # Add or remove class folder names as needed

# Map class labels to IDs
LABELS = selected_classes  # Only selected classes
LABEL2ID = {label: idx for idx, label in enumerate(LABELS)}
ID2LABEL = {idx: label for label, idx in LABEL2ID.items()}

# Function to preprocess audio
def preprocess_audio(file_path):
    audio, sr = librosa.load(file_path, sr=SAMPLE_RATE)
    if len(audio) > SAMPLE_RATE:
        audio = audio[:SAMPLE_RATE]  # Trim to 1 second
    else:
        padding = SAMPLE_RATE - len(audio)
        audio = np.pad(audio, (0, padding))  # Pad if less than 1 second
    return audio

# Function to load dataset from selected class folders
def load_data(data_folder, selected_classes):
    audio_files = []
    labels = []
    for label in selected_classes:
        label_folder = os.path.join(data_folder, label)
        if os.path.exists(label_folder):
            for file_name in os.listdir(label_folder):
                file_path = os.path.join(label_folder, file_name)
                if file_path.lower().endswith((".wav", ".mp3")):  # Ensure only audio files
                    audio_files.append(file_path)
                    labels.append(LABEL2ID[label])
        else:
            print(f"Warning: Folder '{label}' does not exist in {data_folder}. Skipping...")
    return Dataset.from_dict({"audio": audio_files, "label": labels})

# Load dataset from all selected folders
full_dataset = load_data(data_dir, selected_classes)

# Convert labels to ClassLabel type
def convert_labels_to_classlabel(dataset, num_classes, labels):
    # Create a ClassLabel object
    class_label = ClassLabel(num_classes=num_classes, names=labels)
    
    # Apply the ClassLabel transformation
    dataset = dataset.cast_column("label", class_label)
    return dataset

# Convert the label column in the dataset
full_dataset = convert_labels_to_classlabel(
    full_dataset, num_classes=len(selected_classes), labels=selected_classes
)

# Now split the dataset
train_test_split = full_dataset.train_test_split(test_size=0.2, stratify_by_column="label", seed=42)
train_data = train_test_split["train"]
val_data = train_test_split["test"]

# Convert to Hugging Face datasets
train_dataset = Dataset.from_dict({"audio": train_data["audio"], "label": train_data["label"]})
val_dataset = Dataset.from_dict({"audio": val_data["audio"], "label": val_data["label"]})

# Feature extractor
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")

# Preprocess dataset for Wav2Vec2.0
def preprocess(batch):
    audio = preprocess_audio(batch["audio"])
    inputs = feature_extractor(audio, sampling_rate=SAMPLE_RATE, return_tensors="pt", padding=True)
    batch["input_values"] = inputs.input_values[0]
    return batch

train_dataset = train_dataset.map(preprocess, remove_columns=["audio"])
val_dataset = val_dataset.map(preprocess, remove_columns=["audio"])

# Convert to torch tensors
def to_torch(batch):
    batch["input_values"] = torch.tensor(batch["input_values"], dtype=torch.float32)
    batch["labels"] = torch.tensor(batch["label"], dtype=torch.long)
    return batch

train_dataset = train_dataset.map(to_torch)
val_dataset = val_dataset.map(to_torch)

# Prepare dataset for Hugging Face Trainer
dataset = DatasetDict({"train": train_dataset, "validation": val_dataset})

# Load pre-trained Wav2Vec2.0 model for classification
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "facebook/wav2vec2-base",
    num_labels=len(LABELS),
    label2id=LABEL2ID,
    id2label=ID2LABEL,
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
)

# Define the Hugging Face Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=feature_extractor,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained("./marine_species_model_Wav2Vec")
feature_extractor.save_pretrained("./marine_species_model_Wav2Vec")


Casting the dataset: 100%|██████████| 147406/147406 [00:00<00:00, 4496444.21 examples/s]
Map: 100%|██████████| 117924/117924 [05:31<00:00, 355.52 examples/s]
Map:  58%|█████▊    | 16998/29482 [00:49<00:22, 563.31 examples/s]

: 

In [7]:
full_dataset["label"]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
