# Classifies the Project CETI sperm whale audio by coda type

<html><a href="https://colab.research.google.com/github/autumnjohnson/ceti_audio/blob/main/load_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a></html>


Hugging Face: <https://huggingface.co/datasets/autumnjohnson/ceti_audio>

GitHub: <https://github.com/autumnjohnson/ceti_audio>

## Install dependencies

In [20]:
!pip install matplotlib torch pydub datasets
import io
import torch
from datasets import load_dataset, Audio
import pandas as pd
import warnings
from huggingface_hub import login
import random
import requests
import matplotlib.pyplot as plt
from collections import Counter

import numpy as np
import numpy.core.multiarray as multi

from sklearn.metrics import classification_report, balanced_accuracy_score
from sklearn import datasets, metrics, svm
from sklearn.model_selection import train_test_split

import soundfile as sf
import IPython.display as ipd
from collections import Counter
import gzip


from IPython.display import Audio as iAudio
from npc_gzip.compressors.base import BaseCompressor
from npc_gzip.compressors.bz2_compressor import Bz2Compressor
from npc_gzip.compressors.gzip_compressor import GZipCompressor
from npc_gzip.knn_classifier import KnnClassifier

In [22]:
compressor = GZipCompressor()

In [39]:
def fit_model(comp: BaseCompressor, train_audio: np.ndarray, train_labels: np.ndarray, distance_metric: str = "ncd") -> KnnClassifier:
    """
    Fits a Knn-GZip compressor on the train
    data and returns it.

    Arguments:
        train_text (np.ndarray): Training dataset as a numpy array.
        train_labels (np.ndarray): Training labels as a numpy array.

    Returns:
        KnnClassifier: Trained Knn-Compressor model ready to make predictions.
    """
    compressor: BaseCompressor = comp
    model: KnnClassifier = KnnClassifier(
        compressor=compressor,
        training_inputs=train_audio,
        training_labels=train_labels,
        distance_metric=distance_metric,
    )

    return model

## Load dataset

### Download dataset

In [40]:
dataset = load_dataset("autumnjohnson/ceti_audio")

In [41]:
login(token = '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')

### Resample audio

In [42]:
dataset = dataset.cast_column("audio", Audio(decode=False, sampling_rate=16000))

### Prepare train/test sets

In [38]:
def get_data(dataset) -> tuple:
    """
    Pulls the Project CETI sperm whale vocalizations
    training data and the second being the test
    data. Each tuple contains the audio and label
    respectively as numpy arrays.

    """

    train_iter = dataset['train']
    test_iter = dataset['test']

    train_audio =  [audio_array['bytes'] for audio_array in train_iter['audio']]
    train_labels = train_iter['coda_type']

    test_audio  = [audio_array['bytes'] for audio_array in test_iter['audio']]
    test_labels = test_iter['coda_type']

    train_audio = np.array(train_audio)
    train_labels = np.array(train_labels)

    test_audio = np.array(test_audio)
    test_labels = np.array(test_labels)

    train = (train_audio, train_labels)
    test = (test_audio, test_labels)

    return (train, test)

In [43]:
((train_audio, train_labels), (test_audio, test_labels)) = get_data(dataset)

## Train model

In [44]:
model = fit_model(compressor, train_audio, train_labels)

## Generate predictions

In [49]:
# Sets the percent of training set to use to compare `sample_test_text` against
# A value less than 1 saves time at the expense of worse predictions
sampling_percentage = 1
top_k = 1
random_indicies = np.random.choice(test_audio.shape[0], len(test_audio), replace=False)
sample_test_text = test_audio[random_indicies]
sample_test_labels = test_labels[random_indicies]

In [37]:
(distances, labels, similar_samples) = model.predict(sample_test_text, top_k,
sampling_percentage=sampling_percentage)

In [46]:
flattened_labels = labels.flatten().reshape(-1)

## Display results

### Print classification report

In [48]:
print(classification_report(sample_test_labels, flattened_labels, zero_division=np.nan))

### Print confusion matrix

In [50]:
disp = metrics.ConfusionMatrixDisplay.from_predictions(sample_test_labels, flattened_labels)
disp.figure_.suptitle("Confusion Matrix")
print(f"Confusion matrix:\n{disp.confusion_matrix}")
plt.show()