In [1]:
from pathlib import Path

import librosa
import numpy as np
import pandas as pd
import tensorflow as tf
import tqdm


def reshape_square(arr, k=8):
    """Domain specific. Transform 12x50 matrix into a square matrix that's 25x25."""
    x = np.zeros((12, k * k))
    x[:, : arr.shape[1]] = arr
    return x.reshape(3, 2 * k, 2 * k).T


def cens_per_sec(sample_rate, target):
    """Ensure this value is a multiple of 2**6"""
    return (sample_rate // (target * (2 ** 6))) * (2 ** 6)


def extract_features(path, cens_sr=10):
    data, sample_rate = librosa.load(path)
    cens = librosa.feature.chroma_cens(
        data, sample_rate, hop_length=cens_per_sec(sample_rate, cens_sr)
    )
    # return 5 second slices that are reshaped appropriately
    indexes = np.array(
        [
            np.arange(i, i + 50)
            for i in range(0, cens.shape[1], 50)
            if i + 50 < cens.shape[1]
        ]
    )
    transposed = np.transpose(cens[:, indexes], [1, 0, 2])
    return np.array([reshape_square(x) for x in transposed])


def predict_layer(model, data):
    feature = tf.keras.Model(
        inputs=model.input, outputs=model.get_layer(index=len(model.layers) - 2).output
    )
    return feature.predict(data)


def extract_dataframe(model, site, audio_id, base="../data/input/train_soundscapes"):
    path = list(Path(base).glob(f"{audio_id}_{site}_*.ogg"))
    if not path:
        raise ValueError("audio not found")
    cens = extract_features(path[0])
    feature = predict_layer(model, cens)
    feature_df = pd.DataFrame(feature)
    metadata_df = pd.DataFrame((feature_df.index + 1) * 5, columns=["seconds"])
    metadata_df["site"] = site
    metadata_df["audio_id"] = audio_id
    return metadata_df.join(feature_df)

In [2]:
input_df = pd.read_csv("../data/input/train_soundscape_labels.csv")
tf.config.set_visible_devices([], "GPU")
model = tf.keras.models.load_model("../data/models/model-16-16-3-no-mixup-full")

In [3]:
results = []
rows = list(input_df[["site", "audio_id"]].drop_duplicates().itertuples())
for row in tqdm.tqdm(rows):
    results.append(extract_dataframe(model, row.site, row.audio_id))
features = pd.concat(results)
merged = input_df.merge(features, on=["site", "audio_id", "seconds"])

100%|█████████████████████████████████████████████████████████████████████████| 20/20 [08:06<00:00, 24.32s/it]


In [4]:
merged.to_pickle("../data/train_soundscape_v1.gz.pkl")

In [7]:
df = pd.read_pickle("../data/train_soundscape_v1.gz.pkl")
df

Unnamed: 0,row_id,site,audio_id,seconds,birds,0,1,2,3,4,...,118,119,120,121,122,123,124,125,126,127
0,7019_COR_5,COR,7019,5,nocall,0.0,0.000000,0.484167,0.777650,0.0,...,2.594115,1.063408,0.0,0.000000,0.0,0.000000,0.0,2.170409,0.0,0.108364
1,7019_COR_10,COR,7019,10,nocall,0.0,0.000000,2.112043,0.000000,0.0,...,2.392371,0.063970,0.0,0.000000,0.0,0.241455,0.0,1.352149,0.0,0.840994
2,7019_COR_15,COR,7019,15,nocall,0.0,0.000000,0.877503,0.343573,0.0,...,3.147103,1.334816,0.0,1.016320,0.0,0.097038,0.0,0.616815,0.0,1.086201
3,7019_COR_20,COR,7019,20,nocall,0.0,0.033138,0.369459,0.181315,0.0,...,1.617901,0.020769,0.0,0.000000,0.0,0.504223,0.0,1.119352,0.0,1.373091
4,7019_COR_25,COR,7019,25,nocall,0.0,0.515957,0.000000,0.231343,0.0,...,2.322008,1.312874,0.0,1.586848,0.0,1.329830,0.0,1.241296,0.0,1.407362
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,54955_SSW_580,SSW,54955,580,nocall,0.0,2.682888,1.056530,0.000000,0.0,...,0.000000,1.021026,0.0,0.733458,0.0,3.097868,0.0,0.000000,0.0,0.000000
2396,54955_SSW_585,SSW,54955,585,grycat,0.0,0.000000,0.000000,0.930946,0.0,...,1.306528,1.262990,0.0,7.406854,0.0,3.685196,0.0,0.000000,0.0,1.638371
2397,54955_SSW_590,SSW,54955,590,grycat,0.0,1.534010,0.000000,1.047689,0.0,...,1.128624,2.651810,0.0,0.195695,0.0,0.000000,0.0,2.359753,0.0,1.986576
2398,54955_SSW_595,SSW,54955,595,nocall,0.0,0.016686,1.473288,4.940649,0.0,...,0.256271,0.000000,0.0,1.427245,0.0,0.000000,0.0,1.491268,0.0,4.137387


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SVRegression

y = (df.birds == "nocall").astype(np.uint8)
X = df.iloc[:, 5:].values

X_train, X_test, y_train, y_test = train_test_split(X, y)
clf = LogisticRegression(max_iter=500).fit(X_train, y_train)
clf.score(X_test, y_test)

0.6

In [20]:
from sklearn.svm import SVC

clf = SVC().fit(X_train, y_train)
clf.score(X_test, y_test)

0.63

In [21]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier().fit(X_train, y_train)
clf.score(X_test, y_test)

0.6216666666666667