In [1]:
from pyspark.sql import SparkSession, functions as F
import os
import sys

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable
spark = (
    SparkSession.builder
    .config("spark.driver.memory", "20g")
    .config("spark.driver.cores", "12")
    .getOrCreate()
)
spark.conf.set("spark.sql.shuffle.partitions", 64)
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
spark

In [2]:
! ls ../data/input

birdclef-2021.zip
sample_submission.csv
test.csv
test_soundscapes
train_metadata.csv
train_short_audio
train_soundscape_labels.csv
train_soundscapes


In [None]:
from pathlib import Path

paths = [p.resolve().as_posix() for p in Path("../data/input/train_short_audio").glob("**/*.ogg")]
df = spark.createDataFrame([dict(path=p) for p in paths]).cache()
df.show(truncate=False, n=5)

TODO: read out parsed metadata using a UDF

In [4]:
df.count()

62874

In [42]:
import librosa
import numpy as np
from simple_mp.simple import simple_fast
from birdclef.utils import cens_per_sec, compute_offset

@F.udf(returnType="""
    struct<
        mp: array<float>, 
        pi: array<short>,
        motif_0: short,
        motif_1: short,
        duration_cens: short,
        duration_seconds: float,
        duration_samples: int
    >
""")
def compute_simple(path, cens_sr=10, mp_window=50):
    data, sample_rate = librosa.load(path)
    cens = librosa.feature.chroma_cens(
        data, sample_rate, hop_length=cens_per_sec(sample_rate, cens_sr)
    )
    # 5 second matrix profile window
    mp, pi = simple_fast(cens, cens, mp_window)
    motif = np.argmin(mp)
    return dict(
        mp=mp.tolist(), 
        pi=pi.tolist(),
        motif_0=int(motif),
        motif_1=int(pi[motif]),
        duration_seconds=round(
            librosa.get_duration(y=data, sr=sample_rate), 2
        ),
        duration_cens=cens.shape[1],
        duration_samples=data.shape[0]
    )

In [45]:
res = df.limit(20).select(compute_simple("path").alias("data")).select("data.*")
res.show(n=1, vertical=True, truncate=80)

-RECORD 0--------------------------------------------------------------------------------------------
 mp               | [0.61007583, 0.5870274, 0.56637144, 0.54786026, 0.5313231, 0.51626456, 0.5029... 
 pi               | [471, 472, 473, 474, 475, 475, 476, 477, 478, 479, 479, 324, 325, 326, 327, 3... 
 motif_0          | 100                                                                              
 motif_1          | 58                                                                               
 duration_cens    | 646                                                                              
 duration_seconds | 63.67                                                                            
 duration_samples | 1404025                                                                          
only showing top 1 row



In [46]:
(
    df
    .select("path", compute_simple("path").alias("data"))
    .select("path", "data.*")
).write.parquet("../data/motif_v2")