In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def slice_cens(x, k=50):
    # return 5 second slices that are reshaped appropriately
    indexes = np.array(
        [
            np.arange(i, i + k)
            for i in range(0, x.shape[1], k)
            if i + k < x.shape[1]
        ]
    )
    return np.transpose(x[:, indexes], [1, 0, 2])

def with_metadata(row):
    res = list(row)
    n = len(res)
    k = [(i+1)*5 for i in range(n)]
    return list(zip(k, res))

def explode_cens(df):
    df["_section"] = df.cens.apply(slice_cens).apply(with_metadata)
    df = df.explode("_section")
    df["seconds"] = df["_section"].apply(lambda x: x[0])
    df["snippet"] = df["_section"].apply(lambda x: x[1])
    return df.drop(columns=["_section"])


In [16]:
def slice_seconds(data, sample_rate, seconds=5, pad_seconds=0):
    # return 2d array of the original data
    n = len(data)
    k = sample_rate*seconds
    pad = sample_rate*pad_seconds
    indexes = np.array(
        [
            np.arange(i, i + k + pad)
            for i in range(0, n, k)
            if i + k + pad <= n
        ]
    )
    indexed = data[indexes]
    return list(zip((np.arange(len(indexed))+1)*5, indexed))

In [3]:
input_df = pd.read_csv("../data/input/train_soundscape_labels.csv")
cens_df = pd.read_pickle("../data/cens/train_soundscapes/data.pkl.gz")

In [19]:
len(slice_seconds(cens_df.data.iloc[0], cens_df.sample_rate.iloc[0]))

120

In [20]:
cens_df.shape

(20, 10)

In [21]:
cens_df["_snippet"] = cens_df[["name", "data", "sample_rate"]].apply(lambda x: slice_seconds(x.data, x.sample_rate), axis=1)
cens_df["_snippet"]

0     [(5, [-0.07781942, -0.106421225, -0.09363813, ...
1     [(5, [-0.03685273, 0.0034371414, 0.0536922, -0...
2     [(5, [0.00047344487, 0.0012023906, -0.0017922,...
3     [(5, [0.033321597, 0.04495884, 0.04248728, 0.0...
4     [(5, [0.10134773, 0.14713703, 0.13197538, 0.13...
5     [(5, [0.032651156, 0.05079725, 0.030774817, 0....
6     [(5, [-0.078987055, -0.13539001, -0.114788085,...
7     [(5, [0.010165007, -0.003903959, -0.027708258,...
8     [(5, [-0.0026152704, -0.0027552508, -0.0089982...
9     [(5, [0.012256168, 0.02170331, 0.023253325, 0....
10    [(5, [-0.023844557, -0.02762556, -0.026266357,...
11    [(5, [0.04984007, 0.07124069, 0.061214574, 0.0...
12    [(5, [-0.10449072, -0.12700799, -0.08835665, -...
13    [(5, [0.057847183, 0.068220384, 0.080597214, 0...
14    [(5, [0.35199475, 0.518896, 0.44457597, 0.4652...
15    [(5, [0.11415207, 0.15958197, 0.122738324, 0.1...
16    [(5, [0.07652252, 0.0526967, -0.0141702555, 0....
17    [(5, [-0.074927814, -0.070623636, -0.01281

In [26]:
input_df

Unnamed: 0,row_id,site,audio_id,seconds,birds
0,7019_COR_5,COR,7019,5,nocall
1,7019_COR_10,COR,7019,10,nocall
2,7019_COR_15,COR,7019,15,nocall
3,7019_COR_20,COR,7019,20,nocall
4,7019_COR_25,COR,7019,25,nocall
...,...,...,...,...,...
2395,54955_SSW_580,SSW,54955,580,nocall
2396,54955_SSW_585,SSW,54955,585,grycat
2397,54955_SSW_590,SSW,54955,590,grycat
2398,54955_SSW_595,SSW,54955,595,nocall


In [32]:
exploded = cens_df.explode("_snippet")
exploded["seconds"] = exploded["_snippet"].apply(lambda x: x[0])
exploded["snippet"] = exploded["_snippet"].apply(lambda x: x[1])
exploded["site"] = exploded.name.apply(lambda x: x.split("_")[1])
exploded["audio_id"] = exploded.name.apply(lambda x: x.split("_")[0]).astype(int)
tx_df = input_df.merge(exploded, on=["site", "audio_id", "seconds"])
tx_df

Unnamed: 0,row_id,site,audio_id,seconds,birds,name,parent,data,sample_rate,cens,cens_sample_rate,path,snippet,_snippet
0,7019_COR_5,COR,7019,5,nocall,7019_COR_20190904,train_soundscapes,"[-0.074927814, -0.070623636, -0.012813405, -0....",22050,"[[0.29982088833602566, 0.2964565244514178, 0.2...",10,data/input/train_soundscapes/7019_COR_20190904...,"[-0.074927814, -0.070623636, -0.012813405, -0....","(5, [-0.074927814, -0.070623636, -0.012813405,..."
1,7019_COR_10,COR,7019,10,nocall,7019_COR_20190904,train_soundscapes,"[-0.074927814, -0.070623636, -0.012813405, -0....",22050,"[[0.29982088833602566, 0.2964565244514178, 0.2...",10,data/input/train_soundscapes/7019_COR_20190904...,"[0.012558174, -0.0062861964, -0.0106991865, 0....","(10, [0.012558174, -0.0062861964, -0.010699186..."
2,7019_COR_15,COR,7019,15,nocall,7019_COR_20190904,train_soundscapes,"[-0.074927814, -0.070623636, -0.012813405, -0....",22050,"[[0.29982088833602566, 0.2964565244514178, 0.2...",10,data/input/train_soundscapes/7019_COR_20190904...,"[0.0211886, 0.05153161, 0.0547787, 0.06838079,...","(15, [0.0211886, 0.05153161, 0.0547787, 0.0683..."
3,7019_COR_20,COR,7019,20,nocall,7019_COR_20190904,train_soundscapes,"[-0.074927814, -0.070623636, -0.012813405, -0....",22050,"[[0.29982088833602566, 0.2964565244514178, 0.2...",10,data/input/train_soundscapes/7019_COR_20190904...,"[-0.09674744, -0.12779698, -0.11827926, -0.110...","(20, [-0.09674744, -0.12779698, -0.11827926, -..."
4,7019_COR_25,COR,7019,25,nocall,7019_COR_20190904,train_soundscapes,"[-0.074927814, -0.070623636, -0.012813405, -0....",22050,"[[0.29982088833602566, 0.2964565244514178, 0.2...",10,data/input/train_soundscapes/7019_COR_20190904...,"[0.059325323, 0.049653903, 0.06464528, 0.06322...","(25, [0.059325323, 0.049653903, 0.06464528, 0...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,54955_SSW_580,SSW,54955,580,nocall,54955_SSW_20170617,train_soundscapes,"[0.11415207, 0.15958197, 0.122738324, 0.121970...",22050,"[[0.22879103798263076, 0.22964307854049798, 0....",10,data/input/train_soundscapes/54955_SSW_2017061...,"[-0.03339911, -0.02001738, -0.017174277, -0.01...","(580, [-0.03339911, -0.02001738, -0.017174277,..."
2396,54955_SSW_585,SSW,54955,585,grycat,54955_SSW_20170617,train_soundscapes,"[0.11415207, 0.15958197, 0.122738324, 0.121970...",22050,"[[0.22879103798263076, 0.22964307854049798, 0....",10,data/input/train_soundscapes/54955_SSW_2017061...,"[0.1398685, 0.14535491, 0.14392486, 0.14465475...","(585, [0.1398685, 0.14535491, 0.14392486, 0.14..."
2397,54955_SSW_590,SSW,54955,590,grycat,54955_SSW_20170617,train_soundscapes,"[0.11415207, 0.15958197, 0.122738324, 0.121970...",22050,"[[0.22879103798263076, 0.22964307854049798, 0....",10,data/input/train_soundscapes/54955_SSW_2017061...,"[-0.008788381, -0.017808164, -0.021097202, -0....","(590, [-0.008788381, -0.017808164, -0.02109720..."
2398,54955_SSW_595,SSW,54955,595,nocall,54955_SSW_20170617,train_soundscapes,"[0.11415207, 0.15958197, 0.122738324, 0.121970...",22050,"[[0.22879103798263076, 0.22964307854049798, 0....",10,data/input/train_soundscapes/54955_SSW_2017061...,"[0.056973755, 0.033190306, 0.017808143, 0.0071...","(595, [0.056973755, 0.033190306, 0.017808143, ..."


In [29]:
import IPython.display as ipd

In [34]:
for row in tx_df.sample(5).itertuples():
    print(row.birds)
    ipd.display(ipd.Audio(row.snippet, rate=row.sample_rate))

obnthr1


nocall


nocall


nocall


nocall


In [35]:
from pyspark.sql import SparkSession, functions as F
import os
import sys

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable
spark = SparkSession.builder.config("spark.driver.memory", "12g").getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions", 64)
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
spark

In [36]:
data_df = spark.createDataFrame(tx_df[["snippet", "sample_rate", "birds"]]).repartition(64)
data_df.show(n=2)

+--------------------+-----------+------+
|             snippet|sample_rate| birds|
+--------------------+-----------+------+
|[-0.01226425, 0.0...|      22050|nocall|
|[0.06923631, 0.05...|      22050|nocall|
+--------------------+-----------+------+
only showing top 2 rows



In [37]:
data_df.cache()

DataFrame[snippet: array<float>, sample_rate: bigint, birds: string]

In [44]:
data_df.where("birds = 'nocall'").count()

1529

In [51]:
data_df = data_df.withColumn("is_call", F.expr("birds <> 'nocall'").cast("int"))
data_df

DataFrame[snippet: array<float>, sample_rate: bigint, birds: string, is_call: int]

In [66]:
np.random.beta(0.4, 0.4)

0.9945245999656829

In [90]:
from pyspark.sql import Window

@F.udf("struct<x:array<float>,y:float>")
def mixup(x1, x2, y1, y2, alpha=0.4):
    if not x1 or not x2:
        return {}
    a = np.random.beta(alpha, alpha)
    return dict(x=[float(z) for z in a*np.array(x1)+(1-a)*np.array(x2)], y=float(a*y1+(1-a)*y2))

def mixup_df(df):
    win = Window.orderBy(F.rand())
    return (
        df
        .withColumn("mixup", mixup(
            "snippet", 
            F.lead("snippet").over(win), 
            "is_call", 
            F.lead("is_call").over(win)
        ))
        .selectExpr(
            "mixup.x as snippet",
            "mixup.y as is_call",
            "sample_rate"
        )
    )

In [94]:
mixup_df(data_df).show()

+--------------------+------------+-----------+
|             snippet|     is_call|sample_rate|
+--------------------+------------+-----------+
|[-0.033265773, -0...|   0.7415851|      22050|
|[-0.013325614, -0...|         0.0|      22050|
|[-0.012493401, -0...| 0.021477472|      22050|
|[-0.022360211, -0...|         1.0|      22050|
|[0.05048787, 0.07...| 0.009969188|      22050|
|[0.03943031, 0.04...|         0.0|      22050|
|[-7.075423E-4, -3...|         0.0|      22050|
|[-0.012199989, -7...|         0.0|      22050|
|[-0.10774716, -0....|0.0028944497|      22050|
|[-0.09002493, -0....|         1.0|      22050|
|[-0.038468827, -0...|         1.0|      22050|
|[-0.0079638045, -...|         1.0|      22050|
|[-0.01722483, -0....|         1.0|      22050|
|[-0.016411562, -0...|         1.0|      22050|
|[0.04299427, 0.04...|  0.85264266|      22050|
|[-0.004926498, -0...|         0.0|      22050|
|[-0.029883014, -0...|         0.0|      22050|
|[-0.020840243, -0...|  0.55861634|     