In [11]:
import time
import gc
import json
import glob
from typing import Sequence

import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from numpy.random import SeedSequence
from sklearn.model_selection import KFold
from sklearn import metrics
from reservoirpy import ESN
from reservoirpy.mat_gen import generate_input_weights, generate_internal_weights
from reservoirpy.datasets import mackey_glass

import sys
sys.path.insert(0, "..")

from extract import fetch

In [2]:
dataset = fetch("marron1", path="../data")

In [83]:
df = dataset.df
df = df[df["wave"] == list(df.groupby("wave").groups.keys())[0]].copy()
df = df[df["syll"] != "SIL"].copy()
df = df.drop(["repertoire_file"], axis=1).reset_index(drop=True)

In [84]:
df["start_d"] = df["start"].shift(-1)
df

Unnamed: 0,wave,start,end,syll,start_d
0,100_marron1_May_24_2016_62101389.wav,0.35,0.664,call,1.359
1,100_marron1_May_24_2016_62101389.wav,1.359,2.412,Z,2.488
2,100_marron1_May_24_2016_62101389.wav,2.488,2.773,T,2.969
3,100_marron1_May_24_2016_62101389.wav,2.969,4.398,U,4.398
4,100_marron1_May_24_2016_62101389.wav,4.398,4.695,J2,4.835
5,100_marron1_May_24_2016_62101389.wav,4.835,5.697977,B1,5.697977
6,100_marron1_May_24_2016_62101389.wav,5.697977,6.770899,B2,6.768
7,100_marron1_May_24_2016_62101389.wav,6.768,6.925,R,7.059
8,100_marron1_May_24_2016_62101389.wav,7.059,7.104,J1,7.104
9,100_marron1_May_24_2016_62101389.wav,7.104,8.009,J1,8.009


In [85]:
df["diff"] = df["start_d"] - df["end"]
df

Unnamed: 0,wave,start,end,syll,start_d,diff
0,100_marron1_May_24_2016_62101389.wav,0.35,0.664,call,1.359,0.695
1,100_marron1_May_24_2016_62101389.wav,1.359,2.412,Z,2.488,0.076
2,100_marron1_May_24_2016_62101389.wav,2.488,2.773,T,2.969,0.196
3,100_marron1_May_24_2016_62101389.wav,2.969,4.398,U,4.398,0.0
4,100_marron1_May_24_2016_62101389.wav,4.398,4.695,J2,4.835,0.14
5,100_marron1_May_24_2016_62101389.wav,4.835,5.697977,B1,5.697977,0.0
6,100_marron1_May_24_2016_62101389.wav,5.697977,6.770899,B2,6.768,-0.002899
7,100_marron1_May_24_2016_62101389.wav,6.768,6.925,R,7.059,0.134
8,100_marron1_May_24_2016_62101389.wav,7.059,7.104,J1,7.104,0.0
9,100_marron1_May_24_2016_62101389.wav,7.104,8.009,J1,8.009,0.0


In [86]:
first_consecutives = df[(df["syll"].shift(-1) == df["syll"])
                        & ~(df["syll"].isin(["call"]))
                        & (df["diff"] < 0.01)
                        & (df["diff"] != np.nan)]

In [87]:
first_consecutives


Unnamed: 0,wave,start,end,syll,start_d,diff
8,100_marron1_May_24_2016_62101389.wav,7.059,7.104,J1,7.104,0.0
22,100_marron1_May_24_2016_62101389.wav,18.831,19.66,A,19.66,0.0
45,100_marron1_May_24_2016_62101389.wav,36.802,37.712,A,37.712,0.0


In [88]:
df_c = df.copy()
for first in first_consecutives.itertuples():
    next = df.index[first.Index + 1]
    df_c.at[next, "start"] = first.start
    df_c = df_c.drop(first.Index, axis=0)

df_c


Unnamed: 0,wave,start,end,syll,start_d,diff
0,100_marron1_May_24_2016_62101389.wav,0.35,0.664,call,1.359,0.695
1,100_marron1_May_24_2016_62101389.wav,1.359,2.412,Z,2.488,0.076
2,100_marron1_May_24_2016_62101389.wav,2.488,2.773,T,2.969,0.196
3,100_marron1_May_24_2016_62101389.wav,2.969,4.398,U,4.398,0.0
4,100_marron1_May_24_2016_62101389.wav,4.398,4.695,J2,4.835,0.14
5,100_marron1_May_24_2016_62101389.wav,4.835,5.697977,B1,5.697977,0.0
6,100_marron1_May_24_2016_62101389.wav,5.697977,6.770899,B2,6.768,-0.002899
7,100_marron1_May_24_2016_62101389.wav,6.768,6.925,R,7.059,0.134
9,100_marron1_May_24_2016_62101389.wav,7.059,8.009,J1,8.009,0.0
10,100_marron1_May_24_2016_62101389.wav,8.009,8.432,J2,8.617,0.185


In [5]:
def join(df):
    df = df.reset_index(drop=True)
    df["start_d"] = df["start"].shift(-1)
    df["diff"] = df["start_d"] - df["end"]
    first_consecutives = df[(df["syll"].shift(-1) == df["syll"])
                            & ~(df["syll"].isin(["call"]))
                            & (df["diff"] < 0.01)
                            & (df["diff"] != np.nan)]

    df_c = df.copy()
    for first in first_consecutives.itertuples():
        next = df.index[first.Index + 1]
        df_c.at[next, "start"] = first.start
        df_c = df_c.drop(first.Index, axis=0)

    return df_c.drop(["start_d", "diff"], axis=1)

In [9]:
def par_join():
    df = dataset.df
    df = df[df["syll"] != "SIL"].copy()
    df = df.drop(["repertoire_file"], axis=1).reset_index(drop=True)
    songs = list(df.groupby("wave").groups.keys())
    with Parallel(n_jobs=-1) as parallel:
        dfs = parallel(delayed(join)(df[df["wave"] == s].copy()) for s in songs)
    return pd.concat(dfs)

In [12]:
%timeit par_join()

902 ms ± 36.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
def seq_join():
    df = dataset.df
    df = df[df["syll"] != "SIL"].copy()
    df = df.drop(["repertoire_file"], axis=1).reset_index(drop=True)
    songs = list(df.groupby("wave").groups.keys())
    dfs = []
    for s in songs:
        dfs.append(join(df[df["wave"] == s].copy()))
    return pd.concat(dfs)

In [15]:
%timeit seq_join()

1.82 s ± 5.92 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
a = glob.glob("../data/**/*.wav", recursive=True)
len(a)

17130

In [38]:
df[df["syll"] == "TRASH"].iloc[:100]

Unnamed: 0,wave,start,end,syll
216,106_marron1_May_25_2016_22908212.wav,25.422000,25.464000,TRASH
249,107_marron1_May_25_2016_24672389.wav,8.792000,8.862000,TRASH
254,107_marron1_May_25_2016_24672389.wav,11.177000,11.241000,TRASH
265,107_marron1_May_25_2016_24672389.wav,17.656000,17.724000,TRASH
475,112_marron1_May_25_2016_42727029.wav,14.761000,14.840000,TRASH
...,...,...,...,...
13832,526_marron1_June_08_2016_35860114.wav,6.786465,6.918884,TRASH
14249,549_marron1_June_15_2016_58318268.wav,7.746502,7.851333,TRASH
14284,551_marron1_June_15_2016_60066808.wav,10.052796,10.207285,TRASH
14297,552_marron1_June_15_2016_71086510.wav,11.581130,11.724584,TRASH
