In [82]:
import IPython.display as ipd
import librosa
import librosa.display
import librosa.feature
from pydub import AudioSegment

import matplotlib.pyplot as plt
from PIL import Image

from scipy import signal

from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction.settings import from_columns
from tsfresh.feature_extraction import ComprehensiveFCParameters, EfficientFCParameters, MinimalFCParameters

import json
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
# from sklearn_pandas import DataFrameMapper

from tqdm import tqdm

from concurrent.futures import ProcessPoolExecutor

import pickle

ROOT_PATH = Path("..")

ModuleNotFoundError: No module named 'pydub'

# Get ids, labels, and train-test splits

In [6]:
df = pd.read_csv(ROOT_PATH / "data/raw/metadata.csv")
# svc - song vs call ids
# filter ids -> <20s, quality A & B
# svc ids -> only rows that have call or song (not both)
filter_ids = pd.read_json(ROOT_PATH / "data/raw/filter_ids.json").squeeze()
svc_ids = pd.read_json(ROOT_PATH / "data/raw/song_vs_call.json").squeeze()
svc_df = df.loc[df.id.isin(svc_ids)].copy()
# set index to id
svc_df.set_index('id', inplace=True)

with open(ROOT_PATH / "data/processed/svc_split.json") as svc_split_file:
    svc_split = json.load(svc_split_file)
    train_ids = svc_split["train_ids"]
    test_ids = svc_split["test_ids"]

In [7]:
# Add response variable
type_col = svc_df.type.str.lower().str.replace(" ", "").str.split(",")
filtered_type_col = type_col.apply(lambda l: set(l) - {"call", "song"})
svc_df["pred"] = type_col.apply(lambda l: "call" in l).astype(int)

In [8]:
## Build y train-test
# indexing all (svc_df and y_df) by id
y_df = svc_df["pred"]
y_train, y_test = (
    y_df[y_df.index.isin(train_ids)].squeeze(),
    y_df[y_df.index.isin(test_ids)].squeeze(),
)

In [None]:
y_df

## Get a random smaller subset of ids and labels to work with

In [65]:
# get a random sample of n ids from a given list of ids
def get_rand_ids(n, ids_lst):
    ids_ser = pd.Series(ids_lst)
    rand_ids = ids_ser[np.random.randint(0,len(ids_lst),size=n)].array
    return rand_ids

In [70]:
# random subset of train
train_rand_ids = get_rand_ids(10, train_ids)
y_train_rand = y_train.loc[train_rand_ids]
print(y_train_rand)

# random subset of test
test_rand_ids = get_rand_ids(10, test_ids)
y_test_rand = y_test.loc[test_rand_ids]
print(y_test_rand)

# all rand ids
rand_ids = np.concatenate((train_rand_ids, test_rand_ids))

id
511244    0
268852    1
360233    1
345866    1
307236    0
54753     0
513210    0
59504     1
446576    0
472218    1
Name: pred, dtype: int64
id
391159    1
71006     0
314230    0
57034     1
89171     0
143698    1
288789    1
160917    1
107700    0
453754    1
Name: pred, dtype: int64


# Featurize Audio

## Extract features using ts-fresh

In [10]:
# apply butter filter
def highpass_filter(audio, sr):
    # butter_coeff_b, butter_coeff_a = signal.butter(3, 1000, btype='highpass', fs=sr) # numerator and denominator
    # butter_audio = signal.lfilter(butter_coeff_b, butter_coeff_a, audio)
    # return butter_audio
    return signal.lfilter(*signal.butter(3, 1000, btype='highpass', fs=sr), audio)

In [11]:
# unpack an mp3 or wav into df of timeseries values
def unpack_audio(id):
    try:
        audio_path = ROOT_PATH / ("data/raw/recordings/" + str(id) + ".mp3")
        # load mp3 as audio timeseries arr
        timeseries,sr = librosa.load(audio_path)
    except FileNotFoundError:
        audio_path = ROOT_PATH / ("data/raw/recordings/" + str(id) + ".wav")
        timeseries,sr = librosa.load(audio_path)

    # high-pass filter on audio timeseries
    timeseries_filt = highpass_filter(timeseries,sr)

    df = pd.DataFrame(timeseries_filt, columns=['val'])
    df.reset_index(inplace=True)
    df['id'] = id # fill col with id
    df = df.reindex(columns=['id','index','val'])
    df.columns = ['id','time','val']
    return df

In [None]:
%%timeit
unpack_audio(svc_ids[0])

In [180]:
# manually select features to calculate
# features can be found here: https://tsfresh.readthedocs.io/en/latest/api/tsfresh.feature_extraction.html#tsfresh.feature_extraction.feature_calculators.fft_aggregated
extraction_settings = {
        "abs_energy": None,
        "fft_aggregated": [{"aggtype":"centroid"}, {"aggtype":"kurtosis"}],
        "root_mean_square": None,
        "spkt_welch_density": [{"coeff":2},{"coeff":5},{"coeff":8}]
}

def featurize_audio_manual(id):
        return extract_features(unpack_audio(id), column_id='id', column_sort='time',
                        default_fc_parameters=extraction_settings,
                        disable_progressbar=True,
                        # we impute = remove all NaN features automatically
                        impute_function=impute,
                        # turn off parallelization
                        n_jobs=0)

In [161]:
kind_to_fc_parameters = {'val': {'standard_deviation': None,
                         'variance': None,
                         'root_mean_square': None}}
def featurize_audio_selected(id):
        return extract_features(unpack_audio(id), column_id='id', column_sort='time',
                        kind_to_fc_parameters=kind_to_fc_parameters,
                        disable_progressbar=True,
                        # we impute = remove all NaN features automatically
                        impute_function=impute,
                        # turn off parallelization
                        n_jobs=0)

In [177]:
# featurize a single id
# try lots of features
def featurize_audio(id):
    # EfficientFCParameters()
    # ComprehensiveFCParameters()
    # MinimalFCParameters()

    return extract_features(unpack_audio(id), column_id='id', column_sort='time', 
                    default_fc_parameters=EfficientFCParameters(),
                    disable_progressbar=True,
                    # we impute = remove all NaN features automatically
                    impute_function=impute,
                    # turn off parallelization
                    n_jobs=0)

In [181]:
# featurize dataset
# returns df of all
def featurize_set(ids, feat_func):
    X_df = pd.DataFrame()
    for id in ids:
        X_df = pd.concat([X_df,feat_func(id)])
        # X_df = pd.concat([X_df,featurize_audio_manual(id)])
        # X_df = pd.concat([X_df,featurize_audio(id)])
        # X_df = pd.concat([X_df,featurize_audio_selected(id)])
    return X_df

### Test on 5%

In [155]:
# random subset of train
train_rand_ids = get_rand_ids(int(len(train_ids)*.05), train_ids)
y_train_rand = y_train.loc[train_rand_ids]
# print(y_train_rand)

# random subset of test
test_rand_ids = get_rand_ids(int(len(test_ids)*.05), test_ids)
y_test_rand = y_test.loc[test_rand_ids]
# print(y_test_rand)

# all rand ids
rand_ids = np.concatenate((train_rand_ids, test_rand_ids))

In [None]:
# featurize 5% of the dataset
# takes 3 mins for manual, 5% of dataset
# takes 168.2 s for minimal, 5% of dataset
# takes 169s for selected (3 features), 5% of dataset
X_df_rand = featurize_set(rand_ids, featurize_audio_manual)

In [None]:
# takes 1 hr
X_df = featurize_set(svc_ids, featurize_audio_manual)

In [185]:
X_df.to_json(ROOT_PATH / f"data/processed/audio_features_manual.json", indent=2, orient='columns')

### Train-Test split features

In [171]:
# get train-test split of features

X_train_rand, X_test_rand = (
    X_df_rand[X_df_rand.index.isin(train_rand_ids)].squeeze(),
    X_df_rand[X_df_rand.index.isin(test_rand_ids)].squeeze(),
)

# this gives incorrect num rows :( !
# X_train_rand = X_df_rand.loc[train_rand_ids,:]
# X_test_rand = X_df_rand.loc[test_rand_ids,:]

In [10]:
audio_df = pd.concat(audio_list, ignore_index=True) #crashes jupyter

In [60]:
# takes 5h
audio_df = pd.DataFrame(columns = ['id','time','val'])
data_size = len(svc_ids)
buffer_size = 10
df_buffer = [None]*buffer_size
for idx,audio_id in enumerate(tqdm(svc_ids)):
    buffer_idx = idx%buffer_size
    if (buffer_idx == 0):
        audio_df = pd.concat([audio_df]+df_buffer, ignore_index=True)
    df_buffer[buffer_idx] = unpack_audio(audio_id)

# leftover values in buffer
leftover = data_size % buffer_size
if (leftover != 0):
    audio_df = pd.concat([audio_df]+df_buffer[:leftover], ignore_index=True)

 17%|█▋        | 980/5800 [50:17<4:07:18,  3.08s/it]


KeyboardInterrupt: 

In [62]:
# takes 2h
audio_df = pd.DataFrame(columns = ['id','time','val'])
for audio_id in tqdm(svc_ids):  
    audio_df = pd.concat([audio_df,unpack_audio(audio_id)], ignore_index=True)

  1%|▏         | 75/5800 [01:23<1:46:21,  1.11s/it]


KeyboardInterrupt: 

In [8]:
# 1h20m
# ~80GB
for audio_id in tqdm(svc_ids):  
    id_df = unpack_audio(audio_id)
    id_df.to_json(ROOT_PATH / f"data/processed/audio_timeseries/{audio_id}.json", indent=2, orient='columns')

 47%|████▋     | 2750/5800 [31:22<34:48,  1.46it/s]


OSError: [Errno 28] No space left on device

In [11]:
audio_df.to_json(ROOT_PATH / "data/processed/audio_timeseries.json", indent=2, orient='columns')

In [12]:
## Better (?) way (this takes ~1h):
# dict[id] = df with timeseries data for that id's audio recording

# audio_dict = pd.Series(index=svc_ids.head())
audio_dict = {}
for id in tqdm(svc_ids):
    audio_dict[id] = unpack_audio(id)

100%|██████████| 5800/5800 [1:06:39<00:00,  1.45it/s]


In [17]:
len(audio_dict)

5800

In [20]:
with open(ROOT_PATH / "data/processed/audio_timeseries.pkl", "wb+") as pkl_file:
    pickle.dump(audio_dict, pkl_file, protocol=pickle.HIGHEST_PROTOCOL)

In [154]:
# random subset of train
train_rand_ids = get_rand_ids(int(len(train_ids)*.05), train_ids)
y_train_rand = y_train.loc[train_rand_ids]
# print(y_train_rand)

# random subset of test
test_rand_ids = get_rand_ids(int(len(test_ids)*.05), test_ids)
y_test_rand = y_test.loc[test_rand_ids]
# print(y_test_rand)

# all rand ids
rand_ids = np.concatenate((train_rand_ids, test_rand_ids))

In [None]:
# featurize 5% of the dataset
# takes 3 mins for manual, 5% of dataset
# takes 168.2 s for minimal, 5% of dataset
X_df_rand = featurize_set(rand_ids)

In [170]:
# get train-test split of features

X_train_rand, X_test_rand = (
    X_df_rand[X_df_rand.index.isin(train_rand_ids)].squeeze(),
    X_df_rand[X_df_rand.index.isin(test_rand_ids)].squeeze(),
)

# this gives incorrect num rows :( !
# X_train_rand = X_df_rand.loc[train_rand_ids,:]
# X_test_rand = X_df_rand.loc[test_rand_ids,:]

In [173]:
lr = LogisticRegression()
# lr.fit(X_train, y_train)
lr.fit(X_train_rand, y_train_rand)

LogisticRegression()

In [174]:
# print(lr.score(X_test, y_test))
print(lr.score(X_test_rand, y_test_rand))

0.4861111111111111


selected (standard_deviation, variance, root_mean_square) : .486111
minimal : .55
manual : .56

In [None]:
audio_list

In [119]:
kind_to_fc_parameters

{'val': {'standard_deviation': None,
  'variance': None,
  'root_mean_square': None}}

In [None]:
# do extract_features in parallel - either using tsfresh n_jobs, or Process Pool Executer
# def parallel_extract_features(id_set, params):
#     # params are a dict of parameters for extract_features
#     with ProcessPoolExecutor(max_workers=10) as ppe:
#         pd.concat(
#             list)
#                 tqdm(
#                     ppe.map(lambda p: extract_features(*p), download_set), total=len(id_set)
#                 )
#             )
#         )

In [None]:
# have ts-fresh figure out which features are good
# presets can be found here: https://tsfresh.readthedocs.io/en/latest/api/tsfresh.feature_extraction.html#tsfresh.feature_extraction.settings.ComprehensiveFCParameters
# with more details here: https://tsfresh.readthedocs.io/en/latest/_modules/tsfresh/feature_extraction/settings.html#MinimalFCParameters

# X_df_autofilt = extract_relevant_features(audio_df_head, y_train.loc[rand_ids], 
# # X_df_autofilt = extract_relevant_features(audio_df_head, y_train.iloc[rand_idx], 
# # X_df_autofilt = extract_relevant_features(audio_df, y_train, 
#                                             column_id='id', column_sort='time', 
#                                             default_fc_parameters=MinimalFCParameters(), n_jobs=0)
# get X and y 
y_for_selected = y_train.loc[feat_select_ids]
X_selected = select_features(X_extracted, y)

# get a dictionary of the good features parameters, to use again later
kind_to_fc_parameters = from_columns(X_selected)

## Extract selected features

In [None]:
# extract the selected features from full dataset
X_df = extract_features(audio_df, column_id='id', column_sort='time',
                     default_fc_parameters=kind_to_fc_parameters,
                     # we impute = remove all NaN features automatically
                     impute_function=impute)

## Map Features

In [None]:
# feature_mapper = DataFrameMapper(
#     [
#         ("id", None),
#         (["gen"], OneHotEncoder()),
#         (["sp"], OneHotEncoder()),
#         (["ssp"], OneHotEncoder()),
#         (["en"], OneHotEncoder()),
#         (["lat"], [MinMaxScaler(), SimpleImputer()]),
#         (["lng"], [MinMaxScaler(), SimpleImputer()]),
#         (["gender"], OneHotEncoder()),
#         (["age"], OneHotEncoder()),
#     ],
#     df_out=True,
# )

In [None]:
# X_feat_df = feature_mapper.fit_transform(X_df)
# X_train, X_test = (
#     X_df[X_df.id.isin(train_ids)].drop(columns=["id"]),
#     X_df[X_df.id.isin(test_ids)].drop(columns=["id"]),
# )

## Model

In [172]:
lr = LogisticRegression()
# lr.fit(X_train, y_train)
lr.fit(X_train_rand, y_train_rand)

LogisticRegression()

In [146]:
# print(lr.score(X_test, y_test))
print(lr.score(X_test_rand, y_test_rand))

0.3888888888888889
