# Species classification by whistles, Oswald data
# Expt 2: ENCOUNTER split;  cross-validation

# May  12, 2021

In [1]:
import numpy as np
from itertools import permutations
import random
import os
import glob
import pickle
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
import pandas as pd
from os import makedirs
from datetime import datetime
from collections import Counter

from math import floor

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report

# from MulticoreTSNE import MulticoreTSNE as TSNE
# from sklearn.manifold import TSNE
import seaborn as sns

from tensorflow.keras.utils import Sequence
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Activation
from tensorflow.keras.layers import Conv2D, Lambda, Flatten, MaxPooling2D, LSTM, ConvLSTM2D, GlobalAveragePooling2D, GlobalMaxPooling2D  # Reshape, Lambda, Concatenate
from tensorflow.keras.layers import Bidirectional, LSTM, GRU
from tensorflow.keras.models import Model, Sequential
# from tensorflow.keras.regularizers import l2
# from tensorflow.keras import backend as K

from tensorflow.keras.optimizers.schedules import ExponentialDecay, PiecewiseConstantDecay
from tensorflow.keras.optimizers import SGD,Adam
from tensorflow.keras.losses import binary_crossentropy, categorical_crossentropy  # CategoricalCrossentropy
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model
# import tensorflow_addons.layers.spatial_pyramid_pooling as spp
# import tensorflow_datasets as tfds
from tensorflow.math import l2_normalize

import tensorflow as tf

from lib_validation import DataGenerator, find_best_model
from lib_model import model_cnn14_spp, model_cnn14_attention_multi

In [2]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
# Hyper parameters
learning_rate = 1.0e-3
conv_dim = 16
rnn_dim = 16
pool_size = 2
pool_stride = 2
l2_regu = 0.01
drop_rate = 0.4
hidden_units = 256
fcn_dim = 256

# learning_rate = 1.e-4
# conv_dim = 64
# rnn_dim = 16
# pool_size = 2
# pool_stride = 2
# l2_regu = 0.00
# drop_rate = 0.2
# # drop_rate = 0.5
# hidden_units = 512
# fcn_dim = 512

num_epoch = 200
# batch_size = 128
# batch_size = 32  # for cnn14+attention
batch_size = 16  # for cnn14+spp
copies_of_aug =  10

num_patience = 40

num_fold = 5

In [4]:
# data_type_dict = {1: 'universal', 2: 'file', 3: 'encounter', 4: 'domain'}
# data_type = 2

work_path = '/home/ys587/__Data/__whistle/__whislte_30_species'
fit_result_path =  os.path.join(work_path, '__fit_result_species')
# feature_path = os.path.join(work_path, '__feature_species')
feature_path = os.path.join(work_path, '__dataset/20210210')

In [5]:
species_dict = {'BD': 0, 'CD': 1, 'STR': 2, 'SPT': 3, 'SPIN': 4, 'PLT': 5, 'RT': 6,  'FKW': 7}
num_species = len(species_dict)
species_list = list(species_dict.keys())
species_id = list(species_dict.values())

In [7]:
## Models

In [11]:
# Model compile, class weight & fitting

## Features

In [13]:
feature_path = '/home/ys587/__Data/__whistle/__whislte_30_species/__dataset/__feature'

In [14]:
# 'all.csv', 'all_orig.npz', 'all_aug.npz'

In [15]:
# original data
fea_temp_orig = np.load(os.path.join(feature_path, 'all_orig.npz'))
feas_orig = fea_temp_orig['feas_orig']
labels_orig = fea_temp_orig['labels_orig']
print('The shape of feas_orig: ', end='')
print(feas_orig.shape)

# augmented data
fea_temp_aug = np.load(os.path.join(feature_path, 'all_aug.npz'))
feas_aug = fea_temp_aug['feas_aug']
labels_aug = fea_temp_aug['labels_aug']
print('The shape of feas_aug: ', end='')
print(feas_aug.shape)

The shape of feas_orig: (20074, 101, 128)
The shape of feas_aug: (200740, 101, 128)


In [16]:
feas_orig[1, :, :]

array([[-5.5306255e-06, -5.5306255e-06, -5.5306255e-06, ...,
        -5.5306255e-06, -5.5306255e-06, -5.5306255e-06],
       [ 6.7983616e-05, -2.1578808e-05, -9.3846163e-04, ...,
         1.3514265e-04,  1.4964730e-04, -7.9413832e-07],
       [-1.9058969e-04,  2.9266728e-04, -8.4963525e-05, ...,
         2.2916051e-05,  5.7070574e-06,  2.0603933e-05],
       ...,
       [ 1.8069613e-05, -2.1375941e-05,  3.0118985e-05, ...,
         2.9827048e-05,  2.3533221e-06,  6.4520614e-06],
       [-3.0250607e-05, -2.1315935e-05,  3.5012261e-05, ...,
         3.7479298e-05, -1.9081334e-04, -2.6004465e-04],
       [-2.0094125e-05, -1.4169634e-04,  3.3643784e-04, ...,
         1.4158267e-05, -7.2127339e-05, -5.5356446e-05]], dtype=float32)

In [17]:
feas_aug[1*10+0, :, :]

array([[-2.9602832e-05, -2.9602832e-05, -2.9602832e-05, ...,
        -2.9602832e-05, -2.9602832e-05, -2.9602832e-05],
       [-4.8709131e-05, -1.6511540e-04, -1.9601225e-04, ...,
        -2.3991024e-05, -5.6935693e-05, -6.8633824e-05],
       [-4.8636775e-05, -5.4319677e-05,  5.7387711e-05, ...,
        -2.6437801e-05, -9.7370776e-06,  7.7130687e-07],
       ...,
       [-3.5884852e-05, -4.7145946e-05, -6.9007998e-05, ...,
         8.6452183e-06,  4.7872513e-06, -5.1426055e-06],
       [-1.5889327e-06, -6.8359346e-05, -4.4359949e-05, ...,
        -1.3005194e-05, -2.0617288e-05, -2.1008937e-05],
       [-4.3875840e-05, -6.4716137e-06, -1.1660403e-05, ...,
        -2.7213528e-05,  3.3381439e-05,  7.0470451e-05]], dtype=float32)

In [18]:
# df_species = pd.read_csv(os.path.join(feature_path, 'all.csv'))
df_species = pd.read_csv(os.path.join(feature_path, 'all_species.csv'))
df_noise = pd.read_csv(os.path.join(feature_path, 'all_noise.csv'))

In [20]:
# generate data separated by encounters
# use species & encounter as keys!
species_list = []
encounter_unique = pd.unique(df_species['encounter'])
species_unique = []
for ee in encounter_unique:
    species_unique.append(df_species[df_species['encounter']==ee]['species'])
for ii in range(len(species_unique)):
    print(encounter_unique[ii])
    species_name = pd.unique(species_unique[ii])
    species_list.append(species_name[0])
# make an dataframe consisting of encounter_unique & species_list
df_encounter_species = pd.DataFrame({'encounter': encounter_unique, 'species': species_list})
# df_encounter_species.to_csv(os.path.join(dataset_path, 'encounter_species'+'.csv'), index=False)

HICEAS2002_s165
PICEAS2005_a101
PICEAS2005_a73
STAR2000_s352
STAR2000_s46
STAR2003_s494
STAR2003_s516
STAR2003_s586
STAR2000_s282
STAR2000_s302
STAR2000_s303
STAR2000_s313
STAR2000_s329
STAR2000_s338
STAR2000_s346
STAR2000_s368
STAR2000_s374
STAR2000_s375
STAR2000_s376
STAR2000_s377
STAR2000_s378
STAR2000_s48
STAR2000_s515
STAR2000_s561
STAR2003_s482
STAR2003_s489
STAR2003_s627
STAR2003_s628
STAR2003_s631
STAR2003_s640
STAR2003_s792
STAR2006_s216
PICEAS2005_a215
PICEAS2005_a245
PICEAS2005_a249
PICEAS2005_a250
PICEAS2005_a253
PICEAS2005_a57
PICEAS2005_a67
STAR2003_s776
STAR2006_s128
HICEAS2002_s219
HICEAS2002_s228
HICEAS2002_s261
HICEAS2002_s317
HICEAS2002_s318
PICEAS2005_a86
PICEAS2005_a93
STAR2000_s288
STAR2006_s142
STAR2006_s144
STAR2006_s145
STAR2006_s154
STAR2006_s156
STAR2006_s223
STAR2006_s230
HICEAS2002_s125
HICEAS2002_s167
HICEAS2002_s194
HICEAS2002_s234
HICEAS2002_s245
PICEAS2005_a178
PICEAS2005_a179
PICEAS2005_a23
PICEAS2005_a75
STAR2000_s417
STAR2006_s112
STAR2006_s153
HICEA

## Split over encounters

In [21]:
today = datetime.now()
# create a folder based on date & time
fit_result_path1 = os.path.join(fit_result_path, today.strftime('%Y%m%d_%H%M%S'))

In [22]:
label_pred_all = []
label_test_all = []

In [23]:
## fold split over encounters

In [24]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=num_fold)

In [25]:
# k-fold split
for train_set, test_set in skf.split(encounter_unique, species_list):
    print('train_set')
    print(train_set)
    print('test_set')
    print(test_set)
    
    fea_ind_orig = []
    fea_ind_aug = []
    # (a) testing
    for tt in test_set:
        encounter_curr = df_encounter_species.iloc[tt]['encounter']
        # print(encounter_curr)
        df_species_test = df_species[(df_species['encounter'] == encounter_curr)]
        # print(list(df_species_test.index))
        fea_ind_orig += list(df_species_test.index)
    
    fea_ind_orig = np.array(fea_ind_orig)
    # original features & labels
    # fea_ind_orig = np.array(df_species_test.index)
    fea_test = feas_orig[fea_ind_orig, :, :]
    label_test = labels_orig[fea_ind_orig]
    label_test = np.array([species_dict[ll] for ll in label_test])
    print('')
    print(len(test_set))
    print(fea_test.shape)
    print('')
    
    # (b) training
    for tt in train_set:
        encounter_curr = df_encounter_species.iloc[tt]['encounter']
        # print(encounter_curr)
        df_species_train = df_species[(df_species['encounter'] == encounter_curr)]
        # print(list(df_species_test.index))
        fea_ind_aug += list(df_species_train.index)
    
    fea_ind_aug = np.array(fea_ind_aug)
    fea_train = feas_aug[fea_ind_aug, :, :]
    label_train = labels_aug[fea_ind_aug]
    label_train = np.array([species_dict[ll] for ll in label_train])
    print('')
    print(len(train_set))
    print(fea_train.shape)
    print('')
    
    
#     df_species_train = df_species[(df_species['deployment'] != ee)]
#     print(df_species_train.shape)


#     for ii in test_set:
#         print(species_list[ii]+', ', end='')


train_set
[  2   3   4   5   6   7  13  14  15  16  17  18  19  20  21  22  23  24
  25  26  27  28  29  30  31  34  35  36  37  38  39  40  44  45  46  47
  48  49  50  51  52  53  54  55  58  59  60  61  62  63  64  65  66  67
  70  71  72  73  74  75  76  77  78  79  84  85  86  87  88  89  90  91
  92  93  94  95  96  97 105 106 107 108 109 110 111 112 113 114 115 116
 117 118 119 120 121 122 123 124 125 126 127 128 129 130]
test_set
[  0   1   8   9  10  11  12  32  33  41  42  43  56  57  68  69  80  81
  82  83  98  99 100 101 102 103 104]

27
(5040, 101, 128)


104
(15034, 101, 128)

train_set
[  0   1   4   5   6   7   8   9  10  11  12  18  19  20  21  22  23  24
  25  26  27  28  29  30  31  32  33  35  36  37  38  39  40  41  42  43
  47  48  49  50  51  52  53  54  55  56  57  61  62  63  64  65  66  67
  68  69  72  73  74  75  76  77  78  79  80  81  82  83  88  89  90  91
  92  93  94  95  96  97  98  99 100 101 102 103 104 111 112 113 114 115
 116 117 118 119 120 121 1

In [None]:
list(df_species_test.index)

In [None]:
# for ee in ['STAR2000', 'STAR2003', 'STAR2006', 'HICEAS2002', 'PICEAS2005']:
for ee in ['STAR2000']:
    print(ee)
    
    # (a) testing
    df_species_test = df_species[(df_species['deployment'] == ee)]
    print(df_species_test.shape)
    
    # original features & labels
    fea_ind_orig = np.array(df_species_test.index)
    fea_test = feas_orig[fea_ind_orig, :, :]
    label_test = labels_orig[fea_ind_orig]
    label_test = np.array([species_dict[ll] for ll in label_test])
    print(fea_test.shape)
    
    # (b) training
    df_species_train = df_species[(df_species['deployment'] != ee)]
    print(df_species_train.shape)
    
    # original features & labels
    fea_ind_orig = np.array(df_species_train.index)
    #  fea_orig_curr = feas_orig[fea_ind_orig, :, :]
    # labels_orig_curr = labels_orig[fea_ind_orig]
    # print(fea_orig_curr.shape)

    # augmented features & labels
    fea_ind_aug = []
    for ff in list(fea_ind_orig):
        for ii in range(10):
            fea_ind_aug.append(ff*10+ii)
        
    fea_train = feas_aug[fea_ind_aug, :, :]
    label_train = labels_aug[fea_ind_aug]
    label_train = np.array([species_dict[ll] for ll in label_train])
    print(fea_train.shape)    
    
    print('feature train shape: '+str(fea_train.shape))
    print('feature test shape: '+str(fea_test.shape))
    print('label train shape: '+str(label_train.shape))
    print('label test shape: '+str(label_test.shape))

    dim_time = fea_train.shape[1]
    dim_freq = fea_train.shape[2]
    print('dim_time: '+str(dim_time))
    print('dim_freq: '+str(dim_freq))
    
    # shuffle features & labels
    fea_train, label_train = shuffle(fea_train, label_train, random_state=0)
    fea_test, label_test = shuffle(fea_test, label_test, random_state=0)
    
    fea_train = np.expand_dims(fea_train, axis=3)
    fea_test = np.expand_dims(fea_test, axis=3)
    
    fea_train, fea_validate, label_train, label_validate = train_test_split(fea_train, label_train, test_size=0.10, random_state=42+4)

    train_generator = DataGenerator(fea_train, label_train, batch_size=batch_size, num_classes=num_species)
    del fea_train
    validate_generator = DataGenerator(fea_validate, label_validate, batch_size=batch_size, num_classes=num_species)
    del fea_validate
    
    # deployment folder
    fit_result_path2 = os.path.join(fit_result_path1, ee)
    if not os.path.exists(fit_result_path2):
        makedirs(fit_result_path2)
        
    # class weight
    weights = compute_class_weight(class_weight='balanced', classes=np.unique(label_train), y=label_train)

    class_weights = dict()
    for ii in range(num_species):
        class_weights[ii] = weights[ii]


In [None]:
### Training the model
model = model_cnn14_attention_multi(dim_time, dim_freq, num_species, model_type='feature_level_attention', conv_dim=conv_dim, pool_size=pool_size, pool_stride=pool_stride, hidden_units=hidden_units, l2_regu=l2_regu, drop_rate=drop_rate)
# model = model_cnn14_spp(dim_time, dim_freq, num_species, conv_dim=conv_dim, pool_size=pool_size, pool_stride=pool_stride, hidden_units=hidden_units, l2_regu=l2_regu, drop_rate=drop_rate)
# loss = categorical_crossentropy
loss = binary_crossentropy
# model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate_fn), loss=loss, metrics=['accuracy'])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss=loss, metrics=['accuracy'])
model.summary()

# With classes
# history = model.fit(fea_train, to_categorical(label_train), class_weight=class_weights, validation_split=0.3, batch_size=batch_size, epochs=num_epoch, callbacks=[EarlyStopping(patience=num_patience), ModelCheckpoint(filepath=os.path.join(fit_result_path, '{epoch:02d}-{val_loss:.4f}.hdf5'), verbose=1, monitor="val_loss", save_best_only=True)])
# history = model.fit(train_generator, validation_data=validate_generator, class_weight=class_weights, epochs=num_epoch, callbacks=[EarlyStopping(patience=num_patience, monitor='val_loss', mode='min', verbose=1), TensorBoard(log_dir=fit_result_path1), ModelCheckpoint(filepath=os.path.join(fit_result_path1, '{epoch:02d}-{val_loss:.4f}.hdf5'), verbose=1, monitor="val_loss", save_best_only=True)])
# history = model.fit(train_generator, validation_data=validate_generator, class_weight=class_weights, epochs=num_epoch, callbacks=[EarlyStopping(patience=num_patience, monitor='val_loss', mode='min', verbose=1), TensorBoard(log_dir=fit_result_path2), ModelCheckpoint(filepath=os.path.join(fit_result_path2, 'epoch_{epoch:02d}_valloss_{val_loss:.4f}_valacc_{val_accuracy:.4f}.hdf5' ), verbose=1, monitor="val_loss", save_best_only=True)])
# history = model.fit(train_generator, validation_data=validate_generator, class_weight=class_weights, epochs=num_epoch, callbacks=[EarlyStopping(patience=num_patience, monitor='val_loss', mode='min', verbose=1), TensorBoard(log_dir=fit_result_path2), ModelCheckpoint(filepath=os.path.join(fit_result_path2, 'epoch_{epoch:02d}_valloss_{val_loss:.4f}_valacc_{val_accuracy:.4f}.hdf5' ), verbose=1, monitor="val_accuracy", save_best_only=True)])
history = model.fit(train_generator, validation_data=validate_generator, class_weight=class_weights, epochs=num_epoch, callbacks=[EarlyStopping(patience=num_patience, monitor='val_accuracy', mode='max', verbose=1), TensorBoard(log_dir=fit_result_path2), ModelCheckpoint(filepath=os.path.join(fit_result_path2, 'epoch_{epoch:02d}_valloss_{val_loss:.4f}_valacc_{val_accuracy:.4f}.hdf5' ), verbose=1, monitor="val_accuracy", save_best_only=True)])

In [None]:
# Testing
the_best_model, _ = find_best_model(fit_result_path2, purge=False)
model = load_model(the_best_model)
label_pred = model.predict(fea_test)

label_pred_all.append(label_pred)
label_test_all.append(label_test)

In [None]:
## STAR2003

In [None]:
## STAR2006

In [None]:
## HICEAS2002

In [None]:
## PICEAS2005

In [None]:
np.set_printoptions(linewidth=200, precision=2, suppress=True)

In [None]:
print("Confusion matrix:")
cm = confusion_matrix(label_train[:label_train_pred.shape[0]], np.argmax(label_train_pred, axis=1), labels=species_id)

print(species_list)
print('')
print(cm)
print('')

cm2 = cm*1.0
for ii in range(cm.shape[0]):
    cm_row = cm[ii, :]*1.0

    cm_row_sum = cm_row.sum()
    if cm_row_sum != 0:
        cm2[ii, :] = cm_row / cm_row_sum
    else:
        cm2[ii, :] = np.zeros(cm.shape[1])

print(cm2)

In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics._plot.confusion_matrix import ConfusionMatrixDisplay
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=species_list)
disp2 = ConfusionMatrixDisplay(confusion_matrix=cm2, display_labels=species_list)

In [None]:
fig, ax = plt.subplots(figsize=[15, 15])
disp.plot(include_values=True,
                     cmap='viridis', ax=ax, xticks_rotation='horizontal',
                     values_format=None, colorbar=True)


In [None]:
fig, ax = plt.subplots(figsize=[15, 15])
disp2.plot(include_values=True,
                     cmap='viridis', ax=ax, xticks_rotation='horizontal',
                     values_format='.2f', colorbar=True)

In [None]:
## top k accuracy score

In [None]:
from sklearn.metrics import top_k_accuracy_score

In [None]:
top_k = []
for kk in range(1, num_species+1):
    print('k='+str(kk)+':  ')
    this_acc = top_k_accuracy_score(label_train[:label_train_pred.shape[0]], label_train_pred, k=kk, labels=list(range(num_species)))
    print(this_acc)
    top_k.append(this_acc)

In [None]:
# %matplotlib inline
fig = plt.figure()
ax = fig.subplots()
ax.bar(list(range(1, num_species+1)), top_k)
ax.grid(axis='y')