# Main CNN model for bat call classification

### **TO RUN THIS FILE**, just press the "restart&execute" button, and when the kernel dies, restart the notebook process, go to the `exit()` cell and execute all cells after that one

In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from keras import backend as K 
import gc
from tensorflow.keras.optimizers.legacy import Adam
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN, SMOTETomek
from tensorflow.keras import regularizers
import tensorflow_model_optimization as tfmot
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

2023-12-26 00:28:13.352599: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-26 00:28:13.474128: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-12-26 00:28:13.474154: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-12-26 00:28:14.009733: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [2]:
data = pd.read_pickle('./data/images_df_numerical.pkl')

def split_df_equal_class_distribution(df, batch_size):
    
    df['temp_id'] = range(len(df))
    
    num_batches = int(np.ceil(len(df) / batch_size))
    
    grouped = df.groupby('Species', group_keys=False)
    
    chunks = []
    
    for i in range(num_batches):
        chunk = pd.DataFrame(columns=df.columns)
        for _, group in grouped:
            num_samples = int(batch_size * len(group) / len(df))
            sample_indices = np.random.choice(group['temp_id'], size=num_samples, replace=False)
            chunk = pd.concat([chunk, df[df['temp_id'].isin(sample_indices)]])
        chunk = chunk.drop('temp_id', axis=1)
        chunks.append(chunk)
    
    return chunks

chunk_size = 1000

chunks_with_same_dist = split_df_equal_class_distribution(data, chunk_size)
del data
classes = chunks_with_same_dist[0]["Species"].unique()
x_len = chunks_with_same_dist[0].iloc[0]["data"].size
number_of_classes = classes.size
most_x_in_one_class = chunks_with_same_dist[0]["Species"].value_counts().iloc[0]

In [3]:
# Alleiniges undersampling wird keinen Sinn machen, da wir extrem wenig Datenpunkte overall haben
def resample(resampler) -> tuple[np.array, np.array]:
    # 0.3 as buffer
    array_size = int(most_x_in_one_class * number_of_classes * (len(chunks_with_same_dist) + 0.3))
    X = np.empty((array_size, x_len), dtype=np.uint8)
    y = np.empty((array_size), dtype=np.uint8)

    current_index = 0
    for chunk in chunks_with_same_dist:
        X_batch, y_batch = chunk['data'], chunk['Species']
        X_batch, y_batch = np.stack(X_batch).astype(np.uint8), y_batch.astype(np.uint8)
        X_resampled, y_resampled = resampler.fit_resample(X_batch, y_batch)
        num_samples = X_resampled.shape[0]
        X[current_index:current_index + num_samples] = X_resampled.astype(np.uint8)
        y[current_index:current_index + num_samples] = y_resampled.astype(np.uint8)
        current_index += num_samples

    X.resize((current_index, X.shape[1]))
    y.resize(current_index)
    print(f"{resampler}: ", pd.Series(y, dtype=pd.UInt8Dtype()).value_counts())

    return X, y

# oversampling
smote = SMOTE()
adasyn = ADASYN()

X, y = resample(adasyn)

# Kombination aus over und undersampling
smoteenn = SMOTEENN()
smotettomek = SMOTETomek()

ADASYN():  2    2295
3    2272
0    2260
4    2256
5    2256
1    2215
dtype: Int64


In [4]:
#skips cell (idk when we dont use chunks the validation acc is simply trash?)
%%skip
data = pd.read_pickle('./data/images_df_numerical.pkl')
classes = data["Species"].unique()
number_of_classes = classes.size

# Alleiniges undersampling wird keinen Sinn machen, da wir extrem wenig Datenpunkte overall haben
def resample(resampler) -> tuple[np.array, np.array]:
    X, y = data['data'], data['Species']
    X, y = np.stack(X).astype(np.uint8), y.astype(np.uint8)
    X_resampled, y_resampled = resampler.fit_resample(X, y)

    print(f"{resampler}: ", pd.Series(y_resampled, dtype=pd.UInt8Dtype()).value_counts())

    return X_resampled, y_resampled

# oversampling
smote = SMOTE()
adasyn = ADASYN()

# Kombination aus over und undersampling
smoteenn = SMOTEENN()
smotettomek = SMOTETomek()

X, y = resample(adasyn)

UsageError: Line magic function `%%skip` not found.


In [5]:
image_size = X[0].size
samples = X.size
image_shape = (65, 100, 3) # height, width , color channel
smallest_float16 = np.finfo(np.float16).tiny
# normalize to 0-1
X = X / 255.
X = X.reshape((-1,) + image_shape)
#X = X.astype(smallest_float16)

In [6]:
with open("X.npy", "wb") as file:
    np.save(file, X)
with open("y.npy", "wb") as file:
    np.save(file, y)

In [7]:
kfold = KFold(n_splits=10, shuffle=True)

tf.keras.utils.set_random_seed(1)

# If using TensorFlow, this will make GPU ops as deterministic as possible,
# but it will affect the overall performance, so be mindful of that.
tf.config.experimental.enable_op_determinism()

In [8]:
early_stopping = EarlyStopping(monitor='val_accuracy', patience=30, min_delta=0.001, start_from_epoch=15, restore_best_weights=True)
epochs = 200
batch_size = 32
dropout_rate = 0.2 # https://www.kaggle.com/code/rafjaa/dealing-with-very-small-datasets interessant bzgl oberfitting
weight_decay_alpha = 0.01

def create_model():
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Input(shape=image_shape))
    model.add(tf.keras.layers.Conv2D(32, 3, strides=2, padding='same', activation='relu', kernel_regularizer=regularizers.l2(weight_decay_alpha)))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(dropout_rate))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(dropout_rate))
    model.add(tf.keras.layers.Conv2D(64, 3, padding='same', activation='relu', kernel_regularizer=regularizers.l2(weight_decay_alpha)))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(dropout_rate))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(dropout_rate))
    model.add(tf.keras.layers.Conv2D(128, 3, padding='same', activation='relu', kernel_regularizer=regularizers.l2(weight_decay_alpha)))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(dropout_rate))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(weight_decay_alpha)))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(dropout_rate))
    model.add(tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(weight_decay_alpha)))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(dropout_rate))
    #model.add(tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(weight_decay_alpha)))
    #model.add(tf.keras.layers.BatchNormalization())
    #model.add(tf.keras.layers.Dropout(dropout_rate))
    model.add(tf.keras.layers.Dense(number_of_classes, activation='softmax'))

    return model

In [9]:
def fit_model(model, X_train, y_train, worker=8):
    history = model.fit(
        X_train,
        y_train,
        epochs=epochs,
        batch_size=batch_size,
        workers=worker, # workers are number of cores
        callbacks=[early_stopping, tfmot.sparsity.keras.UpdatePruningStep()],
        validation_split=0.2,
        verbose=1)

    return history

In [10]:
end_step = np.ceil(X.shape[0] / batch_size).astype(np.int32) * epochs

pruning_params = {
    # In this example, you start the model with 50% sparsity (50% zeros in weights) and end with 80% sparsity.
      'pruning_schedule': tfmot.sparsity.keras.PolynomialDecay(initial_sparsity=0.50,
                                                                final_sparsity=0.80,
                                                                begin_step=0,
                                                                end_step=end_step)}

model = create_model()
model = tfmot.sparsity.keras.prune_low_magnitude(model, **pruning_params)

2023-12-26 00:28:21.328330: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-12-26 00:28:21.328364: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-12-26 00:28:21.328394: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (marcel-laptop): /proc/driver/nvidia/version does not exist
2023-12-26 00:28:21.328691: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [11]:
#del X, y
#X, y = None, None
#gc.collect()

In [12]:
# for confusion_matrix
true_labels = list()
pred_labels = list()
train_accuracies = list()
val_accuracies = list()
train_losses = list()
val_losses = list()

#X = np.load("X.npy", mmap_mode="r+")
#y = np.load("y.npy", mmap_mode="r+")
for train_indezes, test_indezes in kfold.split(X, y):
    K.clear_session()
    X_train, y_train = tf.convert_to_tensor(X[train_indezes]), tf.convert_to_tensor(y[train_indezes])
    X_test, y_test = tf.convert_to_tensor(X[train_indezes]), tf.convert_to_tensor(y[test_indezes])
    model.compile(optimizer=Adam(0.001), loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    history = fit_model(model, X_train, y_train)

    # for confusion matrix
    predictions = np.argmax(model.predict(X_test), axis=-1)
    true_labels.extend(y_test)
    pred_labels.extend(predictions)

    # for accuracy curves
    train_accuracies.extend(history.history['accuracy'])
    val_accuracies.extend(history.history['val_accuracy'])

    # for loss curves
    train_losses.extend(history.history['loss'])
    val_losses.extend(history.history['val_loss'])

2023-12-26 00:28:22.762072: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 1902888000 exceeds 10% of free system memory.
2023-12-26 00:28:24.740329: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 1902888000 exceeds 10% of free system memory.
2023-12-26 00:28:26.272522: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 1522248000 exceeds 10% of free system memory.
2023-12-26 00:28:26.717429: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 1522248000 exceeds 10% of free system memory.


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200

KeyboardInterrupt: 

In [None]:
confusion_matrix = confusion_matrix(true_labels, pred_labels)
fig, ax = plt.subplots(figsize=(8, 6))
cm_display = ConfusionMatrixDisplay(confusion_matrix, display_labels=classes)
cm_display.plot(ax=ax, cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
# Plotting the training and validation curves
epochs = len(train_losses)
plt.figure(figsize=(12, 4))

# Plotting loss curves
plt.subplot(1, 2, 1)
plt.plot(range(1, epochs + 1), train_losses, label='Training Loss')
plt.plot(range(1, epochs + 1), val_losses, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# Plotting accuracy curves
plt.subplot(1, 2, 2)
plt.plot(range(1, epochs + 1), train_accuracies, label='Training Accuracy')
plt.plot(range(1, epochs + 1), val_accuracies, label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()