In [6]:
from typing import Union

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch import optim
import torch.optim.lr_scheduler as lr_scheduler
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.decomposition

from lib.data_preprocessing import remove_correlated_columns, normalize_data
from lib.ds.bird_classes import NUM_CLASSES
from lib.ds.dataset_loading import load_all_data, flatten
from lib.ds.dataset_splitting import split
from lib.ds.torch_dataset import create_data_loader
from lib.ds.challenge_dataset import load_challenge_data
from lib.model.attention_classifier import AttentionClassifier, AttentionClassifierHyperParameters
from lib.attention_classifier_training import train_attention_classifier_with_cv, train_attention_classifier, evaluate_attention_classifier
from lib.training_hyper_parameters import TrainingHyperParameters
from lib.ds.numpy_dataset import NumpyDataset
from lib.model.model_persistence import save_model, load_model
from lib.random import set_random_seed
from lib.metrics import calculate_average_metrics_for_final_epoch_of_folds, calculate_average_metrics_per_epoch
from lib.ds.bird_combiner import combine_birds, combine_birds_labels_only
from lib.challenge import predict_for_challenge, save_results_to_csv, load_results_from_csv
from lib.label_fixing import fix_labels_information_gain
import lib.torch_device as tdev

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
data_raw, labels = load_all_data('dataset')
print(f'{data_raw.shape = }')
print(f'{labels.shape   = }')

data_raw.shape = (1200, 100, 548)
labels.shape   = (1200, 100)


In [40]:
labels_combined = combine_birds_labels_only(
    labels, 
    sequence_length=3000,
    num_duplicates=2,
    random_seed=42
)

Creating random label sequence from 2 duplicates: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 240000/240000 [00:01<00:00, 234884.93it/s]


In [41]:
n_sequences, sequence_length = labels_combined.shape
print(f'{n_sequences = }, {sequence_length = }')
labels_combined

n_sequences = 80, sequence_length = 3000


array([[0., 0., 0., ..., 1., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 5., ..., 5., 0., 5.],
       [5., 0., 5., ..., 0., 0., 0.],
       [0., 2., 2., ..., 0., 2., 2.]])

In [42]:
set_random_seed(42)

NOISE_RATIO = 0.2

def add_noise(labels: np.ndarray) -> np.ndarray:
    labels_noisy = np.copy(labels)
    
    noise_amount = int(sequence_length * NOISE_RATIO)

    for sequence_nr in range(n_sequences):
        noise_indices = np.random.choice(range(sequence_length), noise_amount, replace=False)
        labels_noisy[sequence_nr, noise_indices] = np.random.randint(0, NUM_CLASSES - 1, size=noise_amount)
    
    return labels_noisy

labels_noisy = add_noise(labels_combined)
labels_noisy

array([[5., 0., 0., ..., 1., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 3., 0., 1.],
       ...,
       [2., 0., 5., ..., 5., 0., 5.],
       [5., 0., 5., ..., 0., 0., 0.],
       [0., 2., 2., ..., 0., 2., 5.]])

In [55]:
labels_fixed = fix_labels_information_gain(
    labels=labels_noisy, 
    window_size=32, 
    window_overlap=9, 
    splitting_point_window_shrink=5,
    split_at_0_only=True,
    information_gain_threshold=0.1,
)

Fixing label sequences: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 80/80 [00:14<00:00,  5.66it/s]


In [56]:
num_labels = labels_combined.size

def print_stats(labels_true, labels_pred):
    num_labels = labels_true.size
    
    num_correct = (labels_true == labels_pred).sum()
    num_incorrect = num_labels - num_correct
    
    assert num_correct + num_incorrect == num_labels

    num_correct_ratio = num_correct / num_labels
    num_incorrect_ratio = num_incorrect / num_labels
    
    print(f'{num_labels = }, {num_correct = }, {num_incorrect = }, {num_correct_ratio = }, {num_incorrect_ratio = }')

print_stats(labels_combined, labels_noisy)
print_stats(labels_combined, labels_fixed)

num_labels = 240000, num_correct = 199620, num_incorrect = 40380, num_correct_ratio = 0.83175, num_incorrect_ratio = 0.16825
num_labels = 240000, num_correct = 203235, num_incorrect = 36765, num_correct_ratio = 0.8468125, num_incorrect_ratio = 0.1531875
