In [1]:
from typing import Union, Callable

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch import optim
import torch.optim.lr_scheduler as lr_scheduler
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.decomposition
from tqdm import tqdm

from lib.data_preprocessing import remove_correlated_columns, normalize_data
from lib.ds.bird_classes import NUM_CLASSES
from lib.ds.dataset_loading import load_all_data, flatten
from lib.ds.dataset_splitting import split
from lib.ds.torch_dataset import create_data_loader
from lib.ds.challenge_dataset import load_challenge_data
from lib.model.attention_classifier import AttentionClassifier, AttentionClassifierHyperParameters
from lib.training_hyper_parameters import TrainingHyperParameters
from lib.ds.numpy_dataset import NumpyDataset
from lib.model.model_persistence import save_model, load_model, load_models_with_scalers_with_prefix
from lib.random import set_random_seed
from lib.metrics import calculate_average_metrics_for_final_epoch_of_folds, calculate_average_metrics_per_epoch
from lib.ds.bird_combiner import combine_birds
from lib.challenge import predict_for_challenge, save_results_to_csv, load_results_from_csv
from lib.label_fixing import fix_labels_information_gain
from lib.voting import perform_weighted_voting
import lib.torch_device as tdev

%load_ext autoreload
%autoreload 2

In [2]:
tdev.PREFERRED = 'cpu'
device = tdev.get_torch_device()
device

device(type='cpu')

# Challenge

In [3]:
challenge_data = load_challenge_data()
challenge_data.shape

(16, 3000, 548)

In [7]:
species_models_with_scalers_and_scores = load_models_with_scalers_with_prefix('saved_models', 'species_classifier')       

In [8]:
def predict_species():
    predictions_list: list[np.ndarray] = []
    for model, normalization_scaler, score in tqdm(species_models_with_scalers_and_scores):
        predictions_list.append(
            predict_for_challenge(challenge_data, model, normalization_scaler, device)
        )
    return np.moveaxis(np.array(predictions_list), 0, -1)

species_predictions = predict_species()
species_predictions.shape

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:43<00:00,  6.20s/it]


(16, 3000, 7)

In [9]:
bird_no_bird_models_with_scalers_and_scores = load_models_with_scalers_with_prefix('saved_models', 'bird_no_bird_classifier')       

In [10]:
def predict_bird_no_bird():
    predictions_list: list[np.ndarray] = []
    for model, normalization_scaler, score in tqdm(bird_no_bird_models_with_scalers_and_scores):
        predictions_list.append(
            predict_for_challenge(challenge_data, model, normalization_scaler, device)
        )
    return np.moveaxis(np.array(predictions_list), 0, -1)

bird_no_bird_predictions = predict_bird_no_bird()
bird_no_bird_predictions.shape

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:47<00:00,  5.92s/it]


(16, 3000, 8)

In [15]:
def get_weights(models_with_scalers_and_scores, non_linearity_exponent):
    weights = np.array([score for _, _, score in models_with_scalers_and_scores])
    if len(weights) <= 1:
        raise ValueError('nono')
    if len(weights) > 1:
        weights -= np.min(weights)
        weights /= np.max(weights)
        weights **= non_linearity_exponent
        weights /= weights.sum()
    return weights
    

In [57]:
species_weights = get_weights(species_models_with_scalers_and_scores, 1.5)
bird_no_bird_weights = get_weights(bird_no_bird_models_with_scalers_and_scores, 1.5)

print(f'species_weights      = {", ".join([f"{w:.4f}" for w in species_weights])}')
print(f'bird_no_bird_weights = {", ".join([f"{w:.4f}" for w in bird_no_bird_weights])}\n')

species_weights_modifier = 1.0
bird_no_bird_weights_modifier = 1.0

print(f'species_weights (modified)      = {", ".join([f"{w:.4f}" for w in (species_weights * species_weights_modifier)])}')
print(f'bird_no_bird_weights (modified) = {", ".join([f"{w:.4f}" for w in (bird_no_bird_weights * bird_no_bird_weights_modifier)])}\n')

voting_results = perform_weighted_voting(
    species_predictions=species_predictions,
    species_classifier_voting_weights=species_weights * species_weights_modifier,
    bird_no_bird_predictions=bird_no_bird_predictions,
    bird_no_bird_classifier_voting_weights=bird_no_bird_weights * bird_no_bird_weights_modifier,
)
voting_results

species_weights      = 0.1385, 0.1751, 0.0000, 0.2802, 0.1682, 0.2332, 0.0049
bird_no_bird_weights = 0.1297, 0.0204, 0.1232, 0.2895, 0.2167, 0.0957, 0.1248, 0.0000

species_weights (modified)      = 0.1385, 0.1751, 0.0000, 0.2802, 0.1682, 0.2332, 0.0049
bird_no_bird_weights (modified) = 0.1297, 0.0204, 0.1232, 0.2895, 0.2167, 0.0957, 0.1248, 0.0000



array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 3, 3, 3],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 6, 6, 6],
       [0, 0, 0, ..., 0, 0, 0]])

In [44]:
save_results_to_csv(voting_results, 'submissions/challenge_submission score=?.csv')

OSError: [Errno 22] Invalid argument: 'submissions/challenge_submission score=?.csv'

In [58]:
# saved_predictions = load_results_from_csv('submissions/challenge_submission.csv')

fixed_labels = fix_labels_information_gain(
    labels=voting_results, 
    window_size=32, 
    window_overlap=9, 
    splitting_point_window_shrink=5,
    split_at_0_only=True,
    information_gain_threshold=0.1,
)

Fixing label sequences: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  5.77it/s]

Skipped 1002 out of 2064 windows





In [None]:
save_results_to_csv(fixed_labels, 'submissions/challenge_submission_2023-06-10_17.40 score=TBD.csv')

In [34]:
from collections import Counter

def print_label_count(labels: np.ndarray):
    counter = Counter(labels.reshape((-1)).tolist())
    
    total = sum(counter.values())
    
    for key in counter.keys():
        counter[key] /= total
    
    items = sorted(counter.items(), key=lambda item: item[0])
    
    print([f'{int(item[0])}: {item[1]:.4f}' for item in items])

In [59]:
print_label_count(fixed_labels)

['0: 0.6761', '1: 0.0575', '2: 0.0693', '3: 0.0568', '4: 0.0229', '5: 0.0603', '6: 0.0570']


In [42]:
print_label_count(load_results_from_csv('submissions/challenge_submission_original_ac_fixed score=11124.csv'))

['0: 0.6216', '1: 0.0763', '2: 0.0789', '3: 0.0522', '4: 0.0285', '5: 0.0729', '6: 0.0696']


In [56]:
print_label_count(load_results_from_csv('submissions/challenge_submission_2023-06-10_17.00 score=10460.csv'))

['0: 0.6780', '1: 0.0558', '2: 0.0681', '3: 0.0552', '4: 0.0223', '5: 0.0620', '6: 0.0586']
