In [31]:
from typing import Union, Callable

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch import optim
import torch.optim.lr_scheduler as lr_scheduler
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.decomposition
from tqdm import tqdm

from lib.data_preprocessing import remove_correlated_columns, normalize_data
from lib.ds.bird_classes import NUM_CLASSES
from lib.ds.dataset_loading import load_all_data, flatten
from lib.ds.dataset_splitting import split
from lib.ds.torch_dataset import create_data_loader
from lib.ds.challenge_dataset import load_challenge_data
from lib.model.attention_classifier import AttentionClassifier, AttentionClassifierHyperParameters
from lib.attention_classifier_training import train_attention_classifier_with_cv, train_attention_classifier, evaluate_attention_classifier
from lib.training_hyper_parameters import TrainingHyperParameters
from lib.ds.numpy_dataset import NumpyDataset
from lib.model.model_persistence import save_model, load_model, load_models_with_scalers_with_prefix
from lib.random import set_random_seed
from lib.metrics import calculate_average_metrics_for_final_epoch_of_folds, calculate_average_metrics_per_epoch
from lib.ds.bird_combiner import combine_birds
from lib.challenge import predict_for_challenge, save_results_to_csv, load_results_from_csv
from lib.label_fixing import fix_labels_information_gain
from lib.voting import perform_weighted_voting
import lib.torch_device as tdev

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# tdev.PREFERRED = 'cpu'
device = tdev.get_torch_device()
device

device(type='cuda', index=0)

# Challenge

In [4]:
challenge_data = load_challenge_data()
challenge_data.shape

(16, 3000, 548)

In [5]:
species_models_with_scalers_and_scores = load_models_with_scalers_with_prefix('saved_models', 'species_classifier cv2023-06-05_18.56')       

In [17]:
def predict_species():
    predictions_list: list[np.ndarray] = []
    for model, normalization_scaler, score in tqdm(species_models_with_scalers_and_scores):
        predictions_list.append(
            predict_for_challenge(challenge_data, model, normalization_scaler, device)
        )
    return np.moveaxis(np.array(predictions_list), 0, -1)

species_predictions = predict_species()
species_predictions.shape

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  4.55it/s]


(16, 3000, 8)

In [19]:
bird_no_bird_models_with_scalers_and_scores = load_models_with_scalers_with_prefix('saved_models', 'bird_no_bird_classifier cv2023-06-06_22.49')       

In [23]:
def predict_bird_no_bird():
    predictions_list: list[np.ndarray] = []
    for model, normalization_scaler, score in tqdm(bird_no_bird_models_with_scalers_and_scores):
        predictions_list.append(
            predict_for_challenge(challenge_data, model, normalization_scaler, device)
        )
    return np.moveaxis(np.array(predictions_list), 0, -1)

bird_no_bird_predictions = predict_bird_no_bird()
bird_no_bird_predictions.shape

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.81it/s]


(16, 3000, 1)

In [28]:
def get_weights(models_with_scalers_and_scores, non_linearity_exponent):
    weights = np.array([score for _, _, score in species_models_with_scalers_and_scores])
    if len(weights) > 1:
        weights -= np.min(weights)
        weights /= np.max(weights)
        weights **= non_linearity_exponent
    return weights
    

In [30]:
voting_results = perform_weighted_voting(
    species_predictions=species_predictions,
    species_classifier_voting_weights=get_weights(species_models_with_scalers_and_scores, 1.1),
    bird_no_bird_predictions=bird_no_bird_predictions,
    bird_no_bird_classifier_voting_weights=get_weights(bird_no_bird_models_with_scalers_and_scores, 1.1),
)
voting_results

array([[5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 3, 3, 3],
       ...,
       [0, 0, 0, ..., 5, 0, 0],
       [0, 0, 0, ..., 6, 6, 6],
       [0, 0, 0, ..., 0, 0, 0]])

In [11]:
save_results_to_csv(voting_results, 'submissions/challenge_submission_TODO.csv')

NameError: name 'voting_results' is not defined

In [14]:
saved_predictions = load_results_from_csv('submissions/challenge_submission_original_ac score=10861.csv')

fixed_labels = fix_labels_information_gain(
    labels=saved_predictions, 
    window_size=32, 
    window_overlap=9, 
    splitting_point_window_shrink=5,
    split_at_0_only=True,
    information_gain_threshold=0.1,
)
save_results_to_csv(fixed_labels, 'submissions/challenge_submission_original_ac_fixed score=TBD.csv')

Fixing label sequences: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  5.67it/s]
