###Download BTHOWeN

In [None]:
!git clone https://github.com/ZSusskind/BTHOWeN.git

Cloning into 'BTHOWeN'...
remote: Enumerating objects: 104, done.[K
remote: Counting objects: 100% (104/104), done.[K
remote: Compressing objects: 100% (70/70), done.[K
remote: Total 104 (delta 44), reused 94 (delta 34), pack-reused 0 (from 0)[K
Receiving objects: 100% (104/104), 1.65 MiB | 13.61 MiB/s, done.
Resolving deltas: 100% (44/44), done.


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


###Reescrevendo train_swept_models para nosso dataset

In [None]:
import sys
import itertools
import argparse
import ctypes as c
import numpy as np
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from multiprocessing import Pool, cpu_count
from scipy.stats import norm

# For saving models
import pickle
import lzma
import os

from wisard import WiSARD

ModuleNotFoundError: No module named 'wisard'

In [None]:
# Perform inference operations using provided test set on provided model with specified bleaching value (default 1)
def run_inference(inputs, labels, model, bleach=1):
    num_samples = len(inputs)
    correct = 0
    ties = 0
    model.set_bleaching(bleach)
    for d in range(num_samples):
        prediction = model.predict(inputs[d])
        label = labels[d]
        if len(prediction) > 1:
            ties += 1
        if prediction[0] == label:
            correct += 1
    correct_percent = round((100 * correct) / num_samples, 4)
    tie_percent = round((100 * ties) / num_samples, 4)
    print(f"With bleaching={bleach}, accuracy={correct}/{num_samples} ({correct_percent}%); ties={ties}/{num_samples} ({tie_percent}%)")
    return correct

In [None]:
def parameterized_run(train_inputs, train_labels, val_inputs, val_labels, test_inputs, test_labels, unit_inputs, unit_entries, unit_hashes):
    model = WiSARD(train_inputs[0].size, train_labels.max()+1, unit_inputs, unit_entries, unit_hashes)

    print("Training model")
    for d in range(len(train_inputs)):
        model.train(train_inputs[d], train_labels[d])
        if ((d+1) % 10000) == 0:
            print(d+1)

    max_val = 0
    for d in model.discriminators:
        for f in d.filters:
            max_val = max(max_val, f.data.max())
    print(f"Maximum possible bleach value is {max_val}")
    # Use a binary search-based strategy to find the value of b that maximizes accuracy on the validation set
    best_bleach = max_val // 2
    step = max(max_val // 4, 1)
    bleach_accuracies = {}
    while True:
        values = [best_bleach-step, best_bleach, best_bleach+step]
        accuracies = []
        for b in values:
            if b in bleach_accuracies:
                accuracies.append(bleach_accuracies[b])
            elif b < 1:
                accuracies.append(0)
            else:
                accuracy = run_inference(val_inputs, val_labels, model, b)
                bleach_accuracies[b] = accuracy
                accuracies.append(accuracy)
        new_best_bleach = values[accuracies.index(max(accuracies))]
        if (new_best_bleach == best_bleach) and (step == 1):
            break
        best_bleach = new_best_bleach
        if step > 1:
            step //= 2
    print(f"Best bleach: {best_bleach}; inputs/entries/hashes = {unit_inputs},{unit_entries},{unit_hashes}")
    # Evaluate on test set
    print("Testing model")
    accuracy = run_inference(test_inputs, test_labels, model, bleach=best_bleach)
    return model, accuracy

In [None]:
def get_datasets_from_folders(base_dir):
    """
    Carrega um dataset onde as imagens estão organizadas em pastas por classe.
    """
    train_dataset = []
    test_dataset = []
    valid_dataset = []

    for split in ['train', 'test', 'valid']:  # Inclui 'valid' como um split adicional
        split_path = os.path.join(base_dir, split)
        if not os.path.exists(split_path):
            if split == 'valid':  # Validação é opcional
                print(f"Aviso: Split de validação {split_path} não encontrado. Pulando.")
                continue
            else:
                raise ValueError(f"Split path {split_path} não encontrado.")

        for class_name in os.listdir(split_path):
            class_path = os.path.join(split_path, class_name)
            if not os.path.isdir(class_path):
                continue  # Ignorar arquivos

            class_label = int(class_name)  # Assumindo que o nome da pasta é o índice da classe
            for img_file in os.listdir(class_path):
                img_path = os.path.join(class_path, img_file)
                try:
                    # Abrir imagem e realizar pré-processamento
                    img = Image.open(img_path).convert("L").resize((160, 160))
                    img_array = np.array(img)  # Escala de cinza
                    binarized_img = (img_array >= img_array.mean()).astype(np.uint8).flatten()  # Binarização
                    data_tuple = (binarized_img, class_label)

                    if split == 'train':
                        train_dataset.append(data_tuple)
                    elif split == 'test':
                        test_dataset.append(data_tuple)
                    elif split == 'valid':
                        valid_dataset.append(data_tuple)
                except Exception as e:
                    print(f"Erro ao processar {img_path}: {e}")

    if not valid_dataset:
        print("Aviso: Nenhum dado de validação encontrado. Dividindo o dataset de treino.")
        split_ratio = 0.9  # Padrão para split treino/validação
        split_idx = int(len(train_dataset) * split_ratio)
        valid_dataset = train_dataset[split_idx:]
        train_dataset = train_dataset[:split_idx]

    return train_dataset, test_dataset, valid_dataset

def get_datasets(dset_name):
    if dset_name == 'custom':
        return get_datasets_from_folders('/content/gdrive/MyDrive/datasets/ships_simpleCNN')
    else:
        # Chamada original
        return get_datasets_original(dset_name)

In [None]:
def get_datasets_original(dset_name):
    dset_name = dset_name.lower()
    print(f"Loading dataset ({dset_name})")
    if dset_name == 'mnist':
        train_dataset = dsets.MNIST(
            root='./data',
            train=True,
            transform=transforms.ToTensor(),
            download=True)
        new_train_dataset = []
        for d in train_dataset:
            new_train_dataset.append((d[0].numpy().flatten(), d[1]))
        train_dataset = new_train_dataset
        test_dataset = dsets.MNIST(
            root='./data',
            train=False,
            transform=transforms.ToTensor())
        new_test_dataset = []
        for d in test_dataset:
            new_test_dataset.append((d[0].numpy().flatten(), d[1]))
        test_dataset = new_test_dataset
    else:
        train_dataset, test_dataset = tabular_tools.get_dataset(dset_name)
    return train_dataset, test_dataset

In [None]:
def binarize_datasets(train_dataset, test_dataset, valid_dataset, bits_per_input):
    """
    Binariza os datasets de treino, validação e teste.
    """
    std_skews = [norm.ppf((i + 1) / (bits_per_input + 1)) for i in range(bits_per_input)]

    print("Binarizando dataset de treino e validação")
    train_inputs = []
    train_labels = []
    for d in train_dataset:
        train_inputs.append(d[0])
        train_labels.append(d[1])
    train_inputs = np.array(train_inputs)
    train_labels = np.array(train_labels)
    use_gaussian_encoding = True
    if use_gaussian_encoding:
        mean_inputs = train_inputs.mean(axis=0)
        std_inputs = train_inputs.std(axis=0)
        train_binarizations = [(train_inputs >= mean_inputs + (i * std_inputs)).astype(np.uint8) for i in std_skews]
    else:
        min_inputs = train_inputs.min(axis=0)
        max_inputs = train_inputs.max(axis=0)
        train_binarizations = [(train_inputs > min_inputs + (((i + 1) / (bits_per_input + 1)) * (max_inputs - min_inputs))).astype(np.uint8) for i in range(bits_per_input)]

    # Codificação termômetro
    train_inputs = np.concatenate(train_binarizations, axis=1)

    if valid_dataset:
        val_inputs = []
        val_labels = []
        for d in valid_dataset:
            val_inputs.append(d[0])
            val_labels.append(d[1])
        val_inputs = np.array(val_inputs)
        val_labels = np.array(val_labels)
        val_binarizations = [(val_inputs >= mean_inputs + (i * std_inputs)).astype(np.uint8) for i in std_skews]
        val_inputs = np.concatenate(val_binarizations, axis=1)
    else:
        val_inputs, val_labels = None, None

    print("Binarizando dataset de teste")
    test_inputs = []
    test_labels = []
    for d in test_dataset:
        test_inputs.append(d[0])
        test_labels.append(d[1])
    test_inputs = np.array(test_inputs)
    test_labels = np.array(test_labels)
    test_binarizations = [(test_inputs >= mean_inputs + (i * std_inputs)).astype(np.uint8) for i in std_skews]
    test_inputs = np.concatenate(test_binarizations, axis=1)

    return train_inputs, train_labels, val_inputs, val_labels, test_inputs, test_labels

In [None]:
def create_models(dset_name, unit_inputs, unit_entries, unit_hashes, bits_per_input, num_workers, save_prefix="model"):
    """
    Cria modelos para as configurações especificadas e salva os melhores resultados.
    """
    # Carregar os datasets (agora incluindo validação)
    train_dataset, test_dataset, valid_dataset = get_datasets(dset_name)

    # Binarizar os datasets
    datasets = binarize_datasets(train_dataset, test_dataset, valid_dataset, bits_per_input)

    # Criar combinações de hiperparâmetros
    prod = list(itertools.product(unit_inputs, unit_entries, unit_hashes))
    configurations = [datasets + c for c in prod]

    # Determinar o número de workers
    if num_workers == -1:
        num_workers = cpu_count()

    print(f"Lançando jobs para {len(configurations)} configurações usando {num_workers} workers")
    with Pool(num_workers) as p:
        results = p.starmap(parameterized_run, configurations)

    # Avaliar e imprimir os melhores resultados por número de entradas
    for entries in unit_entries:
        max_result = max([results[i][1] for i in range(len(results)) if configurations[i][7] == entries])
        print(f"Melhor com {entries} entradas: {max_result}")

    # Ordenar configurações por desempenho
    configs_plus_results = [[configurations[i][6:9]] + list(results[i]) for i in range(len(results))]
    configs_plus_results.sort(reverse=True, key=lambda x: x[2])  # Ordena pelo score

    for i in configs_plus_results:
        print(f"{i[0]}: {i[2]} ({i[2] / len(datasets[4])})")  # Normaliza pelo tamanho do teste

    # Criar diretório para salvar os modelos
    os.makedirs(os.path.dirname(f"./models/{dset_name}/{save_prefix}"), exist_ok=True)

    # Salvar modelos no disco
    for idx, result in enumerate(results):
        model = result[0]
        model_inputs, model_entries, model_hashes = configurations[idx][6:9]
        save_model(model, datasets[0][0].size // bits_per_input,
                   f"./models/{dset_name}/{save_prefix}_{model_inputs}input_{model_entries}entry_{model_hashes}hash_{bits_per_input}bpi.pickle.lzma")

In [None]:
def save_model(model, num_inputs, fname):
    model.binarize()
    model_info = {
        "num_inputs": num_inputs,
        "num_classes": len(model.discriminators),
        "bits_per_input": len(model.input_order) // num_inputs,
        "num_filter_inputs": model.discriminators[0].filters[0].num_inputs,
        "num_filter_entries": model.discriminators[0].filters[0].num_entries,
        "num_filter_hashes": model.discriminators[0].filters[0].num_hashes,\
        "hash_values": model.discriminators[0].filters[0].hash_values
    }
    state_dict = {
        "info": model_info,
        "model": model
    }

    with lzma.open(fname, "wb") as f:
        pickle.dump(state_dict, f)

def read_arguments():
    parser = argparse.ArgumentParser(description="Train BTHOWeN models for a dataset with specified hyperparameter sweep")
    parser.add_argument("dset_name", help="Name of dataset to use")
    parser.add_argument("--filter_inputs", nargs="+", required=True, type=int,\
            help="Number of inputs to each Bloom filter (accepts multiple values)")
    parser.add_argument("--filter_entries", nargs="+", required=True, type=int,\
            help="Number of entries in each Bloom filter (accepts multiple values; must be powers of 2)")
    parser.add_argument("--filter_hashes", nargs="+", required=True, type=int,\
            help="Number of distinct hash functions for each Bloom filter (accepts multiple values)")
    parser.add_argument("--bits_per_input", nargs="+", required=True, type=int,\
            help="Number of thermometer encoding bits for each input in the dataset (accepts multiple values)")
    parser.add_argument("--save_prefix", default="model", help="Partial path/fname to prepend to each output file")
    parser.add_argument("--num_workers", default=-1, type=int, help="Number of processes to run in parallel; defaults to number of logical CPUs")
    args = parser.parse_args()
    return args

def main():
    args = read_arguments()

    for bpi in args.bits_per_input:
        print(f"Do runs with {bpi} bit(s) per input")
        create_models(
            args.dset_name, args.filter_inputs, args.filter_entries, args.filter_hashes,
            bpi, args.num_workers, args.save_prefix)

if __name__ == "__main__":
    main()

###**Testando execução**

In [None]:
!chmod 777 /content/BTHOWeN/software_model/train_swept_models_alt.py

In [None]:
!/content/BTHOWeN/software_model/train_swept_models_alt.py classnavios --filter_inputs 29 --filter_entries 8192 --filter_hashes 3 --bits_per_input 4 --num_workers 2

Do runs with 4 bit(s) per input
Binarizando dataset de treino e validação
Binarizando dataset de teste
Lançando jobs para 1 configurações usando 2 workers
Training model
Maximum possible bleach value is 65
With bleaching=16, accuracy=23.55%; precision=0.2710; recall=0.2355
Ties=295/484 (60.9504%)
With bleaching=32, accuracy=22.93%; precision=0.3656; recall=0.2293
Ties=399/484 (82.438%)
With bleaching=48, accuracy=18.39%; precision=0.1469; recall=0.1839
Ties=460/484 (95.0413%)
With bleaching=8, accuracy=29.96%; precision=0.4375; recall=0.2996
Ties=165/484 (34.0909%)
With bleaching=24, accuracy=25.41%; precision=0.2494; recall=0.2541
Ties=348/484 (71.9008%)
With bleaching=4, accuracy=42.77%; precision=0.4767; recall=0.4277
Ties=64/484 (13.2231%)
With bleaching=12, accuracy=24.17%; precision=0.2367; recall=0.2417
Ties=238/484 (49.1736%)
With bleaching=2, accuracy=53.72%; precision=0.5862; recall=0.5372
Ties=14/484 (2.8926%)
With bleaching=6, accuracy=37.60%; precision=0.4468; recall=0.376