### Neural Network Model for Comparison (Max-all)
In comparison to state of the art models, we train a neural network on the normalized data including the raw data. 
Since our dataset comparises image data, a natural thing to do is compare with a CNN model. However, we do not compare with a CNN model because the resolution of the data is small. The data are 10x10 images. We trained CNN models, though but did not get any better results.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from time import perf_counter

class DynamicBinaryClassifier(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size):
        super(DynamicBinaryClassifier, self).__init__()
        layers = []
        prev_size = input_size
        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(prev_size, hidden_size))
            layers.append(nn.ReLU())
            prev_size = hidden_size
        self.hidden_layers = nn.Sequential(*layers)
        self.output_layer = nn.Linear(prev_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.hidden_layers(x)
        x = self.output_layer(x)
        x = self.sigmoid(x)
        return x

class BinaryClassifierTrainer:
    def __init__(self, model, criterion, optimizer, device="cuda" if torch.cuda.is_available() else "cpu"):
        self.model = model.to(device)
        self.criterion = criterion
        self.optimizer = optimizer
        self.device = device

    def _to_tensor(self, data):
        if not torch.is_tensor(data):
            data = torch.tensor(data, dtype=torch.float32)
        return data.to(self.device)

    def train(self, input_data, labels, num_epochs=100, batch_size=32):
        input_data, labels = self._to_tensor(input_data), self._to_tensor(labels)

        for epoch in range(num_epochs):
            total_loss = 0.0

            # Forward pass and calculate loss
            for i in range(0, len(input_data), batch_size):
                batch_input = input_data[i:i + batch_size]
                batch_labels = labels[i:i + batch_size]

                outputs = self.model(batch_input)
                loss = self.criterion(outputs, batch_labels)

                # Backward pass and optimization
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

                total_loss += loss.item()

            average_loss = total_loss / (len(input_data) / batch_size)

            # Print the average loss every 100 epochs
            if (epoch + 1) % 100 == 0:
                print(f'Epoch [{epoch + 1}/{num_epochs}], Average Loss: {average_loss:.8f}')

    def evaluate(self, input_data, labels):
        input_data, labels = self._to_tensor(input_data), self._to_tensor(labels)

        with torch.no_grad():
            outputs = self.model(input_data)
            predictions = (outputs > 0.5).float()

        accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        return accuracy
    
from glob import glob

# Example usage:
input_size = 100
hidden_sizes = [70,64, 32]
output_size = 1

model = DynamicBinaryClassifier(input_size, hidden_sizes, output_size)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

trainer = BinaryClassifierTrainer(model, criterion, optimizer)

start = perf_counter()
# read data
batch_size = 128
files = glob('../../data/data-norm/max-pixel-all/*.csv')
file_names = [f.split('/')[-1] for f in files]
results = {name:[] for name in file_names}
for name in results:
    for _ in range(3):
        print(name)
        dat = pd.read_csv(f'../../data/data-norm/max-pixel-all/{name}')
        data = dat.iloc[:, 1:].values
        labels = dat.iloc[:, 0].values.reshape(-1, 1)
        # Split the data into training and test sets
        input_train, input_test, labels_train, labels_test = train_test_split(data, labels, test_size=0.2, random_state=42)
        # Training
        trainer.train(input_train, labels_train, num_epochs=200, batch_size=batch_size)
        # Evaluation
        test_accuracy = trainer.evaluate(input_test, labels_test)
        print(f'Test Accuracy: {test_accuracy:.4f}')
        results[name].append(test_accuracy)
data_accuracies = pd.DataFrame(results)
data_accuracies.to_csv('../../data/data-norm/accuracies-nnet-max-all.csv', index=False)

end = perf_counter()
print(f'Total time taken: {(end-start)/60:.6f} minutes')
# dat = pd.read_csv('../../data/data-norm/max-only/raw_image_data.csv')
# data = dat.iloc[:, 1:].values
# labels = dat.iloc[:, 0].values.reshape(-1, 1)

# # Split the data into training and test sets
# input_train, input_test, labels_train, labels_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# # Training
# trainer.train(input_train, labels_train, num_epochs=1000, batch_size=32)

# # Evaluation
# test_accuracy = trainer.evaluate(input_test, labels_test)
# print(f'Test Accuracy: {test_accuracy:.4f}')


norm_1_data.csv
Epoch [100/200], Average Loss: 0.50220957
Epoch [200/200], Average Loss: 0.47047489
Test Accuracy: 0.7443
norm_1_data.csv
Epoch [100/200], Average Loss: 0.44671910
Epoch [200/200], Average Loss: 0.42460004
Test Accuracy: 0.7501
norm_1_data.csv
Epoch [100/200], Average Loss: 0.40238596
Epoch [200/200], Average Loss: 0.38110497
Test Accuracy: 0.7492
nthroot_0.4828.csv
Epoch [100/200], Average Loss: 0.52759860
Epoch [200/200], Average Loss: 0.50698699
Test Accuracy: 0.7336
nthroot_0.4828.csv
Epoch [100/200], Average Loss: 0.49583961
Epoch [200/200], Average Loss: 0.48641860
Test Accuracy: 0.7384
nthroot_0.4828.csv
Epoch [100/200], Average Loss: 0.47884587
Epoch [200/200], Average Loss: 0.47195439
Test Accuracy: 0.7406
nthroot_0.5172.csv
Epoch [100/200], Average Loss: 0.46667791
Epoch [200/200], Average Loss: 0.46149010
Test Accuracy: 0.7496
nthroot_0.5172.csv
Epoch [100/200], Average Loss: 0.45612927
Epoch [200/200], Average Loss: 0.45262347
Test Accuracy: 0.7531
nthroot_0

In [2]:
data_accuracies

Unnamed: 0,norm_1_data.csv,nthroot_0.4828.csv,nthroot_0.5172.csv,norm_31.csv,nthroot_0.5862_data.csv,nthroot_0.7931.csv,nthroot_0.5862.csv,nthroot_0.4483_data.csv,nthroot_1.0.csv,norm_31_data.csv,...,nthroot_0.2069_data.csv,norm_5_data.csv,nthroot_0.7931_data.csv,nthroot_0.4483.csv,norm_2.csv,nthroot_log.csv,nthroot_0.9655.csv,nthroot_log_data.csv,norm_1.csv,norm_5.csv
0,0.744267,0.733627,0.749587,0.452027,0.765731,0.762979,0.77417,0.774537,0.745735,0.607595,...,0.442855,0.442855,0.442855,0.452027,0.45166,0.452027,0.451844,0.442855,0.452027,0.452027
1,0.750138,0.738397,0.753073,0.452027,0.767932,0.761328,0.777105,0.775087,0.750688,0.624839,...,0.442855,0.442855,0.442855,0.452027,0.45166,0.452027,0.451844,0.442855,0.452027,0.452027
2,0.74922,0.740598,0.754174,0.452027,0.766098,0.765731,0.770501,0.774904,0.754174,0.624289,...,0.442855,0.442855,0.442855,0.452027,0.45166,0.452027,0.451844,0.442855,0.452027,0.452027


In [3]:
import numpy as np
np.max(data_accuracies, axis=1)

0    0.774537
1    0.777105
2    0.774904
dtype: float64

In [6]:
idcs = np.argmax(data_accuracies.values, axis=1)
idcs

array([7, 6, 7])

In [7]:
data_accuracies.iloc[:, idcs]

Unnamed: 0,nthroot_0.4483_data.csv,nthroot_0.5862.csv,nthroot_0.4483_data.csv.1
0,0.774537,0.77417,0.774537
1,0.775087,0.777105,0.775087
2,0.774904,0.770501,0.774904


Of all normalizations as well as the raw data, the $r^{th}$ with max-all over each image performs the best at an average of $77\%$ accuracy on $20\%$ test data for neural nets models. So judging from these runs, we conlcude that for neural net models, dividing each pixel with the absolute max over the entire dataset is more benefitial than dividing by max over each image.