# Estimating Accuracy errors

### importing libraries

In [12]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.metrics import precision_score, recall_score, f1_score

import os
from os import listdir

import json


### Directories and principal variables

In [13]:
dim = 373
#20,90,373,1544

In [14]:
base_dir = os.getcwd()

In [15]:
data_dir = os.path.join(base_dir, 'data')
output_dir = os.path.join(base_dir, 'outputs')
processed_data_dir = os.path.join(base_dir, 'processed_data')
file_path = os.path.join(processed_data_dir, 'data_pca_' + str(dim) + '.pth')
files = listdir(data_dir)

# numero file presenti
print(len(files))

280


## Data

In [16]:
data_pca = torch.load(file_path)
train_data_pca = data_pca['train']
test_data_pca = data_pca['test']
train_labels = data_pca['tr_lab']
test_labels = data_pca['te_lab']

### Useful functions and classes

In [17]:
class FeedforwardNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, out_dim, dropout_prob):
        super(FeedforwardNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc2 = nn.Linear(hidden_dim, out_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# Funzione di training
def train(model, train_loader_pca, loss_function, optimizer):
    model.train()
    total_loss = 0
    for x, y in train_loader_pca:
        optimizer.zero_grad()
        out = model(x)
        loss = loss_function(out, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader_pca)


In [18]:
def evaluate(model, data_loader_pca):
    model.eval()
    correct = 0
    total = 0
    all_targets = []
    all_predictions = []
    with torch.no_grad():
        for x, y in data_loader_pca:
            out = model(x)
            _, predicted = torch.max(out.data, 1)
            total += y.size(0)
            correct += (predicted == y).sum().item()
            all_targets.extend(y.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())
    accuracy = 100 * correct / total
    return accuracy, np.array(all_targets), np.array(all_predictions)

In [19]:
def calculate_metrics(targets, predictions):
    precision = precision_score(targets, predictions, average='binary')
    recall = recall_score(targets, predictions, average='binary')
    f1 = f1_score(targets, predictions, average='binary')
    return precision, recall, f1

## Training

In [20]:
iteration = 10
results = {}

train_data_pca = torch.tensor(train_data_pca, dtype=torch.float32)
train_targets = torch.tensor(train_labels, dtype=torch.long)

test_data_pca = torch.tensor(test_data_pca, dtype=torch.float32)
test_targets = torch.tensor(test_labels, dtype=torch.long)

batch_size = 32
train_dataset_pca = torch.utils.data.TensorDataset(train_data_pca, train_targets)
test_dataset_pca = torch.utils.data.TensorDataset(test_data_pca, test_targets)

In [21]:
for i in range(iteration):
    train_loader_pca = torch.utils.data.DataLoader(train_dataset_pca, batch_size=batch_size, shuffle=True, drop_last=True)
    test_loader_pca = torch.utils.data.DataLoader(test_dataset_pca, batch_size=batch_size, shuffle=False)
    
    input_dim = train_data_pca.shape[1]
    print(f"Input dimension: {input_dim}")

    hidden_dim = 100 
    out_dim = 2 
    dropout_prob = 0.5 

    # Creazione di una nuova istanza del modello
    model = FeedforwardNetwork(input_dim, hidden_dim, out_dim, dropout_prob)
    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=0.0005)

    num_epochs = 100

    train_losses_pca = []
    train_accuracies_pca = []
    test_accuracies_pca = []

    train_losses_pca = []
    train_accuracies_pca = []
    test_accuracies_pca = []
    test_precisions_pca = []
    test_recalls_pca = []
    test_f1_scores_pca = []
    
    for epoch in range(num_epochs):
        train_loss_pca = train(model, train_loader_pca, loss_function, optimizer)
        train_losses_pca.append(train_loss_pca)

        train_accuracy = evaluate(model, train_loader_pca)[0]  # Solo l'accuratezza per il training
        train_accuracies_pca.append(train_accuracy)

        test_accuracy, test_targets, test_predictions = evaluate(model, test_loader_pca)
        test_accuracies_pca.append(test_accuracy)

        precision, recall, f1 = calculate_metrics(test_targets, test_predictions)
        test_precisions_pca.append(precision)
        test_recalls_pca.append(recall)
        test_f1_scores_pca.append(f1)

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss_pca:.4f}, Train Accuracy: {train_accuracy:.2f}%, Test Accuracy: {test_accuracy:.2f}%, Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}')

    print(f"Final Test Precision: {test_precisions_pca[-1]:.2f}")
    print(f"Final Test Recall: {test_recalls_pca[-1]:.2f}")
    print(f"Final Test F1 Score: {test_f1_scores_pca[-1]:.2f}")
    results[i]= (train_losses_pca[-1], train_accuracies_pca[-1], test_accuracies_pca[-1], test_precisions_pca[-1], test_recalls_pca[-1], test_f1_scores_pca[-1])

Input dimension: 373
Epoch 1/100, Train Loss: 0.4159, Train Accuracy: 83.47%, Test Accuracy: 85.64%, Precision: 0.84, Recall: 0.60, F1 Score: 0.70
Epoch 2/100, Train Loss: 0.3899, Train Accuracy: 84.14%, Test Accuracy: 85.75%, Precision: 0.85, Recall: 0.60, F1 Score: 0.70


KeyboardInterrupt: 

### Saving Data

In [None]:
title = 'results_'+str(dim)+'.txt'
file_path = os.path.join(processed_data_dir, title)
with open(file_path, 'w') as f:
    json.dump(results, f)

print(f"Risultati {title} salvati in {file_path}")

## Standard deviation and mean

In [None]:
with open(file_path,'r') as f:
    loaded_results = json.load(f)

In [None]:
train_losses_list = []
train_accuracies_list = []
test_accuracies_list = []
test_precisions_list = []
test_recall_list = []
test_f1_score_list = []

for wd, (train_losses_pca, train_accuracies_pca, test_accuracies_pca, test_precisions_pca, test_recalls_pca, test_f1_scores_pca) in loaded_results.items():
    train_losses_list.append(train_losses_pca)
    train_accuracies_list.append(train_accuracies_pca)
    test_accuracies_list.append(test_accuracies_pca)
    test_precisions_list.append(test_precisions_pca)
    test_recall_list.append(test_recalls_pca)
    test_f1_score_list.append(test_f1_scores_pca)


train_losses_array = np.array(train_losses_list)
train_accuracies_array = np.array(train_accuracies_list)
test_accuracies_array = np.array(test_accuracies_list)
test_precisions_array = np.array(test_precisions_list)
test_recall_array = np.array(test_recall_list)
test_f1_score_array = np.array(test_f1_score_list)


mean_train_losses = np.mean(train_losses_array, axis=0)
mean_train_accuracies = np.mean(train_accuracies_array, axis=0)
mean_test_accuracies = np.mean(test_accuracies_array, axis=0)
mean_test_precisions = np.mean(test_precisions_array, axis=0)
mean_test_recall = np.mean(test_recall_array, axis=0)
mean_test_f1_score =np.mean(test_f1_score_array, axis=0)

std_train_losses = np.std(train_losses_array, axis=0)
std_train_accuracies = np.std(train_accuracies_array, axis=0)
std_test_accuracies = np.std(test_accuracies_array, axis=0)
std_test_precisions = np.std(test_precisions_array, axis=0)
std_test_recall = np.std(test_recall_array, axis=0)
std_test_f1_score = np.std(test_f1_score_array, axis=0)

In [None]:
mean_std_values_3 = torch.load('mean_data_pca_3_le.pth')
mean_train_losses_3 = mean_std_values_3['m_loss']
mean_train_accuracies_3 = mean_std_values_3['m_train']
mean_test_accuracies_3 = mean_std_values_3['m_test']
mean_test_precisions_3 = mean_std_values_3['mean_prec']
mean_test_recall_3 = mean_std_values_3['mean_rec']
mean_test_f1_score_3 = mean_std_values_3['mean_f1']
std_train_losses_3 = mean_std_values_3['std_loss']
std_train_accuracies_3 = mean_std_values_3['std_train']
std_test_accuracies_3 = mean_std_values_3['std_test']
std_test_precisions_3 = mean_std_values_3['std_prec']
std_test_recall_3 = mean_std_values_3['std_rec']
std_test_f1_score_3 = mean_std_values_3['std_f1']

mean_std_values_20 = torch.load('mean_data_pca_20_le.pth')
mean_train_losses_20 = mean_std_values_20['m_loss']
mean_train_accuracies_20 = mean_std_values_20['m_train']
mean_test_accuracies_20 = mean_std_values_20['m_test']
mean_test_precisions_20 = mean_std_values_20['mean_prec']
mean_test_recall_20 = mean_std_values_20['mean_rec']
mean_test_f1_score_20 = mean_std_values_20['mean_f1']
std_train_losses_20 = mean_std_values_20['std_loss']
std_train_accuracies_20 = mean_std_values_20['std_train']
std_test_accuracies_20 = mean_std_values_20['std_test']
std_test_precisions_20 = mean_std_values_20['std_prec']
std_test_recall_20 = mean_std_values_20['std_rec']
std_test_f1_score_20 = mean_std_values_20['std_f1']

mean_std_values_90 = torch.load('mean_data_pca_90_le.pth')
mean_train_losses_90 = mean_std_values_90['m_loss']
mean_train_accuracies_90 = mean_std_values_90['m_train']
mean_test_accuracies_90 = mean_std_values_90['m_test']
mean_test_precisions_90 = mean_std_values_90['mean_prec']
mean_test_recall_90 = mean_std_values_90['mean_rec']
mean_test_f1_score_90 = mean_std_values_90['mean_f1']
std_train_losses_90 = mean_std_values_90['std_loss']
std_train_accuracies_90 = mean_std_values_90['std_train']
std_test_accuracies_90 = mean_std_values_90['std_test']
std_test_precisions_90 = mean_std_values_90['std_prec']
std_test_recall_90 = mean_std_values_90['std_rec']
std_test_f1_score_90 = mean_std_values_90['std_f1']

mean_std_values_373 = torch.load('mean_data_pca_373_le.pth')
mean_train_losses_373 = mean_std_values_373['m_loss']
mean_train_accuracies_373 = mean_std_values_373['m_train']
mean_test_accuracies_373 = mean_std_values_373['m_test']
mean_test_precisions_373 = mean_std_values_373['mean_prec']
mean_test_recall_373 = mean_std_values_373['mean_rec']
mean_test_f1_score_373 = mean_std_values_373['mean_f1']
std_train_losses_373 = mean_std_values_373['std_loss']
std_train_accuracies_373 = mean_std_values_373['std_train']
std_test_accuracies_373 = mean_std_values_373['std_test']
std_test_precisions_373 = mean_std_values_373['std_prec']
std_test_recall_373 = mean_std_values_373['std_rec']
std_test_f1_score_373 = mean_std_values_373['std_f1']

mean_std_values_1544 = torch.load('mean_data_pca_1544_le.pth')
mean_train_losses_1544 = mean_std_values_1544['m_loss']
mean_train_accuracies_1544 = mean_std_values_1544['m_train']
mean_test_accuracies_1544 = mean_std_values_1544['m_test']
mean_test_precisions_1544 = mean_std_values_1544['mean_prec']
mean_test_recall_1544 = mean_std_values_1544['mean_rec']
mean_test_f1_score_1544 = mean_std_values_1544['mean_f1']
std_train_losses_1544 = mean_std_values_1544['std_loss']
std_train_accuracies_1544 = mean_std_values_1544['std_train']
std_test_accuracies_1544 = mean_std_values_1544['std_test']
std_test_precisions_1544 = mean_std_values_1544['std_prec']
std_test_recall_1544 = mean_std_values_1544['std_rec']
std_test_f1_score_1544 = mean_std_values_1544['std_f1']

In [None]:
mean_losses=[mean_train_losses_3, mean_train_losses_20, mean_train_losses_90, mean_train_losses_373, mean_train_losses_1544]
std_losses=[std_train_losses_3, std_train_losses_20, std_train_losses_90, std_train_losses_373, std_train_losses_1544]
mean_t_accuracies=[mean_train_accuracies_3, mean_train_accuracies_20, mean_train_accuracies_90, mean_train_accuracies_373, mean_train_accuracies_1544]
std_t_accuracies=[std_train_accuracies_3, std_train_accuracies_20, std_train_accuracies_90, std_train_accuracies_373, std_train_accuracies_1544]
mean_accuracies=[mean_test_accuracies_3, mean_test_accuracies_20, mean_test_accuracies_90, mean_test_accuracies_373, mean_test_accuracies_1544]
std_accuracies=[std_test_accuracies_3,std_test_accuracies_20, std_test_accuracies_90, std_test_accuracies_373, std_test_accuracies_1544]
mean_precision=[mean_test_precisions_3, mean_test_precisions_20, mean_test_precisions_90, mean_test_precisions_373, mean_test_precisions_1544]
std_precision=[std_test_precisions_3, std_test_precisions_20, std_test_precisions_90, std_test_precisions_373, std_test_precisions_1544]
mean_recall=[mean_test_recall_3, mean_test_recall_20, mean_test_recall_90, mean_test_recall_373, mean_test_recall_1544]
std_recall=[std_test_recall_3, std_test_recall_20, std_test_recall_90, std_test_recall_373, std_test_recall_1544]
mean_f1=[mean_test_f1_score_3, mean_test_f1_score_20, mean_test_f1_score_90, mean_test_f1_score_373, mean_test_f1_score_1544]
std_f1=[std_test_f1_score_3, std_test_f1_score_20, std_test_f1_score_90, std_test_f1_score_373, std_test_f1_score_1544]

## Plots

In [None]:
fig, axs = plt.subplots(3, 2, figsize=(15, 15))
title = 'PCA_train_metrics'
dim=[3,20,90,373,1544]

axs[0, 0].errorbar(dim, mean_losses, yerr=std_losses, fmt='-o', label='Train Losses')
axs[0, 0].set_title('Train Losses')
axs[0, 0].set_xlabel('Componenti Principali')
axs[0, 0].set_ylabel('Train Loss')
axs[0, 0].grid(True)
axs[0, 0].legend()

axs[0, 1].errorbar(dim, mean_t_accuracies, yerr=std_t_accuracies, fmt='-o', label='Train Accuracies')
axs[0, 1].set_title('Train Accuracies')
axs[0, 1].set_xlabel('Componenti Principali')
axs[0, 1].set_ylabel('Train Accuracy (%)')
axs[0, 1].grid(True)
axs[0, 1].legend()

axs[1, 0].errorbar(dim, mean_accuracies, yerr=std_accuracies, fmt='-o', label='Test Accuracies')
axs[1, 0].set_title('Test Accuracies')
axs[1, 0].set_xlabel('Componenti Principali')
axs[1, 0].set_ylabel('Test Accuracy (%)')
axs[1, 0].grid(True)
axs[1, 0].legend()

axs[1, 1].errorbar(dim, mean_precision, yerr=std_precision, fmt='-o', label='Precisions')
axs[1, 1].set_title('Precisions')
axs[1, 1].set_xlabel('Componenti Principali')
axs[1, 1].set_ylabel('Precision')
axs[1, 1].grid(True)
axs[1, 1].legend()

axs[2, 0].errorbar(dim, mean_recall, yerr=std_recall, fmt='-o', label='Recall')
axs[2, 0].set_title('Recall')
axs[2, 0].set_xlabel('Componenti Principali')
axs[2, 0].set_ylabel('Recall')
axs[2, 0].grid(True)
axs[2, 0].legend()

axs[2, 1].errorbar(dim, mean_f1, yerr=std_f1, fmt='-o', label='F1 Score')
axs[2, 1].set_title('F1 Score')
axs[2, 1].set_xlabel('Componenti Principali')
axs[2, 1].set_ylabel('F1 Score')
axs[2, 1].grid(True)
axs[2, 1].legend()

plt.tight_layout()
fig.savefig(os.path.join(output_dir, title+'.png'))
plt.show()

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(12, 12))
title = 'PCA_test_metrics'
dim=[3,20,90,373,1544]

axs[0, 0].errorbar(dim, mean_accuracies, yerr=std_accuracies, fmt='-o', label='Test Accuracies', color= 'cadetblue')
axs[0, 0].set_title('Test Accuracies vs. PC')
axs[0, 0].set_xlabel('Componenti Principali')
axs[0, 0].set_ylabel('Test Accuracy (%)')
axs[0, 0].grid(True)

axs[0, 1].errorbar(dim, mean_precision, yerr=std_precision, fmt='-o', label='Precisions', color= 'cadetblue')
axs[0, 1].set_title('Precisions vs. PC')
axs[0, 1].set_xlabel('Componenti Principali')
axs[0, 1].set_ylabel('Precision')
axs[0, 1].grid(True)

axs[1, 0].errorbar(dim, mean_recall, yerr=std_recall, fmt='-o', label='Recall',color= 'cadetblue')
axs[1, 0].set_title('Recall vs. PC')
axs[1, 0].set_xlabel('Componenti Principali')
axs[1, 0].set_ylabel('Recall')
axs[1, 0].grid(True)

axs[1, 1].errorbar(dim, mean_f1, yerr=std_f1, fmt='-o', label='F1 Score', color= 'cadetblue')
axs[1, 1].set_title('F1 Score vs. PC')
axs[1, 1].set_xlabel('Componenti Principali')
axs[1, 1].set_ylabel('F1 Score')
axs[1, 1].grid(True)

fig.savefig(os.path.join(output_dir, title+'.png'))
plt.show()