# PCA metrics

### importing libraries

In [19]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.metrics import precision_score, recall_score, f1_score

import os
from os import listdir

import json

### directories and setup

In [20]:
base_dir = os.getcwd()
data_dir = os.path.join(base_dir, 'data')
output_dir = os.path.join(base_dir, 'outputs')
processed_data_dir = os.path.join(base_dir, 'processed_data')

In [21]:
for directory in [data_dir, output_dir, processed_data_dir]:
    os.makedirs(directory, exist_ok=True)

In [22]:
config = {
    'batch_size': 32,
    'hidden_dim': 100,
    'out_dim': 2,
    'dropout_prob': 0.5,
    'learning_rate': 0.01,
    'weight_decay': 0.0005,
    'num_epochs': 100
}
dims = [3, 20, 90, 373, 154]

### Functions and classes

In [23]:
class FeedforwardNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, out_dim, dropout_prob):
        super(FeedforwardNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc2 = nn.Linear(hidden_dim, out_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

class FeedforwardNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, out_dim, dropout_prob):
        super(FeedforwardNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc2 = nn.Linear(hidden_dim, out_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

def train(model, train_loader_pca, loss_function, optimizer):
    model.train()
    total_loss = 0
    for x, y in train_loader_pca:
        optimizer.zero_grad()
        out = model(x)
        loss = loss_function(out, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader_pca)

def evaluate(model, data_loader_pca):
    model.eval()
    correct = 0
    total = 0
    all_targets = []
    all_predictions = []
    with torch.no_grad():
        for x, y in data_loader_pca:
            out = model(x)
            _, predicted = torch.max(out.data, 1)
            total += y.size(0)
            correct += (predicted == y).sum().item()
            all_targets.extend(y.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())
    accuracy = 100 * correct / total
    return accuracy, np.array(all_targets), np.array(all_predictions)

def calculate_metrics(targets, predictions):
    precision = precision_score(targets, predictions, average='binary')
    recall = recall_score(targets, predictions, average='binary')
    f1 = f1_score(targets, predictions, average='binary')
    return precision, recall, f1

def run_training(iteration, train_data, train_labels, test_data, test_labels, config):
    results = {}
    train_dataset_pca = torch.utils.data.TensorDataset(train_data, train_labels)
    test_dataset_pca = torch.utils.data.TensorDataset(test_data, test_labels)
    for i in range(iteration):
        train_loader_pca = torch.utils.data.DataLoader(train_dataset_pca, batch_size=config['batch_size'], shuffle=True, drop_last=True)
        test_loader_pca = torch.utils.data.DataLoader(test_dataset_pca, batch_size=config['batch_size'], shuffle=False)
        
        input_dim = train_data.shape[1]
        model = FeedforwardNetwork(input_dim, config['hidden_dim'], config['out_dim'], config['dropout_prob'])
        loss_function = nn.CrossEntropyLoss()
        optimizer = optim.SGD(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])

        num_epochs = config['num_epochs']

        train_losses_pca = []
        train_accuracies_pca = []
        test_accuracies_pca = []
        test_precisions_pca = []
        test_recalls_pca = []
        test_f1_scores_pca = []
        
        for epoch in range(num_epochs):
            train_loss_pca = train(model, train_loader_pca, loss_function, optimizer)
            train_losses_pca.append(train_loss_pca)

            train_accuracy = evaluate(model, train_loader_pca)[0]
            train_accuracies_pca.append(train_accuracy)

            test_accuracy, test_targets, test_predictions = evaluate(model, test_loader_pca)
            test_accuracies_pca.append(test_accuracy)

            precision, recall, f1 = calculate_metrics(test_targets, test_predictions)
            test_precisions_pca.append(precision)
            test_recalls_pca.append(recall)
            test_f1_scores_pca.append(f1)

            print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss_pca:.4f}, Train Accuracy: {train_accuracy:.2f}%, Test Accuracy: {test_accuracy:.2f}%, Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}')

        results[i] = (train_losses_pca[-1], train_accuracies_pca[-1], test_accuracies_pca[-1], test_precisions_pca[-1], test_recalls_pca[-1], test_f1_scores_pca[-1])

    return results

In [24]:
def load_data_for_dimension(dim, processed_data_dir):
    file_path = os.path.join(processed_data_dir, f'data_pca_{dim}.pth')
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File {file_path} not found")
    
    data_pca = torch.load(file_path)
    train_data_pca = torch.tensor(data_pca['train'], dtype=torch.float32)
    test_data_pca = torch.tensor(data_pca['test'], dtype=torch.float32)
    train_labels = torch.tensor(data_pca['tr_lab'], dtype=torch.long)
    test_labels = torch.tensor(data_pca['te_lab'], dtype=torch.long)
    
    return train_data_pca, test_data_pca, train_labels, test_labels

In [25]:
def load_and_plot_results(dims, output_dir):
    mean_losses, std_losses = []
    mean_t_accuracies, std_t_accuracies = []
    mean_accuracies, std_accuracies = []
    mean_precision, std_precision = []
    mean_recall, std_recall = []
    mean_f1, std_f1 = []

    for dim in dims:
        mean_std_values = torch.load(os.path.join(output_dir, f'mean_data_pca_{dim}_le.pth'))
        mean_losses.append(mean_std_values['mean_loss'])
        std_losses.append(mean_std_values['std_loss'])
        mean_t_accuracies.append(mean_std_values['mean_train'])
        std_t_accuracies.append(mean_std_values['std_train'])
        mean_accuracies.append(mean_std_values['mean_test'])
        std_accuracies.append(mean_std_values['std_test'])
        mean_precision.append(mean_std_values['mean_prec'])
        std_precision.append(mean_std_values['std_prec'])
        mean_recall.append(mean_std_values['mean_rec'])
        std_recall.append(mean_std_values['std_rec'])
        mean_f1.append(mean_std_values['mean_f1'])
        std_f1.append(mean_std_values['std_f1'])

    fig, axs = plt.subplots(3, 2, figsize=(15, 15))
    metrics = [
        ('Mean Loss', mean_losses, std_losses),
        ('Mean Train Accuracy', mean_t_accuracies, std_t_accuracies),
        ('Mean Test Accuracy', mean_accuracies, std_accuracies),
        ('Mean Precision', mean_precision, std_precision),
        ('Mean Recall', mean_recall, std_recall),
        ('Mean F1 Score', mean_f1, std_f1)
    ]

    for i, (title, mean_vals, std_vals) in enumerate(metrics):
        row, col = divmod(i, 2)
        axs[row, col].errorbar(dims, mean_vals, yerr=std_vals, fmt='-o', capsize=5)
        axs[row, col].set_title(title)
        axs[row, col].set_xlabel('Dimensions')
        axs[row, col].set_ylabel('Score')
        axs[row, col].grid(True)

    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'performance_comparison.png'))
    plt.show()

In [26]:
def calculate_mean_std(file_path):
    with open(file_path, 'r') as f:
        loaded_results = json.load(f)

    train_losses_list = []
    train_accuracies_list = []
    test_accuracies_list = []
    test_precisions_list = []
    test_recall_list = []
    test_f1_score_list = []

    for _, (train_loss, train_acc, test_acc, test_prec, test_rec, test_f1) in loaded_results.items():
        train_losses_list.append(train_loss)
        train_accuracies_list.append(train_acc)
        test_accuracies_list.append(test_acc)
        test_precisions_list.append(test_prec)
        test_recall_list.append(test_rec)
        test_f1_score_list.append(test_f1)

    mean_std_dict = {
        'mean_loss': np.mean(train_losses_list),
        'std_loss': np.std(train_losses_list),
        'mean_train': np.mean(train_accuracies_list),
        'std_train': np.std(train_accuracies_list),
        'mean_test': np.mean(test_accuracies_list),
        'std_test': np.std(test_accuracies_list),
        'mean_prec': np.mean(test_precisions_list),
        'std_prec': np.std(test_precisions_list),
        'mean_rec': np.mean(test_recall_list),
        'std_rec': np.std(test_recall_list),
        'mean_f1': np.mean(test_f1_score_list),
        'std_f1': np.std(test_f1_score_list)
    }

    return mean_std_dict

## Training

In [None]:
for dim in dims:
    print(f"Training for dimension {dim}")
    train_data_pca, test_data_pca, train_labels, test_labels = load_data_for_dimension(dim, processed_data_dir)
    results = run_training(10, train_data_pca, train_labels, test_data_pca, test_labels, config)
    
    # Save results
    title = f'results_{dim}.txt'
    file_path = os.path.join(processed_data_dir, title)
    with open(file_path, 'w') as f:
        json.dump(results, f)
    print(f"Results for dimension {dim} saved in {file_path}")

    # Calculate mean and standard deviation
    mean_std_values = calculate_mean_std(file_path)
    torch.save(mean_std_values, os.path.join(output_dir, f'mean_data_pca_{dim}_le.pth'))


## Plots

In [None]:
dims = [2, 3, 4, 5]
load_and_plot_results(dims, output_dir)