In [1]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data
import pickle
import seaborn as sns

from torch_geometric.utils import to_networkx
#install required packages
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)
# Helper function for visualization.
%matplotlib inline
import networkx as nx
import matplotlib.pyplot as plt

from torch_geometric.data import Dataset
import torch_geometric.utils as pyg_utils
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool,GATv2Conv
from torch_geometric.nn.models import GCN, GAT
from torch.nn import Linear

from torch_geometric.utils import degree

import torch.nn as nn
from torch_geometric.utils import softmax
import math
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr, spearmanr
import random
from sklearn.metrics import root_mean_squared_error,mean_absolute_error


2.4.1+cu118


In [2]:
np.std([0.7280, 0.7030, 0.7110, 0.715,0.72])

0.008404760555780284

In [3]:

def set_seed(seed):
    random.seed(seed)  # Python random
    np.random.seed(seed)  # Numpy random
    torch.manual_seed(seed)  # PyTorch CPU
    torch.cuda.manual_seed(seed)  # PyTorch GPU (un singolo dispositivo)
    torch.cuda.manual_seed_all(seed)  # PyTorch GPU (tutti i dispositivi, se usi multi-GPU)
    torch.backends.cudnn.deterministic = True  # Comportamento deterministico di cuDNN
    torch.backends.cudnn.benchmark = False  # Evita che cuDNN ottimizzi dinamicamente (influisce su riproducibilità)

# Imposta il seed
set_seed(42)


In [4]:
import esm

# Carica il modello pre-addestrato ESM2
model_esm, alphabet_esm = esm.pretrained.esm2_t33_650M_UR50D()
#esm.pretrained.esm2_t48_15B_UR50D() #esm.pretrained.esm2_t33_650M_UR50D()

# #modelli possibili 
# esm.pretrained.esm2_t6_8M_UR50D()   # Modello più piccolo (8 milioni di parametri)
# esm.pretrained.esm2_t33_650M_UR50D() # Modello medio (650 milioni di parametri)
# esm.pretrained.esm2_t36_3B_UR50D()   # Modello più grande (3 miliardi di parametri)

# Sposta il modello su GPU, se disponibile
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_esm = model_esm.to(device)
batch_converter_esm = alphabet_esm.get_batch_converter()
model_esm.eval()

def Esm2_embedding(seq, model_esm = model_esm, batch_converter_esm = batch_converter_esm):
    # Definisci la sequenza della proteina
    sequences = [("protein", seq),]
    
    # Converte la sequenza nel formato richiesto dal modello
    batch_labels, batch_strs, batch_tokens = batch_converter_esm(sequences)
    batch_tokens = batch_tokens.to(device)
    #batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

    # Disabilita il calcolo del gradiente per risparmiare memoria
    with torch.no_grad():
        results = model_esm(batch_tokens, repr_layers=[33])  # Usa l'ultimo layer
        token_representations = results["representations"][33]
    
    # Rimuove i token speciali di inizio/fine sequenza
    # L'embedding risultante sarà una matrice (Lunghezza della sequenza, Dimensione dell'embedding)
    embedding = token_representations[0, 1:-1].cpu().numpy()
    return embedding

In [5]:
from random import sample

class DeltaDataset(Dataset):
    def __init__(self, data, dim_embedding, inv = False):
        self.data = data
        self.dim_embedding = dim_embedding
        self.inv = inv

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]

        if self.inv: 
            return {
                'id': sample['id'],
                'wild_type': torch.tensor(sample['mut_type'], dtype=torch.float32),    # inverto mut con wild 
                'mut_type': torch.tensor(sample['wild_type'], dtype=torch.float32),    # inverto mut con wild             
                'length': torch.tensor(sample['length'], dtype=torch.float32),
                'ddg': torch.tensor(-float(sample['ddg']), dtype=torch.float32),       # -ddg
                'pos_mut': torch.tensor(sample['pos_mut'], dtype=torch.int64),
                #'hydra_slim': torch.tensor(Esm2_embedding(''.join(['X' for n in range(sample['length'])])), dtype=torch.float32),
                'hydra_slim': torch.tensor(sample['mut_type']*0, dtype=torch.float32),

                }

        else:
            return {
                'id': sample['id'],
                'wild_type': torch.tensor(sample['wild_type'], dtype=torch.float32),
                'mut_type': torch.tensor(sample['mut_type'],dtype=torch.float32),
                'length': torch.tensor(sample['length'], dtype=torch.float32),
                'ddg': torch.tensor(float(sample['ddg']), dtype=torch.float32),
                'pos_mut': torch.tensor(sample['pos_mut'], dtype=torch.int64),
                #'hydra_slim': torch.tensor(Esm2_embedding(''.join(['X' for n in range(sample['length'])])), dtype=torch.float32),
                'hydra_slim': torch.tensor(sample['mut_type']*0, dtype=torch.float32),

                }



# class DeltaDataset_nohydra(Dataset):
#     def __init__(self, data, dim_embedding, inv = False):
#         self.data = data
#         self.dim_embedding = dim_embedding
#         self.inv = inv

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         sample = self.data[idx]

#         if self.inv: 
#             return {
#                 'id': sample['id'],
#                 'wild_type': torch.tensor(sample['mut_type'], dtype=torch.float32),    # inverto mut con wild 
#                 'mut_type': torch.tensor(sample['wild_type'], dtype=torch.float32),    # inverto mut con wild             
#                 'length': torch.tensor(sample['length'], dtype=torch.float32),
#                 'ddg': torch.tensor(-float(sample['ddg']), dtype=torch.float32),       # -ddg
#                 'pos_mut': torch.tensor(sample['pos_mut'], dtype=torch.int64),
#                 'hydra_slim': torch.tensor(sample['wild_type'], dtype=torch.float32),
#                 }

#         else:
#             return {
#                 'id': sample['id'],
#                 'wild_type': torch.tensor(sample['wild_type'], dtype=torch.float32),
#                 'mut_type': torch.tensor(sample['mut_type'],dtype=torch.float32),
#                 'length': torch.tensor(sample['length'], dtype=torch.float32),
#                 'ddg': torch.tensor(float(sample['ddg']), dtype=torch.float32),
#                 'pos_mut': torch.tensor(sample['pos_mut'], dtype=torch.int64),
#                 'hydra_slim': torch.tensor(sample['wild_type'], dtype=torch.float32),
#                 }

In [6]:
from torch_geometric.loader import DataLoader
import random

import torch
import torch.nn.functional as F

def collate_fn(batch):
    max_len = max(sample['wild_type'].shape[0] for sample in batch)  # Max sequence length in batch   700
    max_features = max(sample['wild_type'].shape[1] for sample in batch)  # Max feature size

    padded_batch = {
        'id': [],
        'wild_type': [],
        'mut_type': [],
        'length': [],
        'ddg': [],
        'pos_mut': [],
        'hydra_slim':[],
    }

    for sample in batch:
        
        wild_type_padded = F.pad(sample['wild_type'], (0, max_features - sample['wild_type'].shape[1], 
                                                       0, max_len - sample['wild_type'].shape[0]))
        
        mut_type_padded = F.pad(sample['mut_type'], (0, max_features - sample['mut_type'].shape[1], 
                                                     0, max_len - sample['mut_type'].shape[0]))
        
        hydra_slim_type_padded = F.pad(sample['hydra_slim'], (0, max_features - sample['hydra_slim'].shape[1], 
                                                       0, max_len - sample['hydra_slim'].shape[0]))        

        padded_batch['id'].append(sample['id'])  
        padded_batch['wild_type'].append(wild_type_padded)  
        padded_batch['mut_type'].append(mut_type_padded)  
        padded_batch['length'].append(sample['length'])
        padded_batch['ddg'].append(sample['ddg'])
        padded_batch['hydra_slim'].append(hydra_slim_type_padded)


    # Convert list of tensors into a single batch tensor
    padded_batch['wild_type'] = torch.stack(padded_batch['wild_type'])  # Shape: (batch_size, max_len, max_features)
    padded_batch['mut_type'] = torch.stack(padded_batch['mut_type'])  
    padded_batch['length'] = torch.stack(padded_batch['length'])  
    padded_batch['ddg'] = torch.stack(padded_batch['ddg'])
    padded_batch['hydra_slim'] = torch.stack(padded_batch['hydra_slim'])

    return padded_batch


def dataloader_generation(path, collate_fn, batch_size = 128, dataloader_shuffle = True, inv= False):
    
    dim_embedding = 1280
    dataset= []

    for path in path:
        with open(path, 'rb') as f:
            dataset += pickle.load(f)

    delta_dataset = DeltaDataset(dataset, dim_embedding, inv = inv)  
    dataloader_delta = DataLoader(delta_dataset, batch_size=batch_size, shuffle=dataloader_shuffle, collate_fn=collate_fn)

    return dataloader_delta

# def dataloader_generation_nohydra(path, collate_fn, batch_size = 128, dataloader_shuffle = True, inv= False):
    
#     dim_embedding = 1280
#     dataset= []

#     for path in path:
#         with open(path, 'rb') as f:
#             dataset += pickle.load(f)

#     delta_dataset = DeltaDataset_nohydra(dataset, dim_embedding, inv = inv)  
#     dataloader_delta = DataLoader(delta_dataset, batch_size=batch_size, shuffle=dataloader_shuffle, collate_fn=collate_fn)

#     return dataloader_delta




In [7]:
from torch.utils.data import DataLoader  # Use standard PyTorch DataLoader
import random
from itertools import chain
from collections import Counter

train_path =[f's2450_fold_{i}_hydra_slim.pkl' for i in [0,1,2,3,4]]+[f's2450_fold_{i}_hydra_slim_inv.pkl' for i in [0,1,2,3,4]]#+[f's2450_fold_{i}_hydra_slim.pkl' for i in [0,1,2,3,4]]+[f's2450_fold_{i}_hydra_slim_inv.pkl' for i in [0,1,2,3,4]]


#[f's2450_fold_{i}_hydra_slim_SHIFTED1.pkl' for i in [0,1,2,3,4]]+[f's2450_fold_{i}_hydra_slim_SHIFTED1_inv.pkl' for i in [0,1,2,3,4]] #
#[f's2450_fold_{i}.pkl' for i in [0,1,2,3,4]]+[f's2450_fold_{i}_inv.pkl' for i in [0,1,2,3,4]]#['DA_th0.0_foldx_train.pkl'] + ['foldx_train.pkl'] + ['Double_mut_DA_0.0_foldx_train.pkl']+['test_TS16.pkl']#[f's2450_fold_{i}.pkl' for i in [0,1,2,3,4]]+[f's2450_fold_{i}_inv.pkl' for i in [0,1,2,3,4]] + ['ptmul_train.pkl']+['DA_s2450.pkl'] #
val_path = ['ptmul_train.pkl']#['s669_Castrense.pkl']#[f's2450_fold_{i}.pkl' for i in val_set]+[f's2450_fold_{i}_inv.pkl' for i in val_set]
test_path = ['M28_test.pkl']#['s669_Castrense_hydra_slim.pkl']#['test_TS16.pkl']

dataloader_train = dataloader_generation(path = train_path, batch_size = 6,collate_fn=collate_fn, dataloader_shuffle = True, inv= False)
dataloader_validation = dataloader_generation(path = val_path, batch_size = 1, collate_fn=collate_fn, dataloader_shuffle = False, inv= False)
dataloader_test = dataloader_generation(path = test_path, batch_size = 1, collate_fn=collate_fn, dataloader_shuffle = False, inv= False)


In [8]:
import copy


def output_model_from_batch(batch, model, device, hydra = False,train=False):
    
    x_wild = batch['wild_type'].float().to(device)
    x_mut = batch['mut_type'].float().to(device)
    hydra_slim = batch['hydra_slim'].float().to(device)
    labels = batch['ddg'].float().to(device)
    length = batch['length'].to(device)
    output_ddg = model(x_wild, x_mut, hydra_slim, length, hydra=hydra, train = train)
    
    return output_ddg, labels






def training_and_validation_loop_ddg(model, dataloader_train, dataloader_test, dataloader_validation, path_save_fig, epochs=20, lr =0.001, patience=10):
            
    criterion =nn.MSELoss()# nn.HuberLoss()#nn.L1Loss()
    optimizer = torch.optim.Adam(model.parameters(), lr = lr)
    
    pearson_r_train = []
    pearson_r_test = []
    pearson_r_validation = []
    
    loss_ddg_train = []
    loss_ddg_train_TRANS = []
    loss_ddg_train_TOT = []
    
    loss_ddg_validation = []
    loss_ddg_validation_TRANS = []
    loss_ddg_validation_TOT = []
    
    loss_ddg_test = []
    loss_ddg_test_TRANS=[]
    loss_ddg_test_TOT = []

    num_epochs = epochs
    for epoch in range(num_epochs):
            
        # Training Loop
        model.train()
        preds_ddg_train = []
        labels_tot_epoch = []

        preds_ddg_train_TRANS = []
        labels_tot_epoch_TRANS = []

        for i, batch in enumerate(dataloader_train):
            train = True
            optimizer.zero_grad()
            output_ddg_train, labels_train = output_model_from_batch(batch, model, device, hydra=False, train=True)
            output_ddg_HYDRA_SLIM_train, _ = output_model_from_batch(batch, model, device, hydra=True, train=True)
            
            loss_ddg = criterion(output_ddg_train, labels_train)  #usa se NON uso hydra            
            tot_loss = loss_ddg + criterion(output_ddg_HYDRA_SLIM_train, output_ddg_train)
            
            # Backpropagation and optimization
            tot_loss.backward()
            optimizer.step()

            # Collect predictions
            preds_ddg_train.extend(output_ddg_train.cpu().reshape(-1).tolist())
            labels_tot_epoch.extend(labels_train.cpu().tolist())

            preds_ddg_train_TRANS.extend(output_ddg_HYDRA_SLIM_train.cpu().reshape(-1).tolist())
            labels_tot_epoch_TRANS.extend(output_ddg_train.cpu().tolist())            

        # Calculate and print train metrics
        train_loss = mean_squared_error(preds_ddg_train, labels_tot_epoch)
        train_loss_TRANS = mean_squared_error(preds_ddg_train_TRANS, labels_tot_epoch_TRANS)
        
        train_correlation = pearsonr(preds_ddg_train, labels_tot_epoch)[0]
        train_spearman = spearmanr(preds_ddg_train, labels_tot_epoch)[0]
        
        loss_ddg_train.append(train_loss)
        loss_ddg_train_TRANS.append(train_loss_TRANS)
        loss_ddg_train_TOT.append(train_loss_TRANS+train_loss)
        pearson_r_train.append(train_correlation)
        
        # Validation Loop
        model.eval()  # Set model to evaluation mode
                
        all_preds_validation = []
        all_labels_validation = []
        all_preds_validation_TRANS = []

        
        all_preds_test = []
        all_labels_test = []
        all_preds_test_TRANS = []
                
        with torch.no_grad():  # Disable gradient calculation
            for i, batch in enumerate(dataloader_test):

                output_ddg_test, labels_test = output_model_from_batch(batch, model, device, hydra=False, train=False) 
                output_ddg_HYDRA_SLIM_test, _ = output_model_from_batch(batch, model, device, hydra=True, train=False)      
                    
                all_preds_test.extend(output_ddg_test.cpu().reshape(-1).tolist())
                all_labels_test.extend(labels_test.cpu().tolist())

                all_preds_test_TRANS.extend(output_ddg_HYDRA_SLIM_test.cpu().reshape(-1).tolist())
            
            # Calculate validation metrics
            test_loss = mean_squared_error(all_preds_test, all_labels_test)
            loss_ddg_test.append(test_loss)

            test_loss_TRANS = mean_squared_error(all_preds_test_TRANS, all_preds_test)
            loss_ddg_test_TRANS.append(test_loss_TRANS)

            loss_ddg_test_TOT.append(test_loss+test_loss_TRANS)
            
            test_correlation, _ = pearsonr(all_preds_test, all_labels_test)
            pearson_r_test.append(test_correlation)

            test_correlation_TRANS = pearsonr(all_preds_test_TRANS, all_preds_test)

            for i, batch in enumerate(dataloader_validation):
                output_ddg_validation, labels_validation = output_model_from_batch(batch, model, device, hydra=False, train=False,)#inizio = 'wild',fine='mut')
                output_ddg_HYDRA_SLIM_validation, _ = output_model_from_batch(batch, model, device, hydra=True, train=False)      

                all_preds_validation.extend(output_ddg_validation.cpu().reshape(-1).tolist())
                all_labels_validation.extend(labels_validation.cpu().tolist()) #MESSO UN -  se DEF AL CONTRARIO

                all_preds_validation_TRANS.extend(output_ddg_HYDRA_SLIM_validation.cpu().reshape(-1).tolist())

            
            # Calculate validation metrics
            val_loss = mean_squared_error(all_preds_validation, all_labels_validation)
            loss_ddg_validation.append(val_loss)

            val_loss_TRANS = mean_squared_error(all_preds_validation_TRANS, all_preds_validation)
            loss_ddg_validation_TRANS.append(val_loss_TRANS)

            loss_ddg_validation_TOT.append(val_loss+val_loss_TRANS)
            
            
            val_correlation, _ = pearsonr(all_preds_validation, all_labels_validation)
            pearson_r_validation.append(val_correlation)

        print(f'pearson tra triangolazione e non triangolazione : {test_correlation_TRANS}\n')
        print(f'pearson tra triangolazione e true ddg: {pearsonr(all_preds_test_TRANS, all_labels_test)}\n')
        
        if val_correlation >= max(pearson_r_validation): 
            best_model = copy.deepcopy(model)
            print(f'\033[91mEpoch {epoch+1}/{num_epochs}')
            print(f'Train -  trans_loss={train_loss_TRANS:.4f},    Loss: {train_loss:.4f}, Pearson r: {train_correlation:.4f}, Rho spearman: {train_spearman:.4f}')
            print(f'Validation - Loss: {val_loss:.4f}, Pearson r: {val_correlation:.4f}, Rho spearman: {spearmanr(all_preds_validation, all_labels_validation)[0]:.4f}',)        
            print(f'Test - trans_loss={test_loss_TRANS:.4f},      Loss: {test_loss:.4f}, Pearson r: {test_correlation:.4f}, Rho spearman: {spearmanr(all_preds_test, all_labels_test)[0]:.4f}\033[0m\n')
      

        else:
            print(f'Epoch {epoch+1}/{num_epochs}')
            print(f'Train -    trans_loss={train_loss_TRANS:.4f},    Loss: {train_loss:.4f}, Pearson r: {train_correlation:.4f}, Rho spearman: {train_spearman:.4f}')
            print(f'Validation - Loss: {val_loss:.4f}, Pearson r: {val_correlation:.4f}, Rho spearman: {spearmanr(all_preds_validation, all_labels_validation)[0]:.4f}',)        
            print(f'Test -  trans_loss={test_loss_TRANS:.4f}      Loss: {test_loss:.4f}, Pearson r: {test_correlation:.4f}, Rho spearman: {spearmanr(all_preds_test, all_labels_test)[0]:.4f}\n')
                  
        if epoch > (np.argmax(pearson_r_validation) + patience):
            print(f'\033[91mEarly stopping at epoch {epoch+1}\033[0m')
            break

        # if (epoch == 100) or (epoch == 150) or (epoch == 200) or (epoch == 250) or (epoch == 300):
        #     torch.save(model, f'JanusDDG_{epoch}_ensamble.pth')
    
    pearson_max_val = np.max(pearson_r_validation)

    return pearson_r_train, pearson_r_validation, pearson_r_test, loss_ddg_train, loss_ddg_validation, loss_ddg_test, loss_ddg_train_TRANS, loss_ddg_validation_TRANS, loss_ddg_test_TRANS, loss_ddg_train_TOT, loss_ddg_validation_TOT, loss_ddg_test_TOT

In [9]:

class Cross_Attention_DDG(nn.Module):
    
    def __init__(self, base_module, cross_att=False, dual_cross_att= False, hydra=True ,**transf_parameters):
        super().__init__()
        self.base_ddg = base_module(**transf_parameters, cross_att=cross_att, dual_cross_att= dual_cross_att).to(device)
        self.hydra=hydra
    
    def forward(self, x_wild, x_mut, hydra_slim, length, hydra=False, train = True):

        if train:
            if hydra:
                
                # Calcolo DDG tra wild e primo intermezzo
                delta_dir = x_wild - hydra_slim
                wild_half_DDG = self.base_ddg(delta_dir, x_wild, length)
                
                # Calcolo DDG tra ultimo intermezzo e mutato
                delta_dir = hydra_slim - x_mut
                half_mut_DDG = self.base_ddg(delta_dir, hydra_slim, length)
                
                # Somma totale
                output_TCA = wild_half_DDG + half_mut_DDG
    
            else:
                # Calcolo DDG tra wild e primo intermezzo
                delta_dir = x_wild - x_mut
                output_TCA = self.base_ddg(delta_dir, x_wild, length)         
    
        else:
            if hydra:
                
                # Calcolo DDG tra wild e primo intermezzo
                delta_dir = x_wild - hydra_slim
                delta_inv = hydra_slim - x_wild
                wild_half_DDG = (self.base_ddg(delta_dir, x_wild, length) - self.base_ddg(delta_inv, hydra_slim, length)) / 2
                
                # Calcolo DDG tra ultimo intermezzo e mutato
                delta_dir = hydra_slim - x_mut
                delta_inv = x_mut - hydra_slim
                half_mut_DDG = (self.base_ddg(delta_dir, hydra_slim, length) - self.base_ddg(delta_inv, x_mut, length)) / 2
                
                # Somma totale
                output_TCA = wild_half_DDG + half_mut_DDG
    
            else:
                # Calcolo DDG tra wild e primo intermezzo
                delta_dir = x_wild - x_mut
                delta_inv = x_mut - x_wild
                #wild_half_DDG = (self.base_ddg(delta_dir, half_aas[0], length) - self.base_ddg(delta_inv, x_wild, length)) / 2
                output_TCA = (self.base_ddg(delta_dir, x_wild, length) - self.base_ddg(delta_inv, x_mut, length)) / 2            
            
        return output_TCA  


In [10]:
import torch
import torch.nn as nn


def apply_masked_pooling(position_attn_output, padding_mask):

    # Convert mask to float for element-wise multiplication
    padding_mask = padding_mask.float()

    # Global Average Pooling (GAP) - Exclude padded tokens
    # Sum only over valid positions (padding_mask is False for valid positions)
    sum_output = torch.sum(position_attn_output * (1 - padding_mask.unsqueeze(-1)), dim=1)  # (batch_size, feature_dim)
    valid_count = torch.sum((1 - padding_mask).float(), dim=1)  # (batch_size,)
    gap = sum_output / valid_count.unsqueeze(-1)  # Divide by number of valid positions

    # Global Max Pooling (GMP) - Exclude padded tokens
    # Set padded positions to -inf so they don't affect the max computation
    position_attn_output_masked = position_attn_output * (1 - padding_mask.unsqueeze(-1)) + (padding_mask.unsqueeze(-1) * (- 1e10))
    gmp, _ = torch.max(position_attn_output_masked, dim=1)  # (batch_size, feature_dim)

    return gap, gmp


class SinusoidalPositionalEncoding(nn.Module):
    def __init__(self, embedding_dim, max_len=3700):
        super(SinusoidalPositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, embedding_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-torch.log(torch.tensor(10000.0)) / embedding_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # Shape (1, max_len, embedding_dim)
        self.register_buffer('pe', pe)  # Salvato come tensore fisso (non parametro)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]


class TransformerRegression(nn.Module):
    def __init__(self, input_dim=1280, num_heads=8, dropout_rate=0., num_experts=1, f_activation = nn.ReLU(), kernel_size=20, cross_att = True,
                dual_cross_att=True):
        
        super(TransformerRegression, self).__init__()

        self.embedding_dim = input_dim
        self.act = f_activation
        self.max_len = 3700 #lunghezza massima proteina
        out_channels = 128  #num filtri conv 1D
        kernel_size = 20
        padding = 0
        
        self.conv1d = nn.Conv1d(in_channels=self.embedding_dim, 
                                             out_channels=out_channels, 
                                             kernel_size=kernel_size, 
                                             padding=padding) 
        
        self.conv1d_wild = nn.Conv1d(in_channels=self.embedding_dim, 
                                             out_channels=out_channels, 
                                             kernel_size=kernel_size, 
                                             padding=padding)

        self.norm1 = nn.LayerNorm(out_channels)
        self.norm2 = nn.LayerNorm(out_channels)
        
        # Cross-attention layers
        self.positional_encoding = SinusoidalPositionalEncoding(out_channels, 3700)
        self.speach_att_type = True
        self.multihead_attention = nn.MultiheadAttention(embed_dim=out_channels, num_heads=num_heads, dropout=dropout_rate, batch_first=True )
        self.inverse_attention = nn.MultiheadAttention(embed_dim=out_channels, num_heads=num_heads, dropout=dropout_rate, batch_first =True)
        
        dim_position_wise_FFN = out_channels*2

        self.norm3 = nn.LayerNorm(dim_position_wise_FFN)
        self.router = nn.Linear(dim_position_wise_FFN, num_experts) #dim_position_wise_FFN*2

        self.pw_ffnn = nn.Sequential(
            nn.Linear(dim_position_wise_FFN, 512),
            self.act,
            nn.Linear(512, dim_position_wise_FFN)
            )
        

        self.Linear_ddg = nn.Linear(dim_position_wise_FFN*2, 1)

            

    def create_padding_mask(self, length, seq_len, batch_size):
        """
        Create a padding mask for multihead attention.
        length: Tensor of shape (batch_size,) containing the actual lengths of the sequences.
        seq_len: The maximum sequence length.
        batch_size: The number of sequences in the batch.
        
        Returns a padding mask of shape (batch_size, seq_len).
        """
        mask = torch.arange(seq_len, device=length.device).unsqueeze(0) >= length.unsqueeze(1)
        return mask



    def forward(self, delta_w_m, x_wild, length):
            
            delta_w_m = delta_w_m.transpose(1, 2)  # (batch_size, feature_dim, seq_len) -> (seq_len, batch_size, feature_dim)
            C_delta_w_m = self.conv1d(delta_w_m)
            C_delta_w_m = C_delta_w_m.transpose(1, 2)  # (seq_len, batch_size, feature_dim) -> (batch_size, seq_len, feature_dim)
            C_delta_w_m = self.positional_encoding(C_delta_w_m)
            
            x_wild = x_wild.transpose(1, 2)  # (batch_size, feature_dim, seq_len) -> (seq_len, batch_size, feature_dim)
            C_x_wild = self.conv1d_wild(x_wild)
            C_x_wild = C_x_wild.transpose(1, 2)  # (seq_len, batch_size, feature_dim) -> (batch_size, seq_len, feature_dim)
            C_x_wild = self.positional_encoding(C_x_wild)            
            
            batch_size, seq_len, feature_dim = C_x_wild.size()

            padding_mask = self.create_padding_mask(length, seq_len, batch_size)        
                    
            if self.speach_att_type:
                print('ATTENTION TYPE: Dual cross Attention\n q = wild , k = delta, v = delta and q = delta , k = wild, v = wild \n ----------------------------------')
                self.speach_att_type = False
                
            direct_attn_output, _ = self.multihead_attention(C_x_wild, C_delta_w_m, C_delta_w_m, key_padding_mask=padding_mask)
            direct_attn_output += C_delta_w_m 
            direct_attn_output = self.norm1(direct_attn_output)                        
            
            inverse_attn_output, _ = self.inverse_attention(C_delta_w_m, C_x_wild, C_x_wild, key_padding_mask=padding_mask)                   
            inverse_attn_output += C_x_wild  
            inverse_attn_output = self.norm2(inverse_attn_output)
            
            attn_output = torch.cat([direct_attn_output, inverse_attn_output], dim=-1)

            output = self.pw_ffnn(attn_output)
    
            position_attn_output = attn_output + output
    
            position_attn_output = self.norm3(position_attn_output)
    
            gap, gmp = apply_masked_pooling(position_attn_output, padding_mask)
    
            # Concatenate GAP and GMP
            pooled_output = torch.cat([gap, gmp], dim=-1)  # (batch_size, 2 * feature_dim)
    
            # Pass through FFNN to predict DDG
            x = self.Linear_ddg(pooled_output)        
            
            return x.squeeze(-1)

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [12]:
assert False

AssertionError: 

In [13]:
#PROVA base base
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

lr = 1e-4
#lr = 1e-5
#lr = 1e-4

input_dim = 1280

transf_parameters={'input_dim':1280, 'num_heads':8,
                    'dropout_rate':0.,}

patience = 300
DDG_model = TransformerRegression
Final_model = torch.load('JanusDDG_300epochs_ARXIVE.pth')#Cross_Attention_DDG(DDG_model, cross_att = True, dual_cross_att=True, **transf_parameters)

path_save_fig = 'DDGemb \n ----------------------------------'
print(path_save_fig)
pearson_r_train, pearson_r_validation, pearson_r_test, loss_ddg_train, loss_ddg_validation, loss_ddg_test, loss_ddg_train_TRANS, loss_ddg_validation_TRANS, loss_ddg_test_TRANS, loss_ddg_train_TOT, loss_ddg_validation_TOT, loss_ddg_test_TOT = training_and_validation_loop_ddg(Final_model, dataloader_train, dataloader_test,
                                                                                   dataloader_validation,
                                                                                   path_save_fig, epochs=28, lr =lr,patience = patience)

  Final_model = torch.load('JanusDDG_300epochs_ARXIVE.pth')#Cross_Attention_DDG(DDG_model, cross_att = True, dual_cross_att=True, **transf_parameters)


DDGemb 
 ----------------------------------
pearson tra triangolazione e non triangolazione : PearsonRResult(statistic=0.8851722255810179, pvalue=3.974044070153077e-10)

pearson tra triangolazione e true ddg: PearsonRResult(statistic=0.4639411055627248, pvalue=0.012888891490112912)

[91mEpoch 1/28
Train -  trans_loss=0.9823,    Loss: 0.6594, Pearson r: 0.9825, Rho spearman: 0.9929
Validation - Loss: 4.2563, Pearson r: 0.5609, Rho spearman: 0.4918
Test - trans_loss=0.3563,      Loss: 6.0229, Pearson r: 0.6649, Rho spearman: 0.6317[0m

pearson tra triangolazione e non triangolazione : PearsonRResult(statistic=0.8870922896436889, pvalue=3.2278810149658024e-10)

pearson tra triangolazione e true ddg: PearsonRResult(statistic=0.44472164548481674, pvalue=0.017731226106191735)

Epoch 2/28
Train -    trans_loss=0.7961,    Loss: 0.7018, Pearson r: 0.9936, Rho spearman: 0.9924
Validation - Loss: 4.1834, Pearson r: 0.5345, Rho spearman: 0.4619
Test -  trans_loss=0.3859      Loss: 5.6940, Pearso

In [15]:
np.argmax(pearson_r_test)

27

In [None]:
pearson tra triangolazione e non triangolazione : PearsonRResult(statistic=0.8851722255810179, pvalue=3.974044070153077e-10)

pearson tra triangolazione e true ddg: PearsonRResult(statistic=0.4639411055627248, pvalue=0.012888891490112912)

Epoch 1/28
Train -  trans_loss=0.9823,    Loss: 0.6594, Pearson r: 0.9825, Rho spearman: 0.9929
Validation - Loss: 4.2563, Pearson r: 0.5609, Rho spearman: 0.4918
Test - trans_loss=0.3563,      Loss: 6.0229, Pearson r: 0.6649, Rho spearman: 0.6317

In [None]:
assert False

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Creazione della figura con due sottografici affiancati
fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharex=True)

# PRIMO GRAFICO: Pearson
#sns.lineplot(pearson_r_train, label="Pearson s669", ax=axes[0])
sns.lineplot(pearson_r_test, label="Pearson s669", ax=axes[0])
sns.lineplot(pearson_r_validation, label="Pearson Ptmul-NR", ax=axes[0])

# Aggiunta delle linee orizzontali di riferimento
axes[0].axhline(y=0.595, color='r', linestyle='--', label="y = 0.595")
axes[0].axhline(y=0.59, color='g', linestyle='--', label="y = 0.59")
axes[0].axhline(y=0.545, color='b', linestyle='--', label="y = 0.545")
axes[0].axhline(y=0.54, color='y', linestyle='--', label="y = 0.54")

axes[0].set_title("Pearson Correlation")
axes[0].legend()

# SECONDO GRAFICO: Loss
loss_ddg_train_BASE = loss_ddg_train[::2]   # Prende gli elementi con indice pari
loss_ddg_train_TRANS = loss_ddg_train[1::2]  # Prende gli elementi con indice dispari

sns.lineplot(loss_ddg_train_BASE, label="Loss train BASE", ax=axes[1])
sns.lineplot(loss_ddg_train_TRANS, label="Loss train TRANS", ax=axes[1])
sns.lineplot(loss_ddg_train_TOT, label="Loss train TOT", ax=axes[1])

sns.lineplot(loss_ddg_test, label="Loss s669 BASE", ax=axes[1])
sns.lineplot(loss_ddg_test_TRANS, label="Loss s669 TRANS", ax=axes[1])
sns.lineplot(loss_ddg_test_TOT, label="Loss s669 TOT", ax=axes[1])

sns.lineplot(loss_ddg_validation, label="Loss Ptmul-NR BASE", ax=axes[1])
sns.lineplot(loss_ddg_validation_TRANS, label="Loss Ptmul-NR TRANS", ax=axes[1])
sns.lineplot(loss_ddg_validation_TOT, label="Loss Ptmul-NR TOT" , ax=axes[1])


axes[1].set_title("Loss DDG")
axes[1].legend()

# Mostrare il grafico finale
plt.tight_layout()
plt.show()




In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns

# Creazione della figura con 2 righe e 3 colonne (Pearson sopra, Loss sotto)
fig, axes = plt.subplots(2, 3, figsize=(18, 10), sharex=True)

### 1️⃣ GRAFICI PEARSON ###
# TRAIN
sns.lineplot(pearson_r_train, label="Pearson Train", ax=axes[0, 0], color='blue')

axes[0, 0].set_title("Train Pearson")
axes[0, 0].legend()

# TEST
sns.lineplot(pearson_r_test, label="Pearson M28", ax=axes[0, 1], color='blue')
# axes[0, 1].axhline(y=0.595, color='r', linestyle='--')
# axes[0, 1].axhline(y=0.59, color='g', linestyle='--')
# axes[0, 1].axhline(y=0.545, color='b', linestyle='--')
# axes[0, 1].axhline(y=0.54, color='y', linestyle='--')
axes[0, 1].set_title("M28 Pearson")
axes[0, 1].legend()

# VALIDATION
sns.lineplot(pearson_r_validation, label="Pearson PTMUL-TRAIN", ax=axes[0, 2], color='blue')
# axes[0, 2].axhline(y=0.595, color='r', linestyle='--')
# axes[0, 2].axhline(y=0.59, color='g', linestyle='--')
# axes[0, 2].axhline(y=0.545, color='b', linestyle='--')
# axes[0, 2].axhline(y=0.54, color='y', linestyle='--')
axes[0, 2].set_title("PTMUL-TRAIN Pearson")
axes[0, 2].legend()

### 2️⃣ GRAFICI LOSS ###
# TRAIN
sns.lineplot(loss_ddg_train, label="Loss train BASE", ax=axes[1, 0], color='blue')
sns.lineplot(loss_ddg_train_TRANS, label="Loss train TRANS", ax=axes[1, 0], color='orange')
sns.lineplot(loss_ddg_train_TOT, label="Loss train TOT", ax=axes[1, 0], color='green')
axes[1, 0].set_title("Train Loss")
axes[1, 0].legend()

# TEST
sns.lineplot(loss_ddg_test, label="Loss M28 BASE", ax=axes[1, 1], color='blue')
sns.lineplot(loss_ddg_test_TRANS, label="Loss M28 TRANS", ax=axes[1, 1], color='orange')
sns.lineplot(loss_ddg_test_TOT, label="Loss M28 TOT", ax=axes[1, 1], color='green')
axes[1, 1].set_title("M28 Loss")
axes[1, 1].legend()

# VALIDATION
sns.lineplot(loss_ddg_validation, label="Loss PTMUL-TRAIN BASE", ax=axes[1, 2], color='blue')
sns.lineplot(loss_ddg_validation_TRANS, label="Loss PTMUL-TRAIN TRANS", ax=axes[1, 2], color='orange')
sns.lineplot(loss_ddg_validation_TOT, label="Loss PTMUL-TRAIN TOT", ax=axes[1, 2], color='green')
axes[1, 2].set_title("PTMUL-TRAIN Loss")
axes[1, 2].legend()

# Migliorare il layout
plt.tight_layout()
plt.show()


In [None]:
np.argmax(pearson_r_test)

In [None]:
pearson_r_test

In [None]:
with open("metrics_data_zeroes.pkl", "rb") as file:
    loaded_data = pickle.load(file)

print(loaded_data['pearson_r_validation'][27])
print(loaded_data['pearson_r_test'][27])


In [None]:
import pickle

# Definiamo tutte le liste da salvare
data_to_save = {
    "pearson_r_train": pearson_r_train,
    "pearson_r_test": pearson_r_test,
    "pearson_r_validation": pearson_r_validation,
    "loss_ddg_train": loss_ddg_train,
    "loss_ddg_train_BASE": loss_ddg_train,  # Indici pari
    "loss_ddg_train_TRANS": loss_ddg_train_TRANS,  # Indici dispari
    "loss_ddg_train_TOT": loss_ddg_train_TOT,
    "loss_ddg_test": loss_ddg_test,
    "loss_ddg_test_TRANS": loss_ddg_test_TRANS,
    "loss_ddg_test_TOT": loss_ddg_test_TOT,
    "loss_ddg_validation": loss_ddg_validation,
    "loss_ddg_validation_TRANS": loss_ddg_validation_TRANS,
    "loss_ddg_validation_TOT": loss_ddg_validation_TOT
}

# 🔹 SALVATAGGIO su file pickle
with open("metrics_data_zeroes.pkl", "wb") as file:
    pickle.dump(data_to_save, file)

print("✅ Dati salvati in 'metrics_data.pkl' con successo!")

# 🔹 RICARICAMENTO per verifica
with open("metrics_data_zeroes.pkl", "rb") as file:
    loaded_data = pickle.load(file)

print("✅ Dati ricaricati con successo!")
print("Chiavi disponibili:", loaded_data.keys())


In [None]:
loss_ddg_train_TOT

In [None]:
#torch.save(Final_model, 'JanusDDG_28epochs_finetuned_zeros_MODELLO_FINALE.pth')

In [14]:
torch.save(Final_model, 'JanusDDG_fine_tuned.pth')

In [None]:
print('ciao')

In [None]:
# 
# lr = 1e-5
# train_path =[f's2450_fold_{i}_hydra_slim_SHIFTED1.pkl' for i in [0,1,2,3,4]]+[f's2450_fold_{i}_hydra_slim_SHIFTED1_inv.pkl' for i in [0,1,2,3,4]]
# val_path = ['s669_Castrense_hydra_slim.pkl']#['s669_Castrense.pkl']#[f's2450_fold_{i}.pkl' for i in val_set]+[f's2450_fold_{i}_inv.pkl' for i in val_set]
# test_path = ['s669_Castrense_hydra_slim.pkl']#['s669_Castrense_hydra_slim.pkl']#['test_TS16.pkl']
# dataloader_train = dataloader_generation(path = train_path, batch_size = 6,collate_fn=collate_fn, dataloader_shuffle = True, inv= False)
# dataloader_validation = dataloader_generation(path = val_path, batch_size = 1, collate_fn=collate_fn, dataloader_shuffle = False, inv= False)
# dataloader_test = dataloader_generation(path = test_path, batch_size = 1, collate_fn=collate_fn, dataloader_shuffle = False, inv= False)
            # output_ddg_train, labels_train = output_model_from_batch(batch, model, device, hydra=False, train=False)
            # output_ddg_HYDRA_SLIM_train, _ = output_model_from_batch(batch, model, device, hydra=True, train=False)

# DDGemb 
#  ----------------------------------
# pearson tra triangolazione e non triangolazione : PearsonRResult(statistic=0.9588071372114625, pvalue=0.0)

# pearson tra triangolazione e true ddg: PearsonRResult(statistic=0.529450390983784, pvalue=1.3203795258932152e-49)

# Epoch 1/100
# Train -  trans_loss=0.2259,    Loss: 0.0191, Pearson r: 0.9992, Rho spearman: 0.9992
# Validation - Loss: 1.9125, Pearson r: 0.5449, Rho spearman: 0.5671
# Test - trans_loss=0.0645,      Loss: 1.9125, Pearson r: 0.5449, Rho spearman: 0.5671

# pearson tra triangolazione e non triangolazione : PearsonRResult(statistic=0.96427994120678, pvalue=0.0)

# pearson tra triangolazione e true ddg: PearsonRResult(statistic=0.5319033664525803, pvalue=3.9247209250389655e-50)

# Epoch 2/100
# Train -  trans_loss=0.1719,    Loss: 0.0237, Pearson r: 0.9985, Rho spearman: 0.9985
# Validation - Loss: 1.9049, Pearson r: 0.5456, Rho spearman: 0.5671
# Test - trans_loss=0.0568,      Loss: 1.9049, Pearson r: 0.5456, Rho spearman: 0.5671

# pearson tra triangolazione e non triangolazione : PearsonRResult(statistic=0.9675307802090347, pvalue=0.0)

# pearson tra triangolazione e true ddg: PearsonRResult(statistic=0.5334596332503565, pvalue=1.8083434971915834e-50)

# Epoch 3/100
# Train -  trans_loss=0.1426,    Loss: 0.0232, Pearson r: 0.9978, Rho spearman: 0.9981
# Validation - Loss: 1.9022, Pearson r: 0.5457, Rho spearman: 0.5672
# Test - trans_loss=0.0521,      Loss: 1.9022, Pearson r: 0.5457, Rho spearman: 0.5672



##################
##################
# 
# lr = 1e-4
# train_path =[f's2450_fold_{i}_hydra_slim_SHIFTED1.pkl' for i in [0,1,2,3,4]]+[f's2450_fold_{i}_hydra_slim_SHIFTED1_inv.pkl' for i in [0,1,2,3,4]]
# val_path = ['s669_Castrense_hydra_slim.pkl']#['s669_Castrense.pkl']#[f's2450_fold_{i}.pkl' for i in val_set]+[f's2450_fold_{i}_inv.pkl' for i in val_set]
# test_path = ['s669_Castrense_hydra_slim.pkl']#['s669_Castrense_hydra_slim.pkl']#['test_TS16.pkl']
# dataloader_train = dataloader_generation(path = train_path, batch_size = 6,collate_fn=collate_fn, dataloader_shuffle = True, inv= False)
# dataloader_validation = dataloader_generation(path = val_path, batch_size = 1, collate_fn=collate_fn, dataloader_shuffle = False, inv= False)
# dataloader_test = dataloader_generation(path = test_path, batch_size = 1, collate_fn=collate_fn, dataloader_shuffle = False, inv= False)
            # output_ddg_train, labels_train = output_model_from_batch(batch, model, device, hydra=False, train=False)
            # output_ddg_HYDRA_SLIM_train, _ = output_model_from_batch(batch, model, device, hydra=True, train=False)


# DDGemb 
#  ----------------------------------
# pearson tra triangolazione e non triangolazione : PearsonRResult(statistic=0.9771304453887012, pvalue=0.0)

# pearson tra triangolazione e true ddg: PearsonRResult(statistic=0.5287687980041857, pvalue=1.846449274271924e-49)

# Epoch 1/100
# Train -  trans_loss=0.1348,    Loss: 0.0303, Pearson r: 0.9965, Rho spearman: 0.9971
# Validation - Loss: 1.9438, Pearson r: 0.5371, Rho spearman: 0.5625
# Test - trans_loss=0.0333,      Loss: 1.9438, Pearson r: 0.5371, Rho spearman: 0.5625

# pearson tra triangolazione e non triangolazione : PearsonRResult(statistic=0.9779563155914957, pvalue=0.0)

# pearson tra triangolazione e true ddg: PearsonRResult(statistic=0.5248583635712959, pvalue=1.246010758494287e-48)

# Epoch 2/100
# Train -    trans_loss=0.0476,    Loss: 0.0235, Pearson r: 0.9965, Rho spearman: 0.9970
# Validation - Loss: 1.9273, Pearson r: 0.5343, Rho spearman: 0.5602
# Test -  trans_loss=0.0364      Loss: 1.9273, Pearson r: 0.5343, Rho spearman: 0.5602


#########################
#########################
# SOTTO TRAIN CON  TRAIN = TRUE
# lr = 1e-5
# train_path =[f's2450_fold_{i}_hydra_slim_SHIFTED1.pkl' for i in [0,1,2,3,4]]+[f's2450_fold_{i}_hydra_slim_SHIFTED1_inv.pkl' for i in [0,1,2,3,4]]
# val_path = ['s669_Castrense_hydra_slim.pkl']#['s669_Castrense.pkl']#[f's2450_fold_{i}.pkl' for i in val_set]+[f's2450_fold_{i}_inv.pkl' for i in val_set]
# test_path = ['s669_Castrense_hydra_slim.pkl']#['s669_Castrense_hydra_slim.pkl']#['test_TS16.pkl']
# dataloader_train = dataloader_generation(path = train_path, batch_size = 6,collate_fn=collate_fn, dataloader_shuffle = True, inv= False)
# dataloader_validation = dataloader_generation(path = val_path, batch_size = 1, collate_fn=collate_fn, dataloader_shuffle = False, inv= False)
# dataloader_test = dataloader_generation(path = test_path, batch_size = 1, collate_fn=collate_fn, dataloader_shuffle = False, inv= False)
            # output_ddg_train, labels_train = output_model_from_batch(batch, model, device, hydra=False, train=True)
            # output_ddg_HYDRA_SLIM_train, _ = output_model_from_batch(batch, model, device, hydra=True, train=True)


DDGemb 
 ----------------------------------
pearson tra triangolazione e non triangolazione : PearsonRResult(statistic=0.9566641535294127, pvalue=0.0)

pearson tra triangolazione e true ddg: PearsonRResult(statistic=0.5277396095591367, pvalue=3.0592915998106224e-49)

Epoch 1/100
Train -  trans_loss=0.2762,    Loss: 0.0218, Pearson r: 0.9988, Rho spearman: 0.9989
Validation - Loss: 1.9221, Pearson r: 0.5441, Rho spearman: 0.5663
Test - trans_loss=0.0660,      Loss: 1.9221, Pearson r: 0.5441, Rho spearman: 0.5663

pearson tra triangolazione e non triangolazione : PearsonRResult(statistic=0.9610431624147351, pvalue=0.0)

pearson tra triangolazione e true ddg: PearsonRResult(statistic=0.5294964590627442, pvalue=1.290753378238322e-49)

Epoch 2/100
Train -  trans_loss=0.2115,    Loss: 0.0338, Pearson r: 0.9985, Rho spearman: 0.9982
Validation - Loss: 1.9127, Pearson r: 0.5446, Rho spearman: 0.5663
Test - trans_loss=0.0605,      Loss: 1.9127, Pearson r: 0.5446, Rho spearman: 0.5663

pearson tra triangolazione e non triangolazione : PearsonRResult(statistic=0.9640050340567555, pvalue=0.0)

pearson tra triangolazione e true ddg: PearsonRResult(statistic=0.5307356800858392, pvalue=7.001034432792789e-50)

Epoch 3/100
Train -  trans_loss=0.1839,    Loss: 0.0300, Pearson r: 0.9981, Rho spearman: 0.9978
Validation - Loss: 1.9063, Pearson r: 0.5450, Rho spearman: 0.5669
Test - trans_loss=0.0568,      Loss: 1.9063, Pearson r: 0.5450, Rho spearman: 0.5669

pearson tra triangolazione e non triangolazione : PearsonRResult(statistic=0.9661287212983981, pvalue=0.0)

pearson tra triangolazione e true ddg: PearsonRResult(statistic=0.5318791816748505, pvalue=3.9721413760502284e-50)

Epoch 4/100
Train -  trans_loss=0.1613,    Loss: 0.0279, Pearson r: 0.9977, Rho spearman: 0.9975
Validation - Loss: 1.9020, Pearson r: 0.5453, Rho spearman: 0.5667
Test - trans_loss=0.0544,      Loss: 1.9020, Pearson r: 0.5453, Rho spearman: 0.5667

pearson tra triangolazione e non triangolazione : PearsonRResult(statistic=0.9674782908934236, pvalue=0.0)

pearson tra triangolazione e true ddg: PearsonRResult(statistic=0.5322556619700721, pvalue=3.2944397234713975e-50)

Epoch 5/100
Train -  trans_loss=0.1427,    Loss: 0.0258, Pearson r: 0.9976, Rho spearman: 0.9975
Validation - Loss: 1.8991, Pearson r: 0.5454, Rho spearman: 0.5670
Test - trans_loss=0.0529,      Loss: 1.8991, Pearson r: 0.5454, Rho spearman: 0.5670

pearson tra triangolazione e non triangolazione : PearsonRResult(statistic=0.9686013014612453, pvalue=0.0)

pearson tra triangolazione e true ddg: PearsonRResult(statistic=0.532149442873165, pvalue=3.4730707910935084e-50)

Epoch 6/100
Train -    trans_loss=0.1268,    Loss: 0.0241, Pearson r: 0.9974, Rho spearman: 0.9974
Validation - Loss: 1.9022, Pearson r: 0.5451, Rho spearman: 0.5667
Test -  trans_loss=0.0504      Loss: 1.9022, Pearson r: 0.5451, Rho spearman: 0.5667

pearson tra triangolazione e non triangolazione : PearsonRResult(statistic=0.9692794166233685, pvalue=0.0)

pearson tra triangolazione e true ddg: PearsonRResult(statistic=0.5322163147865776, pvalue=3.3595208299555624e-50)

Epoch 7/100
Train -    trans_loss=0.1115,    Loss: 0.0236, Pearson r: 0.9973, Rho spearman: 0.9975
Validation - Loss: 1.8996, Pearson r: 0.5452, Rho spearman: 0.5664
Test -  trans_loss=0.0500      Loss: 1.8996, Pearson r: 0.5452, Rho spearman: 0.5664






In [None]:
DDGemb 
 ----------------------------------
pearson tra triangolazione e non triangolazione : PearsonRResult(statistic=0.9455839781553668, pvalue=0.0)

pearson tra triangolazione e true ddg: PearsonRResult(statistic=0.5151058755154342, pvalue=1.3092470725534226e-46)

Epoch 1/100
Train -  trans_loss=0.9060,    Loss: 0.7454, Pearson r: 0.9846, Rho spearman: 0.9960
Validation - Loss: 2.3962, Pearson r: 0.5331, Rho spearman: 0.5574
Test - trans_loss=0.0252,      Loss: 2.3962, Pearson r: 0.5331, Rho spearman: 0.5574

In [None]:
def model_performance_test(model, dataloader_test, hydra=False):
    # Assicurati che il modello sia in modalità di valutazione
    model.eval()
    
    # Lista per salvare tutte le predizioni
    all_predictions_test = []
    all_lables_test = []
    
    with torch.no_grad():  # Disable gradient calculation
       
        for i, batch in enumerate(dataloader_test):

            predictions_test, labels_test = output_model_from_batch(batch, model, device, hydra=hydra)

            # Aggiungi le predizioni alla lista
            all_predictions_test.append(predictions_test)
            all_lables_test.append(labels_test)
    
    return all_predictions_test, all_lables_test

In [None]:
Final_model = torch.load('JanusDDG_300epochs.pth')

In [None]:
###SECONDA PROVA


# class DeltaDataset(Dataset):
#     def __init__(self, data, dim_embedding, inv = False):
#         self.data = data
#         self.dim_embedding = dim_embedding
#         self.inv = inv

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         sample = self.data[idx]

#         if self.inv: 
#             return {
#                 'id': sample['id'],
#                 #'wild_type': torch.tensor(sample['mut_type'], dtype=torch.float32),    #inverto mut con wild 
#                 'wild_type': torch.tensor(sample['mut_type'], dtype=torch.float32),    #inverto mut con wild
#                 #'wild_type': torch.tensor(sample['hydra_slim'], dtype=torch.float32),    #inverto mut con wild 

#                 #'mut_type': torch.tensor(sample['wild_type'], dtype=torch.float32),    #inverto mut con wild
#                 'mut_type': torch.tensor(sample['hydra_slim'], dtype=torch.float32),    #inverto mut con wild             

#                 'length': torch.tensor(sample['length'], dtype=torch.float32),
#                 'ddg': torch.tensor(-float(sample['ddg']), dtype=torch.float32),       # -ddg
#                 'pos_mut': torch.tensor(sample['pos_mut'], dtype=torch.int64),
#                 'hydra_slim': torch.tensor(sample['hydra_slim'], dtype=torch.float32),
#                 }

#         else:
#             return {
#                 'id': sample['id'],
#                 #'wild_type': torch.tensor(sample['wild_type'], dtype=torch.float32),
#                 'wild_type': torch.tensor(sample['hydra_slim'], dtype=torch.float32),
#                 'mut_type': torch.tensor(sample['mut_type'],dtype=torch.float32),
#                 #'mut_type': torch.tensor(sample['hydra_slim'],dtype=torch.float32),

#                 #'mut_type': torch.tensor(sample['mut_type'],dtype=torch.float32),
#                 'length': torch.tensor(sample['length'], dtype=torch.float32),
#                 'ddg': torch.tensor(float(sample['ddg']), dtype=torch.float32),
#                 'pos_mut': torch.tensor(sample['pos_mut'], dtype=torch.int64),
#                 'hydra_slim': torch.tensor(sample['hydra_slim'], dtype=torch.float32),
#                 }


In [None]:
path_test = ['s669_Castrense_hydra_slim.pkl']

dataloader_test_dir = dataloader_generation(path = path_test, batch_size = 6, collate_fn=collate_fn, dataloader_shuffle = False, inv= False)
all_predictions_test_dir, all_lables_test_dir = model_performance_test(Final_model,dataloader_test_dir,hydra=False)                        
all_predictions_test_dir = pd.Series(torch.cat(all_predictions_test_dir, dim=0).cpu())
all_lables_test_dir = pd.Series(torch.cat(all_lables_test_dir, dim=0).cpu())

In [None]:
dataloader_test_inv = dataloader_generation(path = path_test, batch_size = 6, collate_fn=collate_fn, dataloader_shuffle = False, inv= True)
all_predictions_test_inv, all_lables_test_inv =model_performance_test(Final_model,dataloader_test_inv,hydra=False)                                
all_predictions_test_inv = pd.Series(torch.cat(all_predictions_test_inv, dim=0).cpu())
all_lables_test_inv = pd.Series(torch.cat(all_lables_test_inv, dim=0).cpu())

In [None]:
pearsonr(all_predictions_test_dir,all_lables_test_dir)

In [None]:
#PREDIZIONI_DIRETTE = all_predictions_test_dir

In [None]:
#PREDIZIONI_1_PASSO = (all_predictions_test_dir-all_predictions_test_inv)/2

In [None]:
#PREDIZIONI_1_PASSO = PREDIZIONI_DIRETTE

In [None]:
#PREDIZIONI_2_PASSO = all_predictions_test_dir

In [None]:
pearsonr(PREDIZIONI_1_PASSO+PREDIZIONI_2_PASSO, PREDIZIONI_DIRETTE)

In [None]:
path_test = ['s669_Castrense_hydra_slim.pkl']


dataloader_test_dir = dataloader_generation(path = path_test, batch_size = 1, collate_fn=collate_fn, dataloader_shuffle = False, inv= False)
all_predictions_test_dir, all_lables_test_dir = model_performance_test(Final_model,dataloader_test_dir,hydra=True)                        
all_predictions_test_dir = pd.Series(torch.cat(all_predictions_test_dir, dim=0).cpu())
all_lables_test_dir = pd.Series(torch.cat(all_lables_test_dir, dim=0).cpu())

In [None]:
pearsonr(all_predictions_test_dir, pd.read_csv('../git_JANUS_DDG/Results/Result_s669_to_process.csv')['DDG_JanusDDG'])

In [None]:
dir =True#True
inv = True
path_test = ['s669_Castrense_hydra_slim.pkl']#['s669_hydra_Castrense.pkl']#['Ssym_correct_by_KORPM.pkl']#['s461_Castrense.pkl']#['s669_Castrense.pkl']#['../DeltaDelta_BELLO/cdna117k_fold_1.pkl'] + ['../DeltaDelta_BELLO/cdna117k_fold_2.pkl']#['s669_Castrense.pkl']
#['dataset_doppie.pkl']
all_predictions_test_dir=None
all_lables_test_dir=None
all_predictions_test_inv=None
all_lables_test_inv=None

dir_predictions=[]
dir_lables=[]

inv_predictions=[]
inv_lables=[]


if dir:
    dataloader_test_dir = dataloader_generation(path = path_test, batch_size = 6, collate_fn=collate_fn, dataloader_shuffle = False, inv= False)
    all_predictions_test_dir, all_lables_test_dir = model_performance_test(Final_model,dataloader_test_dir,hydra=False)                        
    all_predictions_test_dir = pd.Series(torch.cat(all_predictions_test_dir, dim=0).cpu())
    all_lables_test_dir = pd.Series(torch.cat(all_lables_test_dir, dim=0).cpu())
    dir_predictions.append(all_predictions_test_dir)
    dir_lables.append(all_lables_test_dir)

if inv:
    dataloader_test_inv = dataloader_generation(path = path_test, batch_size = 6, collate_fn=collate_fn, dataloader_shuffle = False, inv= True)
    all_predictions_test_inv, all_lables_test_inv =model_performance_test(Final_model,dataloader_test_inv,hydra=False)                                
    all_predictions_test_inv = pd.Series(torch.cat(all_predictions_test_inv, dim=0).cpu())
    all_lables_test_inv = pd.Series(torch.cat(all_lables_test_inv, dim=0).cpu())
    inv_predictions.append(all_predictions_test_inv)
    inv_lables.append(all_lables_test_inv)
    if ~dir:
        all_lables_test_dir = -all_lables_test_inv

#metrics(all_predictions_test_dir,all_predictions_test_inv, all_lables_test_dir)

In [None]:

all_predictions_test_dir_HYDRA, all_lables_test_dir_HYDRA = model_performance_test(Final_model,dataloader_test_dir,hydra=True)                        
all_predictions_test_dir_HYDRA = pd.Series(torch.cat(all_predictions_test_dir_HYDRA, dim=0).cpu())
all_lables_test_dir_HYDRA = pd.Series(torch.cat(all_lables_test_dir_HYDRA, dim=0).cpu())



all_predictions_test_inv_HYDRA, all_lables_test_inv_HYDRA =model_performance_test(Final_model,dataloader_test_inv,hydra=True)                                
all_predictions_test_inv_HYDRA = pd.Series(torch.cat(all_predictions_test_inv_HYDRA, dim=0).cpu())
all_lables_test_inv_HYDRA = pd.Series(torch.cat(all_lables_test_inv_HYDRA, dim=0).cpu())


In [None]:
pearsonr((all_predictions_test_dir-all_predictions_test_inv)/2, (all_predictions_test_dir_HYDRA-all_predictions_test_inv_HYDRA)/2)

In [None]:
pearsonr((all_predictions_test_dir-all_predictions_test_inv)/2, (all_predictions_test_dir_HYDRA-all_predictions_test_inv_HYDRA)/2)

In [None]:
#HYDRA_1_PASSO = (all_predictions_test_dir-all_predictions_test_inv)/2

In [None]:
#HYDRA_2_PASSO =  (all_predictions_test_dir-all_predictions_test_inv)/2

In [None]:
pearsonr(HYDRA_1_PASSO, all_lables_test_dir)

In [None]:
len(HYDRA_1_PASSO+HYDRA_2_PASSO)

In [None]:
sns.scatterplot(x =all_lables_test_dir,y=PREDIZIONE_DIRETTA )
sns.scatterplot(x =all_lables_test_dir,y=HYDRA_1_PASSO+HYDRA_2_PASSO )


In [None]:
#PREDIZIONE_DIRETTA = (all_predictions_test_dir-all_predictions_test_inv)/2

In [None]:
pd.DataFrame([p_TRANS,l_TRANS,lab_true]).to_csv('p_trans_l_trans_TRAIN.csv')

In [None]:
pd.DataFrame([p_TRANS,l_TRANS,lab_true]).T

In [None]:
pearsonr(pd.read_csv('../git_JANUS_DDG/Results/Result_s669_to_process.csv')['DDG_JanusDDG'], pd.read_csv('p_trans_l_trans_TRAIN.csv')['pred'])

In [None]:
l_TRANS

In [None]:
l_TRANS

In [None]:
pd.DataFrame([p_TRANS,l_TRANS,lab_true]).T.rename(columns={0:'pred_trans',1:'pred',2:'true'})

In [None]:
d = pd.DataFrame([p_TRANS,l_TRANS,lab_true]).T.rename(columns={0:'pred_trans',1:'pred',2:'true'})
d.to_csv('p_trans_l_trans_TRAIN.csv')

In [None]:
pearsonr([-0.14371299743652344, -0.3913414478302002, -0.4835125207901001, 0.04273629188537598, -0.025092124938964844, 0.10471123456954956, -0.21423470973968506, -0.004149198532104492, -0.11719012260437012, -1.5348817110061646],
        [-1.326751708984375, -1.0506248474121094, -1.758748173713684, 0.08321564644575119, -0.9314181804656982, 0.11959119886159897, -0.8220889568328857, -0.9604212045669556, -0.46827802062034607, -1.5829718112945557])

In [None]:
torch.save(Final_model, 'JanusDDG_3epochs_finetuned.pth')

In [None]:
#FOLD0
Epoch 1/50
Train -  trans_loss=1.0051,    Loss: 0.6700, Pearson r: 0.9803, Rho spearman: 0.9930
Validation - Loss: 2.3989, Pearson r: 0.5375, Rho spearman: 0.5636
Test - trans_loss=0.8241,      Loss: 0.7336, Pearson r: 0.9964, Rho spearman: 0.9964

Epoch 2/50
Train -  trans_loss=0.8126,    Loss: 0.7286, Pearson r: 0.9963, Rho spearman: 0.9963
Validation - Loss: 2.2764, Pearson r: 0.5382, Rho spearman: 0.5629
Test - trans_loss=0.8360,      Loss: 0.7141, Pearson r: 0.9960, Rho spearman: 0.9952

Epoch 3/50
Train -  trans_loss=0.7881,    Loss: 0.6885, Pearson r: 0.9932, Rho spearman: 0.9926
Validation - Loss: 2.2742, Pearson r: 0.5474, Rho spearman: 0.5675
Test - trans_loss=0.9066,      Loss: 0.6478, Pearson r: 0.9933, Rho spearman: 0.9910

Epoch 4/50
Train -  trans_loss=0.7462,    Loss: 0.6107, Pearson r: 0.9879, Rho spearman: 0.9875
Validation - Loss: 2.1597, Pearson r: 0.5489, Rho spearman: 0.5638
Test - trans_loss=0.9318,      Loss: 0.6003, Pearson r: 0.9849, Rho spearman: 0.9793

Epoch 5/50
Train -  trans_loss=0.6601,    Loss: 0.5096, Pearson r: 0.9769, Rho spearman: 0.9780
Validation - Loss: 2.0195, Pearson r: 0.5551, Rho spearman: 0.5708
Test - trans_loss=1.1015,      Loss: 0.4820, Pearson r: 0.9784, Rho spearman: 0.9711

Epoch 6/50
Train -    trans_loss=0.5753,    Loss: 0.4252, Pearson r: 0.9677, Rho spearman: 0.9718
Validation - Loss: 2.0257, Pearson r: 0.5517, Rho spearman: 0.5637
Test -  trans_loss=0.9866      Loss: 0.5066, Pearson r: 0.9758, Rho spearman: 0.9682

Epoch 7/50
Train -    trans_loss=0.5163,    Loss: 0.3995, Pearson r: 0.9613, Rho spearman: 0.9690
Validation - Loss: 2.1413, Pearson r: 0.5520, Rho spearman: 0.5645
Test -  trans_loss=1.0329      Loss: 0.5064, Pearson r: 0.9751, Rho spearman: 0.9684

Epoch 8/50
Train -    trans_loss=0.4697,    Loss: 0.3746, Pearson r: 0.9607, Rho spearman: 0.9703
Validation - Loss: 1.9638, Pearson r: 0.5467, Rho spearman: 0.5570
Test -  trans_loss=1.1225      Loss: 0.4557, Pearson r: 0.9745, Rho spearman: 0.9686

Epoch 9/50
Train -    trans_loss=0.4549,    Loss: 0.3532, Pearson r: 0.9612, Rho spearman: 0.9708
Validation - Loss: 2.0069, Pearson r: 0.5480, Rho spearman: 0.5602
Test -  trans_loss=1.0320      Loss: 0.4329, Pearson r: 0.9736, Rho spearman: 0.9673

Epoch 10/50
Train -    trans_loss=0.4402,    Loss: 0.3430, Pearson r: 0.9612, Rho spearman: 0.9710
Validation - Loss: 2.1210, Pearson r: 0.5482, Rho spearman: 0.5616
Test -  trans_loss=1.0447      Loss: 0.4585, Pearson r: 0.9732, Rho spearman: 0.9675

Epoch 11/50
Train -    trans_loss=0.4231,    Loss: 0.3225, Pearson r: 0.9624, Rho spearman: 0.9720
Validation - Loss: 2.1139, Pearson r: 0.5457, Rho spearman: 0.5569
Test -  trans_loss=1.0647      Loss: 0.4612, Pearson r: 0.9724, Rho spearman: 0.9667

Epoch 12/50
Train -    trans_loss=0.4001,    Loss: 0.3117, Pearson r: 0.9632, Rho spearman: 0.9728
Validation - Loss: 1.9700, Pearson r: 0.5481, Rho spearman: 0.5611
Test -  trans_loss=1.2254      Loss: 0.3882, Pearson r: 0.9715, Rho spearman: 0.9652

Epoch 13/50
Train -    trans_loss=0.3953,    Loss: 0.3048, Pearson r: 0.9635, Rho spearman: 0.9734
Validation - Loss: 2.0333, Pearson r: 0.5436, Rho spearman: 0.5555
Test -  trans_loss=1.2169      Loss: 0.3918, Pearson r: 0.9728, Rho spearman: 0.9670

Epoch 14/50
Train -    trans_loss=0.3848,    Loss: 0.2875, Pearson r: 0.9655, Rho spearman: 0.9752
Validation - Loss: 1.9745, Pearson r: 0.5484, Rho spearman: 0.5619
Test -  trans_loss=1.2349      Loss: 0.3598, Pearson r: 0.9740, Rho spearman: 0.9682

Epoch 15/50
Train -    trans_loss=0.3699,    Loss: 0.2698, Pearson r: 0.9672, Rho spearman: 0.9756
Validation - Loss: 1.9451, Pearson r: 0.5485, Rho spearman: 0.5589
Test -  trans_loss=1.2406      Loss: 0.3384, Pearson r: 0.9714, Rho spearman: 0.9648

Epoch 16/50
Train -    trans_loss=0.3592,    Loss: 0.2555, Pearson r: 0.9687, Rho spearman: 0.9765
Validation - Loss: 2.0307, Pearson r: 0.5489, Rho spearman: 0.5580
Test -  trans_loss=1.3852      Loss: 0.3271, Pearson r: 0.9720, Rho spearman: 0.9656

Epoch 17/50
Train -    trans_loss=0.3400,    Loss: 0.2458, Pearson r: 0.9692, Rho spearman: 0.9765
Validation - Loss: 2.0573, Pearson r: 0.5476, Rho spearman: 0.5574
Test -  trans_loss=1.3532      Loss: 0.3319, Pearson r: 0.9698, Rho spearman: 0.9613


#FOLD1
Epoch 1/50
Train -  trans_loss=1.0213,    Loss: 0.6567, Pearson r: 0.9786, Rho spearman: 0.9919
Validation - Loss: 2.3328, Pearson r: 0.5326, Rho spearman: 0.5588
Test - trans_loss=0.8726,      Loss: 0.7057, Pearson r: 0.9975, Rho spearman: 0.9979

Epoch 2/50
Train -  trans_loss=0.7989,    Loss: 0.7327, Pearson r: 0.9959, Rho spearman: 0.9961
Validation - Loss: 2.2474, Pearson r: 0.5411, Rho spearman: 0.5646
Test - trans_loss=0.9015,      Loss: 0.6788, Pearson r: 0.9976, Rho spearman: 0.9976

Epoch 3/50
Train -  trans_loss=0.7955,    Loss: 0.6960, Pearson r: 0.9947, Rho spearman: 0.9947
Validation - Loss: 2.2197, Pearson r: 0.5426, Rho spearman: 0.5622
Test - trans_loss=0.8991,      Loss: 0.6783, Pearson r: 0.9964, Rho spearman: 0.9955

Epoch 4/50
Train -  trans_loss=0.7607,    Loss: 0.6404, Pearson r: 0.9902, Rho spearman: 0.9900
Validation - Loss: 2.1576, Pearson r: 0.5451, Rho spearman: 0.5616
Test - trans_loss=0.9809,      Loss: 0.6245, Pearson r: 0.9937, Rho spearman: 0.9911

#FOLD2
Epoch 1/50
Train -  trans_loss=0.9933,    Loss: 0.6544, Pearson r: 0.9779, Rho spearman: 0.9905
Validation - Loss: 2.4072, Pearson r: 0.5332, Rho spearman: 0.5592
Test - trans_loss=0.8146,      Loss: 0.8195, Pearson r: 0.9968, Rho spearman: 0.9973

Epoch 2/50
Train -  trans_loss=0.7967,    Loss: 0.7431, Pearson r: 0.9960, Rho spearman: 0.9957
Validation - Loss: 2.3575, Pearson r: 0.5432, Rho spearman: 0.5669
Test - trans_loss=0.8477,      Loss: 0.7497, Pearson r: 0.9979, Rho spearman: 0.9983

Epoch 3/50
Train -  trans_loss=0.7892,    Loss: 0.7186, Pearson r: 0.9955, Rho spearman: 0.9948
Validation - Loss: 2.2764, Pearson r: 0.5464, Rho spearman: 0.5667
Test - trans_loss=0.8429,      Loss: 0.7175, Pearson r: 0.9962, Rho spearman: 0.9961

Epoch 4/50
Train -  trans_loss=0.7676,    Loss: 0.6752, Pearson r: 0.9937, Rho spearman: 0.9927
Validation - Loss: 2.2042, Pearson r: 0.5521, Rho spearman: 0.5690
Test - trans_loss=0.9290,      Loss: 0.6196, Pearson r: 0.9911, Rho spearman: 0.9895


#SOTTO HO USATO FOLD 4 NON DI 6... PROVA CON 6
#FOLD VAL 3
Epoch 1/50
Train -  trans_loss=0.9583,    Loss: 0.6801, Pearson r: 0.9810, Rho spearman: 0.9916
Validation - Loss: 2.4221, Pearson r: 0.5380, Rho spearman: 0.5647
Test - trans_loss=0.8525,      Loss: 0.7802, Pearson r: 0.9974, Rho spearman: 0.9980

Epoch 2/50
Train -  trans_loss=0.7983,    Loss: 0.7384, Pearson r: 0.9948, Rho spearman: 0.9933
Validation - Loss: 2.3831, Pearson r: 0.5480, Rho spearman: 0.5710
Test - trans_loss=0.8355,      Loss: 0.7686, Pearson r: 0.9967, Rho spearman: 0.9971

Epoch 3/50
Train -  trans_loss=0.7731,    Loss: 0.6979, Pearson r: 0.9925, Rho spearman: 0.9906
Validation - Loss: 2.2818, Pearson r: 0.5509, Rho spearman: 0.5685
Test - trans_loss=0.9439,      Loss: 0.6446, Pearson r: 0.9928, Rho spearman: 0.9933

Epoch 4/50
Train -  trans_loss=0.7341,    Loss: 0.6146, Pearson r: 0.9844, Rho spearman: 0.9809
Validation - Loss: 2.2227, Pearson r: 0.5523, Rho spearman: 0.5673
Test - trans_loss=0.9851,      Loss: 0.5983, Pearson r: 0.9786, Rho spearman: 0.9809

Epoch 5/50
Train -    trans_loss=0.6154,    Loss: 0.4760, Pearson r: 0.9712, Rho spearman: 0.9683
Validation - Loss: 1.9757, Pearson r: 0.5462, Rho spearman: 0.5624
Test -  trans_loss=1.2552      Loss: 0.4895, Pearson r: 0.9611, Rho spearman: 0.9621




#fold val 4 
Epoch 1/50
Train -  trans_loss=0.9633,    Loss: 0.7264, Pearson r: 0.9832, Rho spearman: 0.9923
Validation - Loss: 2.4758, Pearson r: 0.5391, Rho spearman: 0.5653
Test - trans_loss=0.5034,      Loss: 0.5450, Pearson r: 0.9960, Rho spearman: 0.9975

Epoch 2/50
Train -  trans_loss=0.8204,    Loss: 0.7638, Pearson r: 0.9944, Rho spearman: 0.9937
Validation - Loss: 2.2981, Pearson r: 0.5420, Rho spearman: 0.5653
Test - trans_loss=0.4875,      Loss: 0.5152, Pearson r: 0.9947, Rho spearman: 0.9957

Epoch 3/50
Train -  trans_loss=0.8100,    Loss: 0.7233, Pearson r: 0.9916, Rho spearman: 0.9893
Validation - Loss: 2.3350, Pearson r: 0.5457, Rho spearman: 0.5637
Test - trans_loss=0.4898,      Loss: 0.4965, Pearson r: 0.9885, Rho spearman: 0.9901

Epoch 4/50
Train -  trans_loss=0.7602,    Loss: 0.6453, Pearson r: 0.9842, Rho spearman: 0.9825
Validation - Loss: 2.1598, Pearson r: 0.5485, Rho spearman: 0.5621
Test - trans_loss=0.6440,      Loss: 0.4266, Pearson r: 0.9716, Rho spearman: 0.9780

Epoch 5/50
Train -    trans_loss=0.6597,    Loss: 0.5232, Pearson r: 0.9709, Rho spearman: 0.9721
Validation - Loss: 2.0976, Pearson r: 0.5470, Rho spearman: 0.5613
Test -  trans_loss=0.6680      Loss: 0.3780, Pearson r: 0.9572, Rho spearman: 0.9708




In [None]:
sns.lineplot([
    0.9685, 0.8048, 0.7809, 0.7176, 0.6070,
    0.5258, 0.4876, 0.4547, 0.4229, 0.4095,
    0.4016, 0.3825, 0.3685, 0.3515, 0.3472,
    0.3227, 0.3129, 0.3053, 0.2971, 0.2757,
    0.2846, 0.2682, 0.2608, 0.2552, 0.2385
]
)

In [None]:





#s2450 +s2450inv+ ptmul_train : 

# Epoch 111/300
# Train -      Loss: 0.0096, Pearson r: 0.9987, Rho spearman: 0.9984
# Validation - Loss: 4.6144, Pearson r: 0.5621, Rho spearman: 0.5345
# Test -       Loss: 1.9337, Pearson r: 0.5208, Rho spearman: 0.5338

# Epoch 112/300
# Train -      Loss: 0.0107, Pearson r: 0.9985, Rho spearman: 0.9983
# Validation - Loss: 4.6020, Pearson r: 0.5630, Rho spearman: 0.5403
# Test -       Loss: 1.9419, Pearson r: 0.5208, Rho spearman: 0.5367

##############################################################################

train con hydra
# Epoch 145/300
# Train -      Loss: 0.0034, Pearson r: 0.9995, Rho spearman: 0.9995
# Validation - Loss: 0.9176, Pearson r: 0.8205, Rho spearman: 0.8206
# Test -       Loss: 2.0290, Pearson r: 0.5171, Rho spearman: 0.5325

# Epoch 146/300
# Train -      Loss: 0.0040, Pearson r: 0.9994, Rho spearman: 0.9994
# Validation - Loss: 0.9274, Pearson r: 0.8196, Rho spearman: 0.8189
# Test -       Loss: 2.0288, Pearson r: 0.5173, Rho spearman: 0.5318

In [None]:
torch.save(Final_model, 'JanusDDG_300epochs_plus25_hydra_slim.pth')

In [None]:
#torch.save(Final_model, 'JanusDDG_300_all_train.pth') #window 20 

In [None]:
#torch.save(best_model, 'DDGemb_Cross_0.pth')

In [None]:
assert False

In [None]:
import torch

# Lista dei file dei modelli salvati
model_paths = [f'JanusDDG_{epoch}_ensamble.pth' for epoch in range(100, 301,50)]

# Carica gli state_dict dei modelli
state_dicts = [torch.load(path).state_dict() for path in model_paths]

# Crea un nuovo state_dict per il modello mediato
avg_state_dict = {}

# Itera su tutti i parametri del modello
for key in state_dicts[0]:  # Prendi le chiavi dal primo modello
    avg_state_dict[key] = sum(d[key] for d in state_dicts) / len(state_dicts)

# Carica i pesi mediati in un nuovo modello
final_model = torch.load(model_paths[0])  # Carica uno dei modelli per l'architettura
final_model.load_state_dict(avg_state_dict)

# Salva il modello mediato
torch.save(final_model, "JanusDDG_avg_final.pth")

In [None]:

con 

#SENZA RELU NELLA CONV1D
# lr = 1e-4
# input_dim = 1280

# transf_parameters={'input_dim':1280, 'num_heads':8,
#                     'dropout_rate':0.,}
# arrivo a 0.54  (dopo un po' meno di 200 epoche)


#     def __init__(self, input_dim=1280, num_heads=8, dropout_rate=0., num_experts=1, f_activation = nn.ReLU(), kernel_size=15, cross_att = False,
#                 dual_cross_att=False):
        
#         super(TransformerRegression, self).__init__()
#         self.cross_att = cross_att
#         self.dual_cross_att = dual_cross_att
        
#         print(f'Cross Attention: {cross_att}')
#         print(f'Dual Cross Attention: {dual_cross_att}')

#         self.embedding_dim = input_dim
#         self.act = f_activation
#         self.max_len = 3700 #lunghezza massima proteina
#         out_channels = 128  #num filtri conv 1D
#         kernel_size = 20
#         padding = 0



In [None]:
# import pickle
# with open("JanusDDG_loss_train.pkl", "wb") as f:
#     pickle.dump(l_tr, f)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6), sharey=False)

# Primo sottografico: Pearson r per il set di Train
sns.lineplot(data=p_tr, ax=axes[0], label='Train r')
#sns.lineplot(data=p_val, ax=axes[0], label='Test r')
sns.lineplot(data=p_te, ax=axes[0], label='Test r')

axes[0].legend()
axes[0].set_title('Pearson r Values for Train and Test Set')
axes[0].set_xlabel('Epochs')
axes[0].set_ylabel('Pearson r')
#axes[0].text(10, 0.53, str(round(pearson_max_val,3)), fontsize=12, color='red')
axes[0].text(10, 0.53, str(round(max(p_te),3)), fontsize=12, color='red')

axes[0].axhline(y=0.545, color='r', linestyle='--', linewidth=2)

# Secondo sottografico: Pearson r per il set di Test
sns.lineplot(data=l_tr, ax=axes[1], label='Train Loss')
sns.lineplot(data=l_te, ax=axes[1], label='Test Loss')

axes[1].legend()
axes[1].set_title('Loss Values for Train and Test Set')
axes[1].set_xlabel('Epochs')
axes[1].set_ylabel('Huber Loss')

# Imposta il titolo generale per la figura
fig.suptitle('JanusDDG Pearson r Values for Train and Test Sets', fontsize=16)

In [None]:
assert False

In [None]:
def metrics(pred_dir, pred_inv, true):

    #Dirette
    print(f'Pearson test dirette: {pearsonr(true,pred_dir)[0]}')   
    print(f'Spearmanr test dirette: {spearmanr(true,pred_dir)[0]}')    
    print(f'RMSE dirette: {root_mean_squared_error(true,pred_dir)}')
    print(f'MAE dirette: {mean_absolute_error(true,pred_dir)}\n')
    
    #Inverse
    print(f'Pearson test inverse: {pearsonr(-true,pred_inv)[0]}')   
    print(f'Spearmanr test inverse: {spearmanr(-true,pred_inv)[0]}')    
    print(f'RMSE inverse: {root_mean_squared_error(-true,pred_inv)}')
    print(f'MAE inverse: {mean_absolute_error(-true,pred_inv)}\n')
    #Tot
    
    print(f'Pearson test tot: {pearsonr(pd.concat([true,-true],axis=0),pd.concat([pred_dir,pred_inv],axis=0))[0]}')   
    print(f'Spearmanr test tot: {spearmanr(pd.concat([true,-true],axis=0),pd.concat([pred_dir,pred_inv],axis=0))[0]}')    
    print(f'RMSE tot: {root_mean_squared_error(pd.concat([true,-true],axis=0),pd.concat([pred_dir,pred_inv],axis=0))}')
    print(f'MAE tot: {mean_absolute_error(pd.concat([true,-true],axis=0),pd.concat([pred_dir,pred_inv],axis=0))}\n')
    
    print(f'PCC d-r: {pearsonr(pred_dir,pred_inv)}\n')
    print(f'anti-symmetry bias: {np.mean(pred_dir + pred_inv)}\n-----------------------\n')


In [None]:
# def output_model_from_batch_inv(batch, model, device, train=True):

#     '''Dato un modello pytorch e batch restituisce: output_modello, True labels'''
    
#     x_wild = batch['mut_type'].float().to(device)
#     x_mut = batch['wild_type'].float().to(device)
#     labels = -batch['ddg'].float().to(device)
#     length = batch['length'].to(device)
#     output_ddg = model(x_wild, x_mut, length, train = train)
    
#     return output_ddg, labels

In [None]:
def dataloader_generation_pred(E_TYPE, test_path, batch_size = 128, dataloader_shuffle = True, inv= False):
    
    EMBEDDING_TYPE = E_TYPE
    
    if EMBEDDING_TYPE == 'ESM2':

        '''train formato da s2648 + UnionV e DA; 1000 dei DA sono usati nel validation insieme a s669 DA
        '''
        
        dim_embedding = 1280
        
        dataset_test = []
        
        for path in test_path:           
            with open(path, 'rb') as f:
                dataset_test += pickle.load(f)
    
    else:
        assert False

    dataset_test = DeltaDataset(dataset_test, dim_embedding, inv = inv)
    
    # Creazione DataLoader
    dataloader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=dataloader_shuffle, collate_fn=collate_fn)#collate_fn_MULTIPLE

    return dataloader_test


In [None]:

# def collate_fn_MULTIPLE(batch):
#     max_len = max(sample['wild_type'].shape[0] for sample in batch)  # Max sequence length in batch   700
#     max_features = max(sample['wild_type'].shape[1] for sample in batch)  # Max feature size

#     padded_batch = {
#         'id': [],
#         'wild_type': [],
#         'mut_type': [],
#         'length': [],
#         'ddg': [],
#         #'alpha_vec': [],
#         'pos_mut': []
#     }

#     for sample in batch:
#         wild_type_padded = F.pad(sample['wild_type'], (0, max_features - sample['wild_type'].shape[1], 
#                                                        0, max_len - sample['wild_type'].shape[0]))
#         mut_type_padded = F.pad(sample['mut_type'], (0, max_features - sample['mut_type'].shape[1], 
#                                                      0, max_len - sample['mut_type'].shape[0]))

#         padded_batch['id'].append(sample['id'])  
#         padded_batch['wild_type'].append(wild_type_padded)  
#         padded_batch['mut_type'].append(mut_type_padded)  
#         padded_batch['length'].append(sample['length'])#append(torch.tensor(sample['length'], dtype=torch.float32))  
#         padded_batch['ddg'].append(sample['ddg'])#append(torch.tensor(float(sample['ddg']), dtype=torch.float32))  
#         #padded_batch['alpha_vec'].append(sample['alpha_vec'])#append(torch.tensor(sample['alpha_vec'], dtype=torch.float32))  
#         #padded_batch['pos_mut'].append(sample['pos_mut'])#append(torch.tensor(sample['pos_mut'], dtype=torch.int64))  

#     # Convert list of tensors into a single batch tensor
#     padded_batch['wild_type'] = torch.stack(padded_batch['wild_type'])  # Shape: (batch_size, max_len, max_features)
#     padded_batch['mut_type'] = torch.stack(padded_batch['mut_type'])  
#     padded_batch['length'] = torch.stack(padded_batch['length'])  
#     padded_batch['ddg'] = torch.stack(padded_batch['ddg'])  
#     #padded_batch['alpha_vec'] = torch.stack(padded_batch['alpha_vec'])  
#     #padded_batch['pos_mut'] = torch.stack(padded_batch['pos_mut'])  

#     return padded_batch



In [None]:
from torch.utils.data import DataLoader, Dataset, SubsetRandomSampler
E_TYPE = 'ESM2'
dataloader_test = dataloader_generation_pred(E_TYPE,  test_path=['s669_Castrense.pkl'],  batch_size = 6, dataloader_shuffle = False, inv= False)
#['s669_Castrense.pkl']#'PTMUL_D.pkl'
#'ddg_S2648_ESM2_ALL_LENGTH.pkl' #'s2450_fold_4.pkl' 
#['../DeltaDelta_BELLO/cdna117k_fold_1.pkl'] + ['../DeltaDelta_BELLO/cdna117k_fold_2.pkl']

In [None]:
# test model

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def model_performance_test(model, dataloader_test, inv= False, train = False):
    # Assicurati che il modello sia in modalità di valutazione
    model.eval()
    
    # Lista per salvare tutte le predizioni
    all_predictions_test = []
    all_lables_test = []
    
    with torch.no_grad():  # Disable gradient calculation
       
        for i, batch in enumerate(dataloader_test):

            predictions_test, labels_test=output_model_from_batch(batch, model, device, train=False)

            # Aggiungi le predizioni alla lista
            all_predictions_test.append(predictions_test)
            all_lables_test.append(labels_test)

    return all_predictions_test, all_lables_test

In [None]:
lr = 1e-4
input_dim = 1280

transf_parameters={'input_dim':1280, 'num_heads':8,
                    'dropout_rate':0.,}
i=4
best_model = torch.load('JanusDDG_300epochs.pth')#(f'JanusDDG_300epochs.pth')
#torch.load(f'DDGemb_Cross_4.pth')

In [None]:
all_predictions_test, all_lables_test = model_performance_test(best_model,dataloader_test,
                                                          inv=False, train=False)

In [None]:
print(f'Pearson test dirette: {pearsonr(torch.cat(all_predictions_test, dim=0).cpu(), -torch.cat(all_lables_test, dim=0).cpu())}')   
print(f'RMSE dirette: {root_mean_squared_error(torch.cat(all_predictions_test, dim=0).cpu(),- torch.cat(all_lables_test, dim=0).cpu())}')
print(f'MAE dirette: {mean_absolute_error(torch.cat(all_predictions_test, dim=0).cpu(),- torch.cat(all_lables_test, dim=0).cpu())}')

In [None]:
pearsonr(torch.cat(all_predictions_test, dim=0).cpu()[:669],torch.cat(all_predictions_test, dim=0).cpu()[669:])

In [None]:
(torch.cat(all_predictions_test, dim=0).cpu()[:669] + torch.cat(all_predictions_test, dim=0).cpu()[669:]).mean()

In [None]:
#pd.Series(torch.cat(all_predictions_test, dim=0).cpu()).to_pickle('DDGemb_cross_0_predictions.pkl')

In [None]:
pd.read_pickle('indici_ordinati_s669.pkl').sort_values(by='index_castrense')['DDG'].values

In [None]:
indici_ordinati = pd.read_pickle('indici_ordinati_s669.pkl').sort_values(by='index_castrense')['index'].values
pythia_s669 = pd.read_csv('../DeltaDeltaG/pythia_s669.csv').iloc[indici_ordinati,:]['Pythia_inv']
info_mut = pd.read_csv('../DeltaDeltaG/pythia_s669.csv').iloc[indici_ordinati,:][['wildtype','mutation']]
pred_janus= np.array(torch.cat(all_predictions_test, dim=0).cpu())
true_ddg = pd.read_pickle('indici_ordinati_s669.pkl').sort_values(by='index_castrense')['DDG'].values#np.array(torch.cat(all_lables_test, dim=0).cpu())

df = pd.DataFrame({
    'wild':info_mut.iloc[:,0].values,
    'mut':info_mut.iloc[:,1].values,
    'pythia_s669': pythia_s669.values,  # Deve avere la stessa lunghezza di predictions
    'predictions': pred_janus,  # Ogni riga avrà un valore scalare o un array se multidimensionale
    'true_ddg':true_ddg
})
df.index = indici_ordinati
print('Pearson s669: ', pearsonr(df['predictions'],df['true_ddg']))   


indici_ordinati_s461  = pd.read_pickle('indici_ordinati_s669.pkl').dropna(subset='s461_pdb').sort_values(by='index_castrense')['index'].values



In [None]:
rose_csv = pd.read_csv('rose1985.csv')

def rose_score(row):
    wild_rose = rose_csv[rose_csv['Parameter']==row['wild']]['Rose1985'].values[0]
    mut_rose = rose_csv[rose_csv['Parameter']==row['mut']]['Rose1985'].values[0]
    return wild_rose - mut_rose
    
df['Rose'] = df.apply(rose_score,axis=1)
df

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression(fit_intercept=False)

lr.fit(df[['pythia_s669','Rose']].values,df['true_ddg'].values)
pearsonr(lr.predict(df[['pythia_s669','Rose']].values),df['true_ddg'].values)



In [None]:
pearsonr(df['pythia_s669'].values * 0.11 - df['Rose'].values*0.0088,df['true_ddg'].values)


In [None]:
df_s461 = df.loc[indici_ordinati_s461,:]

In [None]:
print(pearsonr(df_s461['pythia_s669']*0.11-df_s461['Rose']*0.0088 ,df_s461['true_ddg']))   
print(pearsonr(df_s461['predictions'],df_s461['true_ddg']))   
print('RMSE: ', root_mean_squared_error(df_s461['pythia_s669'],df_s461['true_ddg']))
print('MAE: ', mean_squared_error(df_s461['pythia_s669'],df_s461['true_ddg']))



In [None]:
df

In [None]:
print(pearsonr(df_s461['predictions'],df_s461['true_ddg']))   
print(pearsonr(df['predictions'],df['true_ddg']))   


In [None]:
print(f'Pearson test dirette: {pearsonr(-0.07416111+ pythia_s669.values*0.09+np.array(torch.cat(all_predictions_test, dim=0).cpu()*0.57), torch.cat(all_lables_test, dim=0).cpu())}')   


In [None]:
pd.Series(torch.cat(all_predictions_test, dim=0).cpu()).to_pickle('Janus_s669_fake.pkl')

In [None]:
dir = torch.cat(all_predictions_test, dim=0).cpu()

In [None]:
inv = torch.cat(all_predictions_test, dim=0).cpu()


In [None]:
tot =(dir-inv) /2

In [None]:
tot

In [None]:
metrics( pd.DataFrame(tot),  pd.DataFrame(tot), pd.DataFrame(torch.cat(all_lables_test, dim=0).cpu()))

In [None]:
pred_inv = all_predictions_test

In [None]:
pred_dirette = all_predictions_test
true = all_lables_test

In [None]:
from sklearn.metrics import root_mean_squared_error,mean_absolute_error

metrics( pd.DataFrame(torch.cat(pred_dirette, dim=0).cpu()), pd.DataFrame(torch.cat(pred_inv, dim=0).cpu()), pd.DataFrame(torch.cat(true, dim=0).cpu()))

In [None]:
50+49+53+51+51

In [None]:
for i in range(5):
    val_Set = [0,1,2,3,4]
    val_Set.remove(i)
    print(val_Set)
    best_model = torch.load(f'DDGemb_Cross_{i}.pth')
    
    dataloader_val = dataloader_generation_pred(E_TYPE,  test_path=[f's2450_fold_{i}.pkl']+[f's2450_fold_{i}_inv.pkl'],
                                                 batch_size = 6, dataloader_shuffle = False, inv= False)
    
    all_predictions_val, all_lables_val = model_performance_test(best_model,dataloader_val,
                                                              inv=False, train=False)
    
    
    print(f'Pearson test dirette: {pearsonr(torch.cat(all_predictions_val, dim=0).cpu(), torch.cat(all_lables_val, dim=0).cpu())}')   


In [None]:
pesi = np.array([0.2,0.21,0.2,0.19,0.2])

In [None]:
pred_tot = []

for i in range(5):
    best_model = torch.load(f'DDGemb_Cross_{i}.pth')
    
    all_predictions_test, all_lables_test = model_performance_test(best_model,dataloader_test,
                                                          inv=False, train=False)
    
    pred_tot.append(pd.Series(torch.cat(all_predictions_test, dim=0).cpu()))#.to_pickle('DDGemb_Cross_0_predictions_s669.pkl')

In [None]:
pred_inv = pred_tot.copy()

In [None]:
pred_dir = pred_tot.copy()

In [None]:
pred_tot

In [None]:
somma_pesata_dir = sum(w * v for w, v in zip(pesi, pred_tot))

print(somma_pesata)

In [None]:
pesi

In [None]:
somma_pesata_inv = sum(w * v for w, v in zip(pesi, pred_inv))


In [None]:
r = pred_tot[0]+pred_tot[1]+pred_tot[2]+pred_tot[3]+pred_tot[4]

In [None]:
from sklearn.metrics import root_mean_squared_error,mean_absolute_error

print(f'Pearson test dirette: {pearsonr(somma_pesata, torch.cat(all_lables_test, dim=0).cpu())}')   
print(f'Spearmanr test dirette: {spearmanr(somma_pesata, torch.cat(all_lables_test, dim=0).cpu())}')    
print(f'RMSE dirette: {root_mean_squared_error(somma_pesata, torch.cat(all_lables_test, dim=0).cpu())}')
print(f'MAE dirette: {mean_absolute_error(somma_pesata, torch.cat(all_lables_test, dim=0).cpu())}')

In [None]:
somma_pesata

In [None]:
somma_pesata_inv

In [None]:
metrics(pd.DataFrame(somma_pesata_dir),pd.DataFrame(somma_pesata_inv),-pd.DataFrame(torch.cat(all_lables_test, dim=0).cpu()))

In [None]:
np.mean([386,342,187,497,105])

In [None]:
best_model = torch.load(f'DDGemb_Cross_4.pth')
all_predictions_test, all_lables_test = model_performance(best_model,dataloader_test,
                                                          dataloader_train=None,dataloader_validation=None,inv=True, train=False)

pred_tot.append(pd.Series(torch.cat(all_predictions_test, dim=0).cpu()))#.to_pickle('DDGemb_Cross_0_predictions_s669.pkl')

In [None]:
from sklearn.metrics import root_mean_squared_error,mean_absolute_error

print(f'Pearson test dirette: {pearsonr(torch.cat(all_predictions_test, dim=0).cpu(), torch.cat(all_lables_test, dim=0).cpu())}')   
print(f'Spearmanr test dirette: {spearmanr(torch.cat(all_predictions_test, dim=0).cpu(), torch.cat(all_lables_test, dim=0).cpu())}')    
print(f'RMSE dirette: {root_mean_squared_error(torch.cat(all_predictions_test, dim=0).cpu(), torch.cat(all_lables_test, dim=0).cpu())}')
print(f'MAE dirette: {mean_absolute_error(torch.cat(all_predictions_test, dim=0).cpu(), torch.cat(all_lables_test, dim=0).cpu())}')

In [None]:
#BOOTSTRAPPING

results_naive=[]
for i in range(1000):
    dataloader_test = dataloader_generation_pred(E_TYPE,  test_path='ddg_s669_ESM2_HYDRA_LITE.pkl',  batch_size = 128, dataloader_shuffle = False, inv= False)

    all_predictions_test, all_lables_test = model_performance(best_model,
                                                                                                                                           dataloader_test,
                                                                                                                                           dataloader_train=None,
                                                                                                                                           dataloader_validation=None,         
                                                                                                                                           inv=False,
                                                                                                                                           train=True)
    results_naive.append(pearsonr(torch.cat(all_predictions_test, dim=0).cpu(), torch.cat(all_lables_test, dim=0).cpu())[0]) 

pd.DataFrame(results_naive).to_pickle('result_bootstrapping_MLP_27_01_2025.pkl')

In [None]:
best_model = torch.load('MLP_HYDRA_MEAN_29_01_2025.pth')#('MLP_HYDRA_MEAN_29_01_2025.pth')#'MLP_27_01_2025.pth')#('MPL_HYDRA_model_7W_256_256.pth')
best_model.hydra=True
results_hydra=[]
for i in range(1000):
    dataloader_test = dataloader_generation_pred(E_TYPE,  test_path='ddg_s669_ESM2_HYDRA_LITE.pkl',  batch_size = 128, dataloader_shuffle = False, inv= False)

    all_predictions_test, all_lables_test = model_performance(best_model,
                                                                                                                                           dataloader_test,
                                                                                                                                           dataloader_train=None,
                                                                                                                                           dataloader_validation=None,         
                                                                                                                                           inv=False,
                                                                                                                                           train=True)
    results_hydra.append(pearsonr(torch.cat(all_predictions_test, dim=0).cpu(), torch.cat(all_lables_test, dim=0).cpu())[0]) 

pd.DataFrame(results_hydra).to_pickle('result_bootstrapping_MLP_HYDRA_MEAN_29_01_2025.pkl')

In [None]:
sns.histplot(results_hydra,kde=True,color='blue')
sns.histplot(results_naive,kde=True,color='red')
from scipy.stats import ks_2samp
ks_2samp(results_hydra,results_naive)

In [None]:
sns.histplot(results_hydra,kde=True)
sns.histplot(results_naive,kde=True)
from scipy.stats import ks_2samp
ks_2samp(results_hydra,results_naive)

In [None]:
from sklearn.metrics import root_mean_squared_error,mean_absolute_error

print(f'Pearson test dirette: {pearsonr(torch.cat(all_predictions_test, dim=0).cpu(), torch.cat(all_lables_test, dim=0).cpu())}')   
print(f'Spearmanr test dirette: {spearmanr(torch.cat(all_predictions_test, dim=0).cpu(), torch.cat(all_lables_test, dim=0).cpu())}')    
print(f'RMSE dirette: {root_mean_squared_error(torch.cat(all_predictions_test, dim=0).cpu(), torch.cat(all_lables_test, dim=0).cpu())}')
print(f'MAE dirette: {mean_absolute_error(torch.cat(all_predictions_test, dim=0).cpu(), torch.cat(all_lables_test, dim=0).cpu())}')

In [None]:
pd.Series(torch.cat(all_predictions_test, dim=0).cpu())

In [None]:
#pd.Series(torch.cat(all_predictions_train, dim=0).cpu()).to_pickle('MLP_HYDRA_7W_256_256_predictions_esm2_train.pkl')
pd.Series(torch.cat(all_predictions_test, dim=0).cpu()).to_pickle('MLP_HYDRA_MEAN_29_01_2025.pkl')
#pd.Series(torch.cat(all_predictions_validation, dim=0).cpu()).to_pickle('MLP_HYDRA_7W_256_256_predictions_esm2_val.pkl')
# pd.Series(torch.cat(all_lables_train, dim=0).cpu()).to_pickle('MLP_all_lables_train.pkl')
# pd.Series(torch.cat(all_lables_test, dim=0).cpu()).to_pickle('MLP_all_lables_test.pkl')
# pd.Series(torch.cat(all_lables_validation, dim=0).cpu()).to_pickle('MLP_all_lables_val.pkl')

In [None]:
#####NOMI 
#'MLP_HYDRA_7W_256_256_predictions_esm2_train.pkl'


#####

In [None]:
pd.Series(torch.cat(all_predictions_test, dim=0).cpu()).to_pickle('MLP_HYDRA_7W_256_256_predictions_Protherm_Doubles_pdbnum_2.pkl')


In [None]:
from sklearn.metrics import root_mean_squared_error,mean_absolute_error

print(f'Pearson test d-r: {pearsonr(torch.cat(all_predictions_test, dim=0).cpu(), torch.cat(all_predictions_test_inv, dim=0).cpu())}') 

In [None]:
from sklearn.metrics import root_mean_squared_error,mean_absolute_error

print(f'Pearson test dirette: {pearsonr(torch.cat(all_predictions_test, dim=0).cpu(), torch.cat(all_lables_test, dim=0).cpu())}')   
print(f'Spearmanr test dirette: {spearmanr(torch.cat(all_predictions_test, dim=0).cpu(), torch.cat(all_lables_test, dim=0).cpu())}')    
print(f'RMSE dirette: {root_mean_squared_error(torch.cat(all_predictions_test, dim=0).cpu(), torch.cat(all_lables_test, dim=0).cpu())}')
print(f'MAE dirette: {mean_absolute_error(torch.cat(all_predictions_test, dim=0).cpu(), torch.cat(all_lables_test, dim=0).cpu())}')


# Pearson test dirette: PearsonRResult(statistic=0.49811210357961794, pvalue=3.0676935334080987e-43)
# Pearson test dirette: SignificanceResult(statistic=0.5209411003368696, pvalue=8.229743816346383e-48)
# RMSE dirette: 1.4306520223617554
# MAE dirette: 1.0020039081573486

# Pearson test dirette: PearsonRResult(statistic=0.4984754880369162, pvalue=2.6104865839652876e-43)
# Pearson test dirette: SignificanceResult(statistic=0.5219817530960136, pvalue=4.996030095808217e-48)
# RMSE dirette: 1.4335042238235474
# MAE dirette: 1.0032145977020264

In [None]:
print(f'Pearson test dirette: {spearmanr(torch.cat(all_predictions_test, dim=0).cpu(), torch.cat(all_lables_test, dim=0).cpu())}')    
print(f'RMSE dirette: {root_mean_squared_error(torch.cat(all_predictions_test, dim=0).cpu(), torch.cat(all_lables_test, dim=0).cpu())}')
print(f'MAE dirette: {mean_absolute_error(torch.cat(all_predictions_test, dim=0).cpu(), torch.cat(all_lables_test, dim=0).cpu())}')

In [None]:
print(f'Pearson test inverse: {pearsonr(torch.cat(all_predictions_test_inv, dim=0).cpu(), torch.cat(all_lables_test_inv, dim=0).cpu())}')    
print(f'RMSE inve: {root_mean_squared_error(torch.cat(all_predictions_test_inv, dim=0).cpu(), torch.cat(all_lables_test_inv, dim=0).cpu())}')
print(f'MAE inve: {mean_absolute_error(torch.cat(all_predictions_test_inv, dim=0).cpu(), torch.cat(all_lables_test_inv, dim=0).cpu())}')

In [None]:
all_predictions_test.extend(all_predictions_test_inv)
all_lables_test.extend(all_lables_test_inv)
print(f'Pearson test dirette+inverse: {pearsonr(torch.cat(all_predictions_test, dim=0).cpu(), torch.cat(all_lables_test, dim=0).cpu())}')    
print(f'RMSE dirette+inverse: {root_mean_squared_error(torch.cat(all_predictions_test, dim=0).cpu(), torch.cat(all_lables_test, dim=0).cpu())}')
print(f'MAE dirette+inverse: {mean_absolute_error(torch.cat(all_predictions_test, dim=0).cpu(), torch.cat(all_lables_test, dim=0).cpu())}')

In [None]:
all_predictions_test

In [None]:
## 

In [None]:
y_esm2_train = pd.read_pickle('MLP_all_predictions_esm2_s2648.pkl')
y_esm2_test = pd.read_pickle('MLP_all_predictions_esm2_s669.pkl')

y_ohe_train = pd.read_pickle('MLP_all_predictions_ohe_s2648.pkl')
y_ohe_test = pd.read_pickle('MLP_all_predictions_ohe_s669.pkl')

y_true_test = pd.DataFrame(torch.cat(all_lables_test, dim=0).cpu())
y_true_train = pd.DataFrame(torch.cat(all_lables_train, dim=0).cpu())

In [None]:
from sklearn.linear_model import LinearRegression

X_train = pd.concat([y_esm2_train,y_ohe_train],axis=1)
X_test = pd.concat([y_esm2_test,y_ohe_test],axis=1) 
y_train = y_true_train

lr = LinearRegression()
lr.fit(X_train, y_train)
prediction_lr = lr.predict(X_test)

In [None]:
prediction_lr.reshape(669).values.shape

In [None]:
y_true_test

In [None]:
# Assicurati che entrambe le variabili siano monodimensionali
y_true_test_flat = y_true_test.values.ravel()  # Oppure .flatten()
prediction_lr_flat = prediction_lr.reshape(669).ravel()  # Oppure np.squeeze()

print(f'Pearson test dirette: {pearsonr(y_true_test_flat, prediction_lr_flat)}')    
print(f'RMSE dirette: {root_mean_squared_error(y_true_test_flat, prediction_lr_flat)}')  
print(f'MAE dirette: {mean_absolute_error(y_true_test_flat, prediction_lr_flat)}')  

In [None]:
print(f'Pearson test dirette: {pearsonr(y_true_test_flat, X_test.iloc[:,0])}')    
print(f'RMSE dirette: {root_mean_squared_error(y_true_test_flat, X_test.iloc[:,0])}')  
print(f'MAE dirette: {mean_absolute_error(y_true_test_flat, X_test.iloc[:,0])}')  

In [None]:
#################FINE

In [None]:
with open('ddg_s2648_ESM2.pkl', 'rb') as f:
    dataset_s2648 = pickle.load(f)

with open('ddg_s669_ESM2.pkl', 'rb') as f:
    dataset_s669 = pickle.load(f)

In [None]:

# # Creiamo il dataset
dataset_train = ProteinDataset(dataset_s2648, threshold=5.0)
dataset_test = ProteinDataset(dataset_s669, threshold=5.0)


# Creazione di un DataLoader
dataloader_train = DataLoader(dataset_train, batch_size=32, shuffle=False)
dataloader_test = DataLoader(dataset_test, batch_size=32, shuffle=False)

In [None]:
dic_embedding_aa = {}
ordine_aa = {}

aa_a = "ACDEFGHIKLMNPQRSTVWY"

ind_pos = 0
for name_aa in [*aa_a]:
    dic_embedding_aa[name_aa] = np.zeros(20)
    dic_embedding_aa[name_aa][ind_pos] = 1

    ordine_aa[name_aa] = ind_pos

    ind_pos+=1


In [None]:
info_letters_test = []
info_letters_train = []
info_mutation_test = []
info_mutation_train = []

for batch in dataloader_test:
    info_letters_test = []
    
    for batch_i in range(len(batch['id'])):
        vector_w = [1 if i == 1 else 0 for i in batch['V'][batch_i][:]]
        vector_m = [1 if i == -1 else 0 for i in batch['V'][batch_i][:]]
        letter_w = next((k for k, v in dic_embedding_aa.items() if (v == vector_w).all()), None)
        letter_m = next((k for k, v in dic_embedding_aa.items() if (v == vector_m).all()), None)
        letters = [letter_w + letter_m]
        info_letters_test.extend(letters)

    info_mutation_test.extend([id+l[0]+str(pos)+l[-1] for id, pos, l in zip(batch['id'],batch['position'].tolist(), info_letters_test)])
    

for batch in dataloader_train:
    info_letters_train = []
    for batch_i in range(len(batch['id'])):
        vector_w = [1 if i == 1 else 0 for i in batch['V'][batch_i][:]]
        vector_m = [1 if i == -1 else 0 for i in batch['V'][batch_i][:]]
        letter_w = next((k for k, v in dic_embedding_aa.items() if (v == vector_w).all()), None)
        letter_m = next((k for k, v in dic_embedding_aa.items() if (v == vector_m).all()), None)
        letters = [letter_w + letter_m]
        info_letters_train.extend(letters)

    info_mutation_train.extend([id+l[0]+str(pos)+l[-1] for id, pos, l in zip(batch['id'],batch['position'].tolist(), info_letters_train)])



In [None]:
len(info_mutation_train)

In [None]:
# Assicurati che il modello sia in modalità di valutazione
DDG_model.eval()

# Lista per salvare tutte le predizioni
all_predictions_test = []
all_predictions_train = []

all_lables_train = []
all_lables_test = []

with torch.no_grad():  # Disable gradient calculation
    for i, batch in enumerate(dataloader_test):
        graph_x = batch.x.float().to(device)   #wild_type
        graph_edge_index = batch.edge_index.to(device)   #wild_type
        graph_batch = batch.batch.to(device)   #wild_type
        labels = batch.y.float().float().to(device)
        position_wild = position_adj(batch.position, batch.ptr)
        position_mut = position_mut_adj(batch.position, batch.ptr)
        intervallo_wild = edge_index_wild(graph_x, batch.ptr)
        intervallo_mut = edge_index_mut(graph_x, batch.ptr)
        V = batch.V
        edge_weights = batch.edge_weight
        predictions = DDG_model(graph_x, graph_edge_index, labels,position_wild,position_mut, V,
                        edge_weights = edge_weights,intervallo_wild= intervallo_wild,intervallo_mut =intervallo_mut)  
            
        all_predictions_test.append(predictions)
        all_lables_test.append(labels)


with torch.no_grad():  # Disable gradient calculation
    for i, batch in enumerate(dataloader_train):
        graph_x = batch.x.float().to(device)   #wild_type
        graph_edge_index = batch.edge_index.to(device)   #wild_type
        graph_batch = batch.batch.to(device)   #wild_type
        labels = batch.y.float().float().to(device)
        position_wild = position_adj(batch.position, batch.ptr)
        position_mut = position_mut_adj(batch.position, batch.ptr)
        intervallo_wild = edge_index_wild(graph_x, batch.ptr)
        intervallo_mut = edge_index_mut(graph_x, batch.ptr)
        V = batch.V
        edge_weights = batch.edge_weight
        predictions = DDG_model(graph_x, graph_edge_index, labels,position_wild,position_mut, V,
                        edge_weights = edge_weights,intervallo_wild= intervallo_wild,intervallo_mut =intervallo_mut) 

        
        # Aggiungi le predizioni alla lista
        all_predictions_train.append(predictions)
        all_lables_train.append(labels)


In [None]:
all_predictions_test = torch.cat([i[0] for i in all_predictions_test], dim=0)
all_predictions_train = torch.cat([i[0] for i in all_predictions_train], dim=0)

In [None]:
all_predictions_train_dir = all_predictions_train[:int(all_predictions_train.shape[0]/2)]
all_predictions_train_inv = all_predictions_train[int(all_predictions_train.shape[0]/2):]

In [None]:
all_predictions_test_dir = all_predictions_test[:int(all_predictions_test.shape[0]/2)]
all_predictions_test_inv = all_predictions_test[int(all_predictions_test.shape[0]/2):]

In [None]:
from scipy.stats import pearsonr
pearsonr(all_predictions_test_dir.cpu(),all_predictions_test_inv.cpu())

In [None]:
from scipy.stats import pearsonr
pearsonr(all_predictions_train.cpu(),torch.cat(all_lables_train, dim=0).cpu())

In [None]:
prediction_guido_test = pd.DataFrame(np.array([info_mutation_test, all_predictions_test.cpu().numpy()]).T)
prediction_guido_train = pd.DataFrame(np.array([info_mutation_train, all_predictions_train.cpu().numpy()]).T)

In [None]:
prediction_guido_train.drop_duplicates(subset=[0])

In [None]:
prediction_guido_train.iloc[:,1] = prediction_guido_train.iloc[:,1].map(lambda x: float(x))

In [None]:
#prediction_guido_train.iloc[:,1] = -prediction_guido_train.iloc[:,1]
prediction_guido_train

In [None]:
assert False

In [None]:
prediction_guido_test.to_pickle('prediction_guido_test.pkl')
prediction_guido_train.to_pickle('prediction_guido_train.pkl')


In [None]:
pd.read_pickle('../DeltaDeltaG/prediction_guido_test.pkl')

In [None]:
torch.save(DDG_model.state_dict(), 'model_weights_pyhzia.pth')

In [None]:
assert False

In [None]:
len([float(d['ddg']) for d in dataset_s2648])

In [None]:
all_predictions_test

In [None]:
sns.displot(x=[float(d['ddg']) for d in dataset_s669] + all_predictions_test.tolist(),
            hue=(["true"] * len(dataset_s669) + ["pred"] * len(dataset_s669)))

In [None]:
sns.displot(x=[float(d['ddg']) for d in dataset_s2648] + prediction_guido_train[1].tolist(),
            hue=(["true"] * len(dataset_s2648) + ["pred"] * len(dataset_s2648)))


In [None]:
len(prediction_guido_train[1].tolist())

In [None]:
sns.displot([float(true['ddg']) for true in dataset_s669])

In [None]:
sns.displot(prediction_guido_test.iloc[:,1].values.astype(float))

In [None]:
plt.clf()
sns.scatterplot(y = prediction_guido_test.iloc[:,1].values.astype(float), x= [float(true['ddg']) for true in dataset_s669])
sns.scatterplot(y =  [float(true['ddg']) for true in dataset_s669],x= [float(true['ddg']) for true in dataset_s669])
plt.xlabel('True')
plt.ylabel('Pred')
plt.savefig('prova.pdf')

#plt.yscale('log')

In [None]:
pearsonr(prediction_guido_test.iloc[:,1].values.astype(float),[float(true['ddg']) for true in dataset_s669])