# Filtering Peptide Samples by Biophysical Properties

### This notebook describes additional ranking of the sampled sequences based on their biophysical properties.

### Enhancing Sequence Selection

- This ranking in useful in streamlining the selection of sequences. For instance, it can prioritise specific 
samples for experimental validation, ensuring that those with the most desirable biophysical characteristics are tested first.

## imports

In [None]:
import time
from pathlib import Path
from typing import Tuple, List, Optional, Dict
import pandas as pd
import numpy as np
import warnings


from peptide_models.aminoacids import AMINOACIDS
from peptide_models.peptide import Peptide
from peptide_models.utils_data import fasta2df, get_peptides, pep2fasta, peptides2df, save_frame
from peptide_models.utils_models import get_predictions
from peptide_models.optimize_main import select_sequences

warnings.filterwarnings("ignore")

## Utilities

In [None]:
def get_average_properties(peptides_list: List[Peptide]) -> Dict:
    """
    Computes average biophysical properties 
    :param peptides_list: list of peptide instances
    :return: dictionary with properties
    """
    names_properties = ['M',
                        'pI',
                        'A',
                        'G',
                        'II',
                        'MW']
    MW, pI, A, G, II, M = [], [], [], [], [], []
    for pep in peptides_list:
        MW.append(pep.molecular_weight)
        pI.append(pep.pi)
        G.append(pep.gravy)
        M.append(pep.m_ext_coefficient)
        II.append(pep.instability_index)
        A.append(pep.aromaticity)

    mean_MW, std_MW = np.asarray(MW).mean(), np.asarray(MW).std()
    mean_pI, std_pI = np.asarray(pI).mean(), np.asarray(pI).std()
    mean_A, std_A = np.asarray(A).mean(), np.asarray(A).std()
    mean_G, std_G = np.asarray(G).mean(), np.asarray(G).std()
    mean_II, std_II = np.asarray(II).mean(), np.asarray(II).std()
    mean_M, std_M = np.asarray(M).mean(), np.asarray(M).std()
    properties = dict(zip(names_properties, [(mean_MW, std_MW),
                                             (mean_pI, std_pI),
                                             (mean_A, std_A),
                                             (mean_G, std_G),
                                             (mean_II, std_II),
                                             (mean_M, std_M)]))
    return properties

def rank_peptides(data_frame: pd.DataFrame, 
                  properties_dict: Dict) -> pd.DataFrame:
    """
    Sorts peptides based on the number of features 
    whose values are within one standard deviation 
    of the mean calculated for the
    corresponding group of training set sequences.
    :param data_frame: 
    :param properties_dict: 
    :return: 
    """
    properties = ['M', 'pI', 'A', 'G', 'II', 'MW']
    for prop in properties:
        m, std = properties_dict[prop][0], properties_dict[prop][1]
        data_frame[prop] = [True if m - std <= x <= m + std else False for x in
                            data_frame[prop]]
    data_frame['rank'] = data_frame[[col for col in properties]].sum(axis=1)
    data_frame.sort_values(by=['rank'], inplace=True, ascending=False)
    data_frame.index = range(1, len(data_frame)+1)
    return data_frame

## Select from the training set relevant subsets of peptides based on their potencies

### Load training data 

In [None]:
training_data_path = Path('../data/training_data.xlsx')

In [None]:
dataset = pd.read_excel(str(training_data_path),
                        index_col=0,
                        header=0,
                        skiprows=0,
                        sheet_name='dataset')
msa = pd.read_excel(str(training_data_path),
                    index_col=0,
                    header=0,
                    skiprows=0,
                    sheet_name='alignment')

training_peptides = []
for idx in range(len(dataset)):
    pep_record = dataset.iloc[idx]
    peptide = Peptide(alias=pep_record.alias,
                      ec_50A=pep_record.EC50_LOG_T1,
                      ec_50B=pep_record.EC50_LOG_T2,
                      name=idx,
                      sequence=msa.iloc[idx].sequence,
                      c_term=True)
    training_peptides.append(peptide)


### Peptde selection

In [None]:
GCGR_selective, GLP1R_selective, bothR_high_potency = select_sequences(training_peptides, low_th=-11, high_th=-11)

### Assess the mean biophysical properties of sequences in the training set, categorised by each activity group.

In [None]:
properties_dict_GCGR_selective = get_average_properties(GCGR_selective)
properties_dict_GLP1R_selective = get_average_properties(GLP1R_selective)
properties_dict_bothR_high_potency = get_average_properties(bothR_high_potency)

In [None]:
properties_dict_bothR_high_potency

## Sort samples based on the alignment of biophysical properties 

### Load samples

### a) First generation of samples

In [None]:
samples_path = Path('../results/ligand_design/samples/predictions/gen_0')

In [None]:
for f_path in sorted(samples_path.iterdir()):
        print('file:', str(f_path))
        name_df = str(f_path).split('/')[-1].split('.xlsx')[0]
        if name_df == 'high_potency_at_both':
            data_high_potency_at_both = pd.read_excel(str(f_path),
                                        index_col=0,
                                        header=0,
                                        skiprows=0)
        elif name_df == 'selective_towards_GCGR':
            data_selective_towards_GCGR = pd.read_excel(str(f_path),
                                        index_col=0,
                                        header=0,
                                        skiprows=0)
        else:
            data_selective_towards_GLP1R = pd.read_excel(str(f_path),
                                        index_col=0,
                                        header=0,
                                        skiprows=0)

In [None]:
data_high_potency_at_both

In [None]:
data_high_potency_at_both_ranked = rank_peptides(data_frame = data_high_potency_at_both, 
                                                 properties_dict = properties_dict_bothR_high_potency)
data_selective_towards_GLP1R_ranked = rank_peptides(data_frame = data_selective_towards_GLP1R,
                                                    properties_dict = properties_dict_GLP1R_selective)
data_selective_towards_GCGR_ranked = rank_peptides(data_frame = data_selective_towards_GCGR, 
                                                   properties_dict = properties_dict_GCGR_selective)

###  Save ranked sequenes

In [None]:
output_path = samples_path
name_file = 'ranked_sequences'

with pd.ExcelWriter(str(Path(output_path, name_file).with_suffix('.xlsx'))) as writer:
    data_high_potency_at_both_ranked.to_excel(writer, sheet_name='high_potency_at_both',
                index=True, float_format='%.4f')
    data_selective_towards_GLP1R_ranked.to_excel(writer, sheet_name='selective_towards_GLP1R',
                index=True, float_format='%.4f')
    data_selective_towards_GCGR_ranked.to_excel(writer, sheet_name='selective_towards_GCGR',
                index=True, float_format='%.4f')

### b) Third generation of samples

In [None]:
samples_path = Path('../results/ligand_design/samples/predictions/gen_2')

In [None]:
for f_path in sorted(samples_path.iterdir()):
        print('file:', str(f_path))
        name_df = str(f_path).split('/')[-1].split('.xlsx')[0]
        if name_df == 'high_potency_at_both':
            data_high_potency_at_both = pd.read_excel(str(f_path),
                                        index_col=0,
                                        header=0,
                                        skiprows=0)
        elif name_df == 'selective_towards_GCGR':
            data_selective_towards_GCGR = pd.read_excel(str(f_path),
                                        index_col=0,
                                        header=0,
                                        skiprows=0)
        else:
            data_selective_towards_GLP1R = pd.read_excel(str(f_path),
                                        index_col=0,
                                        header=0,
                                        skiprows=0)

In [None]:
data_high_potency_at_both_ranked = rank_peptides(data_frame = data_high_potency_at_both, 
                                                 properties_dict = properties_dict_bothR_high_potency)
data_selective_towards_GLP1R_ranked = rank_peptides(data_frame = data_selective_towards_GLP1R, 
                                                    properties_dict = properties_dict_GLP1R_selective)
data_selective_towards_GCGR_ranked = rank_peptides(data_frame = data_selective_towards_GCGR, 
                                                   properties_dict = properties_dict_GCGR_selective)

In [None]:
data_high_potency_at_both_ranked

### Save ranked sequenes

In [None]:
output_path = samples_path

name_file = 'ranked_sequences'

with pd.ExcelWriter(str(Path(output_path, name_file).with_suffix('.xlsx'))) as writer:
    data_high_potency_at_both_ranked.to_excel(writer, sheet_name='high_potency_at_both',
                index=True, float_format='%.4f')
    data_selective_towards_GLP1R_ranked.to_excel(writer, sheet_name='selective_towards_GLP1R',
                index=True, float_format='%.4f')
    data_selective_towards_GCGR_ranked.to_excel(writer, sheet_name='selective_towards_GCGR',
                index=True, float_format='%.4f')