# Расчёт признаков по последовательности  

Используя описание белковых признаков из оригинальной работы Bastion6 и код проекта pypredt6.py составить модуль для языка Python, который рассчитывает частотные (статистические) и физико-химические признаки. Разрешается использовать другие готовые решения для расчёта нужных признаков:

I. Sequence-based features: a) AAC, b) DPC, c) QSO 

II. Physicochemical features: a) seven types of physicochemical properties b) CTDT is a representation of the frequency with which a type A residue is followed by a type B residue

В качестве ответа приложите ссылку на репозиторий и пример расчёта каждого признака (выгрузить в tab или CSV файл) для наборов положительных и отрицательных примеров https://bastion6.erc.monash.edu/static/download/T6SE_training_data.zip

Для удобного считывания последовательностей можно использовать BioPython  https://biopython.org/wiki/SeqIO


## Ход работы:
1. Отредактировал функцию featureextraction из pypredt6.py, чтобы она выделяла только нужные фичи: AAC, DPC, seven types of physicochemical properties.
2. Добавил к ним признаки, которых не хватает: QSO, CTDT. Для этого исопользовал библиотеку propy.
3. Сохранил все в csv файле.

In [1]:
import sys
print(sys.version)

3.9.23 | packaged by conda-forge | (main, Jun  4 2025, 17:57:12) 
[GCC 13.3.0]


In [8]:
import sys
sys.path.append('./.local/lib/python3.9/site-packages')

In [3]:
import pandas as pd
from propy import PyPro

/home/jovyan/.conda/envs/bio/lib/python3.9/site-packages/propy/__init__.py


  import pkg_resources


In [28]:
def featureextraction(peptide_file_name, total):
    import csv
    import requests
    import webbrowser
    from selenium import webdriver
    import re
    import time
    import numpy as np

    feature = [[0 for x in range(432)] for y in range(total)]

    #amino acid feature set
    aa=['A','R','N','D','C','E','Q','G','H','I','L','K','M','F','P','S','T','W','Y','V']
    aac_columns = [f"AAC_{aa}" for aa in ['A','R','N','D','C','E','Q','G','H','I','L','K','M','F','P','S','T','W','Y','V']]
    dpc_columns = [f"DPC_{a1}{a2}" for a1 in aa for a2 in aa]
    physico_columns = [
        "Physicochemical_Hydrophobicity",
        "Physicochemical_VanDerWaals",
        "Physicochemical_Polarity",
        "Physicochemical_Polarizability",
        "Physicochemical_Charge",
        "SecondaryStructure_AlphaHelix",
        "SecondaryStructure_BetaSheet",
        "SecondaryStructure_Coil",
        "SolventAccessibility_Buried",
        "SolventAccessibility_Exposed",
        "SolventAccessibility_Intermediate",
        "SolventAccessibility_Intermediate2",
    ]
    feature_columns = aac_columns + dpc_columns + physico_columns
    

    #dipeptide initialization
    dipeptide=list()
    for i in range(len(aa)):
        for j in range(len(aa)):
            t=''
            t=aa[i]+aa[j]
            dipeptide.append(t)
            j=j+1
    #print(dipeptide)
    id=open(peptide_file_name,"r")
    line=id.readline()
    line=id.readline()
    str=''
    count=0
    line_number=0
    print('Извлечение AAC, DPC, physicochemical features')
    while line:
        if '>' not in line:
            str=str+line
        if '>' in line:
            #print(str)
            # Признак AAC 0-19
            for i in range(len(aa)):
                feature[line_number][i]=round(str.count(aa[i])/len(str),4)

            # Признак DPC 20 - 419
            for i in range(len(dipeptide)):
                feature[line_number][i+20]=round(str.count(dipeptide[i])/len(str),4)

            # Признаки physicochemical prroperties 420-424: Hydrophobicity, Normalized van der Waals volume, Polarity, Polarizability, Charge
            # Hydrophobicity ('A','I','L','M','F','W','V')
            feature[line_number][0+420]=round((str.count('A')+ str.count('I')+str.count('L')+str.count('M')+str.count('F')+str.count('W')+str.count('V'))/len(str),4)
    
            # VanDerWaals ('G','A','S','T','P','D','C')
            feature[line_number][1+420]=round((str.count('G')+ str.count('A')+str.count('S')+str.count('T')+str.count('P')+str.count('D')+str.count('C'))/len(str),4)

            # Polarity ('S','T','N','Q','Y','C')
            feature[line_number][2+420]=round((str.count('S')+ str.count('T')+str.count('N')+str.count('Q')+str.count('Y')+str.count('C'))/len(str),4)

            # Polarizability ('F','Y','W','H','K','R')
            feature[line_number][3+420]=round((str.count('F')+ str.count('Y')+str.count('W')+str.count('H')+str.count('K')+str.count('R'))/len(str),4)
            
            # Charge ('D','E','K','R','H')
            feature[line_number][4+420]=round((str.count('D')+ str.count('E')+str.count('K')+str.count('H')+str.count('R'))/len(str),4)
            str=''

            line_number=line_number+1
        line=id.readline()
    id.close()
    # Признак AAC 0-19 для последней поседовательности
    for i in range(len(aa)):
                feature[line_number][i]=round(str.count(aa[i])/len(str),4)
    # Признак DPC 20 - 419 для последней поседовательности
    for i in range(len(dipeptide)):
                feature[line_number][i+20]=round(str.count(dipeptide[i])/len(str),4)

    #Признаки physicochemical prroperties 420-424 для последней поседовательности

    # Hydrophobicity ('A','I','L','M','F','W','V')
    feature[line_number][0+420]=round((str.count('A')+ str.count('I')+str.count('L')+str.count('M')+str.count('F')+str.count('W')+str.count('V'))/len(str),4)

    # VanDerWaals ('G','A','S','T','P','D','C')
    feature[line_number][1+420]=round((str.count('G')+ str.count('A')+str.count('S')+str.count('T')+str.count('P')+str.count('D')+str.count('C'))/len(str),4)

    # Polarity ('S','T','N','Q','Y','C')
    feature[line_number][2+420]=round((str.count('S')+ str.count('T')+str.count('N')+str.count('Q')+str.count('Y')+str.count('C'))/len(str),4)

    # Polarizability ('F','Y','W','H','K','R')
    feature[line_number][3+420]=round((str.count('F')+ str.count('Y')+str.count('W')+str.count('H')+str.count('K')+str.count('R'))/len(str),4)
    
    # Charge ('D','E','K','R','H')
    feature[line_number][4+420]=round((str.count('D')+ str.count('E')+str.count('K')+str.count('H')+str.count('R'))/len(str),4)


    print('Извлечение AAC, DPC, physicochemical features завершено!')
    
    # Признаки physicochemical prroperties 425-431: Secondary Structure, Solvent Accessibility 
    print("Извлечение secondary structure and solvent accessibility features...")
    id=open(peptide_file_name,"r")
    line=id.readline()
    str=''
    str=str+line
    line=id.readline()
    count=0
    line_number=0
    while line:
        if '>' not in line:
            str=str+line
        if '>' in line:
            count=count+1

            structure=''
            solvent=''

            feature[line_number][425]=round((str.count('E')+str.count('A')+str.count('L')+str.count('M')+str.count('Q')+str.count('K')+str.count('R')+str.count('H'))/len(str),4)
            feature[line_number][426]=round((str.count('V')+str.count('I')+str.count('Y')+str.count('C')+str.count('W')+str.count('F')+str.count('T'))/len(str),4)
            feature[line_number][427]=round((str.count('G')+str.count('N')+str.count('P')+str.count('S')+str.count('D'))/len(str),4)


            feature[line_number][428]=round((str.count('A')+str.count('L')+str.count('F')+str.count('C')+str.count('G')+str.count('I')+str.count('V')+str.count('W'))/len(str),4)
            feature[line_number][429]=round((str.count('R')+str.count('K')+str.count('Q')+str.count('E')+str.count('N')+str.count('D'))/len(str),4)
            feature[line_number][430]=round((str.count('M')+str.count('S')+str.count('P')+str.count('T')+str.count('H')+str.count('Y'))/len(str),4)
            feature[line_number][431]=round((str.count('M')+str.count('S')+str.count('P')+str.count('T')+str.count('H')+str.count('Y'))/len(str),4)

            line_number=line_number+1
            str=''
            str=str+line
        line=id.readline()

    # Признаки physicochemical prroperties 425-431 для последней последовательности
    structure=''
    solvent=''


    #secondary
    feature[line_number][425]=round((str.count('E')+str.count('A')+str.count('L')+str.count('M')+str.count('Q')+str.count('K')+str.count('R')+str.count('H'))/len(str),4)
    feature[line_number][426]=round((str.count('V')+str.count('I')+str.count('Y')+str.count('C')+str.count('W')+str.count('F')+str.count('T'))/len(str),4)
    feature[line_number][427]=round((str.count('G')+str.count('N')+str.count('P')+str.count('S')+str.count('D'))/len(str),4)

    #solvent
    feature[line_number][428]=round((str.count('Q')+str.count('E')+str.count('D'))/len(str),4)
    feature[line_number][429]=round((str.count('R')+str.count('K')+str.count('N')+str.count('G'))/len(str),4)
    feature[line_number][430]=round((str.count('A')+str.count('L')+str.count('F')+str.count('C')+str.count('I')+str.count('V'))/len(str),4)
    feature[line_number][431]=round((str.count('W')+str.count('M')+str.count('S')+str.count('P')+str.count('T')+str.count('H')+str.count('Y'))/len(str),4)

    id.close()

    print('Извлечение secondary structure and solvent accessibility features завершено!')

    # Извлечение QSO и CTDT 432-679
    print('Извлечение QSO (100 признаков)и CTDT (147 признаков)')
    qso_features = []
    ctd_features = []
    
    id=open(peptide_file_name,"r")
    line=id.readline()
    line=id.readline()
    
    str=''
    str=str+line
    line=id.readline()
    count=0
    line_number=0
    while line:
        if '>' not in line:
            str=str+line
        if '>' in line:
            count=count+1
            seq_clean = ''.join([a for a in str if a in "ARNDCEQGHILKMFPSTWYV"])
            if len(seq_clean) >= 3:
                # print(seq_clean)
                protein = PyPro.GetProDes(seq_clean)
                qso = protein.GetQSO()
                ctd = protein.GetCTD()
                qso_features.append(qso)
                ctd_features.append(ctd)
            else:
                qso_features.append({})
                ctd_features.append({})
            line_number=line_number+1
            str=''
            # str=str+line
        line=id.readline()
    
    # Извлечение QSO и CTDT 432-679 для последней последовательности
    seq_clean = ''.join([a for a in str if a in "ARNDCEQGHILKMFPSTWYV"])
    
    if len(seq_clean) >= 3:
        protein = PyPro.GetProDes(seq_clean)
        qso = protein.GetQSO()
        ctd = protein.GetCTD()
        qso_features.append(qso)
        ctd_features.append(ctd)
    else:
        qso_features.append({})
        ctd_features.append({})
    id.close()

    features_df = pd.DataFrame(feature, columns=feature_columns)
    qso_df = pd.DataFrame(qso_features).fillna(0)
    ctd_df = pd.DataFrame(ctd_features).fillna(0)
    
    df = pd.concat([features_df, qso_df, ctd_df], axis=1)
    df.to_csv("features_final.csv", index=False)
    print('Извлечение QSO и CTDT features завершено!')
    return df

In [29]:
peptide_file_name = "T6SE_training_data/T6SE_Training_Neg_1112.fasta"
total = 0
with open(peptide_file_name) as f:
 for line in f:
    finded = line.find('>')

    if finded == 0:
        total =total+ 1


print('Total number of sequences to be classified: ',total)

feature_df = featureextraction(peptide_file_name, total)

feature_df.head()

Total number of sequences to be classified:  1112
Извлечение AAC, DPC, physicochemical features
Извлечение AAC, DPC, physicochemical features завершено!
Извлечение secondary structure and solvent accessibility features...
Извлечение secondary structure and solvent accessibility features завершено!
Извлечение QSO (100 признаков)и CTDT (147 признаков)
Извлечение QSO и CTDT features завершено!


Unnamed: 0,AAC_A,AAC_R,AAC_N,AAC_D,AAC_C,AAC_E,AAC_Q,AAC_G,AAC_H,AAC_I,...,_HydrophobicityD2001,_HydrophobicityD2025,_HydrophobicityD2050,_HydrophobicityD2075,_HydrophobicityD2100,_HydrophobicityD3001,_HydrophobicityD3025,_HydrophobicityD3050,_HydrophobicityD3075,_HydrophobicityD3100
0,0.0698,0.0814,0.0698,0.0465,0.0174,0.064,0.0,0.1047,0.0116,0.0465,...,1.17,21.053,38.012,64.912,99.415,0.585,25.146,51.462,74.269,98.83
1,0.0532,0.0532,0.038,0.0951,0.0494,0.038,0.0266,0.0456,0.0114,0.0684,...,1.908,28.244,51.527,70.229,95.42,0.382,24.427,48.092,75.573,100.0
2,0.0833,0.05,0.0389,0.0667,0.0222,0.0778,0.0278,0.0944,0.0167,0.0333,...,1.676,26.816,50.838,74.302,99.441,0.559,12.849,40.223,68.715,100.0
3,0.1081,0.0486,0.0243,0.0541,0.0189,0.0676,0.0135,0.1054,0.0081,0.0649,...,0.813,24.39,49.051,72.087,98.374,0.271,20.596,49.322,78.32,99.187
4,0.1157,0.0463,0.0509,0.0625,0.0093,0.0602,0.0231,0.0833,0.0116,0.0602,...,0.464,25.058,49.188,71.462,99.304,0.232,24.362,46.636,74.014,100.0
