In [1]:
from Bio.PDB import PDBParser
import numpy as np
import pandas as pd

In [2]:
# 필요한 함수 및 변수 정의
def calc_distance(coord1, coord2):
    return np.linalg.norm(coord1 - coord2)

def calc_angle(coord1, coord2, coord3):
    vector1 = coord1 - coord2
    vector2 = coord3 - coord2
    cos_angle = np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))
    angle = np.arccos(cos_angle) * 180 / np.pi
    return angle

def is_contact(atom, ligand_atoms, cutoff_distance=5.0):
    for ligand_atom in ligand_atoms:
        if calc_distance(atom.coord, ligand_atom.coord) <= cutoff_distance:
            return True
    return False

In [8]:

feature_map = {
    "contact": 0,
    "backbone": 1,
    "sidechain": 2,
    "polar": 3,
    "hydrophobic": 4,
    "acceptor": 5,
    "donor": 6,
    "aromatic": 7,
    "charged": 8,
}

polar_residues = {"ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "HIS", "LYS", "SER", "THR", "TYR"}
hydrophobic_residues = {"ALA", "ILE", "LEU", "MET", "PHE", "PRO", "TRP", "VAL"}
aromatic_residues = {"PHE", "TYR", "TRP", "HIS"}
charged_residues = {"ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "HIS", "LYS"}

# PDB 파일 로드
protein_pdb_file = "4Z43.pdb"
ligand_pdb_file = "Structure1.pdb"

parser = PDBParser()
protein_structure = parser.get_structure("protein", protein_pdb_file)
ligand_structure = parser.get_structure("ligand", ligand_pdb_file)

# 단백질 및 리간드 원자 추출
protein_atoms = [atom for atom in protein_structure.get_atoms()]
ligand_atoms = [atom for atom in ligand_structure.get_atoms()]

# 원-핫 인코딩 벡터 초기화
one_hot_vectors = []

# 수소결합 판별 기준
hbond_max_distance = 2.5
hbond_donor_min_angle = 120
hbond_acceptor_min_angle = 90


# 단백질 아미노산 잔기별 상호작용 확인 및 원-핫 인코딩
for residue in protein_structure.get_residues():
    if residue.id[0] == " ":
        features = set()

        # 수소결합 검사용 원자 정보 추출
        donor = None
        acceptor = None
        for atom in residue.get_atoms():
            if atom.name == "H":
                donor = atom
            elif atom.name in ["O", "N"]:
                acceptor = atom
        # 특징 계산
        for atom in residue.get_atoms():
            if is_contact(atom, ligand_atoms):
                features.add("contact")

                if atom.name in ["N", "CA", "C", "O"]:
                    features.add("backbone")
                else:
                    features.add("sidechain")

        if residue.resname in polar_residues:
            features.add("polar")
        if residue.resname in hydrophobic_residues:
            features.add("hydrophobic")
        if residue.resname in aromatic_residues:
            features.add("aromatic")
        if residue.resname in charged_residues:
            features.add("charged")
        
        # 수소결합 Donor/acceptor 판별
        if donor is not None and acceptor is not None:
            for ligand_atom in ligand_atoms:
                if ligand_atom.name in ["O", "N"]:
                    distance = calc_distance(donor.coord, ligand_atom.coord)
                    donor_acceptor_distance = calc_distance(donor.coord, acceptor.coord)
                    acceptor_ligand_distance = calc_distance(acceptor.coord, ligand_atom.coord)
                    donor_acceptor_angle = calc_angle(donor.coord, acceptor.coord, ligand_atom.coord)
                    acceptor_donor_angle = calc_angle(acceptor.coord, ligand_atom.coord, donor.coord)
                    if distance <= hbond_max_distance and donor_acceptor_distance > 1.0 and acceptor_ligand_distance > 1.0:
                        if donor_acceptor_angle >= hbond_donor_min_angle:
                            features.add("donor")
                        if acceptor_donor_angle >= hbond_acceptor_min_angle:
                            features.add("acceptor")
                        break
        
        # 원-핫 인코딩 벡터 생성
        one_hot_vector = [0] * len(feature_map)
        for feature in features:
            if feature in feature_map:
                one_hot_vector[feature_map[feature]] = 1

        one_hot_vectors.append(one_hot_vector)

# 결과 출력
#for residue, one_hot_vector in zip(protein_structure.get_residues(), one_hot_vectors):
#    if residue.id[0] == " ":
#        print(f"Residue: {residue.resname} {residue.id[1]}\nOne-hot vector: {one_hot_vector}\n")

# 원-핫 인코딩 벡터를 처리하여, 첫 번째 값이 0인 경우 모든 값을 0으로 변경하고, 아닌 경우 그대로 유지한 뒤, 이를 하나의 1차원 배열로 만듭니다.
result = np.array([vec if vec[0] != 0 else [0] * len(vec) for vec in one_hot_vectors]).flatten()



In [9]:
# 데이터프레임 초기화
df = pd.DataFrame()

# 아미노산 잔기별로 원-핫 인코딩 벡터를 생성하여 데이터프레임에 추가
for residue in protein_structure.get_residues():
    if residue.id[0] == " ":
        residue_id = residue.id[1]
        features = set()

        # 수소결합 검사용 원자 정보 추출
        donor = None
        acceptor = None
        for atom in residue.get_atoms():
            if atom.name == "H":
                donor = atom
            elif atom.name in ["O", "N"]:
                acceptor = atom

        # 특징 계산
        for atom in residue.get_atoms():
            if is_contact(atom, ligand_atoms):
                features.add("contact")

                if atom.name in ["N", "CA", "C", "O"]:
                    features.add("backbone")
                else:
                    features.add("sidechain")

        if residue.resname in polar_residues:
            features.add("polar")
        if residue.resname in hydrophobic_residues:
            features.add("hydrophobic")
        if residue.resname in aromatic_residues:
            features.add("aromatic")
        if residue.resname in charged_residues:
            features.add("charged")
        
        # 수소결합 Donor/acceptor 판별
        if donor is not None and acceptor is not None:
            for ligand_atom in ligand_atoms:
                if ligand_atom.name in ["O", "N"]:
                    distance = calc_distance(donor.coord, ligand_atom.coord)
                    donor_acceptor_distance = calc_distance(donor.coord, acceptor.coord)
                    acceptor_ligand_distance = calc_distance(acceptor.coord, ligand_atom.coord)
                    donor_acceptor_angle = calc_angle(donor.coord, acceptor.coord, ligand_atom.coord)
                    acceptor_donor_angle = calc_angle(acceptor.coord, ligand_atom.coord, donor.coord)
                    if distance <= hbond_max_distance and donor_acceptor_distance > 1.0 and acceptor_ligand_distance > 1.0:
                        if donor_acceptor_angle >= hbond_donor_min_angle:
                            features.add("donor")
                        if acceptor_donor_angle >= hbond_acceptor_min_angle:
                            features.add("acceptor")
                        break
        
        # 원-핫 인코딩 벡터 생성
        one_hot_vector = [0] * len(feature_map)
        for feature in features:
            if feature in feature_map:
                one_hot_vector[feature_map[feature]] = 1

        # 컬럼 이름 생성
        column_names = [f"{residue_id}_{k}" for k in feature_map.keys()]

        # 데이터프레임에 추가
        df = pd.concat([df, pd.DataFrame([one_hot_vector], columns=column_names)], axis=0, ignore_index=True)

In [11]:
# 컬럼 이름에 A를 추가하여 새로운 컬럼 이름 리스트 생성
new_columns = ['A' + column for column in df.columns]

# 데이터프레임 컬럼 이름 변경
df.columns = new_columns
columns_list = df.columns.tolist()

DATA_CSV = pd.DataFrame(result.reshape(1, -1), columns=columns_list)

In [12]:
DATA_CSV

Unnamed: 0,A2_contact,A2_backbone,A2_sidechain,A2_polar,A2_hydrophobic,A2_acceptor,A2_donor,A2_aromatic,A2_charged,A3_contact,...,A516_charged,A517_contact,A517_backbone,A517_sidechain,A517_polar,A517_hydrophobic,A517_acceptor,A517_donor,A517_aromatic,A517_charged
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
DATA_CSV.to_csv('Structure1.csv', index=False)