# Generating Features

Here we generate extended input features that also includes C/O/N atoms from previous and next residues.

In [None]:
import os
import mdtraj as md
import numpy as np

tag='longer'

# projection function used below
def project(vector,midpoint,xloc,yloc,zloc):
    vmid=vector-midpoint
    return [np.dot(vmid,xloc), np.dot(vmid,yloc), np.dot(vmid,zloc)]

pdblist_filename = f"../extractingpdbs/{tag}_clean_pdb_chain.txt"
pdblist=open(pdblist_filename)

chain_directory = "../extractingpdbs/chains"

# file name to write output to
#target_feature_filename = "local_cai_aa_capm2.dat"
target_feature_filename = f"{tag}_local_cai_aa_capm2opmnpmcpm.dat"
target_feature = open(target_feature_filename,"w")

for line in pdblist:
    items=line.split()
    pdb=items[0]
    chain=items[1]
    
    chain_filename=f"{chain_directory}/{pdb}_{chain}.pdb"
        
    # check whether we can find the prepared PDB file
    if (not os.path.isfile(chain_filename)):
        print(f"cannot find {chain_filename}, skipping")
    else:
        # loading PDB file using mdtraj
        pdb=md.load_pdb(chain_filename)

        # extracting coordinates for C-alpha coordinates
        caxyz=[]
        calist=pdb.topology.select("name CA")
        for ca in calist:
            caxyz=np.append(caxyz,pdb.xyz[0][ca])    
        caxyz=np.reshape(caxyz,(-1,3))

        # extracting coordinates for O coordinates
        oxyz=[]
        olist=pdb.topology.select("name O")
        for o in olist:
            oxyz=np.append(oxyz,pdb.xyz[0][o])    
        oxyz=np.reshape(oxyz,(-1,3))

        # extracting coordinates for N coordinates
        nxyz=[]
        nlist=pdb.topology.select("name N")
        for n in nlist:
            nxyz=np.append(nxyz,pdb.xyz[0][n])    
        nxyz=np.reshape(nxyz,(-1,3))

        # extracting coordinates for C coordinates
        cxyz=[]
        clist=pdb.topology.select("name C")
        for c in clist:
            cxyz=np.append(cxyz,pdb.xyz[0][c])    
        cxyz=np.reshape(cxyz,(-1,3))

        if (len(oxyz) == len(caxyz) and len(nxyz) == len(caxyz) and len(cxyz) == len(caxyz)):
            # go through all residues (excluding the first two and last two residues):
            for i in range(2,len(caxyz)-2):
                ca_i=caxyz[i]
                ca_m1=caxyz[i-1]
                ca_m2=caxyz[i-2]
                ca_p1=caxyz[i+1]
                ca_p2=caxyz[i+2]

                # midpoint, see above
                m=0.5*(ca_m1+ca_p1)
            
                # local coordinate system, see above
                l1=ca_p1-ca_m1
                l2=(ca_m2-ca_m1)+(ca_p2-ca_p1)
                l3=np.cross(l1,l2)
                ol2=np.cross(l3,l1)

                xlocal=l1/np.linalg.norm(l1)
                ylocal=ol2/np.linalg.norm(ol2)
                zlocal=l3/np.linalg.norm(l3)
            
                # projection of Ca_i onto local coordinates
                # this is the TARGET that we want to predict
                ca_i_local=project(ca_i,m,xlocal,ylocal,zlocal)
                # could add Cbeta or other atoms later
                
                if (ca_i_local[0]<-1 or ca_i_local[0]>1 or ca_i_local[1]<-1 or ca_i_local[1]>1 or ca_i_local[2]<-1 or ca_i_local[1]>1):
                    print(f"coordinates out of range {chain_filename} residue {i}")
                    print(*ca_i_local)
                    print("\n")
                else:
                    # projection of other Ca atoms onto local coordinates
                    # these are the INPUT FEATURES that we use to make the prediction
                    ca_m1_local=project(ca_m1,m,xlocal,ylocal,zlocal)
                    ca_m2_local=project(ca_m2,m,xlocal,ylocal,zlocal)
                    ca_p1_local=project(ca_p1,m,xlocal,ylocal,zlocal)
                    ca_p2_local=project(ca_p2,m,xlocal,ylocal,zlocal)

                    o_m1_local=project(oxyz[i-1],m,xlocal,ylocal,zlocal)
                    o_p1_local=project(oxyz[i+1],m,xlocal,ylocal,zlocal)
                    n_m1_local=project(nxyz[i-1],m,xlocal,ylocal,zlocal)
                    n_p1_local=project(nxyz[i+1],m,xlocal,ylocal,zlocal)
                    c_m1_local=project(cxyz[i-1],m,xlocal,ylocal,zlocal)
                    c_p1_local=project(cxyz[i+1],m,xlocal,ylocal,zlocal)
            
                    # could be others such as Cbetas, or closest 10 atoms in space
                    input_feature=ca_m1_local+ca_m2_local+ca_p1_local+ca_p2_local+ \
                        o_m1_local+o_p1_local+n_m1_local+n_p1_local+c_m1_local+c_p1_local
            
                    # another optional feature could be the amino acid type of Ca_i
                    aa=pdb.topology.atom(calist[i]).residue.name
            
                    print(*ca_i_local, aa, *input_feature, file=target_feature)
            
            print(f"worked on {chain_filename}")
        else:
            print(f"skipping {chain_filename} because of missing atoms")
        
target_feature.close()
        