# Generating Features

Here we use extended input features but also generate target data for C/O/N/Cbeta positions so that we can learn how to rebuild other atoms as well.

Note that the output format of the generated file is slightly different to make it easier to process later on.

In [None]:
import os
import sys
import mdtraj as md
import numpy as np

tag='longer'

# projection function used below
def project(vector,midpoint,xloc,yloc,zloc):
    vmid=vector-midpoint
    return [np.dot(vmid,xloc), np.dot(vmid,yloc), np.dot(vmid,zloc)]

pdblist_filename = f"../extractingpdbs/{tag}_clean_pdb_chain.txt"
pdblist=open(pdblist_filename)

chain_directory = "../extractingpdbs/chains"

# file name to write output to
output_filename = f"{tag}_local_i_aa_capm2opmnpmcpm.dat"
output_file = open(output_filename,"w")

for line in pdblist:
    items=line.split()
    pdb=items[0]
    chain=items[1]
    
    chain_filename=f"{chain_directory}/{pdb}_{chain}.pdb"
        
    # check whether we can find the prepared PDB file
    if (not os.path.isfile(chain_filename)):
        print(f"cannot find {chain_filename}, skipping")
    else:
        # loading PDB file using mdtraj
        pdb=md.load_pdb(chain_filename)

        caxyz=np.zeros(pdb.topology.n_residues*3)
        cxyz=np.zeros(pdb.topology.n_residues*3)
        nxyz=np.zeros(pdb.topology.n_residues*3)
        oxyz=np.zeros(pdb.topology.n_residues*3)
        cbxyz=np.zeros(pdb.topology.n_residues*3)      
        haveall=np.zeros(pdb.topology.n_residues)
        
        for atom in pdb.topology.atoms:
            xyz=pdb.xyz[0][atom.index]
            rinx=atom.residue.index
            
            if (atom.name == 'CA'):
                caxyz[rinx*3]=xyz[0]
                caxyz[rinx*3+1]=xyz[1]
                caxyz[rinx*3+2]=xyz[2]
                haveall[rinx]+=1
            
            if (atom.name == 'C'):
                cxyz[rinx*3]=xyz[0]
                cxyz[rinx*3+1]=xyz[1]
                cxyz[rinx*3+2]=xyz[2]
                haveall[rinx]+=1
                
            if (atom.name == 'N'):
                nxyz[rinx*3]=xyz[0]
                nxyz[rinx*3+1]=xyz[1]
                nxyz[rinx*3+2]=xyz[2]
                haveall[rinx]+=1
                
            if (atom.name == 'O'):
                oxyz[rinx*3]=xyz[0]
                oxyz[rinx*3+1]=xyz[1]
                oxyz[rinx*3+2]=xyz[2]
                haveall[rinx]+=1
                
            if (atom.name == 'CB'):
                cbxyz[rinx*3]=xyz[0]
                cbxyz[rinx*3+1]=xyz[1]
                cbxyz[rinx*3+2]=xyz[2]
                haveall[rinx]+=1
           
        caxyz=np.reshape(caxyz,(-1,3))   
        oxyz=np.reshape(oxyz,(-1,3))   
        nxyz=np.reshape(nxyz,(-1,3))
        cxyz=np.reshape(cxyz,(-1,3))
        cbxyz=np.reshape(cbxyz,(-1,3))

        for i in range(2,len(caxyz)-2):
            aa=pdb.topology.residue(i).name
            if ((haveall[i]==5 or (haveall[i]==4 and aa == 'GLY')) and haveall[i-1]>=4 and haveall[i+1]>=4):               
                ca_m1=caxyz[i-1]
                ca_m2=caxyz[i-2]
                ca_p1=caxyz[i+1]
                ca_p2=caxyz[i+2]

                m=0.5*(ca_m1+ca_p1)
                l1=ca_p1-ca_m1
                l2=(ca_m2-ca_m1)+(ca_p2-ca_p1)
                l3=np.cross(l1,l2)
                ol2=np.cross(l3,l1)

                xlocal=l1/np.linalg.norm(l1)
                ylocal=ol2/np.linalg.norm(ol2)
                zlocal=l3/np.linalg.norm(l3)

                # projection of residue coordinats onto local coordinates
                # these are the TARGETs that we want to predict
                ca_i_local=project(caxyz[i],m,xlocal,ylocal,zlocal)
                c_i_local=project(cxyz[i],m,xlocal,ylocal,zlocal)
                n_i_local=project(nxyz[i],m,xlocal,ylocal,zlocal)
                o_i_local=project(oxyz[i],m,xlocal,ylocal,zlocal)
                cb_i_local=project(cbxyz[i],m,xlocal,ylocal,zlocal)
                    
                target_feature=ca_i_local+c_i_local+n_i_local+o_i_local+cb_i_local    
                    
                # projection of atoms from residues before and after onto local coordinates
                # these are the INPUT FEATURES that we use to make the prediction
                ca_m1_local=project(ca_m1,m,xlocal,ylocal,zlocal)
                ca_m2_local=project(ca_m2,m,xlocal,ylocal,zlocal)
                ca_p1_local=project(ca_p1,m,xlocal,ylocal,zlocal)
                ca_p2_local=project(ca_p2,m,xlocal,ylocal,zlocal)

                o_m1_local=project(oxyz[i-1],m,xlocal,ylocal,zlocal)
                o_p1_local=project(oxyz[i+1],m,xlocal,ylocal,zlocal)
                n_m1_local=project(nxyz[i-1],m,xlocal,ylocal,zlocal)
                n_p1_local=project(nxyz[i+1],m,xlocal,ylocal,zlocal)
                c_m1_local=project(cxyz[i-1],m,xlocal,ylocal,zlocal)
                c_p1_local=project(cxyz[i+1],m,xlocal,ylocal,zlocal)
            
                # could be others such as Cbetas, or closest 10 atoms in space
                input_feature=ca_m1_local+ca_m2_local+ca_p1_local+ca_p2_local+ \
                    o_m1_local+o_p1_local+n_m1_local+n_p1_local+c_m1_local+c_p1_local
            
                
            
                print(aa, *target_feature, *input_feature, file=output_file)
            
        print(f"worked on {chain_filename}")
        
output_file.close()
        