In [2]:
import pandas as pd

# Feature Vector Processing
This notebook is dedicatd to processing the feature vector outputs from FEATURE.

FEATURE outputs are generally a a count of nearby items within a specified shell. Therefore, you can sum each atom together and get the total microenvironment around a residue.
- Don't want to just use the C-alpha because that would loose important microenvironment information for large, bulky residues.

### Things that need to be done:

- [x] Create feature vector header parser (Need for 1 shell and extension to n shells)
- [ ] Collapse multiple atoms into a single residue valueFEATURE outputs are generally a a count of nearby items within a specified shell. Therefore, you can sum each atom together and get the total microenvironment around a residue.
- Don't want to just use the C-alpha because that would loose important microenvironment information for large, bulky residues.

## Processing Functions and Storage

In [3]:
### Basic Column Headers ###
column_headers = []
fp = open('feature-vector-comments.md')
for line in fp:
    header_item = [item.replace(',','').lower() for item in line.split()]
    header_item.remove('properties')
    column_headers.extend(header_item)
fp.close()

### Valid Residues ###
valid_resid = {'ala', 'arg', 'asn', 'asp',
               'cys', 'gln', 'glu', 'gly',
               'his', 'ile', 'leu', 'lys',
               'met', 'phe', 'pro', 'ser',
               'thr', 'trp', 'tyr', 'val'}

## Extend column headers for n shells?
def extend_columns(n):
    """ Duplicates columns and adds shell number indicator
    """
    final_headers = []
    for i in range(n):
        temp_headers = [header + '_' + str(i + 1) for header in column_headers]
        final_headers.extend(temp_headers)
    return(final_headers)

def gen_headers(n):
    new_headers = ['env_name']
    extend_headers = extend_columns(n)
    new_headers.extend(extend_headers)
    new_headers.append('residue')
    return(new_headers)

def drop_coordinates(df):
    num_cols = len(df.columns)
    start = num_cols - 6
    stop = num_cols - 1
    cols_to_delete = list(range(start,stop))
    df.drop(cols_to_delete, axis = 1, inplace = True)

## Extract PDB from environment description column
def get_pdb(row):
    env_name = row['env_name']
    if '1btl' in env_name:
        return('1btl')
    if '1nd4' in env_name:
        return('1nd4')
    if '6dc6' in env_name:
        return('6dc6')
    if '1tp3' in env_name:
        return('1tp3')
    if '1ifw' in env_name:
        return('1if2')
    if '1jmq' in env_name:
        return('1jmq')
    if '2xjx' in env_name:
        return('2xjx')
    if '2qmt' in env_name:
        return('2qmt')

## Extract Residue Number from res column
def get_resnum(row):
    value = row['res']
    return(int(value[3:]))

## Extract residue id from res column
def get_resid(row):
    value = row['res']
    return(value[:3].lower())

## Clean atom string
def clean_atom(row):
    return(row['atom'][2:])

## Single Shell (pdb-shell1.ff)

In [5]:
## Basic Processing
shell1_path = 'data/feature-files/pdb-shell1.ff'
shell1 = pd.read_csv(shell1_path, sep='\t', header=None)
drop_coordinates(shell1)
shell1.columns = gen_headers(1)

# Generate better descriptive columns
shell1['pdb'] = shell1.apply(lambda row: get_pdb(row), axis=1)
shell1[['res', 'atom']] = shell1.residue.str.split(":",expand=True)
shell1['resnum'] = shell1.apply(lambda row: get_resnum(row), axis=1)
shell1['resid'] = shell1.apply(lambda row: get_resid(row), axis=1)
shell1['atom'] = shell1.apply(lambda row: clean_atom(row), axis=1)

Unnamed: 0,env_name,atom_type_is_c_1,atom_type_is_ct_1,atom_type_is_ca_1,atom_type_is_n_1,atom_type_is_n2_1,atom_type_is_n3_1,atom_type_is_na_1,atom_type_is_o_1,atom_type_is_o2_1,...,secondary_structure2_is_beta_1,secondary_structure2_is_coil_1,secondary_structure2_is_het_1,secondary_structure2_is_unknown_1,residue,pdb,res,atom,resnum,resid
0,Env_1btl_0,2,3,1,2,0,0,0,1,1,...,3,6,0,0,HIS26:A@N,1btl,HIS26,N,26,his
1,Env_1btl_1,1,4,2,2,0,0,1,1,1,...,1,8,0,0,HIS26:A@CA,1btl,HIS26,CA,26,his
2,Env_1btl_2,2,6,1,4,0,0,1,1,1,...,1,7,0,0,HIS26:A@C,1btl,HIS26,C,26,his
3,Env_1btl_3,3,6,1,5,0,0,0,2,1,...,1,6,0,0,HIS26:A@O,1btl,HIS26,O,26,his
4,Env_1btl_4,1,4,3,2,0,0,2,1,0,...,1,10,0,0,HIS26:A@CB,1btl,HIS26,CB,26,his


In [8]:
# Filtering and Aggregating
shell1_agg = shell1.groupby(['pdb', 'resnum', 'resid']).sum().reset_index()
shell1_filt = shell1_agg[shell1.resid.isin(valid_resid)]

In [11]:
shell1_filt.columns

Index(['pdb', 'resnum', 'resid', 'atom_type_is_c_1', 'atom_type_is_ct_1',
       'atom_type_is_ca_1', 'atom_type_is_n_1', 'atom_type_is_n2_1',
       'atom_type_is_n3_1', 'atom_type_is_na_1', 'atom_type_is_o_1',
       'atom_type_is_o2_1', 'atom_type_is_oh_1', 'atom_type_is_s_1',
       'atom_type_is_sh_1', 'atom_type_is_other_1', 'partial_charge_1',
       'element_is_any_1', 'element_is_c_1', 'element_is_n_1',
       'element_is_o_1', 'element_is_s_1', 'element_is_other_1', 'hydroxyl_1',
       'amide_1', 'amine_1', 'carbonyl_1', 'ring_system_1', 'peptide_1',
       'vdw_volume_1', 'charge_1', 'neg_charge_1', 'pos_charge_1',
       'charge_with_his_1', 'hydrophobicity_1', 'mobility_1',
       'solvent_accessibility_1', 'residue_name_is_ala_1',
       'residue_name_is_arg_1', 'residue_name_is_asn_1',
       'residue_name_is_asp_1', 'residue_name_is_cys_1',
       'residue_name_is_gln_1', 'residue_name_is_glu_1',
       'residue_name_is_gly_1', 'residue_name_is_his_1',
       'residue_

## 4 Shell (Default) (pdb-shell4.ff)

In [27]:
shell4_path = 'data/feature-files/pdb-shell4.ff'
shell4= pd.read_csv(shell4_path, sep='\t', header=None)
drop_coordinates(shell4)

[481, 482, 483, 484, 485]
