# FEATURE Vector Processing
This notebook is dedicated to processing FEATURE vector data into a single, cleaned dataset with neccessary overlap to merge with the original dataset (PDB accession number and residue number).

FEATURE outputs are generally a a count of nearby items within a specified shell. Therefore, you can sum each atom together and get the total microenvironment around a residue.
- Don't want to just use the C-alpha because that would loose important microenvironment information for large, bulky residues.

In [2]:
import pandas as pd

## Helpful Processing Functions and Data

In [27]:
### Basic Column Headers ###
column_headers = []
fp = open('feature-vector-comments.md')
for line in fp:
    header_item = [item.replace(',','').lower() for item in line.split()]
    header_item.remove('properties')
    column_headers.extend(header_item)
fp.close()

### Valid Residues ###
valid_resid = {'ala', 'arg', 'asn', 'asp',
               'cys', 'gln', 'glu', 'gly',
               'his', 'ile', 'leu', 'lys',
               'met', 'phe', 'pro', 'ser',
               'thr', 'trp', 'tyr', 'val'
               }

### Mapping Protein -> PDB ###
prot_pdb_dict = {'TEM-1': '1xpb',
                 'Kka2': '1nd4',
                 'Uba1': '3cmm',
                 'PSD95pdz3': '1be9',
                 'Pab1': '1cvj',
                 'Yap65': '1jmq',
                 'hsp90': '2cg9',
                 'gb1': '1pga'
                }

### Helper Function Definitions

def extend_columns(n):
    """ Duplicates basic columns and adds shell number indicator
    """

    final_headers = []
    for i in range(n):
        temp_headers = [header + '_' + str(i + 1) for header in column_headers]
        final_headers.extend(temp_headers)
    return(final_headers)

def gen_headers(n):
    """ Generates list of pandas headers for FEATURE vector data
    """

    new_headers = ['env_name']
    extend_headers = extend_columns(n)
    new_headers.extend(extend_headers)
    new_headers.append('res_atom')
    return(new_headers)

def drop_coordinates(df):
    """ Drops coordinate/position columns from FEATURE vector data
    """

    num_cols = len(df.columns)
    start = num_cols - 6
    stop = num_cols - 1
    cols_to_delete = list(range(start,stop))
    df.drop(cols_to_delete, axis = 1, inplace = True)

def prot_to_pdb(row):
    """ Maps protein identifier to PDB accession number to create new column
    """

    return(prot_pdb_dict[row['protein']])

def get_resnum(row):
    """ Extracts residue number from the res column in FEATURE vector data
    """

    value = row['res']
    return(int(value[3:]))

def get_resid(row):
    """ Extracts residue ID from res column in FEATURE vector data
    """

    value = row['res']
    return(value[:3].lower())

## Clean atom string
def clean_atom(row):
    """ Removes extraneous characters from atom description
    """
    
    return(row['atom'][2:])

## Initial Feature Vector Processing
- Reading in from .ff file
- Extracting PDB, residue, and atom information
- Cleaning PDB, residue, and atom information (select specific chains)

In [57]:
num_shells = 6
feature_list = dict()

for pdb in prot_pdb_dict.values():
    #DEBUG: Print current PDB
    print(pdb)

    # Basic Processing
    feature_path = 'data/feature-files/{}.ff'.format(pdb)
    feature_data = pd.read_csv(feature_path, sep ='\t', header=None)
    drop_coordinates(feature_data)


    feature_data.columns = gen_headers(num_shells)

    ## Parse Residue/Atom Information ##
    feature_data[['res', 'atom_temp']] = feature_data.res_atom.str.split(':',expand=True)
    feature_data[['chain','atom']] = feature_data.atom_temp.str.split('@',expand=True)
    feature_data['resnum'] = feature_data.apply(lambda row: get_resnum(row), axis=1)
    feature_data['resid'] = feature_data.apply(lambda row: get_resid(row), axis=1)
    feature_data.drop(['res_atom', 'res', 'atom_temp', 'env_name'], axis=1, inplace=True)

    ## Reparse PDB Information ##
    feature_data['pdb'] = pdb

    ## Filtering ##
    # Non-AA residues
    feature_data = feature_data[feature_data.resid.isin(valid_resid)]

    # Extra Chains
    feature_data = feature_data[feature_data.chain == 'A']

    # Aggregate atom microenvironments into residue microenvironments
    feature_data = feature_data.groupby(['pdb', 'resnum']).sum().reset_index()
    feature_list[pdb] = feature_data

# Generate Final Feature Dataset
feature_merged = pd.concat(feature_list)
feature_merged.to_csv('data/feature-files/feature_merged.csv')

1xpb
1nd4
3cmm
1be9
1cvj
1jmq
2cg9
1pga


In [59]:
feature_merged.shape

(2514, 482)