# Data Merging and Final Processing

This notebook contains code to prepare and clean original dataset for machine learning applications (one-hot encodings, removal of missing or incorrect values, etc.) and merging with FEATURE vector data for the final combined dataset.

In [1]:
import pandas as pd

## Helper Functions and Data Storage

In [2]:
### Mapping from Protein Name to PDB Structure ###
prot_pdb_dict = {'TEM-1': '1xpb',
                 'Kka2': '1nd4',
                 'Uba1': '3cmm',
                 'PSD95pdz3': '1be9',
                 'Pab1': '1cvj',
                 'Yap65': '1jmq',
                 'hsp90': '2cg9',
                 'gb1': '1pga'
                }

def prot_to_pdb(row):
    """ Maps protein identifier to PDB accession number to create new column
    """

    return(prot_pdb_dict[row['protein']])

## Main Dataset Cleaning

In [3]:
# Initial Read
main_data_path = 'data/mmc2.csv'
main_data = pd.read_csv(main_data_path)

# Filter out "predicted" variant effects from original dataset
main_data = main_data[main_data['predicted?'] == 'NO']

# Dropping Unnecessary ID columns
main_data.drop(['variant_id', 'position_id', 'dms_id', 'uniprot', 'predicted?'], axis=1, inplace=True)

## Parsing and Renaming columns
main_data['pdb'] = main_data.apply(lambda row: prot_to_pdb(row), axis=1)
main_data.rename(columns = {'position':'resnum'}, inplace=True)


In [4]:
# Remove Redundant or Columns with too much Missing Data
main_data.drop(['local_density', 'local_biochem', 'predictions', 'selection_coefficient', 'local_conservation'],axis=1, inplace=True)

# Fix Data Errors
main_data['accessibility'].fillna(0, inplace=True)
main_data['dssp_sec_str'].replace('.', 'O', inplace=True)
main_data['dssp_sec_str'].fillna('O', inplace=True)
main_data['phi_psi_reg'].fillna('O', inplace=True)
main_data['wt_mut'].fillna('NA', inplace=True)
main_data['evolutionary_coupling_avg'].fillna(0, inplace=True)
main_data['mut_msa_congruency'].fillna(1.4, inplace=True)
main_data['mut_mut_msa_congruency'].fillna(0.04, inplace=True)
main_data['seq_ind_closest_mut'].fillna(0,inplace=True)
main_data['b_factor'].fillna(0,inplace=True)
main_data['delta_solvent_accessibility'].fillna(0, inplace=True)
main_data.dropna(subset=['grantham'], inplace=True)
main_data.dropna(inplace=True)

In [5]:
print(main_data.protein.unique())
main_data.shape

['TEM-1' 'Kka2' 'Uba1' 'PSD95pdz3' 'Pab1' 'Yap65' 'hsp90' 'gb1']


(52922, 31)

After filtering and cleaning data, all proteins were retained, leaving 52,922 samples to learn from.

## Main Dataset Encoding
After cleaning, now need to one-hot encode any categorical columns

In [6]:
main_encode = pd.get_dummies(main_data,columns=['aa1', 'aa2', 'wt_mut', 'aa1_polarity', 'aa2_polarity', 'dssp_sec_str', 'phi_psi_reg'])
main_encode.shape
main_encode.to_csv('data/mmc2_updated.csv')

## Merging with FEATURE Vectors
Next will merge with FEATURE data

In [7]:
feature_data = pd.read_csv('data/feature-files/feature_merged.csv')
# merged_data = pd.merge(main_encode, feature_data, on=['pdb', 'resnum'], how='left') # Add on left to duplicate microenvironments
merged_data = pd.merge(main_encode, feature_data, on=['pdb', 'resnum'])
merged_data.drop(['Unnamed: 0', 'Unnamed: 1'], axis=1, inplace=True)
merged_data.to_csv('data/merged.csv')
merged_data.shape

(49581, 971)

In [9]:
merged_data.pdb.unique()

array(['1xpb', '1nd4', '3cmm', '1be9', '1cvj', '2cg9'], dtype=object)