In [1]:
import numpy as np
import pandas as pd

def standard_scale(x):
    for col in x.columns:
        data_mean = x[col].mean()
        data_stdev = x[col].std()
        
        x[col] = (x[col] - data_mean) / data_stdev
            
    return x

In [2]:
# Imports the various feature sets in the dataset

dezso_features = pd.read_csv("raw_data/dezso_features.csv")
go_components = pd.read_csv("raw_data/go_components_10-14-22.csv")
go_functions = pd.read_csv("raw_data/go_functions_10-14-22.csv")
go_processes = pd.read_csv("raw_data/go_processes_10-14-22.csv")
gdpc = pd.read_csv("raw_data/gdpc_10-14-22.csv")
paac = pd.read_csv("raw_data/paac_10-14-22.csv")
fpocket = pd.read_csv("raw_data/fpocket_output.csv")


# Imports the list of proteins with their sequences and labels

protein_list = pd.read_csv("raw_data/all_proteins.csv")

In [3]:
dezso_names = dezso_features['Protein']

features_categorical = dezso_features[['Enzyme Classification', 
                                     'Localization', 
                                     'Essentiality']]


# Obtains the one-hot encodings for the categorical features and categorical sub-networks

features_categorical = pd.get_dummies(features_categorical.astype(str))

features_numeric = dezso_features.drop(['Protein',
                                      'Enzyme Classification',  
                                      'Localization', 
                                      'Essentiality'
                                       ], axis=1)


# Replaces string labels for two binary variables with 1 and 0

features_numeric = features_numeric.replace({'Signal Peptide': {'Y': 1, 'N': 0}})
features_numeric = features_numeric.replace({'PEST region': {'Potential': 1, 'Poor': 0}})


# Scales (or normalizes) the numeric data

features_numeric = standard_scale(features_numeric)
    
    
# Re-assembles the feature matrix from the categorical, numeric, and sub-network features

n_features = len(features_categorical.columns) + len(features_numeric.columns)

dezso_processed = pd.concat([dezso_names, features_categorical, features_numeric], axis=1)

In [4]:
go_components_processed = go_components.copy()

go_components_processed['label'] = 1

go_components_processed = go_components_processed.pivot(index='Protein', columns='go_term', values='label').fillna(0)

go_components_processed = go_components_processed[[x for x in go_components_processed.columns if go_components_processed[x].sum() >= 10]]

go_components_processed.reset_index(inplace=True)

In [5]:
go_functions_processed = go_functions.copy()

go_functions_processed['label'] = 1

go_functions_processed = go_functions_processed.pivot(index='Protein', columns='go_term', values='label').fillna(0)

go_functions_processed = go_functions_processed[[x for x in go_functions_processed.columns if go_functions_processed[x].sum() >= 10]]

go_functions_processed.reset_index(inplace=True)

In [6]:
go_processes_processed = go_processes.copy()

go_processes_processed['label'] = 1

go_processes_processed = go_processes_processed.pivot(index='Protein', columns='go_term', values='label').fillna(0)

go_processes_processed = go_processes_processed[[x for x in go_processes_processed.columns if go_processes_processed[x].sum() >= 10]]

go_processes_processed.reset_index(inplace=True)

In [7]:
paac_names = paac['Protein']

paac_numeric = paac.drop('Protein', axis=1)

paac_processed = standard_scale(paac_numeric)

paac_processed = pd.concat([paac_names, paac_numeric], axis=1)

In [8]:
fpocket_names = fpocket['Protein']

fpocket_numeric = fpocket.drop('Protein', axis=1)

fpocket_processed = standard_scale(fpocket_numeric)

fpocket_processed = pd.concat([fpocket_names, fpocket_numeric], axis=1)

In [9]:
# Gets the names of all of the columns in each dataset

go_components_names = list(go_components_processed.columns)
go_functions_names = list(go_functions_processed.columns)
go_processes_names = list(go_processes_processed.columns)
gdpc_names = list(gdpc.columns)
paac_names = list(paac_processed.columns)
fpocket_names = list(fpocket_processed.columns)

seq_and_struc_names = [ 
    'Molecularweight',
    'Residues',
    'AvResWeight',
    'Charge',
    'Isoelectric',
    'A280_molar_ext',
    'A280_molar_ext_cyst',
    'A280_ext_coeff',
    'A280_ext_coeff_cyst',
    'Prob',
    'Ala',
    'Arg',
    'Asn',
    'Asp',
    'Cys',
    'Gln',
    'Glu',
    'Gly',
    'His',
    'Ile',
    'Leu',
    'Lys',
    'Met',
    'Phe',
    'Pro',
    'Ser',
    'Thr',
    'Trp',
    'Tyr',
    'Val',
    'Acidic',
    'Aliphatic',
    'Aromatic',
    'Basic',
    'Charged',
    'Non.polar',
    'Polar',
    'Small',
    'Tiny',
    'Glyc_N',
    'Glyc_O',
    'ph_TYR',
    'ph_THR',
    'ph_SER',
    'Transmembrane Helices',
    'Signal Peptide',
    'PEST region',
    'Secondary Structure (Helices)',
    'Secondary Structure (Betasheet)',
    'Secondary Structure (turn)',
    'Secondary Structure (coil)',
    'Solvent Accessibility'
] + paac_names + gdpc_names + fpocket_names

localization_names = [
    'Tissue Specificity',
    'Localization_Chloroplast',
    'Localization_Cytoplasmic',
    'Localization_Cytoskeletal',
    'Localization_ER',
    'Localization_Extracellular',
    'Localization_Golgi',
    'Localization_Lysosomal',
    'Localization_Mitochondrial',
    'Localization_Nuclear',
    'Localization_Peroxisomal',
    'Localization_PlasmaMembrane',
    'Localization_Vacuole'
] + go_components_names

bio_func_names = [
    'Enzyme Classification_Hydrolases',
    'Enzyme Classification_Isomerases',
    'Enzyme Classification_Ligases',
    'Enzyme Classification_Lyases',
    'Enzyme Classification_Not-Enzyme', 
    'Enzyme Classification_Oxireductases',
    'Enzyme Classification_Transferases',
    'Enzyme Classification_Translocases',
    'Essentiality_Essential',
    'Essentiality_Non-Essential',
    'Essentiality_UNK',
    'Biological Process (1)',
    'Biological Process (2)',
    'Biological Process (3)',
    'Molecular Function (1)',
    'Molecular Function (2)',
    'Molecular Function (3)'
] + go_functions_names + go_processes_names

network_info_names = pd.DataFrame(data={'network_info_names': [
    'Maps (1)',
    'Maps (2)',
    'Maps (3)',
    'Degree',
    'Closeness',
    'Betweeness',
    'EigenCent',
    'PageRank'
]})

seq_and_struc_names = pd.DataFrame(data={'seq_and_struc_names': [x for x in seq_and_struc_names if x != 'Protein']})
localization_names = pd.DataFrame(data={'localization_names': [x for x in localization_names if x != 'Protein']})
bio_func_names = pd.DataFrame(data={'bio_func_names': [x for x in bio_func_names if x != 'Protein']})

In [10]:
# Generates the full feature set by merging the proteins and their labels with all of the feature sets
# Allows some sets to be ablated by adding to zeroed_sets instead of feature_sets

full_set = protein_list.copy()

go_sets = [go_components_processed, go_functions_processed, go_processes_processed]

for x in go_sets:
    full_set = pd.merge(full_set, x, left_on='Protein', right_on='Protein', how='left')

feature_sets = [dezso_processed, gdpc, paac_processed, fpocket_processed]

for x in feature_sets:
    full_set = pd.merge(full_set, x, left_on='Protein', right_on='Protein', how='inner')
        
full_set = full_set.fillna(0)

In [11]:
full_set.to_csv("processed_data/full_set.csv")

network_info_names.to_csv("processed_data/network_info_names.csv")
seq_and_struc_names.to_csv("processed_data/seq_and_struc_names.csv")
localization_names.to_csv("processed_data/localization_names.csv")
bio_func_names.to_csv("processed_data/bio_func_names.csv")