In [1]:
import numpy as np
import pandas as pd

## Data Preprocessing

In [2]:
# load data
df_1 = pd.read_csv('pdb_data_no_dups.csv')
df_2 = pd.read_csv('pdb_data_seq.csv')

In [3]:
def data_preprocess(df_info, df_seqs):
    '''
    Preprocess csv protein sequence and information data
    Input:
    df_info: pd dataframe from info csv
    df_seqs: pd dataframe from sequence csv
    Output:
    df_new: joint preprocessed pd dataframe
    '''
    
    # remove unwanted features
    df_1= df_info.drop(['experimentalTechnique', 'residueCount', 'resolution',
       'structureMolecularWeight', 'crystallizationMethod',
       'crystallizationTempK', 'densityMatthews', 'densityPercentSol',
       'pdbxDetails', 'phValue', 'publicationYear'], axis=1) 
    df_2= df_seqs.drop(['chainId', 'residueCount'], axis=1) 
    
    # drop duplicated based on structureId
    df1 = df_1.drop_duplicates(subset='structureId', keep='first',inplace=False)
    df2 = df_2.drop_duplicates(subset='structureId', keep='first',inplace=False)
    
    # join two dataframes
    df = pd.merge(df1, df2, left_on='structureId', right_on='structureId')
    
    # select only protein sequences
    df_select= df[df['macromoleculeType_x']=='Protein']
    
    # remove feature indicating it is protein sequences
    df_select = df_select.drop(['macromoleculeType_x', 'macromoleculeType_y'], axis=1) 
    
    
    # remove duplicate sequences and NA values
    df_new = df_select.drop_duplicates(subset='sequence', keep='first',inplace=False)
    df_new = df_new.dropna(how='any')
    
    # rename structureId to Id
    df_new.columns = ['Id', 'classification',  'sequence']
    
    return df_new

In [4]:
df = data_preprocess(df_1, df_2)

In [5]:
df

Unnamed: 0,Id,classification,sequence
2,101M,OXYGEN TRANSPORT,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...
4,102L,HYDROLASE(O-GLYCOSYL),MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAAKSE...
5,102M,OXYGEN TRANSPORT,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...
7,103L,HYDROLASE(O-GLYCOSYL),MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNSLDAAK...
10,104L,HYDROLASE(O-GLYCOSYL),MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSAA...
11,104M,OXYGEN TRANSPORT,VLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRF...
15,106M,OXYGEN TRANSPORT,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...
17,107L,HYDROLASE(O-GLYCOSYL),MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKGEL...
20,108L,HYDROLASE(O-GLYCOSYL),MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKIEL...
23,109L,HYDROLASE(O-GLYCOSYL),MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKKEL...


In [8]:
# count number of sequences in each classification
count = df.groupby('classification')['Id'].nunique()
count[count > 1000]

classification
HYDROLASE                                10782
IMMUNE SYSTEM                             2495
ISOMERASE                                 1540
LIGASE                                    1261
LYASE                                     2335
MEMBRANE PROTEIN                          1118
OXIDOREDUCTASE                            6242
PROTEIN BINDING                           1381
SIGNALING PROTEIN                         1914
STRUCTURAL GENOMICS, UNKNOWN FUNCTION     1515
TRANSCRIPTION                             2135
TRANSFERASE                               8122
TRANSPORT PROTEIN                         1837
UNKNOWN FUNCTION                          1033
VIRAL PROTEIN                             1499
Name: Id, dtype: int64

In [10]:
# unkown classifications
for i, row in enumerate(count):
    name = count.index[i]
    if 'UNKNOWN' in name:
        print(count.index[i], row)

HYDROLASE/UNKNOWN FUNCTION 1
METAL BINDING PROTEIN, UNKNOWN FUNCTION 2
METAL BINDING PROTEIN,UNKNOWN FUNCTION 1
ONCOPROTEIN,UNKNOWN FUNCTION 1
STRUCTURAL GENOMICS  UNKNOWN FUNCTION 1
STRUCTURAL GENOMICS UNKNOWN FUNCTION 2
STRUCTURAL GENOMICS,   UNKNOWN FUNCTION 2
STRUCTURAL GENOMICS,  UNKNOWN FUNCTION 5
STRUCTURAL GENOMICS, UNKNOWN FUNCTION 1515
STRUCTURAL GENOMICS,UNKNOWN FUNCTION 6
Structural Genomics, UNKNOWN FUNCTION 7
Structural genomics, UNKNOWN FUNCTION 1
TRANSPORT PROTEIN/UNKNOWN FUNCTION 1
UNKNOWN BACTERIAL HYDROLASE 1
UNKNOWN FUNCTION 1033
UNKNOWN FUNCTION, PROTEIN BINDING 1
UNKNOWN FUNCTION, STRUCTURAL GENOMICS 2
UNKNOWN FUNCTION/IMMUNE SYSTEM 2
UNKNOWN FUNCTION/Ligase 1
UNKNOWN PROTEIN 1
VIRAL PROTEIN, UNKNOWN FUNCTION 2


In [11]:
def seq_dict(df,n_min = 1000,include_unknown = False):
    '''
    Input:
    df: joint preprocessed dataframe
    n_min: minimum number of sequences for a given classification
    include_unknown: boolean indicating whether to include unknown classifications 
    Output:
    seqs_dict: dictionary with classifications as keys and corrresponding sequences dataframe as values
    '''
    # count number of sequences in each classification
    df_count = df.groupby('classification')['Id'].nunique()
    
    # retrieve indices of classifications that satitisfy minimum number requirements
    idx= list(df_count[df_count > n_min].index)
    
    # init dictionary
    seqs_dict =  dict()
    for typ in idx:
        # including unknown classfications
        if include_unknown:
            # store selected sections of dataframe to the corresponding key
            seqs_dict[typ] = df[df['classification'] == typ]
        # not including unknown classifications
        else:
            if 'UNKNOWN' not in typ:
                # store selected sections of dataframe to the corresponding key
                seqs_dict[typ] = df[df['classification'] == typ]
    return seqs_dict

In [12]:
seqs_dict = seq_dict(df)

In [13]:
seqs_dict.keys()

dict_keys(['HYDROLASE', 'IMMUNE SYSTEM', 'ISOMERASE', 'LIGASE', 'LYASE', 'MEMBRANE PROTEIN', 'OXIDOREDUCTASE', 'PROTEIN BINDING', 'SIGNALING PROTEIN', 'TRANSCRIPTION', 'TRANSFERASE', 'TRANSPORT PROTEIN', 'VIRAL PROTEIN'])

In [14]:
seqs_dict['IMMUNE SYSTEM']

Unnamed: 0,Id,classification,sequence
1398,1B09,IMMUNE SYSTEM,QTDMSRKAFVFPKESDTSYVSLKAPLTKPLKAFTVCLHFYTELSST...
1421,1B0W,IMMUNE SYSTEM,DIQMTQSPSSLSASVGDRVTITCQASQDISDYLIWYQQKLGKAPNL...
1477,1B2W,IMMUNE SYSTEM,EVQLVQSGGGVVQPGRSLKLSCLASGYIFTSSWINWVKQRPGRGLE...
1500,1B3J,IMMUNE SYSTEM,EPHSLRYNLTVLSWDGSVQSGFLTEVHLDGQPFLRCDRQKCRAKPQ...
1660,1B88,IMMUNE SYSTEM,MQQVRQSPQSLTVWEGETAILNCSYENSAFDYFPWYQQFPGEGPAL...
1734,1BAF,IMMUNE SYSTEM,DVQLQESGPGLVKPSQSQSLTCTVTGYSITSDYAWNWIRQFPGNKL...
2101,1BLN,IMMUNE SYSTEM,DVLMTQTPVSLSVSLGDQASISCRSSQSIVHSTGNTYLEWYLQKPG...
2113,1BM3,IMMUNE SYSTEM,EVQLVQSGGGLVNPGRSLKLSCAASGFTFSSYGMSWVRQTPEKRLE...
2249,1BQH,IMMUNE SYSTEM,GPHSLRYFVTAVSRPGLGEPRYMEVGYVDDTEFVRFDSDAENPRYE...
2453,1BWM,IMMUNE SYSTEM,AVTQSPRNKVAVTGGKVTLSCNQTNNHNNMYWYRQDTGHGLRLIHY...
