In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('./data/af2_dataset_training_labeled.csv', index_col=0)
df

Unnamed: 0,annotation_sequence,feat_A,feat_C,feat_D,feat_E,feat_F,feat_G,feat_H,feat_I,feat_K,...,feat_DSSP_10,feat_DSSP_11,feat_DSSP_12,feat_DSSP_13,coord_X,coord_Y,coord_Z,entry,entry_index,y_Ligand
0,M,False,False,False,False,False,False,False,False,False,...,0,0.0,47,-0.0,-26.499001,-4.742000,-35.189999,GEMI5_HUMAN,0,False
1,G,False,False,False,False,False,True,False,False,False,...,0,0.0,0,0.0,-25.158001,-1.342000,-34.104000,GEMI5_HUMAN,1,False
2,Q,False,False,False,False,False,False,False,False,False,...,1,-0.0,-1,-0.0,-21.926001,-1.641000,-32.175999,GEMI5_HUMAN,2,False
3,E,False,False,False,True,False,False,False,False,False,...,706,-0.1,705,-0.0,-22.073999,0.654000,-29.171000,GEMI5_HUMAN,3,False
4,P,False,False,False,False,False,False,False,False,False,...,0,0.0,705,-0.2,-19.783001,2.670000,-26.858999,GEMI5_HUMAN,4,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,S,False,False,False,False,False,False,False,False,False,...,-3,-0.1,2,-0.4,-19.742001,20.796000,-12.319000,AOC3_HUMAN,755,False
756,H,False,False,False,False,False,False,True,False,False,...,-358,-0.1,-330,-0.1,-16.299000,19.153999,-12.640000,AOC3_HUMAN,756,False
757,G,False,False,False,False,False,True,False,False,False,...,-360,-0.2,-1,-0.1,-13.404000,19.502001,-10.121000,AOC3_HUMAN,757,False
758,G,False,False,False,False,False,True,False,False,False,...,0,0.0,0,0.0,-10.986000,20.320000,-13.016000,AOC3_HUMAN,758,False


1. Drop annotation columns

In [75]:
df = df.drop(['annotation_atomrec', 'annotation_sequence'], axis=1)

2. Convert one hot boolean columns to one hot int encoded array

In [78]:
amino_cols = df.columns[0:20]
amino_cols

Index(['feat_A', 'feat_C', 'feat_D', 'feat_E', 'feat_F', 'feat_G', 'feat_H',
       'feat_I', 'feat_K', 'feat_L', 'feat_M', 'feat_N', 'feat_P', 'feat_Q',
       'feat_R', 'feat_S', 'feat_T', 'feat_V', 'feat_W', 'feat_Y'],
      dtype='object')

In [100]:
def add_feature_one_hot_column(df, one_hot_columns):
    col = []
    for idx, data in df.iterrows():
        encoding = []
        for i in data[one_hot_columns]:
            encoding.append(int(i))
        col.append(encoding)
    return col

In [79]:
aminoacid = add_feature_one_hot_column(df, amino_cols)
df.drop(amino_cols, axis=1)
df.insert(0, "aminoacid", aminoacid)
df

3. Create one hot DSSP cols

In [114]:
dssp_cols = df.columns[8:15]
dssp_cols

Index(['feat_DSSP_H', 'feat_DSSP_B', 'feat_DSSP_E', 'feat_DSSP_G',
       'feat_DSSP_I', 'feat_DSSP_T', 'feat_DSSP_S'],
      dtype='object')

In [116]:
dssp = add_feature_one_hot_column(df, dssp_cols)
df = df.drop(dssp_cols, axis=1)
df.insert(9, "DSSP_feats", dssp)
df

Unnamed: 0,aminoacid,feat_PHI,feat_PSI,feat_TAU,feat_THETA,feat_BBSASA,feat_SCSASA,feat_pLDDT,feat_DSSP_6,DSSP_feats,...,feat_DSSP_10,feat_DSSP_11,feat_DSSP_12,feat_DSSP_13,coord_X,coord_Y,coord_Z,entry,entry_index,y_Ligand
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",0.000000,2.257610,-2.375020,1.956201,80.020602,137.023818,38.49,0,"[0, 0, 0, 0, 0, 0, 0]",...,0,0.0,47,-0.0,-26.499001,-4.742000,-35.189999,GEMI5_HUMAN,0,False
1,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-1.100680,2.224168,-2.654037,1.900792,69.542382,0.000000,46.08,1,"[0, 0, 0, 0, 0, 0, 0]",...,0,0.0,0,0.0,-25.158001,-1.342000,-34.104000,GEMI5_HUMAN,1,False
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",-1.295398,2.676551,-1.696727,2.458310,23.387401,88.587659,65.42,-2,"[0, 0, 0, 0, 0, 0, 0]",...,1,-0.0,-1,-0.0,-21.926001,-1.641000,-32.175999,GEMI5_HUMAN,2,False
3,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-2.352796,2.665542,-2.810012,2.054226,4.908812,44.521273,85.17,-2,"[0, 0, 0, 0, 0, 0, 0]",...,706,-0.1,705,-0.0,-22.073999,0.654000,-29.171000,GEMI5_HUMAN,3,False
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",-1.134474,2.612150,-2.754863,2.272191,9.742674,52.613700,93.24,0,"[0, 0, 0, 0, 0, 0, 0]",...,0,0.0,705,-0.2,-19.783001,2.670000,-26.858999,GEMI5_HUMAN,4,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-2.378927,2.608671,-2.290233,2.192222,7.313673,48.994428,95.81,-2,"[0, 0, 0, 0, 0, 0, 0]",...,-3,-0.1,2,-0.4,-19.742001,20.796000,-12.319000,AOC3_HUMAN,755,False
756,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",-2.122860,2.441583,-2.331874,1.570277,20.300374,46.103273,96.62,-2,"[0, 0, 0, 0, 0, 0, 0]",...,-358,-0.1,-330,-0.1,-16.299000,19.153999,-12.640000,AOC3_HUMAN,756,False
757,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-1.124856,-0.248235,-1.292085,2.315429,2.763823,0.000000,93.61,-2,"[0, 0, 0, 0, 0, 0, 0]",...,-360,-0.2,-1,-0.1,-13.404000,19.502001,-10.121000,AOC3_HUMAN,757,False
758,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.651085,-2.916255,-0.280431,2.332004,15.092203,0.000000,93.17,0,"[0, 0, 0, 0, 0, 0, 0]",...,0,0.0,0,0.0,-10.986000,20.320000,-13.016000,AOC3_HUMAN,758,False


In [118]:
df.to_csv('./data/processed_train.csv', index=False)