In [21]:
import pkg_resources
from pkg_resources import DistributionNotFound, VersionConflict
from platform import python_version
import numpy as np
import pandas as pd
import time
import gc
import random
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
import warnings
warnings.simplefilter("ignore")
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

## Trained on S5926 and predicted S350

In [22]:
df = pd.read_csv('S5296.csv')
print(df.head(5))

  PDB Id Mutated Chain Mutation_PDB    Label UniProt Mutation_UNP  DDGexp  \
0   1A43             A        C218S  forward  P12497        C350S    3.70   
1   1A43             A        C218S  reverse  P12497        S350C   -3.70   
2   1A43             A        E159D  forward  P12497        E291D    4.55   
3   1A43             A        E159D  reverse  P12497        D291E   -4.55   
4   1A43             A        G156A  forward  P12497        G288A    2.40   

  Location    DCS  DOMH  PSSM       P_L  P_RKDE     P_FWY  N_Hydro  N_Charg  \
0      COR -7.158 -0.72    -1  0.069444     0.0  0.069444        4        2   
1      COR  7.156  0.72     8  0.069444     0.0  0.069444        3        2   
2      SUR -2.631 -0.09     2  0.069444     0.0  0.069444        8        9   
3      SUR  2.630  0.09     4  0.069444     0.0  0.069444        8        9   
4      COR -5.275  0.27     0  0.069444     0.0  0.069444        6        8   

   SASA_pro  SASA_sol    PremPS similar proteins  
0        20

In [40]:
descriptive_features = df[['Mutated Chain','Mutation_PDB','Label', "UniProt", "Mutation_UNP",  "DDGexp",  "Location"]]
descriptive_features

Unnamed: 0,Mutated Chain,Mutation_PDB,Label,UniProt,Mutation_UNP,DDGexp,Location
0,A,C218S,forward,P12497,C350S,3.70,COR
1,A,C218S,reverse,P12497,S350C,-3.70,COR
2,A,E159D,forward,P12497,E291D,4.55,SUR
3,A,E159D,reverse,P12497,D291E,-4.55,SUR
4,A,G156A,forward,P12497,G288A,2.40,COR
...,...,...,...,...,...,...,...
5291,A,Y35F,reverse,P00974,F70Y,-0.60,COR
5292,A,Y35G,forward,P00974,Y70G,5.00,COR
5293,A,Y35G,reverse,P00974,G70Y,-5.00,COR
5294,A,Y35N,forward,P00974,Y70N,3.70,COR


In [41]:
descriptive_features['Mutated Chain'] = pd.factorize(descriptive_features['Mutated Chain'])[0]
descriptive_features['Mutation_PDB'] = pd.factorize(descriptive_features['Mutation_PDB'])[0]
descriptive_features['Label'] = pd.factorize(descriptive_features['Label'])[0]
descriptive_features['UniProt'] = pd.factorize(descriptive_features['UniProt'])[0]
descriptive_features['Mutation_UNP'] = pd.factorize(descriptive_features['Mutation_UNP'])[0]
descriptive_features['Location'] = pd.factorize(descriptive_features['Location'])[0]
descriptive_features

Unnamed: 0,Mutated Chain,Mutation_PDB,Label,UniProt,Mutation_UNP,DDGexp,Location
0,0,0,0,0,0,3.70,0
1,0,0,1,0,1,-3.70,0
2,0,1,0,0,2,4.55,1
3,0,1,1,0,3,-4.55,1
4,0,2,0,0,4,2.40,0
...,...,...,...,...,...,...,...
5291,0,2440,1,128,4911,-0.60,0
5292,0,2441,0,128,4912,5.00,0
5293,0,2441,1,128,4913,-5.00,0
5294,0,2442,0,128,4914,3.70,0


In [25]:
target_feature = df[['PremPS']]
target_feature

Unnamed: 0,PremPS
0,1.487019
1,-1.739746
2,0.766227
3,-0.723130
4,1.187639
...,...
5291,-1.909816
5292,3.698962
5293,-3.421731
5294,4.263901


In [26]:
descriptive_features.isnull().sum()

Mutated Chain    0
Mutation_PDB     0
Label            0
UniProt          0
Mutation_UNP     0
DDGexp           0
Location         0
dtype: int64

In [27]:
testing_data = pd.read_excel('350.xlsx')
testing_data

Unnamed: 0,PDB Id,Mutated Chain,Mutation_PDB,UniProt,Mutation_UNP,Label,DDGexp,Location,PremPS,PremPS_M,PremPS_P
0,9999999999999999920903862628363385082275612169...,A,I7S,P00282,I27S,1,3.44,COR,3.186056,2.769578,2.768110
1,9999999999999999920903862628363385082275612169...,A,I20T,P00282,I40T,1,2.39,COR,2.169456,2.039257,2.430897
2,9999999999999999920903862628363385082275612169...,A,V31T,P00282,V51T,1,1.08,COR,1.360860,1.625162,2.380536
3,9999999999999999920903862628363385082275612169...,A,L50V,P00282,L70V,1,0.36,COR,0.664295,1.037976,1.189084
4,9999999999999999920903862628363385082275612169...,A,V60G,P00282,V80G,1,3.11,COR,2.373845,1.227922,1.428366
...,...,...,...,...,...,...,...,...,...,...,...
345,5DFR,A,G121H,P0ABQ4,G121H,1,0.56,SUR,0.597748,0.863704,1.057653
346,5DFR,A,I155T,P0ABQ4,I155T,1,2.53,COR,1.876904,1.230930,1.052630
347,5PTI,A,A16V,P00974,A51V,1,1.30,SUR,1.361258,1.516875,-0.404462
348,5PTI,A,A16T,P00974,A51T,1,1.70,SUR,1.631536,1.591198,0.299480


In [28]:
testing_descriptive = testing_data[['Mutated Chain','Mutation_PDB','Label', "UniProt", "Mutation_UNP",  "DDGexp",  "Location"]]
testing_descriptive

Unnamed: 0,Mutated Chain,Mutation_PDB,Label,UniProt,Mutation_UNP,DDGexp,Location
0,A,I7S,1,P00282,I27S,3.44,COR
1,A,I20T,1,P00282,I40T,2.39,COR
2,A,V31T,1,P00282,V51T,1.08,COR
3,A,L50V,1,P00282,L70V,0.36,COR
4,A,V60G,1,P00282,V80G,3.11,COR
...,...,...,...,...,...,...,...
345,A,G121H,1,P0ABQ4,G121H,0.56,SUR
346,A,I155T,1,P0ABQ4,I155T,2.53,COR
347,A,A16V,1,P00974,A51V,1.30,SUR
348,A,A16T,1,P00974,A51T,1.70,SUR


In [29]:
testing_descriptive['Mutated Chain'] = pd.factorize(testing_descriptive['Mutated Chain'])[0]
testing_descriptive['Mutation_PDB'] = pd.factorize(testing_descriptive['Mutation_PDB'])[0]
testing_descriptive['UniProt'] = pd.factorize(testing_descriptive['UniProt'])[0]
testing_descriptive['Mutation_UNP'] = pd.factorize(testing_descriptive['Mutation_UNP'])[0]
testing_descriptive['Location'] = pd.factorize(testing_descriptive['Location'])[0]
testing_descriptive

Unnamed: 0,Mutated Chain,Mutation_PDB,Label,UniProt,Mutation_UNP,DDGexp,Location
0,0,0,1,0,0,3.44,0
1,0,1,1,0,1,2.39,0
2,0,2,1,0,2,1.08,0
3,0,3,1,0,3,0.36,0
4,0,4,1,0,4,3.11,0
...,...,...,...,...,...,...,...
345,0,342,1,64,344,0.56,1
346,0,343,1,64,345,2.53,0
347,0,344,1,65,346,1.30,1
348,0,345,1,65,347,1.70,1


In [30]:
testing_target = testing_data[['PremPS']]
testing_target

Unnamed: 0,PremPS
0,3.186056
1,2.169456
2,1.360860
3,0.664295
4,2.373845
...,...
345,0.597748
346,1.876904
347,1.361258
348,1.631536


In [31]:

regressor_RF = RandomForestRegressor(n_estimators=500,criterion='squared_error', 
                                    max_depth=None, min_samples_split=2, min_samples_leaf=1, 
                                    min_weight_fraction_leaf=0.0, max_features=1.0)

regressor_RF.fit(descriptive_features,target_feature) # Training dataset 2*3*4, 24
predicted_values_RF = regressor_RF.predict(testing_descriptive) # 5*6*1 , 48(wrong) 30(actual) 
predicted_values_RF

array([ 1.20504421e+00,  1.54053571e-01, -1.15607912e+00, -2.20258390e-01,
        1.00488190e+00,  1.00488190e+00, -1.72594277e+00, -1.07782592e-01,
       -5.22710591e-01,  1.30607504e+00, -5.33900765e-01, -1.13960178e+00,
       -9.93417197e-01, -5.48738701e-01, -3.63384579e-01, -7.45830913e-01,
       -3.59169346e-01, -4.67341659e-01, -2.14561260e-01, -1.17640126e-01,
       -3.93500805e-01, -3.19575197e-01,  8.18032151e-01, -4.80276216e-01,
       -4.48654919e-01, -3.20458704e-01, -3.59398727e-01,  1.36595907e+00,
       -1.77904378e-01, -8.85743343e-01, -4.77618631e-01, -4.67570434e-01,
       -4.83641075e-01,  3.40391147e-01, -3.75791240e-02, -3.84083791e-01,
       -4.57420303e-01, -4.20702330e-01, -5.09672285e-01, -2.75127063e-01,
        4.20804541e-01, -5.38089679e-01, -6.20432441e-01, -5.28056657e-02,
        3.31823380e-01, -1.64927585e+00, -2.06235333e+00, -2.21638736e+00,
       -5.27205421e-01, -5.98506357e-01, -7.61184396e-01, -2.35036874e-01,
       -4.02284709e-01,  

In [32]:
testing_data['Predicted PremPS'] = predicted_values_RF

In [33]:
testing_data

Unnamed: 0,PDB Id,Mutated Chain,Mutation_PDB,UniProt,Mutation_UNP,Label,DDGexp,Location,PremPS,PremPS_M,PremPS_P,Predicted PremPS
0,9999999999999999920903862628363385082275612169...,A,I7S,P00282,I27S,1,3.44,COR,3.186056,2.769578,2.768110,1.205044
1,9999999999999999920903862628363385082275612169...,A,I20T,P00282,I40T,1,2.39,COR,2.169456,2.039257,2.430897,0.154054
2,9999999999999999920903862628363385082275612169...,A,V31T,P00282,V51T,1,1.08,COR,1.360860,1.625162,2.380536,-1.156079
3,9999999999999999920903862628363385082275612169...,A,L50V,P00282,L70V,1,0.36,COR,0.664295,1.037976,1.189084,-0.220258
4,9999999999999999920903862628363385082275612169...,A,V60G,P00282,V80G,1,3.11,COR,2.373845,1.227922,1.428366,1.004882
...,...,...,...,...,...,...,...,...,...,...,...,...
345,5DFR,A,G121H,P0ABQ4,G121H,1,0.56,SUR,0.597748,0.863704,1.057653,-0.054930
346,5DFR,A,I155T,P0ABQ4,I155T,1,2.53,COR,1.876904,1.230930,1.052630,0.307641
347,5PTI,A,A16V,P00974,A51V,1,1.30,SUR,1.361258,1.516875,-0.404462,-0.149930
348,5PTI,A,A16T,P00974,A51T,1,1.70,SUR,1.631536,1.591198,0.299480,0.158420


In [34]:
import math
mse = mean_squared_error(testing_target, predicted_values_RF)
rmse = math.sqrt(mse)
rmse

1.3252978436715392

## Retrain on S5926 without Labels and predicted S921

In [42]:
# error will occur if no label exists
descriptive_features_without_label = descriptive_features.drop('Label', axis =1) 


In [43]:
descriptive_features_without_label

Unnamed: 0,Mutated Chain,Mutation_PDB,UniProt,Mutation_UNP,DDGexp,Location
0,0,0,0,0,3.70,0
1,0,0,0,1,-3.70,0
2,0,1,0,2,4.55,1
3,0,1,0,3,-4.55,1
4,0,2,0,4,2.40,0
...,...,...,...,...,...,...
5291,0,2440,128,4911,-0.60,0
5292,0,2441,128,4912,5.00,0
5293,0,2441,128,4913,-5.00,0
5294,0,2442,128,4914,3.70,0


In [45]:
target_feature_without_label = target_feature # S5296
target_feature_without_label

Unnamed: 0,PremPS
0,1.487019
1,-1.739746
2,0.766227
3,-0.723130
4,1.187639
...,...
5291,-1.909816
5292,3.698962
5293,-3.421731
5294,4.263901


In [46]:
testing_data_s921 = pd.read_excel('S921.xlsx')
testing_data_s921

Unnamed: 0,PDB Id,Mutated Chain,Mutation_PDB,UniProt,Mutation_UNP,DDGexp,Location,PremPS,INPS3D,INPS,PoPMuSiC,mCSM,FoldX
0,107L,A,G44S,P00720,G44S,-0.500,SUR,-0.566552,0.182653,-0.280489,0.23,1.144,-0.733663
1,108L,A,I44S,P00720,I44S,0.300,SUR,0.202305,0.835389,1.165360,0.00,0.867,0.252008
2,109L,A,K44S,P00720,K44S,0.200,SUR,-0.145149,0.254589,0.005043,0.60,0.610,0.138615
3,110L,A,L44S,P00720,L44S,0.400,SUR,0.116301,0.824006,0.806203,0.30,0.854,0.365047
4,111L,A,N44S,P00720,N44S,-0.100,SUR,-0.159980,-0.026243,-0.110165,0.06,0.331,-0.164694
...,...,...,...,...,...,...,...,...,...,...,...,...,...
916,451C,A,Q37R,P00099,Q59R,-0.500,SUR,0.332474,-0.015074,-0.231641,0.29,-0.267,-0.401466
917,451C,A,V13M,P00099,V35M,-0.400,SUR,-0.438337,0.288000,-0.134577,-0.21,0.421,-0.225037
918,451C,A,V78I,P00099,V100I,-1.055,COR,-0.286931,-0.287536,-0.452964,0.19,0.080,-1.001650
919,4LYZ,A,K13D,P00698,K31D,6.700,SUR,0.777545,0.835183,0.699028,1.39,0.256,1.494280


In [47]:
testing_descriptive_s921 = testing_data_s921[['Mutated Chain','Mutation_PDB', "UniProt", "Mutation_UNP",  "DDGexp",  "Location"]]
testing_descriptive_s921

Unnamed: 0,Mutated Chain,Mutation_PDB,UniProt,Mutation_UNP,DDGexp,Location
0,A,G44S,P00720,G44S,-0.500,SUR
1,A,I44S,P00720,I44S,0.300,SUR
2,A,K44S,P00720,K44S,0.200,SUR
3,A,L44S,P00720,L44S,0.400,SUR
4,A,N44S,P00720,N44S,-0.100,SUR
...,...,...,...,...,...,...
916,A,Q37R,P00099,Q59R,-0.500,SUR
917,A,V13M,P00099,V35M,-0.400,SUR
918,A,V78I,P00099,V100I,-1.055,COR
919,A,K13D,P00698,K31D,6.700,SUR


In [48]:
testing_descriptive_s921['Mutated Chain'] = pd.factorize(testing_descriptive_s921['Mutated Chain'])[0]
testing_descriptive_s921['Mutation_PDB'] = pd.factorize(testing_descriptive_s921['Mutation_PDB'])[0]
testing_descriptive_s921['UniProt'] = pd.factorize(testing_descriptive_s921['UniProt'])[0]
testing_descriptive_s921['Mutation_UNP'] = pd.factorize(testing_descriptive_s921['Mutation_UNP'])[0]
testing_descriptive_s921['Location'] = pd.factorize(testing_descriptive_s921['Location'])[0]
testing_descriptive_s921

Unnamed: 0,Mutated Chain,Mutation_PDB,UniProt,Mutation_UNP,DDGexp,Location
0,0,0,0,0,-0.500,0
1,0,1,0,1,0.300,0
2,0,2,0,2,0.200,0
3,0,3,0,3,0.400,0
4,0,4,0,4,-0.100,0
...,...,...,...,...,...,...
916,0,894,52,901,-0.500,0
917,0,895,52,902,-0.400,0
918,0,367,52,372,-1.055,1
919,0,896,53,903,6.700,0


In [49]:
testing_target_s921 = testing_data_s921[['PremPS']]
testing_target_s921

Unnamed: 0,PremPS
0,-0.566552
1,0.202305
2,-0.145149
3,0.116301
4,-0.159980
...,...
916,0.332474
917,-0.438337
918,-0.286931
919,0.777545


In [52]:

regressor_RF_without_label = RandomForestRegressor(n_estimators=500,criterion='squared_error', 
                                    max_depth=None, min_samples_split=2, min_samples_leaf=1, 
                                    min_weight_fraction_leaf=0.0, max_features=1.0)

regressor_RF_without_label.fit(descriptive_features_without_label,target_feature_without_label) # Training dataset 2*3*4, 24
predicted_values_RF_without_label = regressor_RF_without_label.predict(testing_descriptive_s921) # 5*6*1 , 48(wrong) 30(actual) 
predicted_values_RF_without_label

array([ 0.33617642,  0.5157724 ,  0.36576688,  0.29548716, -0.21969538,
       -1.94783861,  0.36576688,  0.53387501,  0.7002923 ,  0.53244527,
       -0.2195745 , -0.85234891,  0.3071215 ,  0.01834992, -1.84053354,
       -1.70485311, -1.05297158, -0.56295147,  0.54679714,  0.49837348,
       -1.27076467,  0.01515292,  0.94313882,  0.85595777,  0.63920352,
        0.27881397,  0.02321782, -1.39180166,  0.50250656,  0.39754936,
        0.53555662,  1.14509788,  0.37509592,  2.24789277,  1.17795131,
        1.22401697,  1.30671158, -0.13397736,  1.06460467, -0.99708441,
        0.26833368,  0.63615964,  0.46850172,  0.20577166,  0.67991125,
       -0.25370137,  0.88854346,  0.57318913,  0.53441748,  0.4449048 ,
        0.06192395,  1.28538099,  1.90882099,  1.17902902,  1.2269597 ,
        1.2269597 ,  1.23124285,  1.23124285,  2.50824345,  1.23124285,
        1.22879492,  1.23307807,  1.97724508,  0.15028808,  1.03067022,
        1.54847816,  2.79313853,  2.51823888,  1.23531929,  1.23

In [53]:
testing_data_s921['Predicted PremPS'] = predicted_values_RF_without_label

In [54]:
testing_data_s921

Unnamed: 0,PDB Id,Mutated Chain,Mutation_PDB,UniProt,Mutation_UNP,DDGexp,Location,PremPS,INPS3D,INPS,PoPMuSiC,mCSM,FoldX,Predicted PremPS
0,107L,A,G44S,P00720,G44S,-0.500,SUR,-0.566552,0.182653,-0.280489,0.23,1.144,-0.733663,0.336176
1,108L,A,I44S,P00720,I44S,0.300,SUR,0.202305,0.835389,1.165360,0.00,0.867,0.252008,0.515772
2,109L,A,K44S,P00720,K44S,0.200,SUR,-0.145149,0.254589,0.005043,0.60,0.610,0.138615,0.365767
3,110L,A,L44S,P00720,L44S,0.400,SUR,0.116301,0.824006,0.806203,0.30,0.854,0.365047,0.295487
4,111L,A,N44S,P00720,N44S,-0.100,SUR,-0.159980,-0.026243,-0.110165,0.06,0.331,-0.164694,-0.219695
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
916,451C,A,Q37R,P00099,Q59R,-0.500,SUR,0.332474,-0.015074,-0.231641,0.29,-0.267,-0.401466,-0.435449
917,451C,A,V13M,P00099,V35M,-0.400,SUR,-0.438337,0.288000,-0.134577,-0.21,0.421,-0.225037,-0.493528
918,451C,A,V78I,P00099,V100I,-1.055,COR,-0.286931,-0.287536,-0.452964,0.19,0.080,-1.001650,-0.470677
919,4LYZ,A,K13D,P00698,K31D,6.700,SUR,0.777545,0.835183,0.699028,1.39,0.256,1.494280,2.391302


In [55]:
import math
mse_s921 = mean_squared_error(testing_target_s921, predicted_values_RF_without_label)
rmse_s921 = math.sqrt(mse)
rmse_s921

1.3252978436715392