In [65]:
import pkg_resources
from pkg_resources import DistributionNotFound, VersionConflict
from platform import python_version
import numpy as np
import pandas as pd
import time
import gc
import random
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
import warnings
warnings.simplefilter("ignore")
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv('S5296.csv')
print(df.head(5))

  PDB Id Mutated Chain Mutation_PDB    Label UniProt Mutation_UNP  DDGexp  \
0   1A43             A        C218S  forward  P12497        C350S    3.70   
1   1A43             A        C218S  reverse  P12497        S350C   -3.70   
2   1A43             A        E159D  forward  P12497        E291D    4.55   
3   1A43             A        E159D  reverse  P12497        D291E   -4.55   
4   1A43             A        G156A  forward  P12497        G288A    2.40   

  Location    DCS  DOMH  PSSM       P_L  P_RKDE     P_FWY  N_Hydro  N_Charg  \
0      COR -7.158 -0.72    -1  0.069444     0.0  0.069444        4        2   
1      COR  7.156  0.72     8  0.069444     0.0  0.069444        3        2   
2      SUR -2.631 -0.09     2  0.069444     0.0  0.069444        8        9   
3      SUR  2.630  0.09     4  0.069444     0.0  0.069444        8        9   
4      COR -5.275  0.27     0  0.069444     0.0  0.069444        6        8   

   SASA_pro  SASA_sol    PremPS similar proteins  
0        20

In [38]:
descriptive_features = df[['Mutated Chain','Mutation_PDB','Label', "UniProt", "Mutation_UNP",  "DDGexp",  "Location"]]
descriptive_features

Unnamed: 0,Mutated Chain,Mutation_PDB,Label,UniProt,Mutation_UNP,DDGexp,Location
0,0,C218S,forward,P12497,C350S,3.70,COR
1,0,C218S,reverse,P12497,S350C,-3.70,COR
2,0,E159D,forward,P12497,E291D,4.55,SUR
3,0,E159D,reverse,P12497,D291E,-4.55,SUR
4,0,G156A,forward,P12497,G288A,2.40,COR
...,...,...,...,...,...,...,...
5291,0,Y35F,reverse,P00974,F70Y,-0.60,COR
5292,0,Y35G,forward,P00974,Y70G,5.00,COR
5293,0,Y35G,reverse,P00974,G70Y,-5.00,COR
5294,0,Y35N,forward,P00974,Y70N,3.70,COR


In [39]:
descriptive_features['Mutated Chain'] = pd.factorize(descriptive_features['Mutated Chain'])[0]
descriptive_features['Mutation_PDB'] = pd.factorize(descriptive_features['Mutation_PDB'])[0]
descriptive_features['Label'] = pd.factorize(descriptive_features['Label'])[0]
descriptive_features['UniProt'] = pd.factorize(descriptive_features['UniProt'])[0]
descriptive_features['Mutation_UNP'] = pd.factorize(descriptive_features['Mutation_UNP'])[0]
descriptive_features['Location'] = pd.factorize(descriptive_features['Location'])[0]
descriptive_features

Unnamed: 0,Mutated Chain,Mutation_PDB,Label,UniProt,Mutation_UNP,DDGexp,Location
0,0,0,0,0,0,3.70,0
1,0,0,1,0,1,-3.70,0
2,0,1,0,0,2,4.55,1
3,0,1,1,0,3,-4.55,1
4,0,2,0,0,4,2.40,0
...,...,...,...,...,...,...,...
5291,0,2440,1,128,4911,-0.60,0
5292,0,2441,0,128,4912,5.00,0
5293,0,2441,1,128,4913,-5.00,0
5294,0,2442,0,128,4914,3.70,0


In [41]:
target_feature = df[['PremPS']]
target_feature

Unnamed: 0,PremPS
0,1.487019
1,-1.739746
2,0.766227
3,-0.723130
4,1.187639
...,...
5291,-1.909816
5292,3.698962
5293,-3.421731
5294,4.263901


In [42]:
descriptive_features.isnull().sum()

Mutated Chain    0
Mutation_PDB     0
Label            0
UniProt          0
Mutation_UNP     0
DDGexp           0
Location         0
dtype: int64

In [43]:
testing_data = pd.read_excel('350.xlsx')
testing_data

Unnamed: 0,PDB Id,Mutated Chain,Mutation_PDB,UniProt,Mutation_UNP,Label,DDGexp,Location,PremPS,PremPS_M,PremPS_P
0,9999999999999999920903862628363385082275612169...,A,I7S,P00282,I27S,1,3.44,COR,3.186056,2.769578,2.768110
1,9999999999999999920903862628363385082275612169...,A,I20T,P00282,I40T,1,2.39,COR,2.169456,2.039257,2.430897
2,9999999999999999920903862628363385082275612169...,A,V31T,P00282,V51T,1,1.08,COR,1.360860,1.625162,2.380536
3,9999999999999999920903862628363385082275612169...,A,L50V,P00282,L70V,1,0.36,COR,0.664295,1.037976,1.189084
4,9999999999999999920903862628363385082275612169...,A,V60G,P00282,V80G,1,3.11,COR,2.373845,1.227922,1.428366
...,...,...,...,...,...,...,...,...,...,...,...
345,5DFR,A,G121H,P0ABQ4,G121H,1,0.56,SUR,0.597748,0.863704,1.057653
346,5DFR,A,I155T,P0ABQ4,I155T,1,2.53,COR,1.876904,1.230930,1.052630
347,5PTI,A,A16V,P00974,A51V,1,1.30,SUR,1.361258,1.516875,-0.404462
348,5PTI,A,A16T,P00974,A51T,1,1.70,SUR,1.631536,1.591198,0.299480


In [44]:
testing_descriptive = testing_data[['Mutated Chain','Mutation_PDB','Label', "UniProt", "Mutation_UNP",  "DDGexp",  "Location"]]
testing_descriptive

Unnamed: 0,Mutated Chain,Mutation_PDB,Label,UniProt,Mutation_UNP,DDGexp,Location
0,A,I7S,1,P00282,I27S,3.44,COR
1,A,I20T,1,P00282,I40T,2.39,COR
2,A,V31T,1,P00282,V51T,1.08,COR
3,A,L50V,1,P00282,L70V,0.36,COR
4,A,V60G,1,P00282,V80G,3.11,COR
...,...,...,...,...,...,...,...
345,A,G121H,1,P0ABQ4,G121H,0.56,SUR
346,A,I155T,1,P0ABQ4,I155T,2.53,COR
347,A,A16V,1,P00974,A51V,1.30,SUR
348,A,A16T,1,P00974,A51T,1.70,SUR


In [45]:
testing_descriptive['Mutated Chain'] = pd.factorize(testing_descriptive['Mutated Chain'])[0]
testing_descriptive['Mutation_PDB'] = pd.factorize(testing_descriptive['Mutation_PDB'])[0]
testing_descriptive['UniProt'] = pd.factorize(testing_descriptive['UniProt'])[0]
testing_descriptive['Mutation_UNP'] = pd.factorize(testing_descriptive['Mutation_UNP'])[0]
testing_descriptive['Location'] = pd.factorize(testing_descriptive['Location'])[0]
testing_descriptive

Unnamed: 0,Mutated Chain,Mutation_PDB,Label,UniProt,Mutation_UNP,DDGexp,Location
0,0,0,1,0,0,3.44,0
1,0,1,1,0,1,2.39,0
2,0,2,1,0,2,1.08,0
3,0,3,1,0,3,0.36,0
4,0,4,1,0,4,3.11,0
...,...,...,...,...,...,...,...
345,0,342,1,64,344,0.56,1
346,0,343,1,64,345,2.53,0
347,0,344,1,65,346,1.30,1
348,0,345,1,65,347,1.70,1


In [46]:
testing_target = testing_data[['PremPS']]
testing_target

Unnamed: 0,PremPS
0,3.186056
1,2.169456
2,1.360860
3,0.664295
4,2.373845
...,...
345,0.597748
346,1.876904
347,1.361258
348,1.631536


In [None]:

regressor_RF = RandomForestRegressor(n_estimators=500,criterion='squared_error', 
                                    max_depth=None, min_samples_split=2, min_samples_leaf=1, 
                                    min_weight_fraction_leaf=0.0, max_features=1.0)

regressor_RF.fit(descriptive_features,target_feature) # Training dataset 2*3*4, 24
predicted_values_RF = regressor_RF.predict(testing_descriptive) # 5*6*1 , 48(wrong) 30(actual) 
predicted_values_RF

In [72]:

regressor_RF = RandomForestRegressor(n_estimators=500,criterion='squared_error', 
                                    max_depth=None, min_samples_split=2, min_samples_leaf=1, 
                                    min_weight_fraction_leaf=0.0, max_features=1.0)

regressor_RF.fit(descriptive_features,target_feature) # Training dataset 2*3*4, 24
predicted_values_RF = regressor_RF.predict(testing_descriptive) # 5*6*1 , 48(wrong) 30(actual) 
predicted_values_RF

array([ 1.25915902,  0.07277404, -1.16825747, -0.24704791,  1.02563188,
        1.02563188, -1.69688299, -0.29117345, -0.55268799,  1.36057411,
       -0.5307537 , -1.10215613, -0.94752944, -0.55301321, -0.34211607,
       -0.73429328, -0.35793494, -0.47387541, -0.21214804, -0.2815099 ,
       -0.39761397, -0.30641034,  0.77145272, -0.50968253, -0.427035  ,
       -0.30641034, -0.34061498,  1.4487863 , -0.19162231, -0.86971852,
       -0.59571875, -0.48571886, -0.51421643,  0.2387251 , -0.05003133,
       -0.37449307, -0.49332326, -0.45107172, -0.52112461, -0.32371707,
        0.43739845, -0.53888346, -0.61313116, -0.09987805,  0.27598731,
       -1.71397659, -2.11357417, -2.21189769, -0.56714652, -0.61487217,
       -0.7342167 , -0.25902382, -0.61733885,  0.38868513,  0.64657345,
        0.26137963,  0.68785155, -0.29299017,  0.10990338,  0.87346762,
       -0.38017094, -0.04927796, -0.47248717, -0.04927796, -0.2770489 ,
        0.15196016, -0.2476108 ,  0.28684191,  0.74059418, -0.66

In [73]:
testing_data['Predicted PremPS'] = predicted_values_RF

In [74]:
testing_data

Unnamed: 0,PDB Id,Mutated Chain,Mutation_PDB,UniProt,Mutation_UNP,Label,DDGexp,Location,PremPS,PremPS_M,PremPS_P,Predicted PremPS
0,9999999999999999920903862628363385082275612169...,A,I7S,P00282,I27S,1,3.44,COR,3.186056,2.769578,2.768110,1.259159
1,9999999999999999920903862628363385082275612169...,A,I20T,P00282,I40T,1,2.39,COR,2.169456,2.039257,2.430897,0.072774
2,9999999999999999920903862628363385082275612169...,A,V31T,P00282,V51T,1,1.08,COR,1.360860,1.625162,2.380536,-1.168257
3,9999999999999999920903862628363385082275612169...,A,L50V,P00282,L70V,1,0.36,COR,0.664295,1.037976,1.189084,-0.247048
4,9999999999999999920903862628363385082275612169...,A,V60G,P00282,V80G,1,3.11,COR,2.373845,1.227922,1.428366,1.025632
...,...,...,...,...,...,...,...,...,...,...,...,...
345,5DFR,A,G121H,P0ABQ4,G121H,1,0.56,SUR,0.597748,0.863704,1.057653,-0.087947
346,5DFR,A,I155T,P0ABQ4,I155T,1,2.53,COR,1.876904,1.230930,1.052630,0.387545
347,5PTI,A,A16V,P00974,A51V,1,1.30,SUR,1.361258,1.516875,-0.404462,-0.169218
348,5PTI,A,A16T,P00974,A51T,1,1.70,SUR,1.631536,1.591198,0.299480,0.131233


In [75]:
import math
mse = mean_squared_error(testing_target, predicted_values_RF)
rmse = math.sqrt(mse)
rmse

1.344198136381793