In [None]:
# predicting log cm values for MC_DB by training on 80% of MC_DB

In [32]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.utils import shuffle 

In [33]:
# Read dataset
dataset = pd.read_csv('df_modred_new_chems.csv')
dataset = dataset.select_dtypes([np.number]) # only select numerical columns for x and y values 

In [34]:
dataset.head()

Unnamed: 0,OCTANOL_WATER_PARTITION_LOGP_OPERA_PRED,C:M Ratio Master,C:M Ratio Lipid Normalized,Log C:M Ratio Lipid Normalized,C:M Ratio Non-Lipid Normalized,Log C:M Ratio Non-Lipid Normalized,# Total Atoms,# Atoms w/o Salt,ABC,ABCGG,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,3.86903,0.71,2.2578,0.353685,0.71,-0.148742,20.0,20.0,15.234431,12.462543,...,9.768984,53.323569,268.157563,6.703939,899,30,100,115,7.777778,4.444444
1,2.98359,0.923077,2.935385,0.467665,0.923077,-0.034762,14.0,14.0,10.146784,9.272626,...,9.182866,44.83479,193.110279,6.658975,331,18,64,71,5.944444,3.305556
2,3.86294,1.0625,3.37875,0.528756,1.0625,0.026329,20.0,20.0,14.389425,12.507389,...,9.435163,52.355966,277.204179,5.897961,970,26,90,99,8.055556,4.805556
3,3.93021,11.888889,37.806667,1.577568,11.888889,1.075141,24.0,24.0,17.625299,14.440378,...,9.913982,58.055175,324.220163,6.235003,1535,38,116,135,8.777778,5.777778
4,1.85996,56.275862,178.957241,2.252749,56.275862,1.750322,12.0,12.0,8.663621,8.0647,...,8.952217,41.87666,165.078979,7.177347,211,14,54,59,4.833333,2.861111


In [35]:
# drop values that have 0 as the ratio
dataset = dataset[dataset['C:M Ratio Lipid Normalized'] != 0]

In [36]:
dataset['C:M Ratio Lipid Normalized'] = dataset['C:M Ratio Lipid Normalized'].replace(0, 0.01)
dataset['logCM'] = np.log10(dataset['C:M Ratio Lipid Normalized'])

In [37]:
dataset['logCM_non_adjusted'] = np.log10(dataset['C:M Ratio Non-Lipid Normalized'])

In [38]:
dataset.shape

(264, 1135)

In [39]:
dataset.columns

Index(['OCTANOL_WATER_PARTITION_LOGP_OPERA_PRED', 'C:M Ratio Master',
       'C:M Ratio Lipid Normalized', 'Log C:M Ratio Lipid Normalized',
       'C:M Ratio Non-Lipid Normalized', 'Log C:M Ratio Non-Lipid Normalized',
       '# Total Atoms', '# Atoms w/o Salt', 'ABC', 'ABCGG',
       ...
       'MW', 'AMW', 'WPath', 'WPol', 'Zagreb1', 'Zagreb2', 'mZagreb1',
       'mZagreb2', 'logCM', 'logCM_non_adjusted'],
      dtype='object', length=1135)

In [40]:
# designate x and y values for the model 
X = dataset.loc[:, 'ABC':'mZagreb2'].values
y = dataset.loc[:, 'logCM_non_adjusted'].values

In [41]:
y.shape

(264,)

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor


for i in range(25):
    # y-randomization - use only in y-randomization analysis
    #y = shuffle(y, random_state=4)
    # Split dataset into training and testing datasets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)
    
    # Normalize data
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

    # Compile RF regressor
    regressor = RandomForestRegressor(n_estimators= 100, bootstrap = True, 
                                      max_depth = 30, max_features = 'sqrt',
                                      min_samples_leaf = 1, min_samples_split = 2)
    regressor.fit(X_train, y_train)

    # Use model to make predictions for training set
    y_pred = regressor.predict(X_train).flatten()

    # Export data into a dataframe
    df = pd.DataFrame({'predicted_values':y_pred, 'true_values':y_train,})

    # Print data into a csv file
    df.to_csv('forest_plus_tr_' + str(i) + '.csv')

    # Use model to make predictions for the testing set
    y_pred = regressor.predict(X_test).flatten()

    # Export data into a dataframe
    df = pd.DataFrame({'predicted_values':y_pred, 'true_values':y_test,})

    # Print data to a csv file
    df.to_csv('forestplus_ts_' + str(i) + '.csv')

In [43]:
#Summarize results into one csv file
ts1 = pd.read_csv('forestplus_ts_0.csv')
ts2 = pd.read_csv('forestplus_ts_1.csv')
ts3 = pd.read_csv('forestplus_ts_2.csv')
ts4 = pd.read_csv('forestplus_ts_3.csv')
ts5 = pd.read_csv('forestplus_ts_4.csv')

ts6 = pd.read_csv('forestplus_ts_5.csv')
ts7 = pd.read_csv('forestplus_ts_6.csv')
ts8 = pd.read_csv('forestplus_ts_7.csv')
ts9 = pd.read_csv('forestplus_ts_8.csv')
ts10 = pd.read_csv('forestplus_ts_9.csv')
ts11 = pd.read_csv('forestplus_ts_10.csv')
ts12 = pd.read_csv('forestplus_ts_11.csv')
ts13 = pd.read_csv('forestplus_ts_12.csv')
ts14 = pd.read_csv('forestplus_ts_13.csv')
ts15 = pd.read_csv('forestplus_ts_14.csv')
ts16 = pd.read_csv('forestplus_ts_15.csv')
ts17 = pd.read_csv('forestplus_ts_16.csv')
ts18 = pd.read_csv('forestplus_ts_17.csv')
ts19 = pd.read_csv('forestplus_ts_18.csv')
ts20 = pd.read_csv('forestplus_ts_19.csv')
ts21 = pd.read_csv('forestplus_ts_20.csv')
ts22 = pd.read_csv('forestplus_ts_21.csv')
ts23 = pd.read_csv('forestplus_ts_22.csv')
ts24 = pd.read_csv('forestplus_ts_23.csv')
ts25 = pd.read_csv('forestplus_ts_24.csv')
ts = pd.concat([ts1,ts2,ts3,ts4,ts5, 
                ts6,ts7,ts8,ts9,ts10, 
                ts11,ts12,ts13,ts14,ts15,
                ts16,ts17,ts18,ts19,ts20,
                ts21,ts22,ts23,ts24,ts25], axis=0)

tr1 = pd.read_csv('forest_plus_tr_0.csv')
tr2 = pd.read_csv('forest_plus_tr_1.csv')
tr3 = pd.read_csv('forest_plus_tr_2.csv')
tr4 = pd.read_csv('forest_plus_tr_3.csv')
tr5 = pd.read_csv('forest_plus_tr_4.csv')

tr6 = pd.read_csv('forest_plus_tr_5.csv')
tr7 = pd.read_csv('forest_plus_tr_6.csv')
tr8 = pd.read_csv('forest_plus_tr_7.csv')
tr9 = pd.read_csv('forest_plus_tr_8.csv')
tr10 = pd.read_csv('forest_plus_tr_9.csv')
tr11 = pd.read_csv('forest_plus_tr_10.csv')
tr12 = pd.read_csv('forest_plus_tr_11.csv')
tr13 = pd.read_csv('forest_plus_tr_12.csv')
tr14 = pd.read_csv('forest_plus_tr_13.csv')
tr15 = pd.read_csv('forest_plus_tr_14.csv')
tr16 = pd.read_csv('forest_plus_tr_15.csv')
tr17 = pd.read_csv('forest_plus_tr_16.csv')
tr18 = pd.read_csv('forest_plus_tr_17.csv')
tr19 = pd.read_csv('forest_plus_tr_18.csv')
tr20 = pd.read_csv('forest_plus_tr_19.csv')
tr21 = pd.read_csv('forest_plus_tr_20.csv')
tr22 = pd.read_csv('forest_plus_tr_21.csv')
tr23 = pd.read_csv('forest_plus_tr_22.csv')
tr24 = pd.read_csv('forest_plus_tr_23.csv')
tr25 = pd.read_csv('forest_plus_tr_24.csv')
tr = pd.concat([tr1,tr2,tr3,tr4,tr5, 
                tr6,tr7,tr8,tr9,tr10, 
                tr11,tr12,tr13,tr14,tr15,
                tr16,tr17,tr18,tr19,tr20,
                tr21,tr22,tr23,tr24,tr25], axis=0)

ts.to_csv('forestplus_ts.csv')
tr.to_csv('forestplus_tr.csv')

In [44]:
# rename with a more descriptive name 
full_test = ts
full_train = tr

In [45]:
full_test

Unnamed: 0.1,Unnamed: 0,predicted_values,true_values
0,0,-0.397879,-0.066947
1,1,-0.263239,-0.322491
2,2,-0.276269,0.000000
3,3,0.705556,-0.352202
4,4,-0.328592,-0.490350
...,...,...,...
48,48,-0.267698,0.462398
49,49,-0.470420,0.531479
50,50,-0.346488,-0.482099
51,51,-0.442404,-0.300000


In [46]:
# set up true and pred values from the test dataset 
full_y_test_true = full_test['true_values']
full_y_test_pred = full_test['predicted_values']

In [47]:
# set up true and pred values from the train dataset 
full_y_train_true = full_train['true_values']
full_y_train_pred = full_train['predicted_values']

In [None]:
# plot true vs predicted values for the train set
plt.scatter(full_y_train_true, full_y_train_pred)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.axis('equal')
plt.xlim(plt.xlim())
plt.ylim(plt.ylim())
_ = plt.plot([-100, 100], [-100, 100]) # make a straight line showing 1-1 relationship 
#plt.show()
plt.savefig('random_forest_mordred_train.png', dpi=300)

In [None]:
# plot true vs predicted values for the test set
plt.scatter(full_y_test_true, full_y_test_pred)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.axis('equal')
plt.xlim(plt.xlim())
plt.ylim(plt.ylim())
_ = plt.plot([-100, 100], [-100, 100]) # make a straight line showing 1-1 relationship 
#plt.show()
plt.savefig('random_forest_mordred_test.png', dpi=300)

In [48]:
print("Train MAE: " + str(mean_absolute_error(full_train['true_values'], full_train['predicted_values'])))

Train MAE: 0.15971937184918553


In [49]:
train_r = np.corrcoef(full_y_train_true, full_y_train_pred)
print("Train R2 Score: " + str(train_r[0, 1]**2))

Train R2 Score: 0.9069100996022176


In [50]:
print("Test MAE: " + str(mean_absolute_error(full_test['true_values'], full_test['predicted_values'])))

Test MAE: 0.41404726165978306


In [51]:
test_r = np.corrcoef(full_y_test_true, full_y_test_pred)
print("Test R2 Score: " + str(test_r[0, 1]**2))

Test R2 Score: 0.0002481724609502936
