# Installing and Importing packages

In [2]:
# Use if you have R libraries installed on a different path

#.libPaths( c( "/tempr" , .libPaths() ) )

requiredPackages <- c("dplyr", # Data manipulation library
                      "data.table", # Fast processing of large data
                      "protr", # Generating numerical representation of protein sequences
                      "ggplot2", # Plotting Data package
                      "gplots",  # Plotting Data package
                      "RColorBrewer",  # Ready-to-use color palettes for graphics
                      "ggpubr",
                      "caret",
                      "gridExtra",
                      "BiocManager",
                      "ggfortify")

for (pkg in requiredPackages) { 
    if(! pkg %in% row.names(installed.packages())) install.packages(pkg)
}

if(! "pcaMethods" %in% row.names(installed.packages()))
    BiocManager::install("pcaMethods", warning=stop)

In [4]:
library(dplyr)
library(data.table)
library(protr)
library(ggplot2)
library(gplots)
library(RColorBrewer)
library(ggpubr)
library(caret)
library(gridExtra)
library(pcaMethods)
library(ggfortify)

source("functions.R")

options(stringsAsFactors = FALSE)

set.seed(123) 

options(repr.matrix.max.cols=150, repr.matrix.max.rows=200)

# ML prediction results plots (dual model)

#### The results of the first model predicting wild-type protein-lignd binding affinity using 6 different split scenarios

In [5]:
png(paste0("images/predictions/rf-2-1-summary.png"), width = 1600, height = 1200, res = 120, units = "px")


predictions <- fread("work/predictions/dual_model_pl_split_random.csv")
colnames(predictions) <- c("observed", "predicted")
predictions = predictions[predictions$predicted >= -16.0,]

  
plot1 <- ggscatter(predictions, x = "observed", y = "predicted", 
            add = "reg.line", conf.int = TRUE, 
            cor.coef = FALSE, cor.method = "pearson",
            color = rgb(0,0,1,1/4), shape = 20,
            xlim=c(-12,-4), ylim=c(-12,-4),
            xlab = "Experimental binding affinity", ylab = "Predicted binding affinity")  

predictions <- fread("work/predictions/dual_model_pl_split_protein.csv")
colnames(predictions) <- c("observed", "predicted")
predictions = predictions[predictions$predicted >= -16.0,]

plot2 <- ggscatter(predictions, x = "observed", y = "predicted", 
            add = "reg.line", conf.int = TRUE, 
            cor.coef = FALSE, cor.method = "pearson", 
            color = rgb(0,0,1,1/4), shape = 20,
            xlim=c(-12,-4), ylim=c(-12,-4),
            xlab = "Experimental binding affinity", ylab = "Predicted binding affinity")  


predictions <- fread("work/predictions/dual_model_pl_split_pocket.csv")
colnames(predictions) <- c("observed", "predicted")
predictions = predictions[predictions$predicted >= -16.0,]

plot3 <- ggscatter(predictions, x = "observed", y = "predicted", 
            add = "reg.line", conf.int = TRUE, 
            cor.coef = FALSE, cor.method = "pearson", 
            color = rgb(0,0,1,1/4), shape = 20,
            xlim=c(-12,-4), ylim=c(-12,-4),
            xlab = "Experimental binding affinity", ylab = "Predicted binding affinity")  

predictions <- fread("work/predictions/dual_model_pl_split_ligand_weight.csv")
colnames(predictions) <- c("observed", "predicted")
predictions = predictions[predictions$predicted >= -16.0,]

plot4 <- ggscatter(predictions, x = "observed", y = "predicted", 
            add = "reg.line", conf.int = TRUE, 
            cor.coef = FALSE, cor.method = "pearson", 
            color = rgb(0,0,1,1/4), shape = 20,
            xlim=c(-12,-4), ylim=c(-12,-4),
            xlab = "Experimental binding affinity", ylab = "Predicted binding affinity")

predictions <- fread("work/predictions/dual_model_pl_split_ligand_volume.csv")
colnames(predictions) <- c("observed", "predicted")
predictions = predictions[predictions$predicted >= -16.0,]

plot6 <- ggscatter(predictions, x = "observed", y = "predicted", 
            add = "reg.line", conf.int = TRUE, 
            cor.coef = FALSE, cor.method = "pearson", 
            color = rgb(0,0,1,1/4), shape = 20,
            xlim=c(-12,-4), ylim=c(-12,-4),
            xlab = "Experimental binding affinity", ylab = "Predicted binding affinity")  

predictions <- fread("work/predictions/dual_model_pl_split_ligand_diversity.csv")
colnames(predictions) <- c("observed", "predicted")
predictions = predictions[predictions$predicted >= -16.0,]

plot5 <- ggscatter(predictions, x = "observed", y = "predicted", 
            add = "reg.line", conf.int = TRUE, 
            cor.coef = FALSE, cor.method = "pearson", 
            color = rgb(0,0,1,1/4), shape = 20,
            xlim=c(-12,-4), ylim=c(-12,-4),
            xlab = "Experimental binding affinity", ylab = "Predicted binding affinity")

ggarrange(plot2, plot3,
          plot1, plot4,
          plot6, plot5, ncol = 2, nrow = 3)

dev.off()


`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'



#### The results of the second model predicting mutated protein-lignd binding affinity 

In [9]:
png(paste0("images/predictions/rf-2-2-summary.png"), width = 1600, height = 1200, res = 120, units = "px")


predictions <- fread("work/predictions/mutation_model_real_wt_ba_test_real_wt_ba.csv")
colnames(predictions) <- c("observed", "predicted")
predictions = predictions[predictions$predicted >= -16.0,]

  
plot1 <- ggscatter(predictions, x = "observed", y = "predicted", 
            add = "reg.line", conf.int = TRUE, 
            cor.coef = FALSE, cor.method = "pearson",
            color = rgb(0,0,1,1/4), shape = 20,
            xlim=c(-16,-2), ylim=c(-16,-2),
            xlab = "Experimental binding affinity", ylab = "Predicted binding affinity")  

predictions <- fread("work/predictions/mutation_model_real_wt_ba_test_pred_wt_ba.csv")
colnames(predictions) <- c("observed", "predicted")
predictions = predictions[predictions$predicted >= -16.0,]

plot2 <- ggscatter(predictions, x = "observed", y = "predicted", 
            add = "reg.line", conf.int = TRUE, 
            cor.coef = FALSE, cor.method = "pearson", 
            color = rgb(0,0,1,1/4), shape = 20,
            xlim=c(-16,-2), ylim=c(-16,-2),
            xlab = "Experimental binding affinity", ylab = "Predicted binding affinity")  


predictions <- fread("work/predictions/mutation_model_pred_wt_ba_test_real_wt_ba.csv")
colnames(predictions) <- c("observed", "predicted")
predictions = predictions[predictions$predicted >= -16.0,]

plot3 <- ggscatter(predictions, x = "observed", y = "predicted", 
            add = "reg.line", conf.int = TRUE, 
            cor.coef = FALSE, cor.method = "pearson", 
            color = rgb(0,0,1,1/4), shape = 20,
            xlim=c(-16,-2), ylim=c(-16,-2),
            xlab = "Experimental binding affinity", ylab = "Predicted binding affinity")  

predictions <- fread("work/predictions/mutation_model_pred_wt_ba_test_pred_wt_ba.csv")
colnames(predictions) <- c("observed", "predicted")
predictions = predictions[predictions$predicted >= -16.0,]

plot4 <- ggscatter(predictions, x = "observed", y = "predicted", 
            add = "reg.line", conf.int = TRUE, 
            cor.coef = FALSE, cor.method = "pearson", 
            color = rgb(0,0,1,1/4), shape = 20,
            xlim=c(-16,-2), ylim=c(-16,-2),
            xlab = "Experimental binding affinity", ylab = "Predicted binding affinity")

ggarrange(plot2, plot3,
          plot1, plot4,
          ncol = 2, nrow = 2)

dev.off()


`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'

