In [1]:
#
# Load R packages
#

library("reshape2")
library("ggplot2")
library("viridis")
library("ggpubr")
library("RColorBrewer")
library("tidyverse")
library("SDMTools")
library("scales")
library("ggpmisc")
library("dplyr")
library("ggExtra")
library("scales")
library("scico")
library("tidyr")

Loading required package: viridisLite

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mtibble [39m 3.1.6     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.1
[32m✔[39m [34mpurrr  [39m 0.3.4     

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘scales’


The following object is masked from ‘package:purrr’:

    discard


The following object is masked from ‘package:readr’:

    col_factor


The following object is masked from ‘package:viridis’:

    viridis_pal


Loading required package: ggpp


Attaching package: ‘ggpp’


The following object is masked from ‘package:gg

In [2]:
#
# Change to the working directory
#

setwd("../")


In [3]:
#
# Function to compute the deltas for a given vector 
#

intra_element_diff = function(myvector){
  collect_diff = c()
  for(i in 1:length(myvector)) {
    if (i != length(myvector)) {
      for(j in (i+1):length(myvector)){
       collect_diff = c(collect_diff,(myvector[i]-myvector[j]))
      }
    }
  }
  return(collect_diff)
}

#
# Function to get all the possible row combinations for a given dataframe
#

rows_comb = function(mydf){
  cmb = combn(seq_len(nrow(mydf)), 2)
  comb_df = cbind(mydf[cmb[1,],], mydf[cmb[2,],])
  return(comb_df)
}


In [4]:
#
# Function to compute the Kendall tau-b correlation coefficient for a given number of pairs of differences
#

compute_tau_b = function(mydf,var1,var2) {
    num_conc = as.numeric(table(mydf[[var1]] < 0,mydf[[var2]] < 0 ))[4] + as.numeric(table(mydf[[var1]] > 0,mydf[[var2]] > 0))[4]
    num_disc = as.numeric(table(mydf[[var1]] < 0,mydf[[var2]] > 0 ))[4] + as.numeric(table(mydf[[var1]] > 0,mydf[[var2]] < 0))[4]
    ties_only_1 = length(which(mydf[[var1]] == 0 & mydf[[var2]] != 0))
    ties_only_2 = length(which(mydf[[var1]] != 0 & mydf[[var2]] == 0))
    tau_b = round((num_conc-num_disc)/sqrt((num_conc+num_disc+ties_only_1)*(num_conc+num_disc+ties_only_2)),digits=2)
    return(tau_b)
}


In [5]:
#
# Function to carry out the analysis based on pairs of sequences using AF2
#

pairwise_plots = function(gdt_ts_df,sop_df,tcs_df,plddt_df,ref_aligner,included_struct){
  for (fam in levels(factor(gdt_ts_df$Family))) {
  tmpdf = gdt_ts_df[which(gdt_ts_df$Family == fam),]
  new_tmpdf = rows_comb(tmpdf)
  if (exists('pair_df') && is.data.frame(get('pair_df'))){
      pair_df = rbind(pair_df,new_tmpdf)
    } else {
      pair_df = new_tmpdf
    }
  }
  for (fam in levels(factor(tcs_df$Family))) {
  tmpdf_tcs = tcs_df[which(tcs_df$Family == fam),]
  new_tmpdf_tcs = rows_comb(tmpdf_tcs)
  if (exists('pair_df_tcs') && is.data.frame(get('pair_df_tcs'))){
      pair_df_tcs = rbind(pair_df_tcs,new_tmpdf_tcs)
    } else {
      pair_df_tcs = new_tmpdf_tcs
    }
  }
  for (fam in levels(factor(plddt_df$Family))) {
  tmpdf_plddt = plddt_df[which(plddt_df$Family == fam),]
  new_tmpdf_plddt = rows_comb(tmpdf_plddt)
  if (exists('pair_df_plddt') && is.data.frame(get('pair_df_plddt'))){
      pair_df_plddt = rbind(pair_df_plddt,new_tmpdf_plddt)
    } else {
      pair_df_plddt = new_tmpdf_plddt
    }
  }

  pair_df = pair_df[!duplicated(as.list(pair_df))]
  colnames(pair_df) = c("Sequence_1","GDT_TS_1","Family","Sequence_2","GDT_TS_2")
  pair_df$GDT_TS_1 = pair_df$GDT_TS_1*100
  pair_df$GDT_TS_2 = pair_df$GDT_TS_2*100

  pair_df_tcs = pair_df_tcs[!duplicated(as.list(pair_df_tcs))]
  colnames(pair_df_tcs) = c("Sequence_1","Family","TCS_1","Sequence_2","TCS_2")
  pair_df_tcs$TCS_1 = pair_df_tcs$TCS_1/10
  pair_df_tcs$TCS_2 = pair_df_tcs$TCS_2/10

  pair_df_plddt = pair_df_plddt[!duplicated(as.list(pair_df_plddt))]
  colnames(pair_df_plddt) = c("Sequence_1","pLDDT_1","Family","Sequence_2","pLDDT_2")
  merged_df_left = merge(merge(merge(sop_df,pair_df,by=c("Sequence_1","Sequence_2","Family")),pair_df_tcs,by=c("Sequence_1","Sequence_2","Family")),pair_df_plddt,by=c("Sequence_1","Sequence_2","Family"))
  colnames(pair_df_plddt) = c("Sequence_2","pLDDT_2","Family","Sequence_1","pLDDT_1")
  merged_df_right = merge(merge(merge(sop_df,pair_df,by=c("Sequence_1","Sequence_2","Family")),pair_df_tcs,by=c("Sequence_1","Sequence_2","Family")),pair_df_plddt,by=c("Sequence_1","Sequence_2","Family"))
  merged_df = rbind(merged_df_left,merged_df_right)

  merged_df$GM_GDT_TS = apply(merged_df[,c(5,6)],1, function(x) exp(mean(log(x))))
  merged_df$GM_TCS = apply(merged_df[,c(7,8)],1, function(x) exp(mean(log(x))))
  merged_df$GM_pLDDT = apply(merged_df[,c(9,10)],1, function(x) exp(mean(log(x))))


  sop_title = paste0("SoP score on pairs of sequences MSA-AF2 vs MSA-PDB (%) ")
  gdt_title_gm = paste0("Geometric mean of GDT-TS scores on pairs of sequences (%) ")
  tcs_title_gm = paste0("Geometric mean of TCS scores per pair of sequences (%) ")
  plddt_title_gm = paste0("Geometric mean of pLDDT scores per pair of sequences (%) ")
  
  p = ggplot(merged_df,aes(x=SoP,y=GM_GDT_TS,color=GM_TCS)) +
    geom_point(alpha = 0.8) +
    stat_cor(inherit.aes = FALSE,data=merged_df,aes(x=SoP,y=GM_GDT_TS),method = "pearson") +
    geom_smooth(method='lm',inherit.aes = FALSE,data=merged_df,aes(x=SoP,y=GM_GDT_TS)) +
    xlim(0,100) + ylim(0,100) +
    scale_color_viridis_c(option="viridis",limits=c(0,100)) +
    guides(color = guide_colorbar(title.vjust = .8),shape = guide_legend(title.vjust = .8)) +
    theme(legend.position = "top",legend.title.align=0.5) +
    xlab(sop_title) + ylab(gdt_title_gm) + labs(color='TCS')
  ggsave(paste0("Fig2_sop_vs_gdt_ts_pairwise_with_gm_",included_struct,"_ref_",ref_aligner,"_alphafold.png"),ggMarginal(p, type = "density",margins = "y",fill = "darkblue"),dpi = 700)

  merged_df_sop_above_95 = merged_df[which(merged_df$SoP > 95),]
  merged_df_sop_above_95_seq_1 = merged_df_sop_above_95[,c("Sequence_1","GDT_TS_1","Family")]
  colnames(merged_df_sop_above_95_seq_1) = c("Sequence","GDT_TS","Family")
  merged_df_sop_above_95_seq_2 = merged_df_sop_above_95[,c("Sequence_2","GDT_TS_2","Family")]
  colnames(merged_df_sop_above_95_seq_2) = c("Sequence","GDT_TS","Family")
  merged_df_sop_above_95_seq_1_2 = rbind(merged_df_sop_above_95_seq_1,merged_df_sop_above_95_seq_2)
  merged_df_sop_above_95_seq_1_2 = merged_df_sop_above_95_seq_1_2[!duplicated(merged_df_sop_above_95_seq_1_2),]

  return(merged_df)

}


In [6]:
#
# Function to carry out the analysis per sequence using AF2
#

delta_analysis_AF2 = function (tcs_df,gdt_ts_df,plddts_df,sop_df,nirmsd_df,ref_aligner,included_struct,plots) {
  merged_df = merge(merge(merge(merge(tcs_df,gdt_ts_df,by=c("Sequence","Family")),plddts_df,by=c("Sequence","Family")),sop_df,by=c("Sequence","Family")),nirmsd_df,by=c("Sequence","Family"))
  merged_df = unique(merged_df)
  colnames(merged_df)[4] = "GDT_TS"
  merged_df$GDT_TS = merged_df$GDT_TS*100
  merged_df$TCS = merged_df$TCS/10
  merged_df = merged_df[order(merged_df$Family),]
  
    
    #
    # Permutation 
    #
    
    set.seed(44969)
    collect_tau_b_perm_sop_vs_tcs = numeric(1000)
    collect_tau_b_perm_sop_vs_gdt_ts = numeric(1000)
    collect_tau_b_perm_sop_vs_plddt = numeric(1000)
    for (perm in seq(1,1000)){
        for (fam in levels(factor(merged_df$Family))) {
            tmp_df = merged_df[which(merged_df$Family == fam),]
            tmp_df$SoP = sample(tmp_df$SoP)
        
            delta_tcs = intra_element_diff(tmp_df$TCS)
            delta_sop = intra_element_diff(tmp_df$SoP)
            delta_gdt_ts = intra_element_diff(tmp_df$GDT_TS)
            delta_plddt = intra_element_diff(tmp_df$pLDDT)
            delta_nirmsd = intra_element_diff(tmp_df$niRMSD)*(-1)
            if (exists('delta_df_perm') && is.data.frame(get('delta_df_perm'))){
              delta_df_perm = rbind(delta_df_perm,data.frame(delta_tcs,delta_sop,delta_gdt_ts,delta_plddt,delta_nirmsd,Family=rep(fam,length(delta_gdt_ts))))
            } else {
              delta_df_perm = data.frame(delta_tcs,delta_sop,delta_gdt_ts,delta_plddt,delta_nirmsd,Family=rep(fam,length(delta_gdt_ts)))
            }
        }
        new_tau_b_sop_vs_tcs = compute_tau_b(delta_df_perm,"delta_sop","delta_tcs")
        new_tau_b_sop_vs_gdt_ts = compute_tau_b(delta_df_perm,"delta_sop","delta_gdt_ts")
        new_tau_b_sop_vs_plddt = compute_tau_b(delta_df_perm,"delta_sop","delta_plddt")

        collect_tau_b_perm_sop_vs_tcs[perm] = new_tau_b_sop_vs_tcs
        collect_tau_b_perm_sop_vs_gdt_ts[perm] = new_tau_b_sop_vs_gdt_ts
        collect_tau_b_perm_sop_vs_plddt[perm] = new_tau_b_sop_vs_plddt
    }
    
    
  #
  # Deltas per family
  #

  for (fam in levels(factor(merged_df$Family))) {
    tmp_df = merged_df[which(merged_df$Family == fam),]
    delta_tcs = intra_element_diff(tmp_df$TCS) #apply(combn(tmp_df$TCS_mTMalign_AF2,2), 2, diff)
    delta_sop = intra_element_diff(tmp_df$SoP) #apply(combn(tmp_df$SoP_mTMalign_AF2,2), 2, diff)
    delta_gdt_ts = intra_element_diff(tmp_df$GDT_TS) #apply(combn(tmp_df$GDT_TS,2), 2, diff)
    delta_plddt = intra_element_diff(tmp_df$pLDDT) #apply(combn(tmp_df$pLDDT,2), 2, diff)
    delta_nirmsd = intra_element_diff(tmp_df$niRMSD)*(-1)
    if (exists('delta_df') && is.data.frame(get('delta_df'))){
      delta_df = rbind(delta_df,data.frame(delta_tcs,delta_sop,delta_gdt_ts,delta_plddt,delta_nirmsd,Family=rep(fam,length(delta_gdt_ts))))
    } else {
      delta_df = data.frame(delta_tcs,delta_sop,delta_gdt_ts,delta_plddt,delta_nirmsd,Family=rep(fam,length(delta_gdt_ts)))
    }
  }
  tau_b_sop_vs_tcs = compute_tau_b(delta_df,"delta_sop","delta_tcs")
  if (length(which(collect_tau_b_perm_sop_vs_tcs >= tau_b_sop_vs_tcs)) == 0)
      {
      pval_sop_vs_tcs = paste0("p < ", formatC(0.001, format = "e", digits = 0))
  } else {
      pval_sop_vs_tcs = paste0("p = ", formatC(length(which(collect_tau_b_perm_sop_vs_tcs >= tau_b_sop_vs_tcs))/1000, format = "e", digits = 2))
  }

  tau_b_sop_vs_gdt_ts = compute_tau_b(delta_df,"delta_sop","delta_gdt_ts")
  if (length(which(collect_tau_b_perm_sop_vs_gdt_ts >= tau_b_sop_vs_gdt_ts)) == 0)
      {
      pval_sop_vs_gdt_ts = paste0("p < ", formatC(0.001, format = "e", digits = 0))
  } else {
      pval_sop_vs_gdt_ts = paste0("p = ", formatC(length(which(collect_tau_b_perm_sop_vs_gdt_ts >= tau_b_sop_vs_gdt_ts))/1000, format = "e", digits = 2))
  }

  tau_b_sop_vs_plddt = compute_tau_b(delta_df,"delta_sop","delta_plddt")
  if (length(which(collect_tau_b_perm_sop_vs_plddt >= tau_b_sop_vs_plddt)) == 0)
      {
      pval_sop_vs_plddt = paste0("p < ", formatC(0.001, format = "e", digits = 0))
  } else {
      pval_sop_vs_plddt = paste0("p = ", formatC(length(which(collect_tau_b_perm_sop_vs_plddt >= tau_b_sop_vs_plddt))/1000, format = "e", digits = 2))
  }
    
  sop_title = paste0("Δ Sum-of-Pairs scores between ", included_struct," pairs within family - MSA-AF2 vs MSA-PDB")
  tcs_title = paste0("Δ TCS scores between ",included_struct, " pairs within family - MSA-AF2")
  gdt_title = paste0("Δ GDT_TS scores between ",included_struct, " pairs within family")
  plddt_title = paste0("Δ pLDDT scores between ", included_struct," pairs within family")
  nirmsd_title = paste0("Δ niRMSD scores between ", included_struct, " pairs within family - MSA-AF2 with AF2 structures")

  sop_title_raw = paste0("Sum-of-Pairs scores per sequence ",ref_aligner,"_AF2 vs ",ref_aligner,"_NAT")
  tcs_title_raw = paste0("TCS scores per sequence ",ref_aligner,"_AF2")
  gdt_title_raw = paste0("GDT_TS scores per sequence ")
  plddt_title_raw = paste0("pLDDT scores per sequence ")
  nirmsd_title_raw = paste0("niRMSD scores per sequence ",ref_aligner,"_AF2 with AF2 structures")

  if (plots == "TRUE") {
    num_conc = as.numeric(table(delta_df$delta_sop < 0,delta_df$delta_tcs < 0 ))[4] + as.numeric(table(delta_df$delta_sop > 0,delta_df$delta_tcs > 0))[4]
    num_disc = as.numeric(table(delta_df$delta_sop < 0,delta_df$delta_tcs > 0 ))[4] + as.numeric(table(delta_df$delta_sop > 0,delta_df$delta_tcs < 0))[4]
    num_ties = length(which(delta_df$delta_sop == 0 | delta_df$delta_tcs == 0))
    mylabel_disc = paste0(num_disc," (",round(100*num_disc/nrow(delta_df),digits=2),"%)")
    mytau = paste0("τ = ",tau_b_sop_vs_tcs,", ",pval_sop_vs_tcs,"\n","Discordant pairs = ",mylabel_disc)
    p = ggplot(delta_df,aes(x=delta_sop,y=delta_tcs)) +
        geom_density_2d_filled(contour_var = "count",show.legend = FALSE) +
        scale_fill_scico_d(palette = "bilbao") + geom_hline(yintercept=0,size=0.1) +
        geom_vline(xintercept=0,size=0.1) +
        annotate("rect", xmin = -Inf, xmax = 0, ymin = -Inf, ymax = 0, fill= "green",alpha = 0.1) +
        annotate("rect", xmin = Inf, xmax = 0, ymin = Inf, ymax = 0, fill= "green",alpha = 0.1) +
        annotate(geom="text", x=-10, y=60, label=mytau) +
        scale_x_continuous(limits = c(-100,100),expand = c(0.01, 0.01),trans=scales::pseudo_log_trans(base = 10)) +
        scale_y_continuous(limits = c(-100,100),expand = c(0.01, 0.01),trans=scales::pseudo_log_trans(base = 10)) +
        theme_classic() + xlab(sop_title) + ylab(tcs_title) +
        theme(axis.title = element_text(size = 10))
    ggsave(paste0("Fig3a_delta_sop_vs_delta_tcs_",included_struct,"_ref_",ref_aligner,"_alphafold.png"),dpi="retina")
    
    sum_tab = data.frame(num_conc,num_disc,num_ties,tau_b_sop_vs_tcs,pval_sop_vs_tcs)
    colnames(sum_tab) = c("V1","V2","V3","V4","V5")
      
    num_conc = as.numeric(table(delta_df$delta_sop < 0,delta_df$delta_plddt < 0 ))[4] + as.numeric(table(delta_df$delta_sop > 0,delta_df$delta_plddt > 0))[4]
    num_disc = as.numeric(table(delta_df$delta_sop < 0,delta_df$delta_plddt > 0 ))[4] + as.numeric(table(delta_df$delta_sop > 0,delta_df$delta_plddt < 0))[4]
    num_ties = length(which(delta_df$delta_sop == 0 | delta_df$delta_plddt == 0))
    mylabel_disc = paste0(num_disc," (",round(100*num_disc/nrow(delta_df),digits=2),"%)")
    mytau = paste0("τ = ",tau_b_sop_vs_plddt,", ",pval_sop_vs_plddt,"\n","Discordant pairs = ",mylabel_disc)
    p = ggplot(delta_df,aes(x=delta_sop,y=delta_plddt)) +
        geom_density_2d_filled(contour_var = "count",show.legend = FALSE) +
        scale_fill_scico_d(palette = "bilbao") + geom_hline(yintercept=0,size=0.1) +
        geom_vline(xintercept=0,size=0.1) +
        annotate("rect", xmin = -Inf, xmax = 0, ymin = -Inf, ymax = 0, fill= "green",alpha = 0.1) +
        annotate("rect", xmin = Inf, xmax = 0, ymin = Inf, ymax = 0, fill= "green",alpha = 0.1) +
        annotate(geom="text", x=-10, y=60, label=mytau) +
        scale_x_continuous(limits = c(-100,100),expand = c(0.01, 0.01),trans=scales::pseudo_log_trans(base = 10)) +
        scale_y_continuous(limits = c(-100,100),expand = c(0.01, 0.01),trans=scales::pseudo_log_trans(base = 10)) +
        theme_classic() + xlab(sop_title) + ylab(plddt_title) +
        theme(axis.title = element_text(size = 10))
    ggsave(paste0("Fig3c_delta_sop_vs_delta_plddt_",included_struct,"_ref_",ref_aligner,"_alphafold.png"),dpi="retina")
    
    sum_tab = rbind(sum_tab,data.frame("V1"=num_conc,"V2"=num_disc,"V3"=num_ties,"V4"=tau_b_sop_vs_plddt,"V5"=pval_sop_vs_plddt))
      
    
    num_conc = as.numeric(table(delta_df$delta_sop < 0,delta_df$delta_gdt_ts < 0 ))[4] + as.numeric(table(delta_df$delta_sop > 0,delta_df$delta_gdt_ts > 0))[4]
    num_disc = as.numeric(table(delta_df$delta_sop < 0,delta_df$delta_gdt_ts > 0 ))[4] + as.numeric(table(delta_df$delta_sop > 0,delta_df$delta_gdt_ts < 0))[4]
    num_ties = length(which(delta_df$delta_sop == 0 | delta_df$delta_gdt_ts == 0))
    mylabel_disc = paste0(num_disc," (",round(100*num_disc/nrow(delta_df),digits=2),"%)")
    mytau = paste0("τ = ",tau_b_sop_vs_gdt_ts,", ",pval_sop_vs_gdt_ts,"\n","Discordant pairs = ",mylabel_disc)
    p = ggplot(delta_df,aes(x=delta_sop,y=delta_gdt_ts)) +
        geom_density_2d_filled(contour_var = "count",show.legend = FALSE) +
        scale_fill_scico_d(palette = "bilbao") +
        geom_hline(yintercept=0,size=0.1) +
        geom_vline(xintercept=0,size=0.1) +
        annotate("rect", xmin = -Inf, xmax = 0, ymin = -Inf, ymax = 0, fill= "green",alpha = 0.1) +
        annotate("rect", xmin = Inf, xmax = 0, ymin = Inf, ymax = 0, fill= "green",alpha = 0.1) +
        annotate(geom="text", x=-10, y=60, label=mytau) +
        scale_x_continuous(limits = c(-100,100),expand = c(0.01, 0.01),trans=scales::pseudo_log_trans(base = 10)) +
        scale_y_continuous(limits = c(-100,100),expand = c(0.01, 0.01),trans=scales::pseudo_log_trans(base = 10)) +
        theme_classic() + xlab(sop_title) + ylab(gdt_title) +
        theme(axis.title = element_text(size = 10))
    ggsave(paste0("Fig3b_delta_sop_vs_delta_gdt_ts_",included_struct,"_ref_",ref_aligner,"_alphafold.png"),dpi="retina")

    sum_tab = rbind(sum_tab,data.frame("V1"=num_conc,"V2"=num_disc,"V3"=num_ties,"V4"=tau_b_sop_vs_gdt_ts,"V5"=pval_sop_vs_gdt_ts))
      
    colnames(sum_tab) = c("Concordant pairs", "Discordant pairs", "Tied pairs", "Tau-b", "p-value")
    row.names(sum_tab) = c("SoP vs TCS","SoP vs pLDDT","SoP vs GDT-TS")
    write.table(sum_tab, file="Supp_table_3.tsv",quote=FALSE, sep="\t",row.names=TRUE)
    
    mytab = as.numeric(table(delta_df$delta_sop >= 0,delta_df$delta_nirmsd >= 0 ))
    mylabel = paste0("N = ",(mytab[1]+mytab[4])," (",round(100*(mytab[1]+mytab[4])/sum(mytab),digits=2),"%)")
  }

  return(delta_df)
}


In [7]:
#
# Load NiRMSD values
#

nirmsd_ref = read.table("./selected_comparisons_nirmsd.txt",header = F,stringsAsFactors = F)
colnames(nirmsd_ref)=c("Family","Famsa","Ginsi","MSAProbs","TCoffee","PSIcoffee","3DCoffee_NAT","3DCoffee_TMalign_NAT","mTMalign_NAT","3DCoffee_AF2_REF_NAT","3DCoffee_TMalign_AF2_REF_NAT","mTMalign_AF2_REF_NAT","3DCoffee_AF2_REF_AF2","3DCoffee_TMalign_AF2_REF_AF2","mTMalign_AF2_REF_AF2")
nirmsd_ref = rbind(nirmsd_ref,c("Average",colMeans(nirmsd_ref[,-1])))
row.names(nirmsd_ref) = make.unique(nirmsd_ref$Family)
nirmsd_ref[colnames(nirmsd_ref)[-1]] <- sapply(nirmsd_ref[colnames(nirmsd_ref)[-1]],as.numeric)
write.table(nirmsd_ref, file="Supp_table_4_NiRMSD_per_aligner.tsv",quote=FALSE, sep="\t",row.names=FALSE)

nirmsd_ref_avg = read.table("./selected_comparisons_nirmsd_avg.txt",header = F,stringsAsFactors = F)
nirmsd_ref_avg = nirmsd_ref_avg[!duplicated(as.list(nirmsd_ref_avg))]
colnames(nirmsd_ref_avg)=c("Sequence","Family","Famsa","Ginsi","MSAProbs","TCoffee","PSIcoffee","3DCoffee_NAT","3DCoffee_TMalign_NAT","mTMalign_NAT","3DCoffee_AF2_REF_NAT","3DCoffee_TMalign_AF2_REF_NAT","mTMalign_AF2_REF_NAT","3DCoffee_AF2_REF_AF2","3DCoffee_TMalign_AF2_REF_AF2","mTMalign_AF2_REF_AF2")

nirmsd_ref_pair = read.table("./selected_comparisons_nirmsd_pair.txt",header = F,stringsAsFactors = F)
nirmsd_ref_pair = nirmsd_ref_pair[!duplicated(as.list(nirmsd_ref_pair))]
colnames(nirmsd_ref_pair)=c("Sequence_1","Sequence_2","Family","Famsa","Ginsi","MSAProbs","TCoffee","PSIcoffee","3DCoffee_NAT","3DCoffee_TMalign_NAT","mTMalign_NAT","3DCoffee_AF2_REF_NAT","3DCoffee_TMalign_AF2_REF_NAT","mTMalign_AF2_REF_NAT","3DCoffee_AF2_REF_AF2","3DCoffee_TMalign_AF2_REF_AF2","mTMalign_AF2_REF_AF2")


In [8]:
#
# Load percent identity (PID) values
#

pid_ref = read.table("./selected_comparisons_pid.txt",header = F,stringsAsFactors = F)
pid_ref = pid_ref[!duplicated(as.list(pid_ref))]
colnames(pid_ref)=c("Family","Famsa","Ginsi","MSAProbs","TCoffee","PSIcoffee","3DCoffee_NAT","3DCoffee_TMalign_NAT","mTMalign_NAT","3DCoffee_AF2","3DCoffee_TMalign_AF2","mTMalign_AF2")

pid_ref_avg = read.table("./selected_comparisons_pid.avg.txt",header = F,stringsAsFactors = F)
pid_ref_avg = pid_ref_avg[!duplicated(as.list(pid_ref_avg))]
colnames(pid_ref_avg)=c("Sequence","Famsa","Family","Ginsi","MSAProbs","TCoffee","PSIcoffee","3DCoffee_NAT","3DCoffee_TMalign_NAT","mTMalign_NAT","3DCoffee_AF2","3DCoffee_TMalign_AF2","mTMalign_AF2")

pid_ref_pair = read.table("./selected_comparisons_pid.pair.txt",header = F,stringsAsFactors = F)
pid_ref_pair = pid_ref_pair[!duplicated(as.list(pid_ref_pair))]
colnames(pid_ref_pair)=c("Sequence_1","Sequence_2","Famsa","Family","Ginsi","MSAProbs","TCoffee","PSIcoffee","3DCoffee_NAT","3DCoffee_TMalign_NAT","mTMalign_NAT","3DCoffee_AF2","3DCoffee_TMalign_AF2","mTMalign_AF2")



In [9]:
#
# Load alignment lengths
#

aln_len = read.table("./selected_comparisons_aln_length.txt",header = F,stringsAsFactors = F)
aln_len = aln_len[!duplicated(as.list(aln_len))]
colnames(aln_len)=c("Family","Famsa","Ginsi","MSAProbs","TCoffee","PSIcoffee","3DCoffee_NAT","3DCoffee_TMalign_NAT","mTMalign_NAT","3DCoffee_AF2","3DCoffee_TMalign_AF2","mTMalign_AF2")



In [10]:
#
# Load structural comparison metrics - AF2 vs NAT structures
#

alphafold = read.table("./alphafold_vs_ref_pdb_comparison_selected.tsv",header = F,stringsAsFactors = F)
colnames(alphafold)=c("Sequence","RMSD","TMscore","GDT_TS","Family")

complete_merged= alphafold
complete_merged = complete_merged[!duplicated(as.list(complete_merged))]
colnames(complete_merged) = c("Sequence","RMSD_AF2","TMscore_AF2","GDT_TS_AF2","Family")

tmscore_complete_with_seqs = complete_merged[,c(1,3,5)]
colnames(tmscore_complete_with_seqs) = c("Sequence","alphafold","Family")
tmscore_complete_with_seqs = tmscore_complete_with_seqs[order(tmscore_complete_with_seqs$Family),]

gdt_ts_complete_with_seqs = complete_merged[,c("Sequence","GDT_TS_AF2","Family")]
colnames(gdt_ts_complete_with_seqs) = c("Sequence","alphafold","Family")
gdt_ts_complete_with_seqs = gdt_ts_complete_with_seqs[order(gdt_ts_complete_with_seqs$Family),]

gdt_ts_complete_with_seqs_rescale = gdt_ts_complete_with_seqs
gdt_ts_complete_with_seqs_rescale$alphafold = gdt_ts_complete_with_seqs_rescale$alphafold * 100
p = ggplot(gdt_ts_complete_with_seqs_rescale, aes(x=alphafold,fill=Family)) + geom_histogram(color="black",position="stack") + theme_light() + xlab("AF2 GDT_TS")
ggsave("Supp_Fig2_gdt_ts_alphafold_histogram.png",dpi=700)

gdt_ts_alphafold_geometric_mean = round(as.numeric(by(gdt_ts_complete_with_seqs_rescale$alphafold,factor(gdt_ts_complete_with_seqs_rescale$Family), function(x) exp(mean(log(x))))),digits=2)
                                                
alphafold_plddts = read.table("./alphafold_plddts_selected.tsv",header = F, stringsAsFactors = F)
colnames(alphafold_plddts)=c("Sequence","pLDDT","Family")

                                                      

Saving 6.67 x 6.67 in image

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



In [11]:
#
# Load SoP scores with 3DCoffee_NAT as reference
#

ref_3dcoffee_sp = read.table("./selected_comparisons_ref_3dcoffee_sp.txt",header = F,stringsAsFactors = F)
ref_3dcoffee_sp = ref_3dcoffee_sp[!duplicated(as.list(ref_3dcoffee_sp))]
colnames(ref_3dcoffee_sp)=c("Family","Famsa","Ginsi","MSAProbs","TCoffee","PSIcoffee","3DCoffee_NAT","3DCoffee_TMalign_NAT","mTMalign_NAT","3DCoffee_AF2","3DCoffee_TMalign_AF2","mTMalign_AF2") #,"Deepblast_vs_REF"
#write.table(ref_3dcoffee_sp, file="selected_comparisons_ref_3dcoffee_sp.tsv",quote=FALSE, sep="\t",row.names=FALSE)

ref_3dcoffee_sp_perc = read.table("./selected_comparisons_ref_3dcoffee_sp.perc.txt",header = F,stringsAsFactors = F)
ref_3dcoffee_sp_perc = ref_3dcoffee_sp_perc[!duplicated(as.list(ref_3dcoffee_sp_perc))]
colnames(ref_3dcoffee_sp_perc)=c("Family","Percentage of pairs included")
ref_3dcoffee_sp$perc_pairs = ref_3dcoffee_sp_perc[[2]]

ref_3dcoffee_sp$GDT_TS_AF2 = gdt_ts_alphafold_geometric_mean
write.table(ref_3dcoffee_sp, file="Supp_table_1b_selected_comparisons_ref_3dcoffee_sp.tsv",quote=FALSE, sep="\t",row.names=FALSE)


ref_3dcoffee_avg_sp = read.table("./selected_comparisons_ref_3dcoffee_avg_sp.txt",header = F,stringsAsFactors = F)
ref_3dcoffee_avg_sp = ref_3dcoffee_avg_sp[!duplicated(as.list(ref_3dcoffee_avg_sp))]
colnames(ref_3dcoffee_avg_sp)= c("Sequence","Family","Famsa","Ginsi","MSAProbs","TCoffee","PSIcoffee","3DCoffee_NAT","3DCoffee_TMalign_NAT","mTMalign_NAT","3DCoffee_AF2","3DCoffee_TMalign_AF2","mTMalign_AF2")
#write.table(ref_3dcoffee_avg_sp, file="selected_comparisons_ref_3dcoffee_avg_sp.tsv",quote=FALSE, sep="\t",row.names=FALSE)

ref_3dcoffee_pair_sp = read.table("./selected_comparisons_ref_3dcoffee_pair_sp.txt",header = F,stringsAsFactors = F)
ref_3dcoffee_pair_sp = ref_3dcoffee_pair_sp[!duplicated(as.list(ref_3dcoffee_pair_sp))]
colnames(ref_3dcoffee_pair_sp)=c("Sequence_1","Sequence_2","Family","Famsa","Ginsi","MSAProbs","TCoffee","PSIcoffee","3DCoffee_NAT","3DCoffee_TMalign_NAT","mTMalign_NAT","3DCoffee_AF2","3DCoffee_TMalign_AF2","mTMalign_AF2")

  #
  # SoP without loops
  #
ref_3dcoffee_sp_without_loops = read.table("./selected_comparisons_ref_3dcoffee_sp.without_loops.txt",header = F,stringsAsFactors = F)
ref_3dcoffee_sp_without_loops = ref_3dcoffee_sp_without_loops[!duplicated(as.list(ref_3dcoffee_sp_without_loops))]
colnames(ref_3dcoffee_sp_without_loops)=c("Family","Famsa","Ginsi","MSAProbs","TCoffee","PSIcoffee","3DCoffee_NAT","3DCoffee_TMalign_NAT","mTMalign_NAT","3DCoffee_AF2","3DCoffee_TMalign_AF2","mTMalign_AF2")
#write.table(ref_3dcoffee_sp_without_loops, file="selected_comparisons_ref_3dcoffee_sp.without_loops.tsv",quote=FALSE, sep="\t",row.names=FALSE)

ref_3dcoffee_avg_sp_without_loops = read.table("./selected_comparisons_ref_3dcoffee_avg_sp.without_loops.txt",header = F,stringsAsFactors = F)
ref_3dcoffee_avg_sp_without_loops = ref_3dcoffee_avg_sp_without_loops[!duplicated(as.list(ref_3dcoffee_avg_sp_without_loops))]
colnames(ref_3dcoffee_avg_sp_without_loops)= c("Sequence","Family","Famsa","Ginsi","MSAProbs","TCoffee","PSIcoffee","3DCoffee_NAT","3DCoffee_TMalign_NAT","mTMalign_NAT","3DCoffee_AF2","3DCoffee_TMalign_AF2","mTMalign_AF2")
#write.table(ref_3dcoffee_avg_sp_without_loops, file="selected_comparisons_ref_3dcoffee_avg_sp_without_loops.tsv",quote=FALSE, sep="\t",row.names=FALSE)

ref_3dcoffee_pair_sp_without_loops = read.table("./selected_comparisons_ref_3dcoffee_pair_sp.without_loops.txt",header = F,stringsAsFactors = F)
ref_3dcoffee_pair_sp_without_loops = ref_3dcoffee_pair_sp_without_loops[!duplicated(as.list(ref_3dcoffee_pair_sp_without_loops))]
colnames(ref_3dcoffee_pair_sp_without_loops)=c("Sequence_1","Sequence_2","Family","Famsa","Ginsi","MSAProbs","TCoffee","PSIcoffee","3DCoffee_NAT","3DCoffee_TMalign_NAT","mTMalign_NAT","3DCoffee_AF2","3DCoffee_TMalign_AF2","mTMalign_AF2")

  #
  # SoP same state - Generate Supp Table 1
  #
ref_3dcoffee_sp_same_state = read.table("./selected_comparisons_ref_3dcoffee_sp.same_state.txt",header = F,stringsAsFactors = F)
ref_3dcoffee_sp_same_state = ref_3dcoffee_sp_same_state[!duplicated(as.list(ref_3dcoffee_sp_same_state))]
colnames(ref_3dcoffee_sp_same_state)=c("Family","Famsa","Ginsi","MSAProbs","TCoffee","PSIcoffee","3DCoffee_NAT","3DCoffee_TMalign_NAT","mTMalign_NAT","3DCoffee_AF2","3DCoffee_TMalign_AF2","mTMalign_AF2")

ref_3dcoffee_sp_same_state_perc = read.table("./selected_comparisons_ref_3dcoffee_sp.same_state.perc.txt",header = F,stringsAsFactors = F)
ref_3dcoffee_sp_same_state_perc = ref_3dcoffee_sp_same_state_perc[,-c(11:16,19:22)]
ref_3dcoffee_sp_same_state_perc = ref_3dcoffee_sp_same_state_perc[!duplicated(as.list(ref_3dcoffee_sp_same_state_perc))]
colnames(ref_3dcoffee_sp_same_state_perc)=c("Family","Percentage of pairs included")
ref_3dcoffee_sp_same_state$perc_pairs = ref_3dcoffee_sp_same_state_perc[[2]]

ref_3dcoffee_sp_same_state$GDT_TS_AF2 = gdt_ts_alphafold_geometric_mean
write.table(ref_3dcoffee_sp_same_state, file="Supp_table_1a_selected_comparisons_ref_3dcoffee_sp.same_state.tsv",quote=FALSE, sep="\t",row.names=FALSE)

ref_3dcoffee_avg_sp_same_state = read.table("./selected_comparisons_ref_3dcoffee_avg_sp.same_state.txt",header = F,stringsAsFactors = F)
ref_3dcoffee_avg_sp_same_state = ref_3dcoffee_avg_sp_same_state[!duplicated(as.list(ref_3dcoffee_avg_sp_same_state))]
colnames(ref_3dcoffee_avg_sp_same_state)= c("Sequence","Family","Famsa","Ginsi","MSAProbs","TCoffee","PSIcoffee","3DCoffee_NAT","3DCoffee_TMalign_NAT","mTMalign_NAT","3DCoffee_AF2","3DCoffee_TMalign_AF2","mTMalign_AF2")
#write.table(ref_3dcoffee_avg_sp_same_state, file="selected_comparisons_ref_3dcoffee_avg_sp.same_state.tsv",quote=FALSE, sep="\t",row.names=FALSE)

ref_3dcoffee_pair_sp_same_state = read.table("./selected_comparisons_ref_3dcoffee_pair_sp.same_state.txt",header = F,stringsAsFactors = F)
ref_3dcoffee_pair_sp_same_state = ref_3dcoffee_pair_sp_same_state[!duplicated(as.list(ref_3dcoffee_pair_sp_same_state))]
colnames(ref_3dcoffee_pair_sp_same_state)=c("Sequence_1","Sequence_2","Family","Famsa","Ginsi","MSAProbs","TCoffee","PSIcoffee","3DCoffee_NAT","3DCoffee_TMalign_NAT","mTMalign_NAT","3DCoffee_AF2","3DCoffee_TMalign_AF2","mTMalign_AF2")


In [21]:
#
# Wilcoxon Test on SoP scores per family in MSA-Seq and MSA-AF2
#


res_wilcox = wilcox.test(ref_3dcoffee_sp_same_state$Ginsi,ref_3dcoffee_sp_same_state$`3DCoffee_AF2`)
round(res_wilcox$p.value,digits = 4)

In [29]:
#
# Generate Fig1
#

ref_3dcoffee_pair_sp_same_state$Family = factor(ref_3dcoffee_pair_sp_same_state$Family,levels=row.names(nirmsd_ref)[-nrow(nirmsd_ref)])

perc_of_pairs_msa_af2_superior_to_msa_seq = 100*round(length(which(ref_3dcoffee_pair_sp_same_state$`3DCoffee_AF2` > ref_3dcoffee_pair_sp_same_state$Ginsi))/nrow(ref_3dcoffee_pair_sp_same_state),digits=2)

color_codes <- colorRampPalette(brewer.pal(8, "Set1"))(12)

o=ggplot(ref_3dcoffee_pair_sp_same_state,aes(x=`3DCoffee_AF2`,y=Ginsi,color=Family)) +
    theme_light() + theme(legend.position = "bottom",plot.title = element_text(size = 10, face = "bold",vjust = 0.8)) +
    geom_point(alpha=0.6)+xlim(0,100)+ylim(0,100)+geom_abline(intercept =0 , slope = 1,lwd=0.2) +
    xlab("SoP score on pairs of sequences - MSA-AF2 vs MSA-PDB (%)") +
    ylab("SoP score on pairs of sequences - MSA-Seq vs MSA-PDB (%)") +
    scale_color_manual(values=color_codes)
ggsave("Fig1_ref_3dcoffee_pair_sp_score_per_family_alphafold_vs_ginsi.png",ggMarginal(o, type = "density",fill = "darkblue",kernel = "epanechnikov",xparams = list(adjust = 7),yparams = list(adjust = 1/4)),dpi = 700)



Saving 6.67 x 6.67 in image



In [13]:
#
# Load TCS scores
#

tcs_score = read.table("./selected_comparisons_tcs.txt",header=F,stringsAsFactors = F)
colnames(tcs_score) = c("Family","Ginsi","TCoffee","PSIcoffee","3DCoffee_NAT","3DCoffee_TMalign_NAT","mTMalign_NAT","3DCoffee_AF2","3DCoffee_TMalign_AF2","mTMalign_AF2")

tcs_score_per_seq = read.table("./selected_comparisons_tcs_avg.txt",header=F,stringsAsFactors = F)
tcs_score_per_seq = tcs_score_per_seq[!duplicated(as.list(tcs_score_per_seq))]
colnames(tcs_score_per_seq) = c("Sequence","Family","Ginsi","TCoffee","PSIcoffee","3DCoffee_NAT","3DCoffee_TMalign_NAT","mTMalign_NAT","3DCoffee_AF2","3DCoffee_TMalign_AF2","mTMalign_AF2")
tcs_score_per_seq[,-c(1,2)]=tcs_score_per_seq[,-c(1,2)]*10



In [14]:
#
# Analysis per sequence using alignments based on AF2 models and 3DCoffee as reference
#

ref_3dcoffee_avg_sp_3DCoffee_AF2 = ref_3dcoffee_avg_sp_same_state[,c("Sequence","Family","3DCoffee_AF2")]
colnames(ref_3dcoffee_avg_sp_3DCoffee_AF2)[3] = "SoP"
tcs_score_per_seq_3DCoffee_AF2 = tcs_score_per_seq[,c("Sequence","Family","3DCoffee_AF2")]
colnames(tcs_score_per_seq_3DCoffee_AF2)[3] = "TCS"
nirmsd_ref_avg_3DCoffee_AF2_ref_NAT = nirmsd_ref_avg[c("Sequence","Family","3DCoffee_AF2_REF_NAT")]
colnames(nirmsd_ref_avg_3DCoffee_AF2_ref_NAT)[3] = "niRMSD"
ref_3dcoffee_AF2_avg_df = delta_analysis_AF2(tcs_score_per_seq_3DCoffee_AF2,gdt_ts_complete_with_seqs,alphafold_plddts,ref_3dcoffee_avg_sp_3DCoffee_AF2,nirmsd_ref_avg_3DCoffee_AF2_ref_NAT,"3DCoffee","all",TRUE)

ref_3dcoffee_AF2_avg_df_sop_vs_tcs = table(ref_3dcoffee_AF2_avg_df$delta_sop >= 0, ref_3dcoffee_AF2_avg_df$delta_tcs >= 0)
ref_3dcoffee_AF2_avg_df_sop_vs_tcs = 100 * (ref_3dcoffee_AF2_avg_df_sop_vs_tcs[1] + ref_3dcoffee_AF2_avg_df_sop_vs_tcs[4]) / sum(ref_3dcoffee_AF2_avg_df_sop_vs_tcs)
ref_3dcoffee_AF2_avg_df_sop_vs_plddt = table(ref_3dcoffee_AF2_avg_df$delta_sop >= 0, ref_3dcoffee_AF2_avg_df$delta_plddt >= 0)
ref_3dcoffee_AF2_avg_df_sop_vs_plddt = 100 * (ref_3dcoffee_AF2_avg_df_sop_vs_plddt[1] + ref_3dcoffee_AF2_avg_df_sop_vs_plddt[4]) / sum(ref_3dcoffee_AF2_avg_df_sop_vs_plddt)
ref_3dcoffee_AF2_avg_df_sop_vs_gdt = table(ref_3dcoffee_AF2_avg_df$delta_sop >= 0, ref_3dcoffee_AF2_avg_df$delta_gdt_ts >= 0)
ref_3dcoffee_AF2_avg_df_sop_vs_gdt = 100 * (ref_3dcoffee_AF2_avg_df_sop_vs_gdt[1] + ref_3dcoffee_AF2_avg_df_sop_vs_gdt[4]) / sum(ref_3dcoffee_AF2_avg_df_sop_vs_gdt)
ref_3dcoffee_AF2_avg_df_delta_perc = c(ref_3dcoffee_AF2_avg_df_sop_vs_tcs,ref_3dcoffee_AF2_avg_df_sop_vs_plddt,ref_3dcoffee_AF2_avg_df_sop_vs_gdt)




Saving 6.67 x 6.67 in image

Saving 6.67 x 6.67 in image

Saving 6.67 x 6.67 in image



In [15]:
#
# Analysis per sequence using MAFFT-GINSI alignments and 3DCoffee as reference
#

ref_3dcoffee_avg_sp_ginsi = ref_3dcoffee_avg_sp_same_state[,c("Sequence","Family","Ginsi")]
colnames(ref_3dcoffee_avg_sp_ginsi)[3] = "SoP"
tcs_score_per_seq_ginsi = tcs_score_per_seq[,c("Sequence","Family","Ginsi")]
colnames(tcs_score_per_seq_ginsi)[3] = "TCS"
nirmsd_ref_avg_ginsi = nirmsd_ref_avg[c("Sequence","Family","Ginsi")]
colnames(nirmsd_ref_avg_ginsi)[3] = "niRMSD"
ref_ginsi_avg_df = delta_analysis_AF2(tcs_score_per_seq_ginsi,gdt_ts_complete_with_seqs,alphafold_plddts,ref_3dcoffee_avg_sp_ginsi,nirmsd_ref_avg_ginsi,"3DCoffee","ginsi",FALSE)

ref_ginsi_avg_df_sop_vs_tcs = table(ref_ginsi_avg_df$delta_sop >= 0, ref_ginsi_avg_df$delta_tcs >= 0)
ref_ginsi_avg_df_sop_vs_tcs = 100 * (ref_ginsi_avg_df_sop_vs_tcs[1] + ref_ginsi_avg_df_sop_vs_tcs[4]) / sum(ref_ginsi_avg_df_sop_vs_tcs)
ref_ginsi_avg_df_delta_perc = c(ref_ginsi_avg_df_sop_vs_tcs,NA,NA)



In [21]:
#
# Analysis per sequence using PSICoffee alignments and 3DCoffee as reference
#

ref_3dcoffee_avg_sp_psicoffee = ref_3dcoffee_avg_sp_same_state[,c("Sequence","Family","PSIcoffee")]
colnames(ref_3dcoffee_avg_sp_psicoffee)[3] = "SoP"
tcs_score_per_seq_psicoffee = tcs_score_per_seq[,c("Sequence","Family","PSIcoffee")]
colnames(tcs_score_per_seq_psicoffee)[3] = "TCS"
nirmsd_ref_avg_psicoffee = nirmsd_ref_avg[c("Sequence","Family","PSIcoffee")]
colnames(nirmsd_ref_avg_psicoffee)[3] = "niRMSD"
ref_psicoffee_avg_df = delta_analysis_AF2(tcs_score_per_seq_psicoffee,gdt_ts_complete_with_seqs,alphafold_plddts,ref_3dcoffee_avg_sp_psicoffee,nirmsd_ref_avg_psicoffee,"3DCoffee","psicoffee",FALSE)

ref_psicoffee_avg_df_sop_vs_tcs = table(ref_psicoffee_avg_df$delta_sop >= 0, ref_psicoffee_avg_df$delta_tcs >= 0)
ref_psicoffee_avg_df_sop_vs_tcs = 100 * (ref_psicoffee_avg_df_sop_vs_tcs[1] + ref_psicoffee_avg_df_sop_vs_tcs[4]) / sum(ref_psicoffee_avg_df_sop_vs_tcs)
ref_psicoffee_avg_df_delta_perc = c(ref_psicoffee_avg_df_sop_vs_tcs,NA,NA)



In [22]:
#
# Analysis per sequence using alignments with NAT models and 3DCoffee as reference
#

ref_3dcoffee_avg_sp_3DCoffee_NAT = ref_3dcoffee_avg_sp_same_state[,c("Sequence","Family","3DCoffee_NAT")]
colnames(ref_3dcoffee_avg_sp_3DCoffee_NAT)[3] = "SoP"
tcs_score_per_seq_3DCoffee_NAT = tcs_score_per_seq[,c("Sequence","Family","3DCoffee_NAT")]
colnames(tcs_score_per_seq_3DCoffee_NAT)[3] = "TCS"
nirmsd_ref_avg_3DCoffee_NAT = nirmsd_ref_avg[c("Sequence","Family","3DCoffee_NAT")]
colnames(nirmsd_ref_avg_3DCoffee_NAT)[3] = "niRMSD"
ref_3dcoffee_NAT_avg_df = delta_analysis_AF2(tcs_score_per_seq_3DCoffee_NAT,gdt_ts_complete_with_seqs,alphafold_plddts,ref_3dcoffee_avg_sp_3DCoffee_NAT,nirmsd_ref_avg_3DCoffee_NAT,"3DCoffee","3DCoffee_NAT",FALSE)

ref_3dcoffee_NAT_avg_df_sop_vs_tcs = table(ref_3dcoffee_NAT_avg_df$delta_sop >= 0, ref_3dcoffee_NAT_avg_df$delta_tcs >= 0)
ref_3dcoffee_NAT_avg_df_sop_vs_tcs = 100 * (ref_3dcoffee_NAT_avg_df_sop_vs_tcs[2]) / sum(ref_3dcoffee_NAT_avg_df_sop_vs_tcs)
ref_3dcoffee_NAT_avg_df_delta_perc = c(ref_3dcoffee_NAT_avg_df_sop_vs_tcs,NA,NA)


In [23]:
#
# Analysis for each pair of sequences using alignments based on AF2 models and 3DCoffee as reference
#

ref_3dcoffee_pair_sp_3dcoffee_AF2 = ref_3dcoffee_pair_sp_same_state[,c("Sequence_1","Sequence_2","Family","3DCoffee_AF2")]
colnames(ref_3dcoffee_pair_sp_3dcoffee_AF2)[4] = "SoP"
ref_3dcoffee_AF2_pair_df = pairwise_plots(gdt_ts_complete_with_seqs,ref_3dcoffee_pair_sp_3dcoffee_AF2,tcs_score_per_seq_3DCoffee_AF2,alphafold_plddts,"3DCoffee","all")

ref_3dcoffee_AF2_pair_df_gm_gdt_below_75_percentage = round(100 * length(which(ref_3dcoffee_AF2_pair_df$GM_GDT_TS < 75))/nrow(ref_3dcoffee_AF2_pair_df),digits=2)
ref_3dcoffee_AF2_pair_df_gm_gdt_below_75_percentage

ref_3dcoffee_AF2_pair_df_gm_gdt_below_75 = ref_3dcoffee_AF2_pair_df[which(ref_3dcoffee_AF2_pair_df$GM_GDT_TS < 75),]
summary(ref_3dcoffee_AF2_pair_df_gm_gdt_below_75$GM_TCS)


Saving 6.67 x 6.67 in image

`geom_smooth()` using formula 'y ~ x'

`geom_smooth()` using formula 'y ~ x'



   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  54.50   85.32   89.95   87.33   92.50   95.50 

In [24]:

sop_sum = as.data.frame(colMeans(ref_3dcoffee_sp_same_state[,c("Ginsi","PSIcoffee","3DCoffee_NAT","3DCoffee_AF2")]))
colnames(sop_sum)[1] = "SoP"
nirmsd_sum = as.data.frame(t(nirmsd_ref[13,c("Ginsi","PSIcoffee","3DCoffee_NAT","3DCoffee_AF2_REF_NAT")]))
colnames(nirmsd_sum)[1] = "NiRMSD"
tcs_sum = as.data.frame(colMeans(tcs_score[,c("Ginsi","PSIcoffee","3DCoffee_NAT","3DCoffee_AF2")]))
colnames(tcs_sum)[1] = "TCS"
sum_table = merge(sop_sum,tcs_sum,by=0,all=T)
row.names(sum_table) = sum_table$Row.names
row.names(nirmsd_sum)[2] = "PSIcoffee"
row.names(nirmsd_sum)[4] = "3DCoffee_AF2"


sum_table = merge(sum_table,nirmsd_sum,by=0,all=T)
row.names(sum_table) = sum_table$Row.names
sum_table$TCS = sum_table$TCS/10
sum_table = sum_table[,-c(1,2)]
sum_table


“column name ‘Row.names’ is duplicated in the result”


Unnamed: 0_level_0,SoP,TCS,NiRMSD
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>
3DCoffee_AF2,93.95,83.43333,1.525
3DCoffee_NAT,100.0,83.175,1.464167
Ginsi,70.33333,53.08333,2.003333
PSIcoffee,77.3,70.975,1.768333


In [25]:
#
# Generate Table 1
#

aln_len_sum = as.data.frame(round(colMeans(aln_len[,c("Ginsi","PSIcoffee","3DCoffee_NAT","3DCoffee_AF2")]),digits=2))
colnames(aln_len_sum) = "Avg aln length"
row.names(aln_len_sum) = c("MSA-Seq","MSA-PSI","MSA-PDB","MSA-AF2")

pid_sum = as.data.frame(round(colMeans(pid_ref[,c("Ginsi","PSIcoffee","3DCoffee_NAT","3DCoffee_AF2")]),digits=2))
colnames(pid_sum) = "Avg PID"
pid_sd = as.data.frame(round(apply(pid_ref[,c("Ginsi","PSIcoffee","3DCoffee_NAT","3DCoffee_AF2")],2,sd),digits=2))
colnames(pid_sd) = "SD PID"
pid_avg_sd = merge(pid_sum,pid_sd,by=0)
row.names(pid_avg_sd) = pid_avg_sd$Row.names
pid_avg_sd = pid_avg_sd[,-c(1)]
row.names(pid_avg_sd) = c("MSA-AF2","MSA-PDB","MSA-Seq","MSA-PSI")

sop_sum = as.data.frame(colMeans(ref_3dcoffee_sp_same_state[,c("Ginsi","PSIcoffee","3DCoffee_NAT","3DCoffee_AF2")]))
colnames(sop_sum)[1] = "SoP"
nirmsd_sum = as.data.frame(t(nirmsd_ref[13,c("Ginsi","PSIcoffee","3DCoffee_NAT","3DCoffee_AF2_REF_NAT")]))
colnames(nirmsd_sum)[1] = "NiRMSD"
tcs_sum = as.data.frame(colMeans(tcs_score[,c("Ginsi","PSIcoffee","3DCoffee_NAT","3DCoffee_AF2")]))
colnames(tcs_sum)[1] = "TCS"
sum_table = merge(sop_sum,tcs_sum,by=0,all=T)
row.names(sum_table) = sum_table$Row.names
row.names(nirmsd_sum)[2] = "PSIcoffee"
row.names(nirmsd_sum)[4] = "3DCoffee_AF2"

sum_table = merge(sum_table,nirmsd_sum,by=0,all=T)
row.names(sum_table) = sum_table$Row.names
sum_table$TCS = sum_table$TCS/10
sum_table = sum_table[,-c(1,2)]
row.names(sum_table) = c("MSA-AF2","MSA-PDB","MSA-Seq","MSA-PSI")


sum_table = merge(pid_avg_sd,sum_table,by=0)
row.names(sum_table) = sum_table$Row.names
sum_table = sum_table[,-c(1)]

sum_table = merge(aln_len_sum,sum_table,by=0)
row.names(sum_table) = sum_table$Row.names
sum_table = sum_table[,-c(1)]

sum_table = round(sum_table,digits=2)
sum_table = sum_table[c(4,3,1,2),]
sum_table

write.table(sum_table,"Table1.tsv",quote=F,row.names=T,col.names=T,sep="\t")



“column name ‘Row.names’ is duplicated in the result”


Unnamed: 0_level_0,Avg aln length,Avg PID,SD PID,SoP,TCS,NiRMSD
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
MSA-Seq,292.67,23.61,4.98,70.33,53.08,2.0
MSA-PSI,299.5,21.23,5.08,77.3,70.97,1.77
MSA-AF2,292.33,21.31,5.23,93.95,83.43,1.52
MSA-PDB,295.42,21.26,5.05,100.0,83.17,1.46


In [26]:
#
# Generate Supp Table 2
#

ref_3dcoffee_AF2_pair_merged_with_sop = merge(ref_3dcoffee_AF2_pair_df,ref_3dcoffee_pair_sp_same_state,by=c("Sequence_1","Sequence_2","Family"))

ref_3dcoffee_AF2_pair_df_gm_gdt_below_75_msa_af2_less_than_seq = table(ref_3dcoffee_AF2_pair_merged_with_sop$GM_GDT_TS < 75 & ref_3dcoffee_AF2_pair_merged_with_sop$SoP <= ref_3dcoffee_AF2_pair_merged_with_sop$Ginsi)[2]
ref_3dcoffee_AF2_pair_df_gm_gdt_above_75_msa_af2_greater_than_seq = table(ref_3dcoffee_AF2_pair_merged_with_sop$GM_GDT_TS >= 75 & ref_3dcoffee_AF2_pair_merged_with_sop$SoP > ref_3dcoffee_AF2_pair_merged_with_sop$Ginsi)[2]

ref_3dcoffee_AF2_pair_df_gm_gdt_below_75_msa_af2_greater_than_seq = table(ref_3dcoffee_AF2_pair_merged_with_sop$GM_GDT_TS < 75 & ref_3dcoffee_AF2_pair_merged_with_sop$SoP > ref_3dcoffee_AF2_pair_merged_with_sop$Ginsi)[2]
ref_3dcoffee_AF2_pair_df_gm_gdt_above_75_msa_af2_less_than_seq = table(ref_3dcoffee_AF2_pair_merged_with_sop$GM_GDT_TS >= 75 & ref_3dcoffee_AF2_pair_merged_with_sop$SoP <= ref_3dcoffee_AF2_pair_merged_with_sop$Ginsi)[2]

supp_table_2 = as.data.frame(matrix(c(ref_3dcoffee_AF2_pair_df_gm_gdt_above_75_msa_af2_greater_than_seq,ref_3dcoffee_AF2_pair_df_gm_gdt_above_75_msa_af2_less_than_seq,ref_3dcoffee_AF2_pair_df_gm_gdt_below_75_msa_af2_greater_than_seq,ref_3dcoffee_AF2_pair_df_gm_gdt_below_75_msa_af2_less_than_seq),nrow=2))

colnames(supp_table_2) = c("GDT-TS >= 75","GDT-TS < 75")
row.names(supp_table_2) = c("SoP(MSA-AF2) > SoP(MSA-Seq)","SoP(MSA-AF2) <= SoP(MSA-Seq)")
supp_table_2$Total = rowSums(supp_table_2)
write.table(supp_table_2,"Supp_table_2.tsv",quote=FALSE,row.names=TRUE,col.names=TRUE,sep="\t")



In [27]:
#
# Generate a big table with all the data using 3DCoffee_NAT as reference
#

tmp_nirmsd_ref_pair = nirmsd_ref_pair
colnames(tmp_nirmsd_ref_pair)[1:2] = c("Sequence_2","Sequence_1")

msa_seq = Reduce(function(dtf1, dtf2) merge(dtf1, dtf2, by = "Family", all.x = TRUE),
list(ref_3dcoffee_sp_same_state[,c("Family","Ginsi")],tcs_score[,c("Family","Ginsi")],nirmsd_ref[-13,c("Family","Ginsi")],pid_ref[,c("Family","Ginsi")]))
colnames(msa_seq) = c("Family","SoP","TCS","NiRMSD","PID")
write.table(msa_seq,"msa_seq.tsv",quote=FALSE,row.names=FALSE,col.names=T,sep="\t")

msa_seq_avg = Reduce(function(dtf1, dtf2) merge(dtf1, dtf2, by = c("Family","Sequence"), all.x = TRUE),
list(ref_3dcoffee_avg_sp_same_state[,c("Family","Sequence","Ginsi")],tcs_score_per_seq[,c("Family","Sequence","Ginsi")],nirmsd_ref_avg[,c("Family","Sequence","Ginsi")],pid_ref_avg[,c("Family","Sequence","Ginsi")],gdt_ts_complete_with_seqs[,c("Family","Sequence","alphafold")],alphafold_plddts))
colnames(msa_seq_avg) = c("Family","Sequence","SoP","TCS","NiRMSD","PID","GDT_TS","pLDDT")
msa_seq_avg$GDT_TS = round(msa_seq_avg$GDT_TS*100,digits=2)
msa_seq_avg$pLDDT = round(msa_seq_avg$pLDDT,digits=2)
write.table(msa_seq_avg,"msa_seq_avg.tsv",quote=FALSE,row.names=FALSE,col.names=T,sep="\t")

msa_seq_pair = Reduce(function(dtf1, dtf2) merge(dtf1, dtf2, by = c("Family","Sequence_1","Sequence_2"), all.x = TRUE),
list(ref_3dcoffee_pair_sp_same_state[,c("Family","Sequence_1","Sequence_2","Ginsi")],tmp_nirmsd_ref_pair[,c("Family","Sequence_1","Sequence_2","Ginsi")],pid_ref_pair[,c("Family","Sequence_1","Sequence_2","Ginsi")]))
msa_seq_pair = msa_seq_pair %>% 
  left_join(tcs_score_per_seq[,c("Family","Sequence","Ginsi")],by = c("Sequence_1" = "Sequence", "Family" = "Family")) %>%
  left_join(tcs_score_per_seq[,c("Family","Sequence","Ginsi")],by = c("Sequence_2" = "Sequence", "Family" = "Family")) %>%
  left_join(gdt_ts_complete_with_seqs[,c("Family","Sequence","alphafold")],by = c("Sequence_1" = "Sequence", "Family" = "Family")) %>%
  left_join(gdt_ts_complete_with_seqs[,c("Family","Sequence","alphafold")],by = c("Sequence_2" = "Sequence", "Family" = "Family")) %>% 
  left_join(alphafold_plddts,by = c("Sequence_1" = "Sequence", "Family" = "Family")) %>%
  left_join(alphafold_plddts,by = c("Sequence_2" = "Sequence", "Family" = "Family"))
colnames(msa_seq_pair) = c("Family","Sequence_1","Sequence_2","SoP","NiRMSD","PID","TCS_1","TCS_2","GDT_TS_1","GDT_TS_2","pLDDT_1","pLDDT_2")
msa_seq_pair$GDT_TS_1 = round(msa_seq_pair$GDT_TS_1*100,digits=2)
msa_seq_pair$GDT_TS_2 = round(msa_seq_pair$GDT_TS_2*100,digits=2)
msa_seq_pair$pLDDT_1 = round(msa_seq_pair$pLDDT_1,digits=2)
msa_seq_pair$pLDDT_2 = round(msa_seq_pair$pLDDT_2,digits=2)  
write.table(msa_seq_pair,"msa_seq_pair.tsv",quote=FALSE,row.names=FALSE,col.names=T,sep="\t")

msa_psi = Reduce(function(dtf1, dtf2) merge(dtf1, dtf2, by = "Family", all.x = TRUE),
list(ref_3dcoffee_sp_same_state[,c("Family","PSIcoffee")],tcs_score[,c("Family","PSIcoffee")],nirmsd_ref[-13,c("Family","PSIcoffee")],pid_ref[,c("Family","PSIcoffee")]))
colnames(msa_psi) = c("Family","SoP","TCS","NiRMSD","PID")
write.table(msa_psi,"msa_psi.tsv",quote=FALSE,row.names=FALSE,col.names=T,sep="\t")

msa_psi_avg = Reduce(function(dtf1, dtf2) merge(dtf1, dtf2, by = c("Family","Sequence"), all.x = TRUE),
list(ref_3dcoffee_avg_sp_same_state[,c("Family","Sequence","PSIcoffee")],tcs_score_per_seq[,c("Family","Sequence","PSIcoffee")],nirmsd_ref_avg[,c("Family","Sequence","PSIcoffee")],pid_ref_avg[,c("Family","Sequence","PSIcoffee")],gdt_ts_complete_with_seqs[,c("Family","Sequence","alphafold")],alphafold_plddts))
colnames(msa_psi_avg) = c("Family","Sequence","SoP","TCS","NiRMSD","PID","GDT_TS","pLDDT")
msa_psi_avg$GDT_TS = round(msa_psi_avg$GDT_TS*100,digits=2)
msa_psi_avg$pLDDT = round(msa_psi_avg$pLDDT,digits=2)
write.table(msa_psi_avg,"msa_psi_avg.tsv",quote=FALSE,row.names=FALSE,col.names=T,sep="\t")

msa_psi_pair = Reduce(function(dtf1, dtf2) merge(dtf1, dtf2, by = c("Family","Sequence_1","Sequence_2"), all.x = TRUE),
list(ref_3dcoffee_pair_sp_same_state[,c("Family","Sequence_1","Sequence_2","PSIcoffee")],tmp_nirmsd_ref_pair[,c("Family","Sequence_1","Sequence_2","PSIcoffee")],pid_ref_pair[,c("Family","Sequence_1","Sequence_2","PSIcoffee")]))
msa_psi_pair = msa_psi_pair %>% 
  left_join(tcs_score_per_seq[,c("Family","Sequence","PSIcoffee")],by = c("Sequence_1" = "Sequence", "Family" = "Family")) %>%
  left_join(tcs_score_per_seq[,c("Family","Sequence","PSIcoffee")],by = c("Sequence_2" = "Sequence", "Family" = "Family")) %>%
  left_join(gdt_ts_complete_with_seqs[,c("Family","Sequence","alphafold")],by = c("Sequence_1" = "Sequence", "Family" = "Family")) %>%
  left_join(gdt_ts_complete_with_seqs[,c("Family","Sequence","alphafold")],by = c("Sequence_2" = "Sequence", "Family" = "Family")) %>% 
  left_join(alphafold_plddts,by = c("Sequence_1" = "Sequence", "Family" = "Family")) %>%
  left_join(alphafold_plddts,by = c("Sequence_2" = "Sequence", "Family" = "Family"))
colnames(msa_psi_pair) = c("Family","Sequence_1","Sequence_2","SoP","NiRMSD","PID","TCS_1","TCS_2","GDT_TS_1","GDT_TS_2","pLDDT_1","pLDDT_2")
msa_psi_pair$GDT_TS_1 = round(msa_psi_pair$GDT_TS_1*100,digits=2)
msa_psi_pair$GDT_TS_2 = round(msa_psi_pair$GDT_TS_2*100,digits=2)
msa_psi_pair$pLDDT_1 = round(msa_psi_pair$pLDDT_1,digits=2)
msa_psi_pair$pLDDT_2 = round(msa_psi_pair$pLDDT_2,digits=2)  
write.table(msa_psi_pair,"msa_psi_pair.tsv",quote=FALSE,row.names=FALSE,col.names=T,sep="\t")

msa_af2 = Reduce(function(dtf1, dtf2) merge(dtf1, dtf2, by = "Family", all.x = TRUE),
list(ref_3dcoffee_sp_same_state[,c("Family","3DCoffee_AF2")],tcs_score[,c("Family","3DCoffee_AF2")],nirmsd_ref[-13,c("Family","3DCoffee_AF2_REF_NAT")],pid_ref[,c("Family","3DCoffee_AF2")]))
colnames(msa_af2) = c("Family","SoP","TCS","NiRMSD","PID")
write.table(msa_af2,"msa_af2.tsv",quote=FALSE,row.names=FALSE,col.names=T,sep="\t")

msa_af2_avg = Reduce(function(dtf1, dtf2) merge(dtf1, dtf2, by = c("Family","Sequence"), all.x = TRUE),
list(ref_3dcoffee_avg_sp_same_state[,c("Family","Sequence","3DCoffee_AF2")],tcs_score_per_seq[,c("Family","Sequence","3DCoffee_AF2")],nirmsd_ref_avg[,c("Family","Sequence","3DCoffee_AF2_REF_NAT")],pid_ref_avg[,c("Family","Sequence","3DCoffee_AF2")],gdt_ts_complete_with_seqs[,c("Family","Sequence","alphafold")],alphafold_plddts))
colnames(msa_af2_avg) = c("Family","Sequence","SoP","TCS","NiRMSD","PID","GDT_TS","pLDDT")
msa_af2_avg$GDT_TS = round(msa_af2_avg$GDT_TS*100,digits=2)
msa_af2_avg$pLDDT = round(msa_af2_avg$pLDDT,digits=2)
write.table(msa_af2_avg,"msa_af2_avg.tsv",quote=FALSE,row.names=FALSE,col.names=T,sep="\t")

msa_af2_pair = Reduce(function(dtf1, dtf2) merge(dtf1, dtf2, by = c("Family","Sequence_1","Sequence_2"), all.x = TRUE),
list(ref_3dcoffee_pair_sp_same_state[,c("Family","Sequence_1","Sequence_2","3DCoffee_AF2")],tmp_nirmsd_ref_pair[,c("Family","Sequence_1","Sequence_2","3DCoffee_AF2_REF_NAT")],pid_ref_pair[,c("Family","Sequence_1","Sequence_2","3DCoffee_AF2")]))
msa_af2_pair = msa_af2_pair %>% 
  left_join(tcs_score_per_seq[,c("Family","Sequence","3DCoffee_AF2")],by = c("Sequence_1" = "Sequence", "Family" = "Family")) %>%
  left_join(tcs_score_per_seq[,c("Family","Sequence","3DCoffee_AF2")],by = c("Sequence_2" = "Sequence", "Family" = "Family")) %>%
  left_join(gdt_ts_complete_with_seqs[,c("Family","Sequence","alphafold")],by = c("Sequence_1" = "Sequence", "Family" = "Family")) %>%
  left_join(gdt_ts_complete_with_seqs[,c("Family","Sequence","alphafold")],by = c("Sequence_2" = "Sequence", "Family" = "Family")) %>% 
  left_join(alphafold_plddts,by = c("Sequence_1" = "Sequence", "Family" = "Family")) %>%
  left_join(alphafold_plddts,by = c("Sequence_2" = "Sequence", "Family" = "Family"))
colnames(msa_af2_pair) = c("Family","Sequence_1","Sequence_2","SoP","NiRMSD","PID","TCS_1","TCS_2","GDT_TS_1","GDT_TS_2","pLDDT_1","pLDDT_2")
msa_af2_pair$GDT_TS_1 = round(msa_af2_pair$GDT_TS_1*100,digits=2)
msa_af2_pair$GDT_TS_2 = round(msa_af2_pair$GDT_TS_2*100,digits=2)
msa_af2_pair$pLDDT_1 = round(msa_af2_pair$pLDDT_1,digits=2)
msa_af2_pair$pLDDT_2 = round(msa_af2_pair$pLDDT_2,digits=2)  
write.table(msa_af2_pair,"msa_af2_pair.tsv",quote=FALSE,row.names=FALSE,col.names=T,sep="\t")


msa_pdb = Reduce(function(dtf1, dtf2) merge(dtf1, dtf2, by = "Family", all.x = TRUE),
list(ref_3dcoffee_sp_same_state[,c("Family","3DCoffee_NAT")],tcs_score[,c("Family","3DCoffee_NAT")],nirmsd_ref[-13,c("Family","3DCoffee_NAT")],pid_ref[,c("Family","3DCoffee_NAT")]))
colnames(msa_pdb) = c("Family","SoP","TCS","NiRMSD","PID")
write.table(msa_pdb,"msa_pdb.tsv",quote=FALSE,row.names=FALSE,col.names=T,sep="\t")

msa_pdb_avg = Reduce(function(dtf1, dtf2) merge(dtf1, dtf2, by = c("Family","Sequence"), all.x = TRUE),
list(ref_3dcoffee_avg_sp_same_state[,c("Family","Sequence","3DCoffee_NAT")],tcs_score_per_seq[,c("Family","Sequence","3DCoffee_NAT")],nirmsd_ref_avg[,c("Family","Sequence","3DCoffee_NAT")],pid_ref_avg[,c("Family","Sequence","3DCoffee_NAT")],gdt_ts_complete_with_seqs[,c("Family","Sequence","alphafold")],alphafold_plddts))
colnames(msa_pdb_avg) = c("Family","Sequence","SoP","TCS","NiRMSD","PID","GDT_TS","pLDDT")
msa_pdb_avg$GDT_TS = round(msa_pdb_avg$GDT_TS*100,digits=2)
msa_pdb_avg$pLDDT = round(msa_pdb_avg$pLDDT,digits=2)
write.table(msa_pdb_avg,"msa_pdb_avg.tsv",quote=FALSE,row.names=FALSE,col.names=T,sep="\t")

msa_pdb_pair = Reduce(function(dtf1, dtf2) merge(dtf1, dtf2, by = c("Family","Sequence_1","Sequence_2"), all.x = TRUE),
list(ref_3dcoffee_pair_sp_same_state[,c("Family","Sequence_1","Sequence_2","3DCoffee_NAT")],tmp_nirmsd_ref_pair[,c("Family","Sequence_1","Sequence_2","3DCoffee_NAT")],pid_ref_pair[,c("Family","Sequence_1","Sequence_2","3DCoffee_NAT")]))
msa_pdb_pair = msa_pdb_pair %>% 
  left_join(tcs_score_per_seq[,c("Family","Sequence","3DCoffee_NAT")],by = c("Sequence_1" = "Sequence", "Family" = "Family")) %>%
  left_join(tcs_score_per_seq[,c("Family","Sequence","3DCoffee_NAT")],by = c("Sequence_2" = "Sequence", "Family" = "Family")) %>%
  left_join(gdt_ts_complete_with_seqs[,c("Family","Sequence","alphafold")],by = c("Sequence_1" = "Sequence", "Family" = "Family")) %>%
  left_join(gdt_ts_complete_with_seqs[,c("Family","Sequence","alphafold")],by = c("Sequence_2" = "Sequence", "Family" = "Family")) %>% 
  left_join(alphafold_plddts,by = c("Sequence_1" = "Sequence", "Family" = "Family")) %>%
  left_join(alphafold_plddts,by = c("Sequence_2" = "Sequence", "Family" = "Family"))
colnames(msa_pdb_pair) = c("Family","Sequence_1","Sequence_2","SoP","NiRMSD","PID","TCS_1","TCS_2","GDT_TS_1","GDT_TS_2","pLDDT_1","pLDDT_2")
msa_pdb_pair$GDT_TS_1 = round(msa_pdb_pair$GDT_TS_1*100,digits=2)
msa_pdb_pair$GDT_TS_2 = round(msa_pdb_pair$GDT_TS_2*100,digits=2)
msa_pdb_pair$pLDDT_1 = round(msa_pdb_pair$pLDDT_1,digits=2)
msa_pdb_pair$pLDDT_2 = round(msa_pdb_pair$pLDDT_2,digits=2)
write.table(msa_pdb_pair,"msa_pdb_pair.tsv",quote=FALSE,row.names=FALSE,col.names=T,sep="\t")


“column names ‘Ginsi.x’, ‘Ginsi.y’ are duplicated in the result”
“column names ‘Ginsi.x’, ‘Ginsi.y’ are duplicated in the result”
“column names ‘Ginsi.x’, ‘Ginsi.y’ are duplicated in the result”
“column names ‘Ginsi.x’, ‘Ginsi.y’ are duplicated in the result”
“column names ‘PSIcoffee.x’, ‘PSIcoffee.y’ are duplicated in the result”
“column names ‘PSIcoffee.x’, ‘PSIcoffee.y’ are duplicated in the result”
“column names ‘PSIcoffee.x’, ‘PSIcoffee.y’ are duplicated in the result”
“column names ‘PSIcoffee.x’, ‘PSIcoffee.y’ are duplicated in the result”
“column names ‘3DCoffee_NAT.x’, ‘3DCoffee_NAT.y’ are duplicated in the result”
“column names ‘3DCoffee_NAT.x’, ‘3DCoffee_NAT.y’ are duplicated in the result”
“column names ‘3DCoffee_NAT.x’, ‘3DCoffee_NAT.y’ are duplicated in the result”
“column names ‘3DCoffee_NAT.x’, ‘3DCoffee_NAT.y’ are duplicated in the result”


In [28]:
exec_time = read.table("./trace.txt",header = T,stringsAsFactors = F)
exec_time = exec_time[,c("name","realtime")]
exec_time$name = gsub("\\(","",exec_time$name)
exec_time$name = gsub("\\)","",exec_time$name)
df = data.frame(exec_time$name)
exec_time_per_fam = data.frame(str_split_fixed(df$exec_time.name, "_", 2))
exec_time_per_fam$realtime = exec_time$realtime
colnames(exec_time_per_fam) = c("Family","Method","realtime")
exec_time_per_fam$realtime = exec_time_per_fam$realtime / 1000
exec_time_per_fam = reshape(exec_time_per_fam, idvar = "Family", timevar = "Method", direction = "wide")

exec_time_per_fam = exec_time_per_fam[,c("Family","realtime.famsa","realtime.ginsi","realtime.msaprobs","realtime.tcoffee","realtime.psicoffee_aln","realtime.3DCoffee_PDB","realtime.3DCoffee_AF2")]
colnames(exec_time_per_fam) = c("Family","FAMSA","G-INS-I (MSA-Seq)","MSAProbs","TCoffee","PSICoffee","MSA-PDB","MSA-AF2")
exec_time_per_fam = exec_time_per_fam[order(exec_time_per_fam$Family),]

af2_time = read.table("./seq_realtime_alphafold.txt",header = F,stringsAsFactors = F)
af2_time = af2_time[,c(2:4)]
af2_time_avg_per_fam = aggregate(af2_time$V4, list(af2_time$V3), FUN=mean)
colnames(af2_time_avg_per_fam) = c("Family","AF2")
af2_time_avg_per_fam = af2_time_avg_per_fam[order(af2_time_avg_per_fam$Family),]
exec_time_per_fam$AF2 = af2_time_avg_per_fam$AF2 / 1000

write.table(exec_time_per_fam, file="Supp_table_5_alignment_runtime.tsv",quote=FALSE, sep="\t",row.names=FALSE)



“multiple rows match for Method=: first taken”
