Directory: ~/hayai_v3.2/zen_odb_v11

# Script Name: evaluate_orthoDBv11.ipynb
# Purpose: Evaluate Zen mapping
# Author: Andrea Ghelfi
# Date: November 20, 2024
# License: GNU GPL-3.0 License
# Software: Python 3.10

In [None]:
library(data.table)

In [None]:
# OrthoDB to parental from OrthoDB v.11.
og_pairs <- fread("odb11v0_OG_pairs.tab", header = FALSE, sep = "\t")
colnames(og_pairs) <- c("OrthoDB", "parental_OrthoDB")

In [None]:
# Select Eukaryotes to match UniProtKB - OrthoDB taxon level
og_pairs_euk <- og_pairs[grep("at2759", og_pairs$parental_OrthoDB), ]

In [None]:
# Inferred OrthoDB using Zen (OrthoDB v.11 and UniProtKB dataset downloaded on 2024.4.29
inferred_ogs <- fread("uniprot2orthodb.tsv", header = TRUE, sep = "\t")
# Remove accessions without assigned OrthoDB
inferred_ogs <- inferred_ogs[!is.na(inferred_ogs$OrthoDB), ]

In [None]:
# Extracted OrthoDB from UniProt-Plants: Swiss-Prot
all_sp <- fread("sp_acc2orthodb.txt", header = FALSE, sep = "\t")
colnames(all_sp) <- c("AC", "UK_OrthoDB")
print("Total accession in SwissProt")
nrow(all_sp)

In [None]:
[1] "Total accession in SwissProt"
[1] 44592

In [None]:
# Extracted OrthoDB from UniProt-Plants: TrEMBL
all_tr <- fread("tr_acc2orthodb.txt", header = FALSE, sep = "\t")
colnames(all_tr) <- c("AC", "UK_OrthoDB")

In [None]:
print("Total accession in TrEMBL")
nrow(all_tr)

In [None]:
[1] "Total accession in TrEMBL"
[1] 19023625

In [None]:
# Add parental levels for the inferred ogs
par_inf_ogs_euk <- merge(inferred_ogs, og_pairs_euk, by = "OrthoDB")

In [None]:
# Add database name for UniProtKB-Plants
all_sp$db <- "sp"
all_tr$db <- "tr"

In [None]:
# Merge both SwissProt and TrEMBL databases
all_uni <- rbind(all_sp, all_tr)
colnames(all_uni) <- c("AC", "UK_OrthoDB", "db")

In [None]:
print("Total accession in UniProtKB-Plants")
print(nrow(all_uni))

In [None]:
[1] "Total accession in UniProtKB-Plants"
[1] 19068217

In [None]:
# Remove accessions without assigned OrthoDB in UniProtKB
uni <- all_uni[!is.na(all_uni$UK_OrthoDB), ]
print("Total of UniProt entries with OrthoDB ID")
print(nrow(uni))

In [None]:
[1] "Total of UniProt entries with OrthoDB ID"
[1] 2729135

In [None]:
# Percentage of UniProt accessions with OrthoDB in UniProtKB-Plants
print("Percentage of UniProt accessions with OrthoDB in UniProtKB-Plants")
print(nrow(uni)/nrow(all_uni)*100)

In [None]:
[1] "Percentage of UniProt accessions with OrthoDB in UniProtKB-Plants"
[1] 14.31248

In [None]:
# Count occurrences of each value in the 'db' column
db_counts <- uni[, .N, by = db]

In [None]:
# Print the result
print("Number of occurrences per database")
print(db_counts)

In [None]:
[1] "Number of occurrences per database"
       db       N
   <char>   <int>
1:     sp   22658
2:     tr 2706477

In [None]:
# Percentage of OrthoDB in UniProt per database
print("Percentage of OrthoDB in SwissProt")
print(db_counts$N[1]/nrow(all_sp)*100)
[1] "Percentage of OrthoDB in SwissProt"
[1] 50.8118

In [None]:
print("Percentage of OrthoDB in TrEMBL")
print(db_counts$N[2]/nrow(all_tr)*100)

In [None]:
[1] "Percentage of OrthoDB in TrEMBL"
[1] 14.22693

In [None]:
# Join the inferred orthodb by Zen mapping with original data from UniProt using uniprot accession
all_compare <- merge(par_inf_ogs_euk, uni, by = "AC", all.y = TRUE)

In [None]:
dim(all_compare[!is.na(all_compare$parental_OrthoDB), ])
[1] 1950290       5

In [None]:
# Removed accessions without assigned parental level for orthodb
compare <- all_compare[!is.na(all_compare$parental_OrthoDB), ]
nrow(compare)
[1] 1950290

In [None]:
same <- compare[compare$parental_OrthoDB == compare$UK_OrthoDB,]

In [None]:
# Correspondence Zen and Uniprot (no NAs)
print("Percentage of accessions with parental OrthoDB in UniProtKB")
print(nrow(compare)/nrow(uni) * 100)
print("Total number of accessions with OrthoDB accessions and Zen mapping")
print(nrow(compare))
print("Accessions were Zen mapping matched OrthoDB in UniProtKB, considering Eukarya level")
print(nrow(same))
print("Percentage of accessions were Zen mapping matched OrthoDB in UniProtKB, considering Eukarya level")
print(nrow(same)/nrow(compare) * 100)

In [None]:
[1] "Percentage of accessions with parental OrthoDB in UniProtKB"
[1] 71.46184
[1] "Total number of accessions with OrthoDB accessions and Zen mapping"
[1] 1950290
[1] "Accessions were Zen mapping matched OrthoDB in UniProtKB, considering Eukarya level"
[1] 1943023
[1] "Percentage of accessions were Zen mapping matched OrthoDB in UniProtKB, considering Eukarya level"
[1] 99.62739

In [None]:
# After Zen Mapping
zen_sp <- merge(inferred_ogs, all_sp, by = "AC")
print("Zen mapping in SwissProt")
print(nrow(zen_sp))
print("Zen mapping in SwissProt% ()")
print(nrow(zen_sp)/nrow(all_sp)*100)

In [None]:
[1] "Zen mapping in SwissProt"
[1] 38765
[1] "Zen mapping in SwissProt% ()"
[1] 86.93263

In [None]:
zen_tr <- merge(inferred_ogs, all_tr, by = "AC")
print("Zen mapping in TrEMBL")
print(nrow(zen_tr))
print("Zen mapping in TrEMBL% ()")
print(nrow(zen_tr)/nrow(all_tr)*100)

In [None]:
[1] "Zen mapping in TrEMBL"
[1] 9170680
[1] "Zen mapping in TrEMBL% ()"
[1] 48.2068

In [None]:
print("Zen mapping in UniProtKB")
print(nrow(zen_sp)+nrow(zen_tr))

In [None]:
[1] "Zen mapping in UniProtKB"
[1] 9209445

In [None]:
print("Total accession in UniProtKB-Plants")
print(nrow(all_sp)+nrow(all_tr))

In [None]:
[1] "Total accession in UniProtKB-Plants"
[1] 19068217

In [None]:
print("Total accession in UniProtKB-Plants (%)")
print(((nrow(zen_sp)+nrow(zen_tr))/(nrow(all_sp)+nrow(all_tr)))*100)

In [None]:
[1] "Total accession in UniProtKB-Plants (%)"
[1] 48.29736

In [None]:
fwrite(compare, "Zen_OrthoDB_vs_UniProt_OrthoDB_20240429.tsv", row.names = FALSE, col.names = TRUE, sep = "\t")