## Map compound target genes to Gene Ontology Pathways

In [1]:
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(topGO))
suppressPackageStartupMessages(library(org.Hs.eg.db))
suppressPackageStartupMessages(library(GO.db))


groupGOTerms: 	GOBPTerm, GOMFTerm, GOCCTerm environments built.



In [2]:
# Use GO.db to extract GO annotations
go_annotations_list <- as.list(GOTERM)

# How to filter go terms (many have very low counts)
n_pert_filter <- 20

In [3]:
# Load compound target info
cpd_file <- file.path("data", "split_moas_targets_cpds.csv")

cpd_df <- readr::read_csv(cpd_file, show_col_types = FALSE)

# How many unique targets:
print(length(unique(cpd_df$target_unique)))
print(dim(cpd_df))
head(cpd_df, 3)

[1] 744
[1] 3178    9


pert_iname,moa,train,test,marked,target_unique,clinical_phase,disease_area,indication
<chr>,<chr>,<lgl>,<lgl>,<lgl>,<chr>,<chr>,<chr>,<chr>
ketoprofen,cyclooxygenase inhibitor,True,False,True,PTGS1,Launched,rheumatology,rheumatoid arthritis|osteoarthritis
ketoprofen,cyclooxygenase inhibitor,True,False,True,PTGS2,Launched,rheumatology,rheumatoid arthritis|osteoarthritis
ketoprofen,cyclooxygenase inhibitor,True,False,True,SLC5A8,Launched,rheumatology,rheumatoid arthritis|osteoarthritis


In [4]:
# For each GO ontology, map target genes to pathways
go_ontologies <- c("BP", "CC", "MF")

go_mapping_df <- list()
for (go_ont in go_ontologies) {
    # Identify total gene map per ontology
    geneMap <- topGO::annFUN.org(
        whichOnto = go_ont,
        feasibleGenes = NULL,
        mapping = "org.Hs.eg.db",
        ID = "symbol"
    )
    
    # Pull pathway assignment per target
    target_pathways <- topGO::inverseList(
        topGO::annFUN.GO2genes(
            whichOnto = go_ont,
            feasibleGenes = cpd_df$target_unique,
            GO2genes = geneMap
        )
    )
    
    # Clean the data for easier input analysis downstream
    go_term_dfs <- list()
    for (gene in names(target_pathways)) {
        go_terms <- target_pathways[[gene]]
        
        go_term_labels <- c()
        go_term_synonyms <- c()
        for (go_term in go_terms) {
            go_annotation <- go_annotations_list[[go_term]]

            go_term_labels <- c(go_term_labels, go_annotation@Term)
            go_term_synonyms <- c(go_term_synonyms, paste0(go_annotation@Synonym, sep="", collapse="|"))
        }
        
        go_terms <- dplyr::tibble(go_terms, gene, go_term_labels, go_term_synonyms)
        colnames(go_terms) <- c("go_term", "gene", "term_label", "term_synonym")
        go_term_dfs[[gene]] <- go_terms
    }
    
    go_mapping_df[[go_ont]] <- do.call(rbind, go_term_dfs) %>%
        dplyr::as_tibble() %>%
        dplyr::mutate(go_ontology = go_ont)
}

In [5]:
# Combine data and describe results
full_go_mapping_df <- do.call(rbind, go_mapping_df) %>%
        dplyr::as_tibble()

# How many unique go terms:
print(length(unique(full_go_mapping_df$go_term)))

# How many unique genes:
print(length(unique(full_go_mapping_df$gene)))

# How many GO terms per gene:
sort(table(full_go_mapping_df$gene), decreasing = TRUE)

[1] 5822
[1] 732



    AKT1     TP53      SRC      TNF     ABL1     MTOR    PSEN1     EGFR 
     179      170      166      160      143      142      138      137 
    BCL2     DRD2     CDK5     JAK2    STAT3     RELA    PPARG     PKD2 
     135      123      115      115      109      108      105      103 
     SYK    ANXA1     IL1B      IL6      KIT    GSK3B     MDM2   PDGFRB 
      99       96       96       96       94       93       93       93 
   GPER1   PRKAA1    HDAC1    HDAC6     RARA    ROCK1    KCNQ1    PTGS2 
      89       88       85       85       84       84       83       80 
  HSPA1A    PPARD       AR    ITGB3   MAPK14    ROCK2      SMO    PARP1 
      79       79       78       78       78       78       78       77 
   PRKCD     ESR1      INS    CALM1     CDK1    ITGAV    PPARA    ABCA1 
      77       76       76       75       75       75       75       74 
   HSPA5   PDGFRA     PTK2     CCR2     INSR    HDAC2    HDAC4 HSP90AA1 
      74       74       74       73       73      

In [6]:
# What does this dataset look like?
print(dim(full_go_mapping_df))
head(full_go_mapping_df)

[1] 23072     5


go_term,gene,term_label,term_synonym,go_ontology
<chr>,<chr>,<chr>,<chr>,<chr>
GO:0002790,ABCA1,peptide secretion,,BP
GO:0006497,ABCA1,protein lipidation,GO:0042050|lipid:protein modification|protein amino acid lipidation,BP
GO:0006869,ABCA1,lipid transport,,BP
GO:0006911,ABCA1,"phagocytosis, engulfment",phagosome biosynthesis|phagosome formation,BP
GO:0007040,ABCA1,lysosome organization,lysosome organisation|lysosome organization and biogenesis,BP
GO:0007186,ABCA1,G protein-coupled receptor signaling pathway,GO:0038042|G protein coupled receptor protein signaling pathway|G protein coupled receptor protein signalling pathway|G-protein coupled receptor protein signal transduction|G-protein coupled receptor protein signaling pathway|G-protein coupled receptor signalling pathway|G-protein-coupled receptor protein signalling pathway|GPCR signaling pathway|GPCR signalling pathway|dimeric G-protein coupled receptor signaling pathway|dimeric G-protein coupled receptor signalling pathway|G-protein coupled receptor signaling pathway via GPCR dimer,BP


In [7]:
# Merge compound info with GO terms
cpd_go_df <- cpd_df %>%
    dplyr::left_join(full_go_mapping_df, by = c("target_unique" = "gene"))

output_file <- file.path("data", "split_moas_targets_pathways_cpds_full.csv")
readr::write_csv(cpd_go_df, output_file)

# How many unique perturbations
print(length(unique(cpd_go_df$pert_iname)))

# How many unique go_terms are in our dataset
print(length(unique(cpd_go_df$go_term)))

print(dim(cpd_go_df))
head(cpd_go_df, 3)

[1] 1258
[1] 5823
[1] 83825    13


pert_iname,moa,train,test,marked,target_unique,clinical_phase,disease_area,indication,go_term,term_label,term_synonym,go_ontology
<chr>,<chr>,<lgl>,<lgl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
ketoprofen,cyclooxygenase inhibitor,True,False,True,PTGS1,Launched,rheumatology,rheumatoid arthritis|osteoarthritis,GO:0001516,prostaglandin biosynthetic process,prostaglandin anabolism|prostaglandin biosynthesis|prostaglandin formation|prostaglandin synthesis,BP
ketoprofen,cyclooxygenase inhibitor,True,False,True,PTGS1,Launched,rheumatology,rheumatoid arthritis|osteoarthritis,GO:0006954,inflammatory response,inflammation,BP
ketoprofen,cyclooxygenase inhibitor,True,False,True,PTGS1,Launched,rheumatology,rheumatoid arthritis|osteoarthritis,GO:0006979,response to oxidative stress,,BP


In [8]:
# Filter go terms based on low counts
cpd_go_counts <- cpd_go_df %>%
    dplyr::select(!target_unique) %>%
    dplyr::distinct() %>%
    dplyr::group_by(term_label) %>%
    dplyr::mutate(n_pert = dplyr::n()) %>%
    dplyr::select(go_term, term_label, n_pert, go_ontology) %>%
    dplyr::distinct() %>%
    dplyr::arrange(desc(n_pert)) %>%
    # The majority of GO terms had very low representation,
    # which would make ML training very difficult
    dplyr::filter(n_pert >= n_pert_filter)

print(dim(cpd_go_counts))

tail(cpd_go_counts, 10)

[1] 773   4


go_term,term_label,n_pert,go_ontology
<chr>,<chr>,<int>,<chr>
GO:0032412,regulation of ion transmembrane transporter activity,20,BP
GO:0019221,cytokine-mediated signaling pathway,20,BP
GO:0015872,dopamine transport,20,BP
GO:0043130,ubiquitin binding,20,MF
GO:0006935,chemotaxis,20,BP
GO:0005765,lysosomal membrane,20,CC
GO:0005819,spindle,20,CC
GO:0098609,cell-cell adhesion,20,BP
GO:0043433,negative regulation of DNA-binding transcription factor activity,20,BP
GO:0032091,negative regulation of protein binding,20,BP


In [9]:
# Subset to more common GO terms and output to file
cpd_go_subset_df <- cpd_go_df %>%
    dplyr::filter(go_term %in% unique(cpd_go_counts$go_term))

output_file <- file.path("data", "split_moas_targets_pathways_cpds.csv")
readr::write_csv(cpd_go_subset_df, output_file)

# How many unique perturbations
print(length(unique(cpd_go_subset_df$pert_iname)))

# How many unique go_terms are in our dataset
print(length(unique(cpd_go_subset_df$go_term)))

print(dim(cpd_go_subset_df))
head(cpd_go_subset_df, 3)

[1] 1258
[1] 773
[1] 57224    13


pert_iname,moa,train,test,marked,target_unique,clinical_phase,disease_area,indication,go_term,term_label,term_synonym,go_ontology
<chr>,<chr>,<lgl>,<lgl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
ketoprofen,cyclooxygenase inhibitor,True,False,True,PTGS1,Launched,rheumatology,rheumatoid arthritis|osteoarthritis,GO:0001516,prostaglandin biosynthetic process,prostaglandin anabolism|prostaglandin biosynthesis|prostaglandin formation|prostaglandin synthesis,BP
ketoprofen,cyclooxygenase inhibitor,True,False,True,PTGS1,Launched,rheumatology,rheumatoid arthritis|osteoarthritis,GO:0006954,inflammatory response,inflammation,BP
ketoprofen,cyclooxygenase inhibitor,True,False,True,PTGS1,Launched,rheumatology,rheumatoid arthritis|osteoarthritis,GO:0006979,response to oxidative stress,,BP
