# Notebook for R (Biomart Homolog download)

# R Notebook

In [1]:
format(Sys.Date(), "%d-%b-%Y")

In [3]:
# remember to use "R" conda env

# Main
print(paste0("Retrieving on: ", format(Sys.Date(), "%d-%b-%Y")))

# Download HGNC complete txt file
download.file(
  url = "https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/hgnc_complete_set.txt",
  destfile = "hgnc_complete_set.txt",
  method = "auto"
)

# read into R
hgnc_data <- read.delim("hgnc_complete_set.txt", sep = "\t", header = TRUE)
# head(hgnc_data)
# Save to file

[1] "Retrieving on: 06-Jun-2025"


In [4]:
hgnc_data

hgnc_id,symbol,name,locus_group,locus_type,status,location,location_sortable,alias_symbol,alias_name,⋯,cd,lncrnadb,enzyme_id,intermediate_filament_db,rna_central_id,lncipedia,gtrnadb,agr,mane_select,gencc
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
HGNC:5,A1BG,alpha-1-B glycoprotein,protein-coding gene,gene with protein product,Approved,19q13.43,19q13.43,,,⋯,,,,,,,,HGNC:5,ENST00000263100.8|NM_130786.4,
HGNC:37133,A1BG-AS1,A1BG antisense RNA 1,non-coding RNA,"RNA, long non-coding",Approved,19q13.43,19q13.43,FLJ23569,,⋯,,,,,URS00007E4F6E,A1BG-AS1,,HGNC:37133,,
HGNC:24086,A1CF,APOBEC1 complementation factor,protein-coding gene,gene with protein product,Approved,10q11.23,10q11.23,ACF|ASP|ACF64|ACF65|APOBEC1CF,,⋯,,,,,,,,HGNC:24086,ENST00000373997.8|NM_014576.4,
HGNC:7,A2M,alpha-2-macroglobulin,protein-coding gene,gene with protein product,Approved,12p13.31,12p13.31,FWP007|S863-7|CPAMD5,,⋯,,,,,,,,HGNC:7,ENST00000318602.12|NM_000014.6,HGNC:7
HGNC:27057,A2M-AS1,A2M antisense RNA 1,non-coding RNA,"RNA, long non-coding",Approved,12p13.31,12p13.31,,,⋯,,,,,URS00001F234A,A2M-AS1,,HGNC:27057,,
HGNC:23336,A2ML1,alpha-2-macroglobulin like 1,protein-coding gene,gene with protein product,Approved,12p13.31,12p13.31,FLJ25179|p170,,⋯,,,,,,,,HGNC:23336,ENST00000299698.12|NM_144670.6,HGNC:23336
HGNC:41022,A2ML1-AS1,A2ML1 antisense RNA 1,non-coding RNA,"RNA, long non-coding",Approved,12p13.31,12p13.31,,,⋯,,,,,URS00005F9A07,A2ML1-AS1,,,,
HGNC:41523,A2ML1-AS2,A2ML1 antisense RNA 2,non-coding RNA,"RNA, long non-coding",Approved,12p13.31,12p13.31,,,⋯,,,,,URS00001BFF15,A2ML1-AS2,,,,
HGNC:8,A2MP1,alpha-2-macroglobulin pseudogene 1,pseudogene,pseudogene,Approved,12p13.31,12p13.31,,,⋯,,,,,,,,HGNC:8,,
HGNC:30005,A3GALT2,"alpha 1,3-galactosyltransferase 2",protein-coding gene,gene with protein product,Approved,1p35.1,01p35.1,IGBS3S|IGB3S,iGb3 synthase|isoglobotriaosylceramide synthase,⋯,,,,,,,,HGNC:30005,ENST00000442999.3|NM_001080438.1,


In [5]:
write.table(hgnc_data, file = "data/HGNC_gene_info_full.tsv", sep = "\t", row.names = FALSE, quote = FALSE)

In [8]:
library(biomaRt)

# Main
print(paste0("Retrieving on: ", format(Sys.Date(), "%d-%b-%Y")))

# Connect to Ensembl BioMart
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
human_genes <- getBM(
        attributes = c("hgnc_id", "ensembl_peptide_id", "transcript_mane_select"),
        mart = ensembl
    )
human_genes <- unique(human_genes)
human_genes <- human_genes[!human_genes$hgnc_id=="",]
human_genes <- human_genes[!human_genes$ensembl_peptide_id =="", ]
human_genes <- human_genes[!human_genes$transcript_mane_select =="", ]
human_genes <- human_genes[!is.na(human_genes$ensembl_peptide_id), ]
# Save to file
readr::write_csv(human_genes, "data/hgnc_ensp_biomart.csv")

[1] "Retrieving on: 06-Jun-2025"


In [None]:
unique(hgnc_id

In [15]:

human_genes  <- human_genes[!is.na(human_genes$ensembl_hgnc_id), ]

In [28]:
length(unique(human_genes$hgnc_id))

In [31]:
readr::write_csv(human_genes, "data/ensg_ensp_biomart.csv")

In [14]:
listAttributes(ensembl)

name,description,page
<chr>,<chr>,<chr>
ensembl_gene_id,Gene stable ID,feature_page
ensembl_gene_id_version,Gene stable ID version,feature_page
ensembl_transcript_id,Transcript stable ID,feature_page
ensembl_transcript_id_version,Transcript stable ID version,feature_page
ensembl_peptide_id,Protein stable ID,feature_page
ensembl_peptide_id_version,Protein stable ID version,feature_page
ensembl_exon_id,Exon stable ID,feature_page
description,Gene description,feature_page
chromosome_name,Chromosome/scaffold name,feature_page
start_position,Gene start (bp),feature_page


In [3]:
# Function to retrieve orthologs for different species
get_species_orthologs <- function(species_name) {
  # Connect to Ensembl BioMart
  ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")

  # Retrieve all human genes with their Ensembl IDs and HGNC symbols
    human_genes <- getBM(
        attributes = c("hgnc_id", "hgnc_symbol", "ensembl_gene_id"),
        mart = ensembl
    )
  
  # Define species-specific attributes dynamically based on species name
  species_column <- paste0(species_name, "_homolog_ensembl_gene")
  species_gene_name <- paste0(species_name, "_homolog_associated_gene_name")
  
  # Get orthologs for the specified species
  orthologs <- getBM(
    attributes = c("ensembl_gene_id", 
                   species_column, 
                   species_gene_name),
    mart = ensembl
  )
  final_result <- merge(human_genes, orthologs, by = "ensembl_gene_id", all.x = TRUE)
  readr::write_csv(final_result, paste0("data/",species_name, "_ID_biomart.csv"))
  return(final_result)
}

# Function to list available attributes for a given Ensembl species dataset
list_species_attributes <- function(species_dataset) {
  ensembl <- useMart("ensembl", dataset = species_dataset)
  attributes <- listAttributes(ensembl)
  return(attributes)
}

In [5]:
#  Marmoset (Callithrix jacchus)
get_species_orthologs("cjacchus")

ensembl_gene_id,hgnc_id,hgnc_symbol,cjacchus_homolog_ensembl_gene,cjacchus_homolog_associated_gene_name
<chr>,<chr>,<chr>,<chr>,<chr>
ENSG00000000003,HGNC:11858,TSPAN6,ENSCJAG00000004288,TSPAN6
ENSG00000000005,HGNC:17757,TNMD,ENSCJAG00000004270,TNMD
ENSG00000000419,HGNC:3005,DPM1,ENSCJAG00000000551,DPM1
ENSG00000000457,HGNC:19285,SCYL3,ENSCJAG00000010050,SCYL3
ENSG00000000460,HGNC:25565,FIRRM,ENSCJAG00000010088,FIRRM
ENSG00000000938,HGNC:3697,FGR,ENSCJAG00000009293,FGR
ENSG00000000971,HGNC:4883,CFH,ENSCJAG00000008415,CFH
ENSG00000001036,HGNC:4008,FUCA2,ENSCJAG00000019246,FUCA2
ENSG00000001084,HGNC:4311,GCLC,ENSCJAG00000011063,GCLC
ENSG00000001167,HGNC:7804,NFYA,ENSCJAG00000004074,NFYA


In [4]:
#  Rhesus macaque
get_species_orthologs("mmulatta")

ensembl_gene_id,hgnc_id,hgnc_symbol,mmulatta_homolog_ensembl_gene,mmulatta_homolog_associated_gene_name
<chr>,<chr>,<chr>,<chr>,<chr>
ENSG00000000003,HGNC:11858,TSPAN6,,
ENSG00000000005,HGNC:17757,TNMD,ENSMMUG00000008212,TNMD
ENSG00000000419,HGNC:3005,DPM1,ENSMMUG00000002759,DPM1
ENSG00000000457,HGNC:19285,SCYL3,ENSMMUG00000016583,SCYL3
ENSG00000000460,HGNC:25565,FIRRM,ENSMMUG00000016582,FIRRM
ENSG00000000938,HGNC:3697,FGR,ENSMMUG00000014434,FGR
ENSG00000000971,HGNC:4883,CFH,ENSMMUG00000045497,CFH
ENSG00000001036,HGNC:4008,FUCA2,ENSMMUG00000005366,FUCA2
ENSG00000001084,HGNC:4311,GCLC,ENSMMUG00000008684,GCLC
ENSG00000001167,HGNC:7804,NFYA,ENSMMUG00000021173,NFYA


In [None]:
# Chimp
get_species_orthologs("ptroglodytes")

In [None]:
# Chicken (Gallus gallus)
chicken_orthologs <- get_species_orthologs("ggallus")

In [None]:
# Pig (Sus scrofa)
pig_orthologs <- get_species_orthologs("sscrofa")

In [None]:
# Cow (Bos taurus)
cow_orthologs <- get_species_orthologs("btaurus")

In [None]:
# Dog (Canis lupus familiaris)
dog_orthologs <- get_species_orthologs("clfamiliaris")

In [None]:
# Horse (Equus caballus)
horse_orthologs <- get_species_orthologs("ecaballus")

In [None]:
# Sheep (Ovis aries rambouillet)
sheep_orthologs <- get_species_orthologs("oarambouillet")

In [None]:
# Mouse (Mus musculus) # test
mouse_orthologs <- get_species_orthologs("mmusculus")

In [None]:
# Rat (Rattus norvegicus) # test
rat_orthologs <- get_species_orthologs("rnorvegicus")

In [None]:
# Zebrafish (Danio rerio) # test
zebra_orthologs <- get_species_orthologs("drerio")

# Check homologs available

In [None]:
ensembl_human <- useMart("ensembl", dataset ="hsapiens_gene_ensembl")
attributes_human <- listAttributes(ensembl_human)

In [None]:
homologs_available <- attributes_human[grep("homolog_ensembl_gene", attributes_human$name), ]

In [None]:
readr::write_csv(homologs_available, paste0("data/","human_homologs", "_biomart.csv"))

In [8]:
human_attributes <- list_species_attributes("hsapiens_gene_ensembl")
head(human_attributes)  # Show first few attributes

Unnamed: 0_level_0,name,description,page
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,ensembl_gene_id,Gene stable ID,feature_page
2,ensembl_gene_id_version,Gene stable ID version,feature_page
3,ensembl_transcript_id,Transcript stable ID,feature_page
4,ensembl_transcript_id_version,Transcript stable ID version,feature_page
5,ensembl_peptide_id,Protein stable ID,feature_page
6,ensembl_peptide_id_version,Protein stable ID version,feature_page


In [15]:
# download HGNC complete txt file
download.file(
  url = "https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/hgnc_complete_set.txt",
  destfile = "hgnc_complete_set.txt",
  method = "auto"
)

# Optional: read into R
hgnc_data <- read.delim("hgnc_complete_set.txt", sep = "\t", header = TRUE)
head(hgnc_data)
# Save to file
write.table(hgnc_data, file = "data/HGNC_gene_info_full.tsv", sep = "\t", row.names = FALSE, quote = FALSE)

Unnamed: 0_level_0,hgnc_id,symbol,name,locus_group,locus_type,status,location,location_sortable,alias_symbol,alias_name,⋯,cd,lncrnadb,enzyme_id,intermediate_filament_db,rna_central_id,lncipedia,gtrnadb,agr,mane_select,gencc
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,HGNC:5,A1BG,alpha-1-B glycoprotein,protein-coding gene,gene with protein product,Approved,19q13.43,19q13.43,,,⋯,,,,,,,,HGNC:5,ENST00000263100.8|NM_130786.4,
2,HGNC:37133,A1BG-AS1,A1BG antisense RNA 1,non-coding RNA,"RNA, long non-coding",Approved,19q13.43,19q13.43,FLJ23569,,⋯,,,,,URS00007E4F6E,A1BG-AS1,,HGNC:37133,,
3,HGNC:24086,A1CF,APOBEC1 complementation factor,protein-coding gene,gene with protein product,Approved,10q11.23,10q11.23,ACF|ASP|ACF64|ACF65|APOBEC1CF,,⋯,,,,,,,,HGNC:24086,ENST00000373997.8|NM_014576.4,
4,HGNC:7,A2M,alpha-2-macroglobulin,protein-coding gene,gene with protein product,Approved,12p13.31,12p13.31,FWP007|S863-7|CPAMD5,,⋯,,,,,,,,HGNC:7,ENST00000318602.12|NM_000014.6,HGNC:7
5,HGNC:27057,A2M-AS1,A2M antisense RNA 1,non-coding RNA,"RNA, long non-coding",Approved,12p13.31,12p13.31,,,⋯,,,,,URS00001F234A,A2M-AS1,,HGNC:27057,,
6,HGNC:23336,A2ML1,alpha-2-macroglobulin like 1,protein-coding gene,gene with protein product,Approved,12p13.31,12p13.31,FLJ25179|p170,,⋯,,,,,,,,HGNC:23336,ENST00000299698.12|NM_144670.6,HGNC:23336


In [3]:
colnames(hgnc_data)

In [2]:
#listAttributes(ensembl)

In [9]:
# Save to file
write.table(hgnc_data, file = "data/HGNC_gene_info_full.tsv", sep = "\t", row.names = FALSE, quote = FALSE)