# Convert symbol for Macaque

## Load libraries

In [1]:
suppressPackageStartupMessages(library(MuSiC))
suppressPackageStartupMessages(library(xbioc))
suppressPackageStartupMessages(library(Biobase))
suppressPackageStartupMessages(library(SummarizedExperiment))
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(Biobase))

## Load prepared single-cell reference (seq-well)

In [2]:
sc_reference <- readRDS('/home/nbarkas/disk3/ebov_bulk_rna_seq/proc_20210524/18-CompositionDeconvolution/00-reference_data/01-EBOV_sc/01-prepare-eset/eset_sc.rds')

In [3]:
genenames_in_single_cell <- rownames(exprs(sc_reference))

In [4]:
head(genenames_in_single_cell)

In [5]:
# Do we have any non-SYMBOL identifiers? Such as Ensembl ids
genenames_in_single_cell[grepl('^EN',x=genenames_in_single_cell)]

## Load bulk data and convert to Eset

In [6]:
bulk_qc_summarized_experiment_path <- "/home/nbarkas/disk3/ebov_bulk_rna_seq/proc_20210524/03-sample_qc/se.qc.rds"

In [7]:
bulk.se.qc <- readRDS(bulk_qc_summarized_experiment_path)

## Convert

In [8]:
library(biomaRt)

In [9]:
listEnsembl()

biomart,version
<chr>,<chr>
genes,Ensembl Genes 104
mouse_strains,Mouse strains 104
snps,Ensembl Variation 104
regulation,Ensembl Regulation 104


In [10]:
ensembl <- useEnsembl(biomart="genes")

In [11]:
datasets <- listDatasets(ensembl)

In [12]:
searchDatasets(mart=ensembl, pattern="mulatta")

Unnamed: 0_level_0,dataset,description,version
Unnamed: 0_level_1,<I<chr>>,<I<chr>>,<I<chr>>
104,mmulatta_gene_ensembl,Macaque genes (Mmul_10),Mmul_10


In [13]:
ensembl_mmul10 <- useDataset(dataset = "mmulatta_gene_ensembl", mart=ensembl)

### Try to convert the ensembl identifiers to gene symbols

In [22]:
source("/home/nbarkas/disk3/ebov_bulk_rna_seq/proc_20210524/prj_helpers.R")

In [33]:
# Remove transcript version
gene_ids <- rownames(assay(bulk.se.qc))
gene_ids_no_version <- strpart(rownames(assay(bulk.se.qc)),'.',fixed=T,1)

In [24]:
attr_list <- listAttributes(mart = ensembl_mmul10)

In [25]:
head(attr_list)

Unnamed: 0_level_0,name,description,page
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,ensembl_gene_id,Gene stable ID,feature_page
2,ensembl_gene_id_version,Gene stable ID version,feature_page
3,ensembl_transcript_id,Transcript stable ID,feature_page
4,ensembl_transcript_id_version,Transcript stable ID version,feature_page
5,ensembl_peptide_id,Protein stable ID,feature_page
6,ensembl_peptide_id_version,Protein stable ID version,feature_page


In [26]:
attr_list[grepl("symbol",attr_list$name),]

Unnamed: 0_level_0,name,description,page
Unnamed: 0_level_1,<chr>,<chr>,<chr>
50,hgnc_symbol,HGNC symbol,feature_page
73,uniprot_gn_symbol,UniProtKB Gene Name symbol,feature_page


In [36]:
# The key here is to use 'external_gene_name', not 'hgnc_symbol' as the latter column is very sparsely populated
#gene_conversion <- getBM(filters="ensembl_gene_id",attributes=c("ensembl_gene_id","external_gene_name"),values=gene_ids_no_version,mart=ensembl_mmul10)
gene_conversion <- getBM(filters="ensembl_gene_id_version",attributes=c("ensembl_gene_id_version","ensembl_gene_id","external_gene_name"),values=gene_ids,mart=ensembl_mmul10)


Batch submitting query [===>---------------------------]  12% eta:  9s






                                                                      



In [37]:
head(gene_conversion)

Unnamed: 0_level_0,ensembl_gene_id_version,ensembl_gene_id,external_gene_name
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,ENSMMUG00000000009.4,ENSMMUG00000000009,PTPRC
2,ENSMMUG00000000015.4,ENSMMUG00000000015,C1H1orf100
3,ENSMMUG00000000016.4,ENSMMUG00000000016,ADSS2
4,ENSMMUG00000000021.4,ENSMMUG00000000021,CATSPERE
5,ENSMMUG00000000037.4,ENSMMUG00000000037,ISG20L2
6,ENSMMUG00000000038.4,ENSMMUG00000000038,RRNAD1


In [38]:
dim(gene_conversion)

In [39]:
write.csv(x=gene_conversion,file="gene_conversion.csv")