In [None]:
install.packages("readr")

if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")


BiocManager::install("clValid")
BiocManager::install("org.Mm.eg.db")
BiocManager::install("org.Hs.eg.db")

BiocManager::install("clusterProfiler")
BiocManager::install("GOSemSim")

In [None]:
library("readr")
library("clValid")
library('org.Mm.eg.db')
library('org.Hs.eg.db')

set.seed(12345)

In [None]:
# Compute the BHI z_score for one partition
# @param clusters: clustering ids
# @param bhi_matrix: may contain NA as there are unmapped genes or genes with no GO terms.
# @param n_permute: number of random clusterings
bhi_zscore <-function(clusters,annotation,n_permute = 100){
  random_bhi <- rep(0,n_permute)
  cluster_bhi <- BHI(clusters, annotation, names=as.character(valid_genes[,1]), category="BP")
  for(i in 1:n_permute){
    cids = sample(clusters)
    random_bhi[i] <- BHI(cids, annotation, names=as.character(valid_genes[,1]), category="BP")
  }
  return( (cluster_bhi - mean(random_bhi))/sd(random_bhi) )
}


# load the clusering table with each column gives the cluster ids of the samples in each row
## The gene matrix built can be shared between methods for the same dataset
partitions <- read.table("PCA_10Xmouse_gene_cluster_v5.csv",sep=",", header = TRUE, row.names=1)
genes <- rownames(partitions)
id_used <- intersect(which(genes!=""),which(duplicated(genes)==0))

valid_genes <- na.omit(as.data.frame(as.matrix(mapIds(org.Hs.eg.db, genes, 'ENTREZID', 'SYMBOL'))))

print('Percentage of valid genes')
print(dim(valid_genes)[1]/length(genes)*100)


# compute z_scores
n_time <- length(names(partitions))
zscores <- rep(0,n_time)
for(i in 2:2){
  clusters <- partitions[rownames(valid_genes),i]
  zscores[i] <- bhi_zscore(clusters,"org.Hs.eg.db",n_permute=100)
}

In [None]:
genes_eid <- bitr(geneID = genes[id_used], fromType = 'SYMBOL', toType = 'ENTREZID', OrgDb = 'org.Hs.eg.db', drop = TRUE)

# compute z_scores
n_time <- length(names(partitions))
zscores <- rep(0,n_time)
for(i in 1:n_time){
  clusters <- partitions[genes_eid$SYMBOL,i]
  zscores[i] <- bhi_zscore(clusters,"org.Hs.eg.db",n_permute=100)
}
View(zscores)

### Not longer used

In [1]:
conda install -c r r-essentials


install.packages("readr")

if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")


BiocManager::install("clValid")

BiocManager::install("clusterProfiler")
BiocManager::install("GOSemSim")
BiocManager::install("org.Mm.eg.db")
BiocManager::install("org.Hs.eg.db")
BiocManager::install("genebabel")

SyntaxError: invalid syntax (<ipython-input-1-95c0845d4ae5>, line 3)

In [1]:
install.packages("readr")

also installing the dependencies ‘glue’, ‘cli’, ‘vroom’, ‘tzdb’


“installation of package ‘glue’ had non-zero exit status”
“installation of package ‘tzdb’ had non-zero exit status”
“installation of package ‘cli’ had non-zero exit status”
“installation of package ‘vroom’ had non-zero exit status”
“installation of package ‘readr’ had non-zero exit status”
Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done



In [3]:
library("readr")
library('clusterProfiler')
library('GOSemSim')
library('org.Mm.eg.db')
library('org.Hs.eg.db')


# Convert HGNC name/ID to HGNC symbols
library('genebabel')

set.seed(12345)

##### functions

NameError: name 'library' is not defined

In [4]:
# modified from mgeneSim in GOSemSim
geneSimMatirx <- function (genes, semData, measure="Resnik", drop="IEA", combine="max", verbose=FALSE) {
  genes <- unique(as.character(genes))
  n <- length(genes)
  scores <- matrix(NA, nrow=n, ncol=n)
  rownames(scores) <- genes
  colnames(scores) <- genes
  
  gos <- lapply(genes, gene2GO, godata=semData, dropCodes=drop)
  uniqueGO <-  unique(unlist(gos))
  go_matrix <- mgoSim(uniqueGO, uniqueGO, semData, measure = measure, combine = NULL)
  if (verbose) {
    cnt <- 1
    pb <- txtProgressBar(min=0, max=sum(1:n), style=3)
  }
  for (i in seq_along(genes)) {
    for (j in seq_len(i)){
      if (verbose) {
        setTxtProgressBar(pb, cnt)
        cnt <- cnt + 1
      }
      scores[i,j] <- combineScores(go_matrix[gos[[i]], gos[[j]]], combine = combine)
      scores[j,i] <- scores[i,j]
    }
  }
  if (verbose)
    close(pb)
  #  removeRowNA <- apply(!is.na(scores), 1, sum)>0
  #  removeColNA <- apply(!is.na(scores), 2, sum)>0
  return(scores)
}

# copied from GOSemSim
gene2GO <- function(gene, godata, dropCodes) {
  goAnno <- godata@geneAnno
  if (! "EVIDENCE" %in% colnames(goAnno)) {
    warning("Evidence codes not found, 'drop' parameter will be ignored...")
  } else {
    goAnno <- goAnno[!goAnno$EVIDENCE %in% dropCodes,]
  }
  go <- as.character(unique(goAnno[goAnno[,1] == gene, "GO"]))
  go[!is.na(go)]
}

# Compute the sementic BHI for one partition
# @param clusters: clustering ids
# @param bhi_matrix: may contain NA as there are unmapped genes or genes with no GO terms.
cluster_BHI <- function (clusters, bhi_matrix){
  n_c <- length(unique(clusters))
  cluster_BHI <- matrix(NA,nrow = n_c)
  for(i in 1:n_c){
    idx <- which(clusters==i)
    cluster_BHI[i] <- mean(bhi_matrix[idx,idx],na.rm=TRUE)
  }
  return(mean(cluster_BHI,na.rm=TRUE))
}

# Compute the BHI z_score for one partition
# @param clusters: clustering ids
# @param bhi_matrix: may contain NA as there are unmapped genes or genes with no GO terms.
# @param n_permute: number of random clusterings
bhi_zscore <-function(clusters,bhi_matrix,n_permute = 500){
  random_bhi <- rep(0,n_permute)
  cluster_bhi <- cluster_BHI(clusters,bhi_matrix)
  for(i in 1:n_permute){
    cids = sample(clusters)
    random_bhi[i] <- cluster_BHI(cids,bhi_matrix)
  }
  return( (cluster_bhi - mean(random_bhi))/sd(random_bhi) )
}


# load the clusering table with each column gives the cluster ids of the samples in each row
## The gene matrix built can be shared between methods for the same dataset
partitions <- read.table("output/IHPF_10Xpbmc_gene_cluster.csv",sep=",", header = TRUE, row.names=1)
genes <- rownames(partitions)
id_used <- intersect(which(genes!=""),which(duplicated(genes)==0))



# Convert HGNC symbols to ENTREZID


######### BHI
#mmGO <- godata('org.Mm.eg.db', ont="BP")
mmGO <- godata('org.Hs.eg.db', ont="BP")

# get ENTREZID
#genes_eid <- bitr(geneID = genes[id_used], fromType = 'SYMBOL', toType = 'ENTREZID',OrgDb = 'org.Mm.eg.db',drop = TRUE)
genes_eid <- bitr(geneID = genes[id_used], fromType = 'SYMBOL', toType = 'ENTREZID',OrgDb = 'org.Hs.eg.db',drop = TRUE)
genes_ENTREZID <- genes_eid$ENTREZID


# build sementic gene similarity matrix, very slow
g_matrix <- geneSimMatirx(genes_ENTREZID, mmGO, measure = "Resnik", 
                          drop = "IEA",combine = "max", verbose = FALSE)


## Keep non zero scores from g_matrix
n_g <- length(genes_ENTREZID)
scores <-  matrix(NA, nrow=n_g, ncol=n_g)
rownames(scores) <- genes_ENTREZID
colnames(scores) <- genes_ENTREZID
for (i in 1:n_g){
  for (j in 1:i){
    if(!is.na(genes_ENTREZID[i]) & !is.na(genes_ENTREZID[j]))
      scores[genes_ENTREZID[i],genes_ENTREZID[j]] <- g_matrix[genes_ENTREZID[i],genes_ENTREZID[j]]
  }
}
diag(scores) <- NA
g_matrix <- scores

# compute z_scores
n_time <- length(names(partitions))
zscores <- rep(0,n_time)
for(i in 1:n_time){
  clusters <- partitions[genes_eid$SYMBOL,i]
  zscores[i] <- bhi_zscore(clusters,g_matrix,n_permute=1000)
  
}
View(zscores)

## Save zscores 



SyntaxError: invalid syntax (<ipython-input-4-259ec9a8bb99>, line 2)

In [None]:
zscores