In [5]:
# if (!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")

# BiocManager::install("fgsea")

In [6]:
library(fgsea)
library(dplyr)

In [7]:
GO_file = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/msigdb.v7.4.symbols.gmt.txt"
myGO = fgsea::gmtPathways(GO_file)

In [8]:
filename = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_endodiff/debug_May2021/genetic_effect/scaled_absolute_eff_sizes_20mofas_as_spatial_downsample_20pct_cluster_labels.csv"
df_clusters = read.csv(filename, row.names = 1)

In [9]:
head(df_clusters[,c("label","gene")],2)

Unnamed: 0,label,gene
0,16,ENSG00000005059_CCDC109B_-4_110648632_T_A
1,16,ENSG00000006016_CRLF1_-19_18735221_G_T


In [10]:
# load cluster summaries
cluster_summaries = read.csv("/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_endodiff/debug_May2021/genetic_effect/scaled_absolute_eff_sizes_20mofas_as_spatial_downsample_20pct_cluster_summaries.csv", row.names=1)
colnames(cluster_summaries) = paste0("cluster",0:16)
head(cluster_summaries,2)

Unnamed: 0,cluster0,cluster1,cluster2,cluster3,cluster4,cluster5,cluster6,cluster7,cluster8,cluster9,cluster10,cluster11,cluster12,cluster13,cluster14,cluster15,cluster16
21241_6#101,0.2507479,0.4205593,-0.5014652,0.2928452,-0.1241073,1.1162674,0.686837,-0.4045079,-0.08604902,0.3840519,1.203841701,0.4945531,1.1934878,0.86092678,0.3522783,0.4010664,0.3738378
21241_6#104,-0.1288142,1.0546755,0.139884,0.7488747,0.3678931,0.8531244,0.983805,1.2775621,-0.32772498,1.092781,0.007358882,-0.05449019,-0.0328414,-0.09641719,0.6181342,0.2365938,0.3239782


In [11]:
###################################################### 
### approach 1: enrichment of genes (eQTL) in the clusters

In [12]:
## load individual eQTL genetic effects
df = read.csv("/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_endodiff/debug_May2021/genetic_effect/MOFA10/eqtl_tot_genetic_effect_all.txt", sep="\t",row.names = 1)
colnames(df) = gsub("\\.","-",colnames(df))
head(df,2)
nrow(df)

Unnamed: 0,ENSG00000005059_CCDC109B_-4_110648632_T_A,ENSG00000006016_CRLF1_-19_18735221_G_T,ENSG00000006459_KDM7A_-7_139913409_G_A,ENSG00000013297_CLDN11_-3_170402765_T_C,ENSG00000021300_PLEKHB1_-11_73339784_G_A,ENSG00000029639_TFB1M_-6_155635808_C_G,ENSG00000033178_UBA6_-4_68480525_C_T,ENSG00000035115_SH3YL1_-2_256116_G_T,ENSG00000059377_TBXAS1_-7_139547370_G_A,ENSG00000063761_ADCK1_-14_78311319_G_A,...,ENSG00000248098_BCKDHA_-19_41937095_G_A,ENSG00000248124_RRN3P1_-16_21818292_A_C,ENSG00000250317_SMIM20_-4_25918516_A_G,ENSG00000253203_GUSBP3_-5_68922087_T_G,ENSG00000254184_TYW1B_-7_72200923_G_A,ENSG00000256018_HIST1H3G_-6_26198845_G_C,ENSG00000256073_C21orf119_-21_33728588_C_A,ENSG00000259024_TVP23C-CDRT4_-17_15434991_C_T,ENSG00000259803_SLC22A31_-16_89234110_G_C,ENSG00000267323_SLC25A1P5_-19_28298186_T_C
21843_1#10,-0.02336156,0.037167263,-0.02962768,0.1829698,-0.03793941,-0.02932441,0.0313985,-0.039623,0.08013833,0.02749838,...,-0.08185803,-0.1086237,-0.10317566,-0.04854321,0.095111678,-0.00257107,-0.05560833,0.1931245,-0.06168149,0.05468625
21843_1#100,-0.04797379,-0.002567715,-0.05030368,0.1420454,-0.06425603,-0.01789411,-0.06468377,-0.08820444,0.23273882,0.0403759,...,-0.16133495,-0.4557181,-0.09858982,-0.0956413,0.007662211,-0.001386594,-0.09411877,0.261424,-0.16271379,-0.03705185


In [13]:
cluster = 14
gene_list = c()
genes = df_clusters[df_clusters$label==cluster,"gene"]
for (gene in genes){
    gene_list = c(gene_list, cor(df[rownames(cluster_summaries),gene], cluster_summaries[,cluster+1]))
}
names(gene_list) = genes
gene_list = gene_list[order(gene_list, decreasing=T)]
gene_list

In [14]:
## for gprofiler https://biit.cs.ut.ee/gprofiler/gost

In [15]:
# cluster 14
noquote(gsub(".*_","",gsub("_-.*","",names(gene_list))))

[1] GLIPR1L1 NKD2     FAM86B3P NQO2    

In [16]:
##########################################################################################
### approach 2: enrichment of genes correlated with each clusters (using the cluster summaries)

In [17]:
input_files_dir = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_endodiff/new/input_files/"
# Phenotype
phenotype_file = paste0(input_files_dir,"phenotype.csv")

In [18]:
df0 = read.csv(phenotype_file, row.names=1)
colnames(df0) = gsub("\\.","#",colnames(df0))
colnames(df0) = sub(".","",colnames(df0))
df0[1:5,1:5]

Unnamed: 0,21843_1#10,21843_1#100,21843_1#101,21843_1#102,21843_1#103
ENSG00000000003_TSPAN6,5.5207770563,6.4562078,5.878671,4.8608241,5.90364
ENSG00000000419_DPM1,5.3924605058,6.0659226,6.838769,6.6142685,6.512403
ENSG00000000457_SCYL3,0.0001741556,0.3525966,0.0,0.8259555,2.201697
ENSG00000000460_C1orf112,1.4719275194,4.5369683,4.318528,5.373009,4.636175
ENSG00000001036_FUCA2,2.9088018801,3.8673273,3.321747,3.736476,4.917576


In [19]:
df1 = df0[,rownames(cluster_summaries)]
ncol(df0)
ncol(df1)
df1[1:5,1:5]

Unnamed: 0,21241_6#101,21241_6#104,21241_6#105,21241_6#111,21241_6#112
ENSG00000000003_TSPAN6,3.8440581,6.1634316,4.6724579,4.0896162,6.112655
ENSG00000000419_DPM1,6.044142,6.1924969,6.7683899,5.8303792,7.272333
ENSG00000000457_SCYL3,0.2319908,0.3103204,0.7722554,0.2570971,3.277097
ENSG00000000460_C1orf112,1.012337,4.1553163,3.8089331,1.3274994,3.035646
ENSG00000001036_FUCA2,0.0,0.0,0.0,3.231996,5.531081


In [20]:
genes = rownames(df1)
length(genes)
head(genes)

In [None]:
mat = matrix(0,nrow = length(genes), ncol = ncol(cluster_summaries))
rownames(mat) = genes
colnames(mat) = colnames(cluster_summaries)
for (cluster in 0:16){
    for (gene in genes){
        mat[gene,cluster+1] = cor(as.numeric(df1[gene,]), cluster_summaries[,cluster+1])
    }
}

In [None]:
df_mat = as.data.frame(mat)
head(df_mat)
write.csv(df_mat, "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_endodiff/debug_May2021/genetic_effect/gene_expression_correlation_abs_cluster_summaries.csv")


In [None]:
for (i in 1:ncol(mat)){
    print(hist(mat[,i]))
}