# Loading Packages and Datasets

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scrublet as scr
import scipy.io
import matplotlib.pyplot as plt
import os
import scvelo as scv
import harmonypy as hm
from matplotlib.pyplot import rc_context
import re

In [2]:
sc.settings.set_figure_params(dpi=150,fontsize=8) #change global figure resolution


In [3]:
#load all dataset

donor1_r1 = sc.read_h5ad(r'/nfs_master/prakrithi/abhay/testis_allfiles/GSE112013_SRR6860519_Donor1_rep1.h5ad')
donor1_r2 = sc.read_h5ad(r'/nfs_master/prakrithi/abhay/testis_allfiles/GSE112013_SRR6860520_Donor1_rep2.h5ad')
donor2_r1 = sc.read_h5ad(r'/nfs_master/prakrithi/abhay/testis_allfiles/GSE112013_SRR6860521_Donor2_rep1.h5ad')
donor2_r2 = sc.read_h5ad(r'/nfs_master/prakrithi/abhay/testis_allfiles/GSE112013_SRR6860522_Donor2_rep2.h5ad')
donor3_r1 = sc.read_h5ad(r'/nfs_master/prakrithi/abhay/testis_allfiles/GSE112013_SRR6860523_Donor3_rep1.h5ad')
donor3_r2 = sc.read_h5ad(r'/nfs_master/prakrithi/abhay/testis_allfiles/GSE112013_SRR6860524_Donor3_rep2.h5ad')


In [4]:
adata=donor1_r1.concatenate(donor1_r2,donor2_r1,donor2_r2,donor3_r1,donor3_r2,
                            join="outer",
                            batch_categories=['Donor1_r1', 'Donor1_r2', 'Donor2_r1',
                                              'Donor2_r2', 'Donor3_r1', 'Donor3_r2'],
                           batch_key = "samples")

In [5]:
adata

AnnData object with n_obs × n_vars = 45488 × 58389
    obs: 'samples'

# Add metadata for TE elements

In [6]:
#load list of human TEs
all_te = pd.read_csv('/nfs_master/prakrithi/abhay/testis_scripts/extra_files/all_TE.csv')

In [7]:
all_te = all_te['All_TEs'].tolist()


In [8]:
gene_list = list(adata.var.index)


In [9]:
te_list = list(set(all_te).intersection(gene_list))

In [10]:
Alu  = list(filter(lambda x:'Alu' in x, te_list))
AluY = list(filter(lambda x:'AluY' in x, te_list))
L1   = list(filter(lambda x:'L1' in x, te_list))
LINE = list(filter(lambda x:'LINE' in x, te_list))
LTR  = list(filter(lambda x:'LTR' in x, te_list))
SVA  = list(filter(lambda x:'SVA' in x, te_list))


In [11]:
adata.obs['percent_TE']   = np.sum(adata[:, te_list].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
adata.obs['percent_Alu']  = np.sum(adata[:, Alu].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
adata.obs['percent_AluY'] = np.sum(adata[:, AluY].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
adata.obs['percent_L1']   = np.sum(adata[:, L1].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
adata.obs['percent_LINE'] = np.sum(adata[:, LINE].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
adata.obs['percent_LTR']  = np.sum(adata[:, LTR].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
adata.obs['percent_SVA']  = np.sum(adata[:, SVA].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

In [12]:
adata.obs

Unnamed: 0,samples,percent_TE,percent_Alu,percent_AluY,percent_L1,percent_LINE,percent_LTR,percent_SVA
AAACCTGAGAAACCTA-1-Donor1_r1,Donor1_r1,0.310769,0.215581,0.034718,0.043784,0.000100,0.007920,0.001893
AAACCTGAGAAAGTGG-1-Donor1_r1,Donor1_r1,0.158619,0.110893,0.012914,0.020307,0.000094,0.001591,0.001778
AAACCTGAGACAAAGG-1-Donor1_r1,Donor1_r1,0.088608,0.070323,0.002813,0.004219,0.000000,0.000000,0.000000
AAACCTGAGACTGGGT-1-Donor1_r1,Donor1_r1,0.131544,0.100671,0.005369,0.005369,0.000000,0.001342,0.000000
AAACCTGAGATCCTGT-1-Donor1_r1,Donor1_r1,0.111072,0.065378,0.003163,0.006327,0.000000,0.003515,0.000000
...,...,...,...,...,...,...,...,...
TTTGTCAGTTCGGCAC-1-Donor3_r2,Donor3_r2,0.230022,0.142644,0.021658,0.041822,0.000000,0.003734,0.002240
TTTGTCAGTTCTCATT-1-Donor3_r2,Donor3_r2,0.064646,0.046465,0.002020,0.006061,0.000000,0.000000,0.000000
TTTGTCATCCTCTAGC-1-Donor3_r2,Donor3_r2,0.062385,0.055046,0.007339,0.001835,0.000000,0.000000,0.000000
TTTGTCATCCTGCCAT-1-Donor3_r2,Donor3_r2,0.266294,0.175047,0.018622,0.042365,0.000000,0.005587,0.000466


In [13]:
adata[:,Alu].var

AluSq10
AluSg7
AluY
AluYg6
AluSq4
AluYf1
AluYb9
AluSc
AluYe5
AluSp
AluSx1


# Doublet Removal

In [14]:
counts_matrix = adata.X
genes = adata.var_names
scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=0.06)
doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=2, 
                                                         min_cells=3,
                                                         min_gene_variability_pctl=85,
                                                         n_prin_comps=30)


Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Elapsed time: 116.2 seconds


In [15]:
scrub.call_doublets(threshold=0.25)
adata.obs['doublet']=doublet_scores
adata.obs['predicted']=predicted_doublets
scrub.plot_histogram();
adata = adata[adata.obs['doublet'] < 0.25, :]

Detected doublet rate = 0.6%
Estimated detectable doublet fraction = 7.0%
Overall doublet rate:
	Expected   = 6.0%
	Estimated  = 8.5%


# Calculating QC 

#### Basic filtering

In [None]:
sc.pp.filter_cells(adata, min_genes=500)
sc.pp.filter_genes(adata, min_cells=3)

In [None]:
mito_genes = adata.var_names.str.startswith('MT-')
adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

In [None]:
ribo_genes = adata.var_names.str.startswith('RBS','RPL')
adata.obs['percent_ribo'] = np.sum(adata[:, ribo_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

In [None]:
sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)

plt.savefig(pre-filter_violin.png)

In [None]:
sc.pl.scatter(adata, x='total_counts', y='percent_mito')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')


In [None]:
print(adata)

In [None]:
adata.obs

# Filtering Data

#### Filtering based on mito and ribosomal content

In [None]:
adata2 = adata[adata.obs.n_genes < 4000, :]
adata2 

In [None]:
adata2 = adata2[adata2.obs.percent_mito < 0.2, :]
adata2

In [None]:
sc.pl.violin(adata2, ['n_genes_by_counts', 'total_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)

In [None]:
sc.pp.normalize_total(adata2, target_sum=1e4)

In [None]:
sc.pp.log1p(adata2)

In [None]:
adata2.raw=adata2

# Dimensionality Reduction

#### Calculate highly variable genes

In [None]:
sc.pp.highly_variable_genes(adata2,n_top_genes=3000)

In [None]:
adata_hvg = adata2[:, adata2.var['highly_variable']].copy()

In [None]:
sc.tl.pca(adata_hvg, svd_solver='arpack', n_comps=100)

In [None]:
sc.pp.neighbors(adata_hvg,n_neighbors=30)

In [None]:
sc.tl.umap(adata_hvg,spread=0.7,min_dist=0.2)

In [None]:
sc.tl.leiden(adata_hvg,resolution=0.30,n_iterations=-1)

In [None]:
sc.pl.pca_variance_ratio(adata_hvg, log=False)

In [None]:
sc.pl.umap(adata_hvg,color=["leiden"],cmap="plasma_r")
sc.pl.umap(adata_hvg,color=["samples"],cmap="plasma_r")

In [None]:
sc.tl.rank_genes_groups(adata_hvg,pts=True,groupby='leiden',n_genes=100,method='wilcoxon',corr_method='bonferroni')

In [None]:
sc.tl.dendrogram(adata_hvg,groupby="leiden")

In [None]:
with rc_context({'figure.figsize': (8, 8)}):
    sc.pl.rank_genes_groups_dotplot(adata_hvg,n_genes=7,
                                    groupby="leiden",color_map="plasma_r")

# Cluster Annotation

In [None]:
# Main markers for clustering at resolution = 0.30

macrophage_markers = ['CD14', 'CD163', 'S100A4'] # Cluster 6
endothelial_markers = ['PALMD', 'VWF', 'CDH5'] # Cluster 7
other=['RBP1', 'INSL3', 'MYL9'] #cluster 9 missing
lydig_markers = ['DLK1','IGF1','CFD'] # Cluster 1
myoid_markers = ['MYH11','ACTA2', "TPM1"] # Cluster 4
spermatid_markers = ['SPATA18', 'HOOK1', 'SPATA12'] #cluster 0 and 3
sertoli_markers = ['SOX9', 'WT1', 'HMGN5'] # cluster 8
immature_spermatid_markers = ['ZPBP', 'ZPBP2', 'SPAG6'] # Cluster 2
spermatogonia_markers = ['UTF1', 'ID4', 'SOHLH1'] # Cluster 4

# spermatocytes ??
# what is cluster 9 ??

In [None]:
all_markers = macrophage_markers + endothelial_markers + other + lydig_markers + myoid_markers + spermatid_markers + sertoli_markers + immature_spermatid_markers + spermatogonia_markers

niche_markers = macrophage_markers + endothelial_markers + other + lydig_markers + myoid_markers + sertoli_markers

germcell_markers = spermatid_markers + immature_spermatid_markers + spermatogonia_markers

In [None]:
sc.pl.dotplot(adata_hvg, all_markers, groupby='leiden', dendrogram=True)
plt.savefig('GSE112013-test-umap.png')

#spermatids    : 1,3
#macrophage    : 6
#endothelial   : 7
#lydig         : 0
#myoid         : 0
#sertoli.      : 8
#imm_sperm     : 2
#spermatogonia : 4
#unknown       : 5



In [None]:
label_dict = {'0': 'Spermatids',
              '1': 'Lydig_cells',
              '2': 'Immature_sperm_cells',
              '3': "Spermatids_2",
              '4': "Myoid_cells",
              '5': "Spermatogonia",
              '6': "Macrophages",
              '7': "Endothelial_cells",
              '8': "Sertoli_cells",
              '9': "Unknown"}
adata_hvg.obs['clusters'] = adata_hvg.obs['leiden'].map(label_dict).astype('category')


In [None]:
SVA_DEG = ['SVA_B', 'SVA_A', 'SVA_F', 'SVA_E', 'SVA_D', 'SVA_C']

In [None]:
sc.pl.umap(adata_hvg,color=["clusters"],cmap="plasma_r")

sc.pl.umap(adata_hvg,color=["Alu" ],cmap="plasma_r")
sc.pl.umap(adata_hvg,color=["AluY"],cmap="plasma_r")
#sc.pl.umap(adata_hvg,color=["L1"],cmap="plasma_r")
#sc.pl.umap(adata_hvg,color=["LINE"],cmap="plasma_r")
#sc.pl.umap(adata_hvg,color=["LTR" ],cmap="plasma_r")
#sc.pl.umap(adata_hvg,color=["SVA" ],cmap="plasma_r")
plt.savefig('GSE112013-umap.png')


In [None]:
sc.pl.dotplot(adata_hvg, all_markers, groupby='clusters', dendrogram=True)
plt.savefig('GSE112013-dotplot.png')

In [None]:
sc.pl.correlation_matrix(adata_hvg, 'clusters', figsize=(5,3.5))

In [None]:
sc.pl.dotplot(adata_hvg, Alu, groupby='clusters', dendrogram=True)

In [None]:
sc.pl.dotplot(adata_hvg, AluY, groupby='clusters', dendrogram=True)

In [None]:
Alu_int = ['AluJr', 'AluJo', 'AluJb', 'AluSz6', 'AluSx1', "AluSz", "AluSq", "AluSx4"] + AluY[0:5] + AluY[6:-1]
sc.pl.dotplot(adata_hvg, Alu_int, groupby='clusters', dendrogram=True)

In [None]:
sc.pl.dotplot(adata_hvg, L1, groupby='clusters', dendrogram=True)
sc.pl.dotplot(adata_hvg, L1[5:10], groupby='clusters', dendrogram=True)

In [None]:
LTR_1 = LTR[0:60]
LTR_2 = LTR[61:120]
LTR_3 = LTR[121:180]
LTR_4 = LTR[181:242]

In [None]:
sc.pl.dotplot(adata_hvg, LTR_1, groupby='clusters', dendrogram=True)
sc.pl.dotplot(adata_hvg, LTR_2, groupby='clusters', dendrogram=True)
sc.pl.dotplot(adata_hvg, LTR_3, groupby='clusters', dendrogram=True)
sc.pl.dotplot(adata_hvg, LTR_4, groupby='clusters', dendrogram=True)

In [None]:
sc.pl.dotplot(adata_hvg, LINE, groupby='clusters', dendrogram=True)

In [None]:
sc.pl.dotplot(adata_hvg, SVA, groupby='clusters', dendrogram=True)

In [None]:
sc.pl.umap(adata_hvg,color=["clusters"],cmap="plasma_r")

sc.pl.umap(adata_hvg,color=["Alu" ],cmap="plasma_r")
sc.pl.umap(adata_hvg,color=["AluY"],cmap="plasma_r")
#sc.pl.umap(adata_hvg,color=["L1"],cmap="plasma_r")
#sc.pl.umap(adata_hvg,color=["LINE"],cmap="plasma_r")
#sc.pl.umap(adata_hvg,color=["LTR" ],cmap="plasma_r")
#sc.pl.umap(adata_hvg,color=["SVA" ],cmap="plasma_r")


In [None]:
sub_adata = sc.pp.subsample[adata_hvg]
sub_adata

# Save objects for subclustering

In [None]:
#Subsetting for subclustering

germ_cells = adata_hvg[adata_hvg.obs['clusters'].isin(['Spermatids','Immature_sperm_cells', 'Spermatids_2', 'Spermatogonia'])]
niche_cells = adata_hvg[adata_hvg.obs['clusters'].isin(['Lydig_cells','Myoid_cells', 'Macrophages', 'Endothelial_cells', 'Sertoli_cells', 'Unknown'])]

In [None]:
germ_cells

In [None]:
!pwd

In [None]:
germ_cells.write_h5ad("/home/user/abhay/scTE_gonads/GSE112013/donor/GSE112013_germ_cells.h5ad")

In [None]:
niche_cells.write_h5ad("/home/user/abhay/scTE_gonads/GSE112013/donor/GSE112013_niche_cells.h5ad")

# Marker Analysis

In [None]:
somatic_markers = ['VIM']

stemcell_markers = ['GFRA1', 'FGFR3', 'ETV5', 'ID4', 'UTF1', 'ZBTB16']

diff_prolif = ['KIT', 'DMRT1', 'MKI67', 'SOHLH1', 'SOHLH2']

meosis = ['CHEK1', 'BRCA1', 'SPO11', 'DMC1', 'ATM'] #, 'SYCP1', 'SYCP2'

mitochondrial_translation = ['MRPL2','MRPL3','MRPL14','MRPL17','MRPL21','MRPL22']

gametogenesis = stemcell_markers + diff_prolif + meosis 

In [None]:
sc.pl.dotplot(adata_hvg, gametogenesis, groupby='clusters', dendrogram=True)

In [None]:
sc.pl.umap(adata_hvg,color=stemcell_markers,cmap="plasma_r")
sc.pl.umap(adata_hvg,color=diff_prolif,cmap="plasma_r")
sc.pl.umap(adata_hvg,color=meosis,cmap="plasma_r")
sc.pl.umap(adata_hvg,color=mitochondrial_translation,cmap="plasma_r")

In [None]:
sc.pl.umap(adata_hvg,color=["clusters"],cmap="plasma_r")

# Marker lit review

In [None]:
# More relevant markers

# markers = ['CD163','S100A4','CD14',  #Macrophages
#            'PECAM1','VWF','CDH5',    #Endothelial cells
#            'MYH11','ACTA2','TPM4',   #Myoid cells
#            'SOX9','WFDC2','BEX2',    #Sertoli cells
#            'DLK1','IGF1','CFD',      #Leydig cells
#            'UTF1','ID4','FGFR3',
#            'KIT', 'STRA8',
#            'TNP1', 'PRM2', 'ZPBP',
#            'SYCP3', 'SPO11', 'MLH3']
   

In [None]:
#Germ Cells

# transcription = ['GFRA1', 'FGFR3', 'ETV5', 'ID4', 'UTF1', 'ZBTB16']
# diff_prolif = ['KIT', 'DMRT1', 'MKI67', 'SOHLH1', 'SOHLH2']
# meosis = ['CHEK1', 'BRCA1', 'SPO11', 'DMC1', 'ATM', 'SYCP1', 'SYCP2']
# late_pachynema = ['MLH3']
# round_spermatid = ['SPAG6']
# elongating_spermatid = ['ZPBP', 'ZPBP2', 'DNAH6', 'DNAH7', 'DNAH14', 'CATSPER1', 'CATSPER4', 'CAMK4', 'CREM', 'MYO1D']
# sperm = ['TNP2', 'HOOK1', 'SPATA7', 'SPATA32', 'SPATA33', 'PRM3', 'SPATA12', 'SPATA18', 'SPATA20']
# late_spermatid = ['CREM', 'MYO1D']

# total_germ = transcription + diff_prolif + meosis + late_pachynema + round_spermatid + elongating_spermatid + sperm

# all_sperm = round_spermatid + elongating_spermatid + sperm + late_spermatid



In [None]:
# Global DGE dotplot

# sc.pl.dotplot(adata_hvg, all_markers, groupby='leiden', dendrogram=True)
# sc.pl.dotplot(adata_hvg, niche_markers, groupby='leiden', dendrogram=True)
# sc.pl.dotplot(adata_hvg, germcell_markers, groupby='leiden', dendrogram=True)


In [None]:
# Plotting DGEs on UMAP

# sc.pl.umap(adata_hvg,color=macrophage_markers,cmap="plasma_r")
# sc.pl.umap(adata_hvg,color=endothelial_markers,cmap="plasma_r")
# sc.pl.umap(adata_hvg,color=other,cmap="plasma_r")
# sc.pl.umap(adata_hvg,color=lydig_markers,cmap="plasma_r")
# sc.pl.umap(adata_hvg,color=myoid_markers,cmap="plasma_r")
# sc.pl.umap(adata_hvg,color=spermatid_markers,cmap="plasma_r")
# sc.pl.umap(adata_hvg,color=sertoli_markers,cmap="plasma_r")
# sc.pl.umap(adata_hvg,color=immature_spermatid_markers,cmap="plasma_r")
# sc.pl.umap(adata_hvg,color=spermatogonia_markers,cmap="plasma_r")
