# Loading Packages and Datasets

In [45]:
import numpy as np
import pandas as pd
import scanpy as sc
import scrublet as scr
import scipy.io
from scipy import io
import matplotlib.pyplot as plt
import os
import bbknn as bk
import scvelo as scv
import harmonypy as hm
from matplotlib.pyplot import rc_context
import anndata


In [46]:
sc.settings.set_figure_params(dpi=150,fontsize=8) #change global figure resolution


In [47]:
#load all dataset

donor1_r1 = sc.read_h5ad(r'/home/user/abhay/scTE_gonads/GSE112013/donor/GSE112013_SRR6860519_Donor1_rep1.h5ad')
donor1_r2 = sc.read_h5ad(r'/home/user/abhay/scTE_gonads/GSE112013/donor/GSE112013_SRR6860520_Donor1_rep2.h5ad')
donor2_r1 = sc.read_h5ad(r'/home/user/abhay/scTE_gonads/GSE112013/donor/GSE112013_SRR6860521_Donor2_rep1.h5ad')
donor2_r2 = sc.read_h5ad(r'/home/user/abhay/scTE_gonads/GSE112013/donor/GSE112013_SRR6860522_Donor2_rep2.h5ad')
donor3_r1 = sc.read_h5ad(r'/home/user/abhay/scTE_gonads/GSE112013/donor/GSE112013_SRR6860523_Donor3_rep1.h5ad')
donor3_r2 = sc.read_h5ad(r'/home/user/abhay/scTE_gonads/GSE112013/donor/GSE112013_SRR6860524_Donor3_rep2.h5ad')


In [48]:
adata=donor1_r1.concatenate(donor1_r2,donor2_r1,donor2_r2,donor3_r1,donor3_r2,
                            join="outer",
                            batch_categories=['Donor1_r1', 'Donor1_r2', 'Donor2_r1',
                                              'Donor2_r2', 'Donor3_r1', 'Donor3_r2'],
                           batch_key = "samples")

# Add column for TE elements

In [49]:
#load list of human TEs
df = pd.read_csv('/home/user/abhay/scTE_gonads/Ancestrial_TE.csv')

In [50]:
tename = list(df.iloc[:, 0])
tename = [k for k in tename if k in adata.var.index]
len(tename)

667

In [54]:
Alu = adata.var_names.str.startswith('Alu')
AluY = adata.var_names.str.startswith('AluY')


58389

In [55]:
Alu1 = tename.str.stratswith('Alu')

AttributeError: 'list' object has no attribute 'str'

# Calculating QC 

In [24]:
sc.pp.filter_cells(adata, min_genes=500)
sc.pp.filter_genes(adata, min_cells=3)

In [25]:
mito_genes = adata.var_names.str.startswith('MT-')
adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

In [26]:
ribo_genes = adata.var_names.str.startswith('RBS','RPL')
adata.obs['percent_ribo'] = np.sum(adata[:, ribo_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

In [27]:
sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)

In [30]:
adata.var


Unnamed: 0,n_cells,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts
(CATTC)n,436,436,0.087004,98.176266,2080.0
(GAATG)n,455,455,0.040197,98.096792,961.0
A1BG,562,562,0.027900,97.649224,667.0
A1BG-AS1,128,128,0.005480,99.464592,131.0
A1CF,45,45,0.002259,99.811771,54.0
...,...,...,...,...,...
Zaphod,4855,4855,0.422219,79.692140,10094.0
Zaphod2,2115,2115,0.122182,91.153219,2921.0
Zaphod3,3349,3349,0.211152,85.991551,5048.0
hAT-16_Crp,8,8,0.000335,99.966537,8.0


In [35]:
adata.obs['percent_TE'] = np.sum(adata[:, tename].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

In [36]:
adata.obs['percent_Alu'] = np.sum(adata[:, Alu].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

In [37]:
adata.obs['percent_AluY'] = np.sum(adata[:, AluY].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

# AnnData Tutorial

In [22]:
df = adata.to_df()

In [39]:
adata[:,["GAPDH"]].var

Unnamed: 0,n_cells,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts
GAPDH,16167,16167,3.382817,32.375455,80873.0


In [38]:
adata[:,["AluYa5","AluYb8","AluYb9","BLACKJACK"]].var

Unnamed: 0,n_cells,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts
AluYa5,16103,16103,7.400427,32.643159,176922.0
AluYb8,13173,13173,4.515372,44.898984,107949.0
AluYb9,6194,6194,0.526791,74.09127,12594.0
BLACKJACK,4748,4748,0.397624,80.139708,9506.0


In [40]:
adata.obs

Unnamed: 0,samples,n_genes,percent_mito,percent_ribo,n_genes_by_counts,total_counts,percent_TE,percent_Alu,percent_AluY
AAACCTGAGAAACCTA-1-Donor1_r1,Donor1_r1,5493,0.054094,0.00005,5493,20076.0,0.310769,0.220711,0.039550
AAACCTGAGAAAGTGG-1-Donor1_r1,Donor1_r1,2923,0.032753,0.00000,2923,10686.0,0.158619,0.113045,0.014879
AAACCTGAGATCCTGT-1-Donor1_r1,Donor1_r1,1488,0.025659,0.00000,1488,2845.0,0.111072,0.065378,0.003163
AAACCTGAGATGGGTC-1-Donor1_r1,Donor1_r1,1684,0.133518,0.00000,1684,3243.0,0.061363,0.045945,0.002467
AAACCTGAGGTGCACA-1-Donor1_r1,Donor1_r1,574,0.436893,0.00000,574,2678.0,0.069828,0.050784,0.005601
...,...,...,...,...,...,...,...,...,...
TTTGTCACACCAGCAC-1-Donor3_r2,Donor3_r2,1351,0.094779,0.00000,1351,3007.0,0.143665,0.090123,0.011640
TTTGTCACAGGTGCCT-1-Donor3_r2,Donor3_r2,557,0.111374,0.00000,556,1266.0,0.328594,0.201422,0.030016
TTTGTCAGTTCGGCAC-1-Donor3_r2,Donor3_r2,685,0.116505,0.00000,685,1339.0,0.230022,0.144137,0.023152
TTTGTCATCCTGCCAT-1-Donor3_r2,Donor3_r2,761,0.159218,0.00000,761,2148.0,0.266294,0.179702,0.023277


In [41]:
adata.uns

OverloadedDict, wrapping:
	OrderedDict()
With overloaded keys:
	['neighbors'].

In [None]:
# Calculating QC 