In [None]:
import numpy as np
import pandas as pd

import scanpy as sc
import episcanpy as epi
import anndata as ad

import matplotlib.pyplot as plt

import time
import pickle
import os

In [None]:
fragments_file = "fragments.tsv.gz"
peak_file = "ChEA3_TFBS_remap2022_20230109.bed"

In [None]:
adata = epi.ct.peak_mtx(
    fragments_file,
    peak_file, 
    #valid_barcodes,
    normalized_peak_size=None,
    fast=False
)

adata

In [None]:
TFs = []
with open("ChEA3_TFBS_remap2022_20230109.bed") as f:
    for line in f:
        line = line.split("\t")
        TFs.append(line[3][:-1])

In [None]:
adata.var["TF"] = TFs

In [None]:
print("Max before:\t{}".format(np.max(adata.X)))
epi.pp.binarize(adata)
print("Max after:\t{}".format(np.max(adata.X)))

In [None]:
epi.pp.qc_stats(adata, verbose=True)

In [None]:
epi.pp.nucleosome_signal(adata, fragments_file, n=10000)

In [None]:
gtf_file = "gencode.v42.annotation.gtf"

In [None]:
epi.pp.tss_enrichment(adata, gtf=gtf_file, fragments=fragments_file, n=5000, score="avg_score_of_center_region", distance_to_tss=1000, bp_per_flank=100)

In [None]:
min_features = 2000
max_features = None

epi.pl.violin(adata, "n_features", min_threshold=min_features, max_threshold=max_features, show_log=True, show_mean=True, show_median=True, print_statistics=False,
             save="scATAC_DMSO_QC_filter_cells_20230126.pdf")
epi.pl.histogram(adata, "n_features", bins=40, min_threshold=min_features, max_threshold=max_features, show_log=True, show_mean=True, show_median=True, print_statistics=True,
                save="scATAC_DMSO_QC_filter_cells_hist_20230126.pdf")

In [None]:
min_cells = 30
max_cells = None

epi.pl.violin(adata, "n_cells", min_threshold=min_cells, max_threshold=max_cells, show_log=True, show_mean=True, show_median=True, print_statistics=False, 
              save="scATAC_DMSO_QC_filter_features_20230126.pdf")

In [None]:
max_nucleosome_signal = 5

epi.pl.fragment_length(adata, n=5000, threshold=max_nucleosome_signal, show_n=True, save="scATAC_DMSO_fragment_length_distribution_20230126.pdf")
epi.pl.violin(adata, "nucleosome_signal", max_threshold=max_nucleosome_signal, show_log=True, show_mean=True, show_median=True, print_statistics=True, save="scATAC_DMSO_nucleosome_signal_20230126.pdf")

In [None]:
min_tss_enrichment = 2

adata.obs["tss"] = ["high" if val > min_tss_enrichment else "low" for val in adata.obs["tss_enrichment_score"]]

epi.pl.tss_enrichment(adata, group_by="tss", show_n=True, save="scATAC_DMSO_TSS_enrichment_20230126.pdf")
epi.pl.violin(adata, "tss_enrichment_score", min_threshold=min_tss_enrichment, show_log=True, show_mean=True, show_median=True, print_statistics=True, save="scATAC_DMSO_TSS_enrichment_violin_20230126.pdf")

In [None]:
epi.pp.set_filter(adata, "n_features", min_threshold=min_features, max_threshold=max_features)
epi.pp.set_filter(adata, "nucleosome_signal", max_threshold=max_nucleosome_signal)
epi.pp.set_filter(adata, "tss_enrichment_score", min_threshold=min_tss_enrichment)

epi.pp.set_filter(adata, "n_cells", min_threshold=min_cells)

In [None]:
adata = epi.pp.apply_filters(adata, verbose=True)

In [None]:
epi.pp.highly_variable(adata, min_score=0.515, n_features=None)

In [None]:
adata1 = epi.pp.select_highly_variable(adata, verbose=True)

In [None]:
adata1.write("ChEA3_predicted_TFBS_remap2022_filtered_most_variable_20220110.h5ad")

In [None]:
adata1

In [None]:
del adata # to save memory

In [None]:
epi.tl.find_genes(adata1, "gencode.v42.annotation.gtf", upstream=1000, downstream=100, feature_type="gene")

In [None]:
subgroupgenes = pd.read_csv("subgroups_DMSO_top500.csv", index_col=0)

In [None]:
genes = [elem for elem in subgroupgenes["bile_sterols"].tolist()]
genes = genes + [elem for elem in subgroupgenes["carbs_phaseII_stress"].tolist()]
genes = genes + [elem for elem in subgroupgenes["lipids_phaseIII"].tolist()]
genes = genes + [elem for elem in subgroupgenes["losing_expression"].tolist()]

In [None]:
relevant_TFBS = []
for idx,elem in enumerate(adata1.var["gene_annotation"].tolist()):
    elem = elem.split(";")
    for item in elem:
        if item in genes:
            relevant_TFBS.append(adata1.var_names.tolist()[idx])

In [None]:
annot = []
for elem in adata1.var_names.tolist():
    if elem in relevant_TFBS:
        annot.append("yes")
    else:
        annot.append("no")
adata1.var["relevant"] = annot

In [None]:
adata2 = adata1[:,adata1.var["relevant"] == "yes"]

In [None]:
adata2

In [None]:
epi.pp.lazy(adata2)

In [None]:
epi.pl.umap(adata2)

In [None]:
annot = []
for idx,elem in enumerate(adata2.var["gene_annotation"].tolist()):
    elem = elem.split(";")
    if len(elem) > 1:
        sublist = []
        for item in elem:
            if item in subgroupstuff["bile_sterols"].tolist():
                sublist.append("I")
            elif item in subgroupstuff["carbs_phaseII_stress"].tolist():
                sublist.append("II")
            elif item in subgroupstuff["lipids_phaseIII"].tolist():
                sublist.append("III")
            elif item in subgroupstuff["losing_expression"].tolist():
                sublist.append("IV")
        annot.append(sublist)
    elif elem[0] in subgroupstuff["bile_sterols"].tolist():
        annot.append("I")
    elif elem[0] in subgroupstuff["carbs_phaseII_stress"].tolist():
        annot.append("II")
    elif elem[0] in subgroupstuff["lipids_phaseIII"].tolist():
        annot.append("III")
    elif elem[0] in subgroupstuff["losing_expression"].tolist():
        annot.append("IV")
    else:
        annot.append("this shouldn't exist")

In [None]:
adata2.var["subgroup"] = annot

In [None]:
epi.tl.louvain(adata2, resolution=0.5)
epi.tl.louvain(adata2, resolution=0.2, restrict_to=("louvain",["3"]), key_added="test")
epi.tl.louvain(adata2, resolution=0.2, restrict_to=("test",["1"]), key_added="test2")

In [None]:
sc.pl.umap(adata2, color=["test2","chr5_154682469_154682925","chr8_101205373_101206298","chr7_6374186_6374568","chr1_202589184_202589536"], cmap="YlGnBu", wspace=0.4, ncols=2)

In [None]:
annot = []
for idx, elem in enumerate(adata2.obs["test2"]):
    if elem in ["6","1,4","1,5","10"]:
        annot.append("I")
    elif elem in ["3,1","1,3","4"]:
        annot.append("II")
    elif elem in ["3,2","1,1"]:
        annot.append("III")
    elif elem in ["1,0","12"]:
        annot.append("IV")
    else:
        annot.append("nothing")

In [None]:
adata2.obs["subgroups"] = annot

In [None]:
adata2.uns["subgroups_colors"] = ['#ff7f0e', '#1f77b4', '#17b2b1', '#9b1c03',"#f0f0f0"]

In [None]:
epi.pl.umap(adata2, color=["subgroups","chr5_154682469_154682925","chr1_150971221_150971606",
                           "chr7_73622917_73623468","chr2_54630007_54630263",
                           "chr8_101205373_101206298","chr7_6374186_6374568","chr1_1374971_1375434",
                           "chr1_202589184_202589536","chr1_202348091_202348922",
                          "chr1_118963414_118963884","chr2_218395915_218396552"], cmap="YlGnBu", ncols=3,
           save="_annotation_attempt_by_specific_TFBS_20230119.pdf")

In [None]:
adata3 = adata2[adata2.obs["subgroups"] != "nothing"]

In [None]:
outfile = open("subgroup_annotation_for_R.txt","w")
outfile.write("cell"+"\t"+"group"+"\t"+"total_reads"+"\n")
for idx, elem in enumerate(adata3.obs_names.tolist()):
    outfile.write(elem+"\t"+adata3.obs["subgroups"][idx]+"\t"+str(adata3.obs["n_features"][idx])+"\n")
outfile.close()

In [None]:
annot = []
for elem in adata2.var["subgroup"].tolist():
    if str(elem)[0] == "[":
        annot.append("NA")
    else:
        annot.append(elem)

In [None]:
adata2.var["subgroup"] = annot

In [None]:
adata2.write("rough_annotation_subgroups_TFBS_in_proximity_to_genes.h5ad")

In [None]:
import seaborn as sns
from scipy.cluster.hierarchy import linkage

In [None]:
adata = ad.read("rough_annotation_subgroups_TFBS_in_proximity_to_genes.h5ad")

In [None]:
adatax = adata[:,adata.var["subgroup"] != "NA"]

In [None]:
annot = []
for elem in adatax.var["gene_annotation"].tolist():
    if elem in ["CTSB","LARP1","SPTBN1","IQGAP2","DIAPH1",
                "RAC1","UGP2","ZNF706","RPL5","RPS12",
                "PPP1R12B","CCNL1","ALB","NRBP2","OGT"]:
        annot.append("keep")
    else:
        annot.append("no")

In [None]:
adatax.var["to_plot"] = annot

In [None]:
adata2 = adatax[:,adatax.var["to_plot"] == "keep"]

In [None]:
sc.pp.filter_cells(adata2, min_counts=1)

In [None]:
df = pd.DataFrame(adata2.X.todense().T)
df.index = adata2.var_names.tolist()
df.columns = adata2.obs.index

In [None]:
from scipy.spatial.distance import pdist, squareform

In [None]:
x = pdist(np.log1p(df), metric="correlation")
#x = np.matrix(df)
x2 = squareform(x)

In [None]:
df1 = pd.DataFrame(x2)
df1.index = df.index
df1.columns = df.index

In [None]:
annot = []
for idx, elem in enumerate(adata2.var["gene_annotation"].tolist()):
    annot.append(elem+"_"+adata2.var["subgroup"][idx])
df1["gene"] = annot

In [None]:
df2 = df1.groupby(df1["gene"]).mean()     

In [None]:
gene_list = ['OGT_III','ALB_III','CCNL1_III','NRBP2_III','PPP1R12B_III','RAC1_II','UGP2_II','RPL5_II','RPS12_II','ZNF706_II',
             'IQGAP2_I','LARP1_I','CTSB_I','DIAPH1_I','SPTBN1_I']

In [None]:
sorterIndex = dict(zip(gene_list, range(len(gene_list))))
df2['gene_rank'] = df2.index.map(sorterIndex)
df2.sort_values(["gene_rank"], inplace = True)
df2.drop('gene_rank', 1, inplace = True)

In [None]:
lut2 = dict(zip(set(adata2.var["subgroup"].tolist()), ['#1f77b4', '#17b2b1', '#ff7f0e'])) # check order for matching colors
row_colors2 = adata2.var["subgroup"].map(lut2)

sns.clustermap(df2, cmap="Blues_r", xticklabels=1, yticklabels=1, col_colors=row_colors2, row_cluster=False)
plt.savefig("scATAC_DMSO_TFBS_openness_correlation_gene_wise_subgroup_specific_genes_20230126.pdf", bbox_inches="tight")