In [None]:
from pathlib import Path
import pandas as pd
import copy
file = Path("hg38_coding_exons_bilan_ovlpTF_NR_cCREs_FANTOM5+Enh_NonCoding_TSS_TES_UTR5_UTR3_peakDensity_density50pb_AllSummitsPeaksDensitySum_correl_DnaseEncodeChipAtlas_ATAC.bed")

# We first take all tf and cells lines mapping on a gene 
dic_list = []
with open(file) as f:
    for line in f:      

        if line.strip().split()[10] != "NA":

            dic = {"Exon ID":"", "Chrom":"", "Start":"", "End":"", "Nb_TF":"","Density":"", "CDS Start":False, "CDS End":False, "UTR":False, "Number of TF < 10":False, "Number of TF >= 10":False, "Encode cCREs":False, "CTCF":False, "enhD":False, "enhP":False, "K4m3":False, "prom":False, 'Fantom5 TSS':False, "Fantom enhancer":False, "Non-coding first exon":False, "Exon with max density":False, "Max mapping density around exons (50bp)":False, "DNase ENCODE":False,"DNase ChipAtlas":False,"ATAC ChipAtlas":False}

            dic["Exon ID"] = line.strip().split()[3]
            dic["Chrom"] = line.strip().split()[0]
            dic["Start"] = line.strip().split()[1]
            dic["End"] = line.strip().split()[2]
            dic["Nb_TF"] = line.strip().split()[10]

            # check if first exon with cds start
            cds_start = line.strip().split()[21]
            if cds_start != "NA":
                dic["CDS Start"] = True

            cds_end = line.strip().split()[22]
            if cds_end != "NA":
                dic["CDS End"] = True
            
            UTR5 = line.strip().split()[23]
            UTR3 = line.strip().split()[24]
            if UTR5 != "NA" or UTR3 != "NA":
                dic["UTR"] = True

            if line.strip().split()[33] != "NA":
                dic["DNase ENCODE"] = True

            if line.strip().split()[35] != "NA":
                dic["DNase ChipAtlas"] = True

            if line.strip().split()[40] != "NA":
                dic["ATAC ChipAtlas"] = True

            # check other values
            nb_tf = int(line.strip().split()[10])
            if nb_tf < 10:
                dic["Number of TF < 10"] = True
            else:
                dic["Number of TF >= 10"] = True
            
            ccre_label = line.strip().split()[16]
            ccre_label_list = ccre_label.split(",")
            for e in ccre_label_list:
                if e != "NA":
                    dic["Encode cCREs"] = True
                if e == "CTCF":
                    dic["CTCF"] = True
                if e == "enhD":
                    dic["enhD"] = True
                if e == "enhP":
                    dic["enhP"] = True
                if e == "K4m3":
                    dic["K4m3"] = True
                if e == "prom":
                    dic["prom"] = True

            tss = line.strip().split()[17]
            if tss != "NA":
                dic["Fantom5 TSS"] = True
            
            enhancer = line.strip().split()[19]
            if enhancer != "NA":
                dic["Fantom enhancer"] = True
            
            #check if flag first exon of non coding gene
            noncoding_gene = line.strip().split()[20]
            if noncoding_gene != "NA":
                noncoding_gene_list = noncoding_gene.split(",")
                for e in noncoding_gene_list:
                        dic["Non-coding first exon"] = True

            #check if exon has the most density of peak chipseq all in it
            peak_exon = line.strip().split()[25]
            dic["Density"] = peak_exon
            peak_50pb = line.strip().split()[26]
            if int(peak_exon) >= int(peak_50pb):
                dic["Exon with peak density"] = True
            else:
                dic["Max mapping density around exons (50bp)"] = True
            
            
            #save row
            dic_list.append(copy.deepcopy(dic))


df = pd.DataFrame(dic_list)
data = [
    {"labels": ["Promoter","Enhancer proximal","Enhancer distal","H3K4me2","DNAse-seq","H3K4me1","H3K27ac"], "values": [df['prom'].value_counts().loc[True],df['enhP'].value_counts().loc[True],df['enhD'].value_counts().loc[True],42747,df['DNase ENCODE'].value_counts().loc[True],103314,134460], "colors": ["red","orange","yellow","cyan","limegreen","deepskyblue","blue"]},                # First subplot with 1 bar
    {"labels": ["H3K27ac"], "values": [54375], "colors": ["blue"]}, # Second subplot with 3 bars
    {"labels": ["DNAse-seq","ATAC-seq"], "values": [df['DNase ChipAtlas'].value_counts().loc[True],df['ATAC ChipAtlas'].value_counts().loc[True]], "colors": ["limegreen","green"]},
    {"labels": ["CAGE TSS"], "values": [df['Fantom5 TSS'].value_counts().loc[True]], "colors": ["darkorchid"]},
]

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.ticker as ticker
import seaborn as sns
matplotlib.rcParams['pdf.fonttype'] = 42
from matplotlib.gridspec import GridSpec

titles = ["ENCODE","GTEx","CHIP-ATLAS","FANTOM"]

# Calculate the relative heights for each subplot
heights = [len(d["labels"]) for d in data]  # Heights proportional to the number of bars

# Create a gridspec with proportional subplot heights
fig = plt.figure(figsize=(5, 5))
gs = GridSpec(len(data), 1, height_ratios=heights)

# Plot each dataset
for i, d in enumerate(data):
    ax = fig.add_subplot(gs[i])
    bars = ax.barh(d["labels"], d["values"], color=d["colors"])  # Horizontal bar plot
    ax.set_title(f"{titles[i]}")
    
    # Ensure consistent spacing by adjusting y-limits
    n_bars = len(d["labels"])
    ax.set_ylim(-0.5, n_bars - 0.5)  # Add uniform padding around the bars

    ax.set_xlim(0, 165000)  # Align x-axis limits
    ax.invert_yaxis()
    ax.set_xticks([])

    # Add value labels in front of each bar
    for bar, value in zip(bars, d["values"]):
        ax.text(value, bar.get_y() + bar.get_height() / 2,  # Position slightly ahead of the bar
                f" {value}", va='center', ha='left')  # Align text to the left of the bar

# Common x-label
ax.set_xticks([0,40000,80000,120000,160000])
plt.xlabel("Count")
plt.tight_layout()

fig.suptitle("Count of hg38 merged exons overlapping ressources", y=1.05)
plt.savefig("/home/mouren/Images/tls1/overlap_exons_with_all_catalogs/all_exons_ovlp_bilan_fig1_2.pdf", format="pdf", bbox_inches="tight")

# Show the plot
plt.show()
