In [None]:
import numpy as np
import math
import matplotlib.pyplot as plt
import csv
import pandas as pd
from scipy import stats
import re
from scipy.stats import pearsonr,spearmanr,fisher_exact,binom_test
#import rpy2.robjects as robjects
import random
from statsmodels.stats.multitest import fdrcorrection
import os
import seaborn as sns
#import gseapy as gs
from scipy.stats import norm
import gseapy as gs
from collections import Counter
import scanpy as sc

In [None]:
#Colors
mouse = "#F2C911"
rat = "#65B0AC"
intrinsic = "#F55F00"
extrinsic = "#7D9AF4"
reinforcing = "#9B00F5"
opposing = "#F50901"
interaction = "#1E771A"

sns.set(font_scale=1.5)
sns.set_style("white")
d_ct_abrev = {"chondrocytes":"chondrocyte", "Chondrocyte":"chondrocyte", "Forebrain glutamatergic progenitors":"brain.glut.prog", "Forebrain GABAergic progenitors":"brain.GABA.prog", "Intermediate progenitors":"inter.prog", "Forebrain glutamatergic neurons":"brain.glut.neu", "Forebrain GABAergic neurons":"brain.GABA.neu", "Spinal GABAergic neurons":"spine.GABA.neu", "Spinal glutamatergic neurons":"spine.glut.neu", "Chondrocytes":"chondrocyte", "Mesenchyme 0":"mesen.0", "Mesenchyme 2":"mesen.2", "Mesenchyme cycling":"mesen.cyc", "Forebrain GABAergic neurons 0":"Toss"}

#Define file to cell type mapping
def file_to_celltype(x):
    x = x.split("/")[-1]
    x = x.replace("_GO_Molecular_Function_2023", "").replace("_GO_Biological_Process_2023", "").replace("Mesechyme", "Mesenchyme").replace("Brain_Or_2010_Div_New4_NewNorm_", "")
    x = x.replace("Mesenchymal_Or_2010_Div_New4_NewNorm_", "").replace("Chondrocyte_Or_2010_Div_New4_NewNorm_", "").replace(".csv", "").replace("_", " ").replace(" all", "")
    x = x.replace("Glutamatergic", "Forebrain glutamatergic").replace("GABAergic", "Forebrain GABAergic").replace("Spinal Forebrain", "Spinal").replace("Chondrocytes", "Chondrocyte")
    return x

plt.rcParams["font.family"] = "Arial"

#Summarized enrichments
def summarize_enrichment(fold, out_file):
    #Summarize the enrichments
    out = []
    for folder in os.listdir("Final/" + fold):
        v = pd.read_csv("Final/" + fold + "/" + folder + "/" + "gseapy.gene_set.prerank.report.csv")
        for index, row in v.iterrows():
            #Keep nominally significant
            if row["FDR q-val"] < 0.25:
                out.append([folder, row["Term"], row["FDR q-val"], row["NES"], row["Lead_genes"]])
    
    #Write to out_file
    df = pd.DataFrame(out)
    df.columns = ["Cluster_Category", "Term", "FDR", "NES", "Ledge genes"]
    df = df.sort_values("FDR")
    df.to_csv("Final/Summarized_Enrichments_" + out_file + ".csv", index = False)
    
vals = ["Extrinsic", "Intrinsic", "Interaction"]

In [None]:
#Make a function to plot the various expression levels
def bp(gene, file, extra_title = ""):
    
    #Read in file and restrict to gene
    vv = pd.read_csv(file).set_index("Unnamed: 0")
    vp = vv.loc[gene]
    
    #Reformat for seaborn and convert names to those in the paper
    r1 = ["Rat-like env.", "Mouse", vp["MR1_mi Norm CPM"]]
    r2 = ["Rat-like env.", "Rat", vp["MR1_ri Norm CPM"]]
    r3 = ["Mouse-like env.", "Mouse", vp["RM1_mi Norm CPM"]]
    r4 = ["Mouse-like env.", "Rat", vp["RM1_ri Norm CPM"]]
    r5 = ["Mouse-like env.", "Mouse", vp["RM2_mi Norm CPM"]]
    r6 = ["Mouse-like env.", "Rat", vp["RM2_ri Norm CPM"]]
    r7 = ["Wildtype", "Mouse", vp["WT_mi Norm CPM"]]
    r8 = ["Wildype", "Rat", vp["WT_ri Norm CPM"]]
    
    #Select correct sample based on tissue
    if "Brain" in file:
        r21 = ["Host", "Rat", vp["MR1_ri Norm CPM"]]
        r31 = ["Host", "Mouse", vp["RM1_mi Norm CPM"]]
        tp = pd.DataFrame([r1, r2, r3, r4, r31, r21])
    else:
        r21 = ["Host", "Rat", vp["MR1_ri Norm CPM"]]
        r31 = ["Host", "Mouse", vp["RM2_mi Norm CPM"]]
        tp = pd.DataFrame([r31, r21, r1, r2, r6, r5])
    tp.columns = ["Chimera (Donor-Host)", "Cell species", "Norm CPM"]
    #fig, ax = plt.subplots(figsize = (6, 4.5))
    sns.set(font_scale=1.5)
    sns.set_style("white")
    
    #Create barplot
    sns.barplot(data = tp, y = "Norm CPM", x = "Chimera (Donor-Host)", hue = "Cell species", palette = {"Mouse":mouse, "Rat":rat}, alpha = 1, order = ["Host", "Rat-like env.", "Mouse-like env."])
    plt.ylabel("Pseudobulked normalized counts")
    #plt.xlabel("Chimera (donor" + r'$\rightarrow$' + "host)")
    plt.xlabel("Extrinsic environment")
    if not extra_title:
        plt.title("Expression of $\it{" + gene + "}$ in " + d_ct_abrev[file_to_celltype(file)])
    else:
        plt.title(extra_title + "\n$\it{" + gene + "}$ in " + d_ct_abrev[file_to_celltype(file)])
    xmin, xmax, ymin, ymax = plt.axis()
    #plt.ylim(0, 1)
    plt.axvline(0.5, color = "black", linewidth = 2.5, alpha = 1, linestyle="dashed")
    #plt.legend(bbox_to_anchor=(1.375, 1.05))
    plt.legend([],[], frameon=False)
    plt.show()
    return vp

#file = "Final/Div/Brain_Or_2010_Div_New4_NewNorm_GABAergic_neurons_all.csv"
#gene = "Psma7"
#vp = bp(gene, file)
#file = "Final/Div/Mesenchymal_Or_2010_Div_New4_NewNorm_Mesenchyme_2.csv"
#gene = "Psma7"
#vp = bp(gene, file)



file = "Final/Div/Brain_Or_2010_Div_New4_NewNorm_GABAergic_neurons_all.csv"
gene = "Grb10"
vp = bp(gene, file)
vp

In [None]:
#Read in high confidence imprinted genes
imp_conf_list = []
o = open("mouse_imprinted_genes.txt")
for line in o:
    l = line.replace("\xa0", "").replace("\n", "").split("\t")
    if ("Imprinted" in l[3] or "Predicted" in l[3]) and "Not" not in l[3]:
        imp_conf_list.append(l)
df_imp_info = pd.DataFrame(imp_conf_list)
df_imp_info

In [None]:
#Plot log2 fold-enrichment
term = 'Imprinted'
gene_set = list(df_imp_info[0])
enrich = []
fdr = []
cell_types = []
genes = []

#Iterate through
for file in os.listdir("Final/GSEAPY_New4_Imprinting"):
    #Get only the absolute interaction divergence enrichments
    if "PatMat" not in file and "Heart" not in file:
        v = pd.read_csv("Final/GSEAPY_New4_Imprinting/" + file + "/gseapy.gene_set.prerank.report.csv", sep = ",")
        out = []
        for index, row in v.iterrows():
            #Save potentially relevant information for term of interest
            if row["Term"] == term:
                enrich.append(row["ES"])
                fdr.append(row["FDR q-val"])
                cell_types.append(file_to_celltype(file).replace(" ", "."))
                genes.append(row["Lead_genes"])

#Create dictionary to map cell type to genes
d_en = {}
for i in range(len(cell_types)):
    d_en[cell_types[i]] = genes[i]

ct = []
enrichment = []
#For all cell types
for file in os.listdir("Final/Div"):
    
    #Read in file
    v = pd.read_csv("Final/Div/" + file, sep = ",")
    
    #Compute absolute interaction divergence and sort
    v["Interaction magnitude"] = np.abs(v["Interaction"])
    asc = False
    v = v.sort_values("Interaction magnitude", ascending = asc)
    v["Gene"] = v["Unnamed: 0"]
    out = []
    
    #Get genes driving GSEAPY enrichment
    genes = d_en[file_to_celltype(file).replace(" ", ".")].split(";")
    cut = 0
    
    #Figure out rank cutoff used by GSEAPY
    for index, row in v.iterrows():
        cut += 1
        if genes[-1] == row["Gene"]:
            break
    
    #Get number of genes driving enrichment and number of genes at equivalent cutoff
    genes_agree = len(np.intersect1d(list(v["Gene"])[:cut], gene_set))
    genes_disagree = len(np.intersect1d(list(v["Gene"])[::-1][:cut], gene_set))
    ct.append(d_ct_abrev[file_to_celltype(file)])
    if asc:
        enrichment.append(-np.log2((genes_agree + 1)/(genes_disagree + 1)))
    else:
        enrichment.append(np.log2((genes_agree + 1)/(genes_disagree + 1)))
        
sns.set(font_scale = 1.5)
sns.set_style("white")

#Create dataframe to input
to_plot = pd.DataFrame([enrichment, fdr, ct]).T
to_plot.columns = ["Log$_2$ fold-enrichment", "FDR", "Cell type"]
#to_plot = to_plot[~to_plot["Cell type"].isin(["Toss", "chondrocyte", "mesen.0", "mesen.2", "mesen.cyc"])]

#Remove cell types we filtered out earlier
to_plot = to_plot[~to_plot["Cell type"].isin(["Toss"])]

#Get connective tissue and brain subsets, use to compute plotting order
tpcon = to_plot[to_plot["Cell type"].isin(["chondrocyte", "mesen.0", "mesen.2", "mesen.cyc"])].sort_values("Log$_2$ fold-enrichment", ascending = asc)
tpbra = to_plot[~to_plot["Cell type"].isin(["chondrocyte", "mesen.0", "mesen.2", "mesen.cyc"])].sort_values("Log$_2$ fold-enrichment", ascending = asc)
order = list(tpbra['Cell type']) + list(tpcon['Cell type'])

#Label cell types by their lineage
to_plot["Lineage"] = ["Nervous", "Nervous", "Nervous", "Nervous", "Nervous", "Nervous", "Nervous", "Connective", "Connective", "Connective", "Connective"]

#Make barplot in right order colored by lineage
sns.barplot(data = to_plot, x = "Cell type", y = "Log$_2$ fold-enrichment", hue = "Lineage", order = order, palette = {"Nervous":"#E87620", "Connective":"#1489F4"})
plt.title("Imprinted genes")
#plt.legend(bbox_to_anchor=(1.5, 1.05))
plt.legend([],[], frameon=False)
plt.xticks(rotation=90)

In [None]:
#Newer version of plot, USED
d_imp2 = {"Paternal":list(df_imp_info[df_imp_info[4].isin(["Paternal"])][0]), "Maternal":list(df_imp_info[df_imp_info[4].isin(["Maternal"])][0])}

fdr = []
enrichment = []
ct_list = []
term_list = []

#For each cell type
for file in os.listdir("Final/Div"):
    
    #Read in normalized counts and divergence
    v = pd.read_csv("Final/Div/" + file, sep = ",")
    
    #Get cell type abbreviation
    ct = d_ct_abrev[file_to_celltype(file)].replace(".", "_")
    
    #Read in enrichment results
    vv = pd.read_csv("Final/GSEAPY_New4_Imprinting_Signed/" + file.replace(".csv", "") + "_" + "PatMat" + "/gseapy.gene_set.prerank.report.csv", sep = ",")
    v = v[["Unnamed: 0", "Interaction"]]
    v.columns = ["Gene", "Interaction"]
    
    #Go through paternally and maternally expressed genes
    for index, row in vv.iterrows():
        term = row["Term"]
        term_list.append(row["Term"])
        
        #Get gene set and genes driving enrichment
        gene_set = d_imp2[term]
        genes = row["Lead_genes"].split(";")
        
        #Replace zero q-values with lowest possible q-value based on number of permutations
        fdr.append(max(row["FDR q-val"], 0.001))
        
        #Sort dataframe
        if row["ES"] < 0:
            asc = True
        else:
            asc = False
        v = v.sort_values("Interaction", ascending = asc)
        
        #Figure out rank cutoff used by GSEAPY
        cut = 0
        for index, row in v.iterrows():
            cut += 1
            if genes[-1] == row["Gene"]:
                break
                
        #Get number of genes driving enrichment and number of genes at equivalent cutoff
        genes_agree = len(np.intersect1d(list(v["Gene"])[:cut], gene_set))
        genes_disagree = len(np.intersect1d(list(v["Gene"])[::-1][:cut], gene_set))
        ct_list.append(d_ct_abrev[file_to_celltype(file)])
        if asc:
            enrichment.append(-np.log2((genes_agree + 1)/(genes_disagree + 1)))
        else:
            enrichment.append(np.log2((genes_agree + 1)/(genes_disagree + 1)))

#Create dataframe, compute -log FDR and log fold-enrichment
df_plot_new = pd.DataFrame([term_list, ct_list, enrichment, fdr]).T
df_plot_new.columns = ["Gene set", "Cell type", "Log$_{2}$ fold-enrichment", "FDR"]
df_plot_new["-Log$_{10}$ FDR"] = -np.log10(df_plot_new["FDR"].astype(float))
df_plot_new["Log$_{2}$ fold-enrichment"] = df_plot_new["Log$_{2}$ fold-enrichment"].astype(float)

# Create subplots
fig, ax = plt.subplots(sharex=True, sharey=True)

# Separate data by color
grey_data = df_plot_new[df_plot_new['Gene set'] != 'Paternal']
blue_data = df_plot_new[df_plot_new['Gene set'] == 'Paternal']

# Scatter plots with zorder
sns.scatterplot(x=grey_data['Log$_{2}$ fold-enrichment'], y=grey_data['-Log$_{10}$ FDR'], c='orange', label='Maternally expressed', zorder=1)
sns.scatterplot(x=blue_data['Log$_{2}$ fold-enrichment'], y=blue_data['-Log$_{10}$ FDR'], c='blue', label='Patnerally expressed', zorder=2)

# Customize plot settings
plt.title("Imprinted gene enrichments split by expr. allele")
ax.set_xlabel('Log$_{2}$ fold-enrichment for imprinted genes')
ax.set_ylabel('-Log$_{10}$ FDR')

# Repeat similar steps for other subplots (7 Day, 14 Day, 28 Day)

# Show the legend
ax.legend(bbox_to_anchor= (1.1, 1))

# Display the plot
plt.show()

In [None]:
### Below was not used so it not commented ###

In [None]:
#Older version of plot using enrichment score, not used
term = 'Imprinted'
gene_set = list(df_imp_info[0])
enrich = []
fdr = []
cell_types = []
genes = []
matpat = []
for file in os.listdir("Final/GSEAPY_New4_Imprinting_Signed"):
    if "PatMat" in file and "Heart" not in file and "Imprinting" not in file:
        v = pd.read_csv("Final/GSEAPY_New4_Imprinting_Signed/" + file + "/gseapy.gene_set.prerank.report.csv", sep = ",")
        for index, row in v.iterrows():
            enrich.append(row["ES"])
            fdr.append(row["FDR q-val"])
            cell_types.append(file_to_celltype(file).replace(" ", ".").replace(".PatMat", ""))
            genes.append(row["Lead_genes"])
            matpat.append(row["Term"])
d_en = {}
for i in range(len(cell_types)):
    d_en[cell_types[i]] = genes[i]
sns.set(font_scale = 1.5)
sns.set_style("white")
to_plot = pd.DataFrame([enrich, np.maximum(0.001, fdr), cell_types, matpat]).T
to_plot.columns = ["ES", "FDR", "Cell type", "Expressed allele"]
to_plot["-Log$_{10}$ FDR"] = -np.log10(to_plot["FDR"].astype(float))
to_plot["Enrichment score for imprinted genes"] = to_plot["ES"]
#to_plot["Significance"] = list(np.repeat("FDR < 0.1", to_plot[to_plot["FDR"] < 0.1].shape[0])) + list(np.repeat("Not significant", to_plot[to_plot["FDR"] >= 0.1].shape[0]))
sns.scatterplot(data = to_plot, x = "Enrichment score for imprinted genes", y = "-Log$_{10}$ FDR", hue = "Expressed allele", palette = {"Paternal":"blue", "Maternal":"orange"})
plt.title("Imprinted gene enrichments split by allele")
plt.legend(frameon=True)
