In [None]:
import numpy as np
import math
import matplotlib.pyplot as plt
import csv
import pandas as pd
from scipy import stats
import re
from scipy.stats import pearsonr,spearmanr,fisher_exact,binom_test
#import rpy2.robjects as robjects
import random
from statsmodels.stats.multitest import fdrcorrection
import os
import seaborn as sns
#import gseapy as gs
from scipy.stats import norm
import gseapy as gs
from collections import Counter
import scanpy as sc

#Colors
mouse = "#F2C911"
rat = "#65B0AC"
intrinsic = "#F55F00"
extrinsic = "#7D9AF4"
reinforcing = "#9B00F5"
opposing = "#F50901"
interaction = "#1E771A"

sns.set(font_scale=1.5)
sns.set_style("white")
d_ct_abrev = {"chondrocytes":"chondrocyte", "Chondrocyte":"chondrocyte", "Forebrain glutamatergic progenitors":"brain.glut.prog", "Forebrain GABAergic progenitors":"brain.GABA.prog", "Intermediate progenitors":"inter.prog", "Forebrain glutamatergic neurons":"brain.glut.neu", "Forebrain GABAergic neurons":"brain.GABA.neu", "Spinal GABAergic neurons":"spine.GABA.neu", "Spinal glutamatergic neurons":"spine.glut.neu", "Chondrocytes":"chondrocyte", "Mesenchyme 0":"mesen.0", "Mesenchyme 2":"mesen.2", "Mesenchyme cycling":"mesen.cyc", "Forebrain GABAergic neurons 0":"Toss"}

#Function to convert file names to cell types for plotting purposes
def file_to_celltype(x):
    x = x.split("/")[-1]
    x = x.replace("_GO_Molecular_Function_2023", "").replace("_GO_Biological_Process_2023", "").replace("Mesechyme", "Mesenchyme").replace("Brain_Or_2010_Div_New4_NewNorm_", "")
    x = x.replace("Mesenchymal_Or_2010_Div_New4_NewNorm_", "").replace("Chondrocyte_Or_2010_Div_New4_NewNorm_", "").replace(".csv", "").replace("_", " ").replace(" all", "")
    x = x.replace("Glutamatergic", "Forebrain glutamatergic").replace("GABAergic", "Forebrain GABAergic").replace("Spinal Forebrain", "Spinal").replace("Chondrocytes", "Chondrocyte")
    return x

plt.rcParams["font.family"] = "Arial"

#Summarizes GSEAPY enrichment resuts in a particular folder
def summarize_enrichment(fold, out_file):
    #Summarize the enrichments
    out = []
    
    #Iterate through the different results
    for folder in os.listdir("Final/" + fold):
        v = pd.read_csv("Final/" + fold + "/" + folder + "/" + "gseapy.gene_set.prerank.report.csv")
        for index, row in v.iterrows():
            #If it is nominally significant, save it
            if row["FDR q-val"] < 0.25:
                out.append([folder, row["Term"], row["FDR q-val"], row["NES"], row["Lead_genes"]])
    #Write out
    df = pd.DataFrame(out)
    df.columns = ["Cluster_Category", "Term", "FDR", "NES", "Ledge genes"]
    df = df.sort_values("FDR")
    df.to_csv("Final/Summarized_Enrichments_" + out_file + ".csv", index = False)

#Read in dictionary to convert mouse gene names to human gene names
m2h = pd.read_csv("Mouse_To_Human_Gene_Conversions.csv").dropna()
d = {}
for index, row in m2h.iterrows():
    d[row["external_gene_name"]] = row["hsapiens_homolog_associated_gene_name"]
    
gene_sets = ['GO_Biological_Process_2023', 'GO_Cellular_Component_2023', 'GO_Molecular_Function_2023', 'MGI_Mammalian_Phenotype_Level_4_2021', "CORUM"]

In [None]:
### This first cell is the only one where I comment on the general enrichment strategy ###

#Doing enrichement analysis for signed extrinsic (USED)
#Negative is higher in mouse, positive is higher in rat

vals = ["Extrinsic", "Intrinsic", "Interaction"]
for divergence in vals:
    try:
        os.mkdir("Final/GSEAPY_" + "Signed_Proportion_" + divergence.lower())
    except:
        pass
    for gene_set in gene_sets:
        #For each cell type
        for file in os.listdir("Final/Div"):
            v = pd.read_csv("Final/Div/" + file, sep = ",")
            
            #Compute the signed proportion divergence
            v["Signed " + divergence.lower()] = np.sign(v[divergence])*v["Proportion " + divergence.lower()]
            
            #Sort by that
            v = v.sort_values("Signed " + divergence.lower(), ascending = False)
            
            #Subset the data for input to gseapy
            ranking = v[["Unnamed: 0", "Signed " + divergence.lower()]]
            out = []
            
            #Convert to human gene names
            for index, row in ranking.iterrows():
                try:
                    out.append([d[row["Unnamed: 0"]], row["Signed " + divergence.lower()]])
                except:
                    pass
            ranking = pd.DataFrame(out)
            try:
                gs.prerank(rnk=ranking, gene_sets=gene_set, threads=4, permutation_num=1000, outdir= 'C:/Users/astar/Chimerism/Final/GSEAPY_New4_Signed_Proportion_' + divergence.lower() + '/' + d_ct_abrev[file_to_celltype(file)].replace(".", "_") + "_" + gene_set, format='png', seed=6, min_size = 10, max_size = 300)
            except:
                pass

In [None]:
summarize_enrichment("GSEAPY_New4_Signed_Proportion_intrinsic", "GSEAPY_Signed_Proportion_intrinsic")
summarize_enrichment("GSEAPY_New4_Signed_Proportion_interaction", "GSEAPY_Signed_Proportion_interaction")
summarize_enrichment("GSEAPY_New4_Signed_Proportion_extrinsic", "GSEAPY_Signed_Proportion_extrinsic")

In [None]:
#Doing exact same thing again, this time without Xbp1 and only GO_biological_process
#Did not affect results at all
vals = ["Extrinsic", "Intrinsic"]
gene_sets = ["GO_Biological_Process_2023"]
for divergence in vals:
    try:
        os.mkdir("Final/GSEAPY_" + "Signed_Proportion_" + divergence.lower())
    except:
        pass
    for gene_set in gene_sets:
        for file in os.listdir("Final/Div"):
            v = pd.read_csv("Final/Div/" + file, sep = ",")
            print(v.shape)
            v = v[~v["Unnamed: 0"].isin(["Xbp1"])]
            print(v.shape)
            v["Signed " + divergence.lower()] = np.sign(v[divergence])*v["Proportion " + divergence.lower()]
            v = v.sort_values("Signed " + divergence.lower(), ascending = False)
            ranking = v[["Unnamed: 0", "Signed " + divergence.lower()]]
            out = []
            for index, row in ranking.iterrows():
                try:
                    out.append([d[row["Unnamed: 0"]], row["Signed " + divergence.lower()]])
                except:
                    pass
            ranking = pd.DataFrame(out)
            try:
                gs.prerank(rnk=ranking, gene_sets=gene_set, threads=4, permutation_num=1000, outdir= 'C:/Users/astar/Chimerism/Final/GSEAPY_New4_Signed_Proportion_NoXbp1' + divergence.lower() + '/' + d_ct_abrev[file_to_celltype(file)].replace(".", "_") + "_" + gene_set, format='png', seed=6, min_size = 10, max_size = 300)
            except:
                pass

In [None]:
### Enrichment strategy is identical to second cell ###

gene_sets = ['GO_Biological_Process_2023', 'GO_Cellular_Component_2023', 'GO_Molecular_Function_2023', 'MGI_Mammalian_Phenotype_Level_4_2021', "CORUM"]

#Aggregating by tissue and doing enrichment analysis for signed things
vals = ["Extrinsic", "Intrinsic", "Interaction"]
for keep in ["All", "Brain", "Conn"]:
    for divergence in vals:
        df = pd.DataFrame()
        ind = 1
        #Iterate through the files
        for file in os.listdir("Final/Div"):
            #If we want to keep all of them regardless of tissue
            if keep == "All":
                v = pd.read_csv("Final/Div/" + file).set_index("Unnamed: 0")
                v["Signed " + divergence.lower()] = np.sign(v[divergence])*v["Proportion " + divergence.lower()]
                v = v[["Signed " + divergence.lower()]]
                v.columns = [d_ct_abrev[file_to_celltype(file)].replace(".", "_")]
                if ind:
                    df = v
                    ind = 0
                else:
                    df = df.join(v, how="outer")
            
            else:
                #If we only want neuronal cell types
                if keep == "Brain":
                    if "Brain" in file:
                        v = pd.read_csv("Final/Div/" + file).set_index("Unnamed: 0")
                        v["Signed " + divergence.lower()] = np.sign(v[divergence])*v["Proportion " + divergence.lower()]
                        v = v[["Signed " + divergence.lower()]]
                        v.columns = [d_ct_abrev[file_to_celltype(file)].replace(".", "_")]
                        if ind:
                            df = v
                            ind = 0
                        else:
                            df = df.join(v, how="outer")
                #If we only want connective tissue cell types
                elif keep == "Conn":
                    if "Brain" not in file:
                        v = pd.read_csv("Final/Div/" + file).set_index("Unnamed: 0")
                        v["Signed " + divergence.lower()] = np.sign(v[divergence])*v["Proportion " + divergence.lower()]
                        v = v[["Signed " + divergence.lower()]]
                        v.columns = [d_ct_abrev[file_to_celltype(file)].replace(".", "_")]
                        if ind:
                            df = v
                            ind = 0
                        else:
                            df = df.join(v, how="outer")
        out = []
        indices = []
        #Require a gene be measured in at least 5 cell types if all, at least 3 cell types if connective tissue or nervous only
        for index, row in df.iterrows():
            if keep == "All":
                if len(row.dropna()) > 4:
                    out.append(np.mean(row.dropna()))
                    indices.append(index)
            else:
                if len(row.dropna()) > 2:
                    out.append(np.mean(row.dropna()))
                    indices.append(index)
        df2 = pd.DataFrame(out)
        df2.index = indices
        df2.columns = ["Mean signed " + divergence.lower()]
        df2.sort_values("Mean signed " + divergence.lower())
        out = []
        
        #Same enrichment strategy as above
        for index, row in df2.iterrows():
            try:
                out.append([d[index], row["Mean signed " + divergence.lower()]])
            except:
                pass
        ranking = pd.DataFrame(out)
        ranking.columns = [0, 1]
        for gene_set in gene_sets:
            try:
                gs.prerank(rnk=ranking, gene_sets=gene_set, threads=4, permutation_num=1000, outdir= 'C:/Users/astar/Chimerism/Final/GSEAPY_New4_Signed_Proportion_Mean_Signed_' + divergence.lower() + '/' + keep + "_" + gene_set, format='png', seed=6, min_size = 10, max_size = 300)
            except:
                pass

In [None]:
#Make a function to plot the various expression levels

#Copied from the Fig_2A-F.ipynb notebook for convenience (explained in more detail there)
sns.set(font_scale=1.3)
sns.set_style("white")
def bp(gene, file, extra_title = ""):
    vv = pd.read_csv(file).set_index("Unnamed: 0")
    vp = vv.loc[gene]
    r1 = ["Rat-like env.", "Mouse", vp["MR1_mi Norm CPM"]]
    r2 = ["Rat-like env.", "Rat", vp["MR1_ri Norm CPM"]]
    r3 = ["Mouse-like env.", "Mouse", vp["RM1_mi Norm CPM"]]
    r4 = ["Mouse-like env.", "Rat", vp["RM1_ri Norm CPM"]]
    r5 = ["Mouse-like env.", "Mouse", vp["RM2_mi Norm CPM"]]
    r6 = ["Mouse-like env.", "Rat", vp["RM2_ri Norm CPM"]]
    r7 = ["Wildtype", "Mouse", vp["WT_mi Norm CPM"]]
    r8 = ["Wildype", "Rat", vp["WT_ri Norm CPM"]]
    if "Brain" in file:
        r21 = ["Species-matched env.", "Rat", vp["MR1_ri Norm CPM"]]
        r31 = ["Species-matched env.", "Mouse", vp["RM1_mi Norm CPM"]]
        tp = pd.DataFrame([r1, r2, r3, r4, r31, r21])
    else:
        r21 = ["Species-matched env.", "Rat", vp["MR1_ri Norm CPM"]]
        r31 = ["Species-matched env.", "Mouse", vp["RM2_mi Norm CPM"]]
        tp = pd.DataFrame([r31, r21, r1, r2, r6, r5])
    tp.columns = ["Chimera (Donor-Species-matched env.)", "Cell species", "Norm CPM"]
    #fig, ax = plt.subplots(figsize = (6, 4.5))
    #sns.set(font_scale=1.5)
    #sns.set_style("white")
    sns.barplot(data = tp, y = "Norm CPM", x = "Chimera (Donor-Species-matched env.)", hue = "Cell species", palette = {"Mouse":mouse, "Rat":rat}, alpha = 1, order = ["Species-matched env.", "Rat-like env.", "Mouse-like env."])
    plt.ylabel("Pseudobulked normalized counts")
    #plt.xlabel("Chimera (donor" + r'$\rightarrow$' + "Species-matched env.)")
    plt.xlabel("Extrinsic environment")
    if not extra_title:
        plt.title("Expression of $\it{" + gene + "}$ in " + d_ct_abrev[file_to_celltype(file)])
    else:
        plt.title(extra_title + "\n$\it{" + gene + "}$ in " + d_ct_abrev[file_to_celltype(file)])
    xmin, xmax, ymin, ymax = plt.axis()
    #plt.ylim(0, 1)
    plt.axvline(0.5, color = "black", linewidth = 2.5, alpha = 1, linestyle="dashed")
    #plt.legend(bbox_to_anchor=(1.375, 1.05))
    plt.legend([],[], frameon=False)
    plt.show()
    return vp

#file = "Final/Div/Brain_Or_2010_Div_New4_NewNorm_GABAergic_neurons_all.csv"
#gene = "Psma7"
#vp = bp(gene, file)
#file = "Final/Div/Mesenchymal_Or_2010_Div_New4_NewNorm_Mesenchyme_2.csv"
#gene = "Psma7"
#vp = bp(gene, file)



file = "Final/Div/Brain_Or_2010_Div_New4_NewNorm_Glutamatergic_progenitors.csv"
gene = "Setbp1"
vp = bp(gene, file)
vp

In [None]:
term = "Response To Endoplasmic Reticulum Stress (GO:0034976)"
term = "Histone H3 Methyltransferase Activity (GO:0140938)"
#term = "Receptor Signaling Pathway Via STAT (GO:0097696)"
#term = "Positive Regulation Of Programmed Cell Death (GO:0043068)"
#gene_set = d_BP[term]
enrich = []
fdr = []
cell_types = []

#For a particular term, go through and collect the results across all cell types for it
for file in os.listdir("Final/GSEAPY_New4_Signed_Proportion_extrinsic"):
    if "neurons_0" not in file:
        v = pd.read_csv("Final/GSEAPY_New4_Signed_Proportion_extrinsic/" + file + "/gseapy.gene_set.prerank.report.csv", sep = ",")
        out = []
        for index, row in v.iterrows():
            if row["Term"] == term:
                enrich.append(row["ES"])
                fdr.append(row["FDR q-val"])
                cell_types.append(file_to_celltype(file).replace(" ", "."))
                          

In [None]:
#Make a barplot of the enrichments, not used
sns.set(font_scale = 1.5)
sns.set_style("white")
to_plot = pd.DataFrame([enrich, fdr, cell_types]).T
to_plot.columns = ["GSEA enrichment score", "FDR", "Cell type"]
to_plot = to_plot[~to_plot["Cell type"].isin(["Toss", "chondrocyte", "mesen.0", "mesen.2", "mesen.cyc"])]
to_plot = to_plot[~to_plot["Cell type"].isin(["Toss"])]
to_plot = to_plot.sort_values("GSEA enrichment score", ascending = False)
sns.barplot(data = to_plot, x = "Cell type", y = "GSEA enrichment score", color = "grey")
plt.title("Positive regulation of programmed cell death")
plt.xticks(rotation=90)

In [None]:
#Making volcano plots with log2 fold-enrichment

d_MF = gs.get_library(name='GO_Molecular_Function_2023')
d_BP = gs.get_library(name='GO_Biological_Process_2023')

#Uncomment which term to use
#term = "Response To Endoplasmic Reticulum Stress (GO:0034976)"
#term = "Histone H3 Methyltransferase Activity (GO:0140938)"
#term = "Receptor Signaling Pathway Via STAT (GO:0097696)"
#term = "Positive Regulation Of Programmed Cell Death (GO:0043068)"

fdr = []
enrichment = []
ct_list = []
term_list = []
for file in os.listdir("Final/Div"):
    #Get the normalized counts and divergence estimates for the cell type
    v = pd.read_csv("Final/Div/" + file, sep = ",")
    ct = file.replace("Brain_Or_2010_Div_New4_NewNorm_", "").replace("Mesenchymal_Or_2010_Div_New4_NewNorm_", "").replace("Chondrocyte_Or_2010_Div_New4_NewNorm_", "").replace(".csv", "")
    ct = d_ct_abrev[file_to_celltype(file)].replace(".", "_")
    
    #Get the enrichment results for the cell type
    vv = pd.read_csv("Final/GSEAPY_New4_Signed_Proportion_extrinsic/" + ct + "_GO_Molecular_Function_2023" + "/gseapy.gene_set.prerank.report.csv", sep = ",")
    out = []
    
    #Restrict to only genes that could be tested for enrichment
    v["Signed extrinsic"] = np.sign(v["Extrinsic"])*v["Proportion extrinsic"]
    for index, row in v.iterrows():
        try:
            out.append([d[row["Unnamed: 0"]], row["Signed extrinsic"]])
        except:
            pass
    v = pd.DataFrame(out)
    v.columns = ["Gene", "Signed extrinsic"]
    
    #For every term, compute the log fold-change enrichment
    for index, row in vv.iterrows():
        term = row["Term"]
        term_list.append(row["Term"])
        gene_set = d_MF[term]
        genes = row["Lead_genes"].split(";")
        fdr.append(row["FDR q-val"])
        
        #Change sort order depending on whether it is negative enrichment or positive
        if row["ES"] < 0:
            asc = True
        else:
            asc = False
        v = v.sort_values("Signed extrinsic", ascending = asc)
        
        #Find the gene rank that was used as a cutoff in GSEAPY
        cut = 0
        for index, row in v.iterrows():
            cut += 1
            if genes[-1] == row["Gene"]:
                break
        
        #Using that rank, get the number of genes that drive the enrichment and the number of genes going against it at an equivalent cutoff
        genes_agree = len(np.intersect1d(list(v["Gene"])[:cut], gene_set))
        genes_disagree = len(np.intersect1d(list(v["Gene"])[::-1][:cut], gene_set))
        ct_list.append(d_ct_abrev[file_to_celltype(file)])
        
        #Make sign of the fold-change properly match the sign of the original GSEAPY enrichment
        if asc:
            enrichment.append(-np.log2((genes_agree + 1)/(genes_disagree + 1)))
        else:
            enrichment.append(np.log2((genes_agree + 1)/(genes_disagree + 1)))

In [None]:
#Create a data frame for subplotting purposes
df_plot_new = pd.DataFrame([term_list, ct_list, enrichment, fdr]).T
df_plot_new.columns = ["Gene set", "Cell type", "Log$_{2}$ fold-enrichment", "FDR"]
df_plot_new["-Log$_{10}$ FDR"] = -np.log10(df_plot_new["FDR"].astype(float))
df_plot_new["Log$_{2}$ fold-enrichment"] = df_plot_new["Log$_{2}$ fold-enrichment"].astype(float)
df_plot_new

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Create subplots
fig, ax = plt.subplots(sharex=True, sharey=True)

# Separate data by color
grey_data = df_plot_new[df_plot_new['Gene set'] != 'Histone H3 Methyltransferase Activity (GO:0140938)']
blue_data = df_plot_new[df_plot_new['Gene set'] == 'Histone H3 Methyltransferase Activity (GO:0140938)']

# Scatter plots with zorder
sns.scatterplot(x=grey_data['Log$_{2}$ fold-enrichment'], y=grey_data['-Log$_{10}$ FDR'], c='#BBBBBC', label='Other', zorder=1)
sns.scatterplot(x=blue_data['Log$_{2}$ fold-enrichment'], y=blue_data['-Log$_{10}$ FDR'], c='blue', label='Histone H3\nmethyltransferase activity', zorder=2)

# Customize plot settings
plt.title("Histone H3 methyltransferase activity")
ax.set_xlabel('Log$_{2}$ fold-enrichment for GO category')
ax.set_ylabel('-Log$_{10}$ FDR')

# Repeat similar steps for other subplots (7 Day, 14 Day, 28 Day)

# Show the legend
ax.legend(bbox_to_anchor= (1.1, 1))

# Display the plot
plt.show()

In [None]:
#Diamond plots, shown in Fig3, 4, Supp Figs 14, 15, 22, 23
gene_sets = ['GO_Biological_Process_2023', 'GO_Cellular_Component_2023', 'GO_Molecular_Function_2023', 'MGI_Mammalian_Phenotype_Level_4_2021', "CORUM"]
out2 = []
d_cat = gs.get_library('GO_Molecular_Function_2023')
to_plot = "Spliceosome (human)"

#Add the correct term to plot
#to_plot = "26S proteasome (human)"
to_plot = "Respiratory chain complex I, mitochondrial (mouse)"
to_plot = "Response To Endoplasmic Reticulum Stress (GO:0034976)"
to_plot = "Positive Regulation Of Programmed Cell Death (GO:0043068)"
to_plot = "Histone H3 Methyltransferase Activity (GO:0140938)"
sns.set_style("white")

#Iterate through cell types
for file in os.listdir("Final/Div/"):
    
    #Read in data and compute signed extrinsic/intrinsic
    v = pd.read_csv("Final/Div/" + file)
    v["Signed extrinsic"] = np.sign(v["Extrinsic"])*v["Proportion extrinsic"]
    v["Signed intrinsic"] = np.sign(v["Intrinsic"])*v["Proportion intrinsic"]
    v["Signed interaction"] = np.sign(v["Interaction"])*v["Proportion interaction"]
    out = []
    
    #Restrict to only genes that could be used for enrichment
    for index, row in v.iterrows():
        try:
            out.append([d[row["Unnamed: 0"]], row["Signed extrinsic"], row["Signed intrinsic"], row["Signed interaction"]])
        except:
            pass
    ranking = pd.DataFrame(out)
    ranking.columns = ["Gene", "Signed extrinsic", "Signed intrinsic", "Signed interaction"]
    enrichments = pd.read_csv("Final/Summarized_Enrichments_GSEAPY_Signed_Proportion_extrinsic.csv")
    
    #Iterate through the enrichment terms to find the right term
    for index, row in enrichments.iterrows():
        
        #If the category was actually tested for the cell type of interest, proceed
        if d_ct_abrev[file_to_celltype(file)].replace(".", "_") in row["Cluster_Category"]:
            
            #Get the genes driving the enrichment
            genes = row["Ledge genes"].split(";")
            granking = ranking[ranking["Gene"].isin(genes)]
            
            #Not used
            out2.append([file.replace("Brain_Or_2010_Div_New4_NewNorm_", "").replace("Mesenchymal_Or_2010_Div_New4_NewNorm_", "").replace("Chondrocyte_Or_2010_Div_New4_NewNorm_", "").replace(".csv", ""), row["Term"], np.mean(granking["Signed extrinsic"]), np.mean(granking["Signed intrinsic"]), np.mean(granking["Signed interaction"]), np.mean(np.abs(granking["Signed extrinsic"])), np.mean(np.abs(granking["Signed intrinsic"])), np.mean(np.abs(granking["Signed interaction"])), row["FDR"], row["NES"]])
            
            #If we have the right term
            if to_plot in row["Term"]:
                
                #Get genes in the category so we can label them
                genes = np.intersect1d(d_cat[to_plot], ranking["Gene"])
                ranking["Gene set"] = ranking["Gene"].isin(genes)
                ranking = ranking.sort_values("Gene set")
                
                #Need to change this based on the category
                ranking["Gene set"] = ["Spliceosome" if x else "Background" for x in ranking["Gene set"]]
                
                #Create diamond scatterplot
                sns.scatterplot(data = ranking, x = "Signed intrinsic", y = "Signed extrinsic", hue = "Gene set", palette = {"Spliceosome":"blue", "Background":"#BBBBBC"})
                #plt.legend(bbox_to_anchor=(1.9, 1))
                
                #Remove legend and add proper tick marks
                plt.legend([],[], frameon=False)
                plt.xticks([-1, -0.5, 0, 0.5, 1], [-1, -0.5, 0, 0.5, 1])
                plt.yticks([-1, -0.5, 0, 0.5, 1], [-1, -0.5, 0, 0.5, 1])
                z = file.replace("_GO_Molecular_Function_2023", "").replace("_GO_Biological_Process_2023", "").replace("Mesechyme", "Mesenchyme").replace("Brain_Or_2010_Div_New4_NewNorm_", "").replace("Mesenchymal_Or_2010_Div_New4_NewNorm_", "").replace("Chondrocyte_Or_2010_Div_New4_NewNorm_", "").replace(".csv", "").replace("_", " ").replace(" all", "").replace("Glutamatergic", "Forebrain glutamatergic").replace("GABAergic", "Forebrain GABAergic").replace("Spinal Forebrain", "Spinal").replace("Chondrocytes", "Chondrocyte")
                #plt.title(row["Term"].replace(" (mouse)", "").replace(" (human)", "") + " " + " ".join([z.split(" ")[0].lower()] + z.split(" ")[1:]))
                
                #Change plurality
                if z == "Chondrocyte":
                    z = "Chondrocytes"
                
                #Give correct title
                #plt.title("Resp. chain complex I in:\n"+ " " + z.lower())
                #plt.title("26S proteasome in:\n"+ " " + z.lower())
                plt.title("Histone H3 methyltransferase activity in:\n"+ " " + z[0].lower() + z[1:])
                plt.show()
                rank_keep = ranking.copy()
#Not used
#df2 = pd.DataFrame(out2)
#df2

In [None]:
#Enrichment with proportion of each divergence (not used)
props = ["Proportion extrinsic", "Proportion intrinsic", "Proportion interaction"]
for prop in props:
    try:
        os.mkdir("Final/" + "GSEAPY_" + prop.replace(" ", "_"))
    except:
        print("Dir exists")
    for gene_set in gene_sets:
        for file in os.listdir("Final/Div"):
            v = pd.read_csv("Final/Div/" + file, sep = ",").sort_values(prop, ascending = False)
            ranking = v[["Unnamed: 0", prop]]
            out = []
            for index, row in ranking.iterrows():
                try:
                    out.append([d[row["Unnamed: 0"]], row[prop]])
                except:
                    pass
            df = pd.DataFrame(out)
            df = df.sort_values(1, ascending = False)
            try:
                gs.prerank(rnk=df, gene_sets=gene_set, threads=4, permutation_num=1000, outdir= 'C:/Users/astar/Chimerism/Final/GSEAPY_' + prop.replace(" ", "_") + '/' + d_ct_abrev[file_to_celltype(file)].replace(".", "_") + "_" + gene_set, format='png', seed=6, min_size = 10, max_size = 300)
            except:
                pass
#Summarize the enrichments
summarize_enrichment("GSEAPY_Proportion_intrinsic", "GSEAPY_Proportion_intrinsic")
summarize_enrichment("GSEAPY_Proportion_interaction", "GSEAPY_Proportion_interaction")
summarize_enrichment("GSEAPY_Proportion_extrinsic", "GSEAPY_Proportion_extrinsic")