In [None]:
import numpy as np
import math
import matplotlib.pyplot as plt
import csv
import pandas as pd
from scipy import stats
import re
from scipy.stats.stats import pearsonr
#import rpy2.robjects as robjects
import random
from statsmodels.stats.multitest import fdrcorrection
import copy
from collections import Counter
import seaborn as sns
from matplotlib.patches import PathPatch
from matplotlib.path import Path
import os
from scipy.stats import spearmanr,pearsonr

In [None]:
#Colors
mouse = "#F2C911"
rat = "#65B0AC"
intrinsic = "#F55F00"
extrinsic = "#7D9AF4"
reinforcing = "#9B00F5"
opposing = "#F50901"
interaction = "#1E771A"

sns.set(font_scale=1.5)
sns.set_style("white")
d_ct_abrev = {"Chondrocyte":"chondrocyte", "Forebrain glutamatergic progenitors":"brain.glut.prog", "Forebrain GABAergic progenitors":"brain.GABA.prog", "Intermediate progenitors":"inter.prog", "Forebrain glutamatergic neurons":"brain.glut.neu", "Forebrain GABAergic neurons":"brain.GABA.neu", "Spinal GABAergic neurons":"spine.GABA.neu", "Spinal glutamatergic neurons":"spine.glut.neu", "Chondrocytes":"chondrocyte", "Mesenchyme 0":"mesen.0", "Mesenchyme 2":"mesen.2", "Mesenchyme cycling":"mesen.cyc", "Forebrain GABAergic neurons 0":"Toss"}

def file_to_celltype(x):
    x = x.split("/")[-1]
    x = x.replace("_GO_Molecular_Function_2023", "").replace("_GO_Biological_Process_2023", "").replace("Mesechyme", "Mesenchyme").replace("Brain_Or_2010_Div_New4_NewNorm_", "")
    x = x.replace("Mesenchymal_Or_2010_Div_New4_NewNorm_", "").replace("Chondrocyte_Or_2010_Div_New4_NewNorm_", "").replace(".csv", "").replace("_", " ").replace(" all", "")
    x = x.replace("Glutamatergic", "Forebrain glutamatergic").replace("GABAergic", "Forebrain GABAergic").replace("Spinal Forebrain", "Spinal").replace("Chondrocytes", "Chondrocyte")
    return x

plt.rcParams["font.family"] = "Arial"

In [None]:
v = pd.read_csv("Final/Proportion_IntExtInter.csv")
v["Cell type"] = [d_ct_abrev[z] for z in [x.replace("Mesechyme", "Mesenchyme").replace("Brain_Or_2010_Div_New4_NewNorm_", "").replace("Mesenchymal_Or_2010_Div_New4_NewNorm_", "").replace("Chondrocyte_Or_2010_Div_New4_NewNorm_", "").replace(".csv", "").replace("_", " ").replace(" all", "").replace("Glutamatergic", "Forebrain glutamatergic").replace("GABAergic", "Forebrain GABAergic").replace("Spinal Forebrain", "Spinal") for x in list(v["File"])]]
out = []
v = v[v["Cell type"] != "Toss"]
for index, row in v.iterrows():
    out.append([row["Cell type"], row["Proportion intrinsic"], "Proportion intrinsic"])
    out.append([row["Cell type"], row["Proportion extrinsic"], "Proportion extrinsic"])
    out.append([row["Cell type"], row["Proportion interaction"], "Proportion interaction"])
df = pd.DataFrame(out)
df.columns = ["Cell type", "Proportion divergence", "Divergence type"]
df = df.sort_values("Proportion divergence", ascending = False)

sns.barplot(data = df, x = "Cell type", y = "Proportion divergence", hue = "Divergence type", palette = {"Proportion intrinsic":intrinsic, "Proportion extrinsic":extrinsic, "Proportion interaction":interaction})
plt.legend(bbox_to_anchor=(1, 1))
plt.xticks(rotation = 90)
plt.title("Decomposition of gene expression divergence")

In [None]:
directory = "Final/RegInput/"
out = []
for file in os.listdir(directory):
    if ".csv" in file and "Moca" not in file:
        
        v = pd.read_csv(directory + file, sep = ",")
        if "Glutamatergic_neuron" in file and "Spin" not in file:
            v["log expression"] = np.log10(1+v["Total expression"])
            sns.set(font_scale = 1.25)
            sns.set_style('white')
            sns.scatterplot(data = v, x="HM/HR", y=v["Proportion intrinsic"]-v["Proportion extrinsic"], color = "grey")
            plt.xlabel("Absolute host log fold-change")
            plt.ylabel("[intrinsic proportion] - [extrinsic proportion]")

In [None]:
#Evaluate the correlation between the degree extrinsic across different cell types
#Generally more similar cell types have more correlated Degree Extr
out = []
for divergence in ["Extrinsic", "Intrinsic", "Interaction"]:
    for file1 in os.listdir("Final/Div"):
        for file2 in os.listdir("Final/Div"):
            v = pd.read_csv("Final/Div/" + file1, sep = ",").set_index("Unnamed: 0")
            v.columns = [x + file1 for x in list(v.columns)]
            vv = pd.read_csv("Final/Div/" + file2, sep = ",").set_index("Unnamed: 0")
            vv.columns = [x + file2 + " 2" for x in list(vv.columns)]
            vvv = v.join(vv).dropna()
            s = spearmanr(np.sign(vvv[divergence + file1])*vvv["Proportion " + divergence.lower() + file1], np.sign(vvv[divergence + file2 + " 2"])*vvv["Proportion " + divergence.lower() + file2 + " 2"])
            #p = pearsonr(np.sign(vvv[divergence + file1])*vvv["Proportion " + divergence.lower() + file1], np.sign(vvv[divergence + file2 + " 2"])*vvv["Proportion " + divergence.lower() + " 2"])
            out.append([d_ct_abrev[file_to_celltype(file1)], d_ct_abrev[file_to_celltype(file2)], vvv.shape[0], s[0], s[1]])
            if "Glutamatergic_neurons" in file1 and "Spinal" not in file1 and "GABA" in file2 and "Spinal" not in file2 and "eur" in file2:
                print(file1, file2)
                vvv["Signed proportion " + divergence.lower() + " brain.glut.neu"] = np.sign(vvv[divergence + file1])*vvv["Proportion " + divergence.lower() + file1]
                vvv["Signed proportion " + divergence.lower() + " brain.GABA.neu"] = np.sign(vvv[divergence + file2 + " 2"])*vvv["Proportion " + divergence.lower() + file2 + " 2"]

                fig, ax = plt.subplots(figsize=(8, 8))
                ax.set_box_aspect(1)
                ax.set_xlim(-1, 1)
                ax.set_ylim(-1, 1)
                ax = sns.scatterplot(data = vvv, x="Signed proportion " + divergence.lower() + " brain.glut.neu", y="Signed proportion " + divergence.lower() + " brain.GABA.neu", color = "grey")
                plt.show()
    df = pd.DataFrame(out)
    df.columns = ["Cell type 1", "Cell type 2", "Number of genes", "Spearman rho", "Spearman p-value"]
    df.to_csv("Final/Proportion_Signed" + divergence + "_Cross_CellType_Correlations.csv", sep = ",", index = False) 

In [None]:
#Make heatmap of correlation across cell types
sns.set(font_scale=1.5)
df = pd.read_csv("Final/Proportion_Signed" + "Extrinsic" + "_Cross_CellType_Correlations.csv", sep = ",") 
#for index, row in df.iter
cols = []
rows = []
for index, row in df.iterrows():
    if row["Cell type 1"] not in cols:
        cols.append(row["Cell type 1"])
    if row["Cell type 2"] not in rows:
        rows.append(row["Cell type 2"])
out = []
for i in range(len(list(set(df["Cell type 1"])))):
    out.append(list(np.repeat(0, len(list(set(df["Cell type 1"]))))))
for index, row in df.iterrows():
    xind = cols.index(row["Cell type 1"])
    yind = rows.index(row["Cell type 2"])
    if xind != yind:
        out[xind][yind] = row["Spearman rho"]
    else:
        out[xind][yind] = 1
new_df = pd.DataFrame(out)
new_df.columns = cols

#The ordering was inferred from the clustering below
new_df.index = rows
new_df = new_df[["brain.glut.prog", "brain.GABA.neu", "brain.glut.neu", "brain.GABA.prog", "inter.prog", "spine.GABA.neu", "spine.glut.neu","chondrocyte", "mesen.cyc", "mesen.2", "mesen.0"]]
new_df = new_df.T
new_df = new_df[["brain.glut.prog", "brain.GABA.neu", "brain.glut.neu", "brain.GABA.prog", "inter.prog", "spine.GABA.neu", "spine.glut.neu","chondrocyte", "mesen.cyc", "mesen.2", "mesen.0"]]

mask = np.triu(np.ones_like(new_df, dtype=bool)).T
sns.set_theme(style="white")
sns.heatmap(new_df, mask=np.array(mask).T)

In [None]:
new_df

In [None]:
sns.clustermap(new_df, metric = "Euclidean", tree_kws=dict(linewidths=1.5))

In [None]:
#Make heatmap of correlation across cell types
sns.set(font_scale=1.5)
df = pd.read_csv("Final/Proportion_Signed" + "Intrinsic" + "_Cross_CellType_Correlations.csv", sep = ",") 
#for index, row in df.iter
cols = []
rows = []
for index, row in df.iterrows():
    if row["Cell type 1"] not in cols:
        cols.append(row["Cell type 1"])
    if row["Cell type 2"] not in rows:
        rows.append(row["Cell type 2"])
out = []
for i in range(len(list(set(df["Cell type 1"])))):
    out.append(list(np.repeat(0, len(list(set(df["Cell type 1"]))))))
for index, row in df.iterrows():
    xind = cols.index(row["Cell type 1"])
    yind = rows.index(row["Cell type 2"])
    if xind != yind:
        out[xind][yind] = row["Spearman rho"]
    else:
        out[xind][yind] = 1
new_df = pd.DataFrame(out)
new_df.columns = cols

#The ordering was inferred from the clustering below
new_df.index = rows
new_df = new_df[["brain.GABA.prog", "brain.glut.prog", "brain.GABA.neu", "brain.glut.neu", "spine.GABA.neu", "spine.glut.neu", "mesen.2", "mesen.0", "mesen.cyc", "inter.prog", "chondrocyte"]]
new_df = new_df.T
new_df = new_df[["brain.GABA.prog", "brain.glut.prog", "brain.GABA.neu", "brain.glut.neu", "spine.GABA.neu", "spine.glut.neu", "mesen.2", "mesen.0", "mesen.cyc", "inter.prog", "chondrocyte"]]

mask = np.triu(np.ones_like(new_df, dtype=bool)).T
sns.set_theme(style="white")
sns.heatmap(new_df, mask=np.array(mask).T)

In [None]:
sns.clustermap(new_df, metric = "Euclidean", tree_kws=dict(linewidths=1.5))

In [None]:
df = pd.read_csv("Final/Spearman_Prop_Intrinsic-Extrinsic_Predictors.csv", sep = ",")
out = []
for index, row in df.iterrows():
    out.append([row["Cell type"], row["Total expression rho"], "Total expression\n in cell type"])
    out.append([row["Cell type"], row["LFC rho"], "Log$_{2}$ fold-change\n in cell type"])
    out.append([row["Cell type"], row["Constraint rho"], "Constraint on expression"])
    out.append([row["Cell type"], row["Number CREs rho"], "Number of CREs\n within 100kb of TSS"])
    #out.append([row["Cell type"], row["Nearest gene dist rho"], "Nearest gene dist"])
    out.append([row["Cell type"], row["Tau us rho"], "Cell type-specificity"])
    #out.append([row["Cell type"], row["CRE density rho"], "CRE density"])
    out.append([row["Cell type"], row["Tau kaes rho"], "Tissue-specificity in\n rat and mouse embryos"])
    out.append([row["Cell type"], row["LFC var kaes rho"], "Log$_{2}$ fold-change\n variance across tissues"])
new_df = pd.DataFrame(out)
new_df.columns = ["Cell type", "Spearman's rho", "Predictor"]

In [None]:
sns.set(font_scale = 1.5)
sns.set_style("white")
sns.swarmplot(data = new_df, x = "Cell type", y = "Spearman's rho", hue = "Predictor", palette = "colorblind")
plt.legend(bbox_to_anchor=(1, 1.035))
plt.xticks(rotation=90)