In [None]:
import numpy as np
import math
import matplotlib.pyplot as plt
import csv
import pandas as pd
from scipy import stats
import re
from scipy.stats import pearsonr,spearmanr,fisher_exact,binom_test
#import rpy2.robjects as robjects
import random
from statsmodels.stats.multitest import fdrcorrection
import os
import seaborn as sns
#import gseapy as gs
from scipy.stats import norm
import gseapy as gs
from collections import Counter
import scanpy as sc
import h5py
import anndata
import scanpy.external as sce

sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80)

In [None]:
c1 = sc.read_10x_h5("Heart/filtered_feature_bc_matrix_Chimera1.h5")
gem_c1 = pd.read_csv("Heart/gem_classification_Chimera1.csv").set_index("barcode")
c1.obs = c1.obs.join(gem_c1)

c2 = sc.read_10x_h5("Heart/filtered_feature_bc_matrix_Chimera2.h5")
gem_c2 = pd.read_csv("Heart/gem_classification_Chimera2.csv").set_index("barcode")
c2.obs = c2.obs.join(gem_c2)

r1 = sc.read_10x_h5("Heart/filtered_feature_bc_matrix_WT_Rat_Heart.h5")
r1.obs["Call"] = np.repeat("mRatBN7.2", r1.obs.shape[0])

In [None]:
#Comparable xpression of Eif2s3y indicates that these are male
c1.obs
c1m = c1[c1.obs["call"].isin(["GRCm39"])].copy().T
keep = []
gene_names = []
for index, row in c1m.obs.iterrows():
    if "GRCm39" in index:
        keep.append(row["gene_ids"])
        gene_names.append(index.split("_")[1])
c1m = c1m[c1m.obs["gene_ids"].isin(keep)]
c1m.obs["Gene name"] = gene_names

c = c1m
c_X = pd.DataFrame(c.X.todense())
c_X.index = c.obs.index
c_X.columns = c.var.index
print(np.sum(c_X.loc["GRCm39____Eif2s3y"]))
print(np.sum(c_X.loc["GRCm39____Eif2s3x"]))
print(np.sum(c_X.loc["GRCm39____Uty"]))

In [None]:
#Not enough cells to really infer anything for c2 though
c2m = c2[c2.obs["call"].isin(["GRCm39"])].copy().T
keep = []
gene_names = []
for index, row in c2m.obs.iterrows():
    if "GRCm39" in index:
        keep.append(row["gene_ids"])
        gene_names.append(index.split("_")[1])
c2m = c2m[c2m.obs["gene_ids"].isin(keep)]
c2m.obs["Gene name"] = gene_names

c = c2m
c_X = pd.DataFrame(c.X.todense())
c_X.index = c.obs.index
c_X.columns = c.var.index
print(np.sum(c_X.loc["GRCm39____Eif2s3y"]))
print(np.sum(c_X.loc["GRCm39____Eif2s3x"]))
print(np.sum(c_X.loc["GRCm39____Uty"]))


In [None]:
#Restrict only to rat cells
c1 = c1[c1.obs["call"].isin(["mRatBN7.2"])].copy()
c2 = c2[c2.obs["call"].isin(["mRatBN7.2"])].copy()
c1 = c1.T
c2 = c2.T

In [None]:
keep = []
gene_names = []
for index, row in c1.obs.iterrows():
    if "mRatBN7.2" in index:
        keep.append(row["gene_ids"])
        gene_names.append(index.split("_")[1])
c1 = c1[c1.obs["gene_ids"].isin(keep)]
c1.obs["Gene name"] = gene_names
c2 = c2[c2.obs["gene_ids"].isin(keep)]
c2.obs["Gene name"] = gene_names

In [None]:
c1 = c1.copy().T
c2 = c2.copy().T

In [None]:
c1.obs["Orig barcode"] = c1.obs.index
c1.obs.index = [x + "_c1" for x in list(c1.obs.index)]
c2.obs["Orig barcode"] = c2.obs.index
c2.obs.index = [x + "_c2" for x in list(c2.obs.index)]
cm.index

In [None]:
c1.var.index = c1.var["Gene name"]
c2.var.index = c2.var["Gene name"]

In [None]:
c1.var_names_make_unique()
c2.var_names_make_unique()
r1.var_names_make_unique()

In [None]:
v = anndata.concat([c1, c2, r1])

In [None]:
bleh = []
bleh2 = []
for index, row in v.obs.iterrows():
    if "c1" in index or "c2" in index:
        bleh.append("Donor")
    else:
        bleh.append("Host")
    if "c1" in index:
        bleh2.append("Chimera 1")
    elif "c2" in index:
        bleh2.append("Chimera 2")
    else:
        bleh2.append("Host")
v.obs["DH"] = bleh
v.obs["Sample"] = bleh2

In [None]:
meta = pd.read_csv("Final/Heart/Heart_Metadata.csv")
meta.index = [x.split("_")[-1] for x in list(meta["Barcode"])]
d_map = {"Chimera 1":"_c1", "Chimera 2":"_c2", "Rat Heart E11.5":""}
indices = []
for index, row in meta.iterrows():
    indices.append((index + d_map[row["orig.ident"]]).replace(" ", ""))
meta.index = indices
v.obs = v.obs.join(meta)

In [None]:
try:
    os.mkdir("Final/Heart_TheirCT")
except:
    print("Dir exists")

In [None]:
def pseudobulk2(c, ct):
    c_X = pd.DataFrame(c.X.todense())
    c_X.index = c.obs.index
    c_X.columns = c.var.index
    c_ct = c_X.loc[np.intersect1d(c_X.index, list(v.obs[v.obs["CellType"].isin([ct])].index))]
    c_ct = c_ct.T

    c_ct["Summed"] = np.sum(c_ct, axis = 1)
    c_coll = c_ct[["Summed"]].copy()
    return c_coll

def cpm(x):
    s = sum(list(x))
    x = [i*1000000/s for i in list(x)]
    return x

def cpt(x):
    s = sum(list(x))
    x = [i*10000/s for i in list(x)]
    return x

def pseudobulk(c, ct):
    c_X = pd.DataFrame(c.X.todense())
    c_X.index = c.obs.index
    c_X.columns = c.var.index

    c_ct = c_X.loc[np.intersect1d(c_X.index, list(v.obs[v.obs["leiden"].isin([ct])].index))]
    c_ct = c_ct.T

    c_ct["Summed"] = np.sum(c_ct, axis = 1)
    c_coll = c_ct[["Summed"]].copy()
    return c_coll

def to_ct(c, ct):
    c_X = pd.DataFrame(c.X.todense())
    c_X.index = c.obs.index
    c_X.columns = c.var.index

    c_ct = c_X.loc[np.intersect1d(c_X.index, list(v.obs[v.obs["leiden"].isin([ct])].index))]
    c_ct = c_ct.T
    return c_ct

m2h = pd.read_csv("Mouse_To_Human_Gene_Conversions.csv").dropna()
d = {}
for index, row in m2h.iterrows():
    d[row["external_gene_name"]] = row["hsapiens_homolog_associated_gene_name"]


In [None]:
imp_conf_list = []
o = open("mouse_imprinted_genes.txt")
for line in o:
    l = line.replace("\xa0", "").replace("\n", "").split("\t")
    if ("Imprinted" in l[3] or "Predicted" in l[3]) and "Not" not in l[3]:
        imp_conf_list.append(l)
df_imp_info = pd.DataFrame(imp_conf_list)
Counter(df_imp_info[3])

In [None]:
#Do the enrichment testing on their data

for i in list(set(list(v.obs["CellType"]))):
    if i != "nan" and i != "Other":
        c1_cm = pseudobulk2(c1, str(i))
        c2_cm = pseudobulk2(c2, str(i))
        r1_cm = pseudobulk2(r1, str(i))
        c1_cm.columns = ["Summed counts donor 1"]
        c2_cm.columns = ["Summed counts donor 2"]
        r1_cm.columns = ["Summed counts host"]
        cm = c1_cm.join(c2_cm).join(r1_cm).dropna()

        cm = cm[(cm["Summed counts donor 1"] >= 20) | (cm["Summed counts donor 2"] >= 20) | (cm["Summed counts host"] >= 20)]
        cm["Normalized counts donor 1"] = cpm(cm["Summed counts donor 1"])
        cm["Normalized counts donor 2"] = cpm(cm["Summed counts donor 2"])
        cm["Normalized counts host"] = cpm(cm["Summed counts host"])
        cm["Mean donor normalized counts"] = (cm["Normalized counts donor 1"] + cm["Normalized counts donor 2"])/2
        cm["l2fc"] = np.log2((cm["Mean donor normalized counts"] + 1)/(cm["Normalized counts host"] + 1))
        #Commented out line that writes out the l2fc computations
        #cm.to_csv("Final/Heart_TheirCT/" + i.replace(" ", "_") + "_l2fc.csv")
        d_imp = {"Imprinted":list(df_imp_info[0])}
        rank = "True"
        cm["Abs l2fc"] = np.abs(cm["l2fc"])
        cm = cm.sort_values("Abs l2fc", ascending = False)
        cm["Gene"] = cm.index
        ranking = cm[["Gene", "Abs l2fc"]]
        ranking.index = list(range(ranking.shape[0]))
        if rank:
            front_half = list(range(1, ranking.shape[0]//2+1))
            front_half.sort(reverse=True)
            back_half = [-x for x in range(1, ranking.shape[0]//2+1)]
            if ranking.shape[0] % 2 == 0:
                ranking["Abs l2fc"] = front_half + back_half
            else:
                ranking["Abs l2fc"] = front_half + [0] + back_half
        try:
            gs.prerank(rnk=ranking, gene_sets=d_imp, threads=4, permutation_num=1000, outdir= 'C:/Users/astar/Chimerism/Final/GSEAPY_New4_Imprinting_Heart_Rank_New_TheirCT/' + i + "_Conf", format='png', seed=6, min_size = 10, max_size = 30000)
        except:
            pass

        d_imp2 = {"Paternal":list(df_imp_info[df_imp_info[4].isin(["Paternal"])][0]), "Maternal":list(df_imp_info[df_imp_info[4].isin(["Maternal"])][0])}
        cm["Gene"] = cm.index
        #v["Abs interaction"] = np.abs(v["Interaction"])
        cm = cm.sort_values("l2fc", ascending = False)
        ranking = cm[["Gene", "l2fc"]]
        try:
            gs.prerank(rnk=ranking, gene_sets=d_imp2, threads=4, permutation_num=1000, outdir= 'C:/Users/astar/Chimerism/Final/GSEAPY_New4_Imprinting_Heart_Rank_New_TheirCT/' + i + "_PatMat_Conf", format='png', seed=6, min_size = 10, max_size = 30000)
        except:
            pass

In [None]:
plt.rcParams["font.family"] = "Arial"
i = "Ven-CMs"
cm = pd.read_csv("Final/Heart_TheirCT/" + i + "_l2fc.csv").set_index("Gene name")

x = pd.DataFrame(cm.loc[["Grb10", "Igf2"]]).T
out = []
for index, row in x.iterrows():
    if index == "Normalized counts host":
        out.append([row["Grb10"], "$\it{Grb10}$", "Rat-like env."])
        out.append([row["Igf2"], "$\it{Igf2}$", "Rat-like env."])
    elif index == "Mean donor normalized counts":
        out.append([row["Grb10"], "$\it{Grb10}$", "Mouse-like env."])
        out.append([row["Igf2"], "$\it{Igf2}$", "Mouse-like env."])
to_plot = pd.DataFrame(out)
to_plot.columns = ["Counts per million", "Gene", "Environment"]
fig, ax = plt.subplots(figsize = (7, 5.5))
sns.set(font_scale=1.5)
sns.set_style("white")
sns.barplot(data = to_plot, y = "Counts per million", x = "Gene", hue = "Environment", palette = {"Rat-like env.":"#65B0AC", "Mouse-like env.":"#65B0AC"}, alpha = 1)
plt.ylabel("Pseudobulked counts per million")
#plt.xlabel("Chimera (donor" + r'$\rightarrow$' + "host)")
plt.xlabel("Gene")
plt.title("$\it{Grb10}$ and $\it{Igf2}$ expression in cardiomyocytes")
xmin, xmax, ymin, ymax = plt.axis()
#plt.ylim(0, 1)
#plt.axvline(0.5, color = "black", linewidth = 2.5, alpha = 1, linestyle="dashed")
#plt.legend(bbox_to_anchor=(1.475, 1.))
plt.legend([],[], frameon=False)
plt.show()

In [None]:
df = pd.DataFrame()
ind = 1
for i in list(set(list(v.obs["CellType"]))):
    if i != "nan" and i != "Other":
        c1_cm = pseudobulk2(c1, str(i))
        c2_cm = pseudobulk2(c2, str(i))
        r1_cm = pseudobulk2(r1, str(i))
        c1_cm.columns = ["Summed counts donor 1"]
        c2_cm.columns = ["Summed counts donor 2"]
        r1_cm.columns = ["Summed counts host"]
        cm = c1_cm.join(c2_cm).join(r1_cm).dropna()

        cm = cm[(cm["Summed counts donor 1"] >= 20) | (cm["Summed counts donor 2"] >= 20) | (cm["Summed counts host"] >= 20)]
        cm["Normalized counts donor 1"] = cpm(cm["Summed counts donor 1"])
        cm["Normalized counts donor 2"] = cpm(cm["Summed counts donor 2"])
        cm["Normalized counts host"] = cpm(cm["Summed counts host"])
        cm["Mean donor normalized counts"] = (cm["Normalized counts donor 1"] + cm["Normalized counts donor 2"])/2
        cm["l2fc"] = np.log2((cm["Mean donor normalized counts"] + 1)/(cm["Normalized counts host"] + 1))
        try:
            x = pd.DataFrame(cm.loc[["Grb10", "Igf2"]]).T
            out = []
            for index, row in x.iterrows():
                if index == "Normalized counts host":
                    out.append([row["Grb10"], "$\it{Grb10}$", "Rat-like env.", i])
                    out.append([row["Igf2"], "$\it{Igf2}$", "Rat-like env.", i])
                elif index == "Mean donor normalized counts":
                    out.append([row["Grb10"], "$\it{Grb10}$", "Mouse-like env.", i])
                    out.append([row["Igf2"], "$\it{Igf2}$", "Mouse-like env.", i])
            df_new = pd.DataFrame(out)
            if ind:
                df = df_new
                ind = 0
            else:
                df = pd.concat([df, df_new])
        except:
            pass


In [None]:
to_plot = df.copy()
to_plot.columns = ["Counts per million", "Gene", "Environment", "Cell type"]
to_plot = to_plot[to_plot["Gene"].isin(["$\it{Igf2}$"])]
fig, ax = plt.subplots(figsize = (14, 5))
sns.set(font_scale=1.5)
sns.set_style("white")
sns.barplot(data = to_plot, y = "Counts per million", x = "Cell type", hue = "Environment", palette = {"Rat-like env.":"#65B0AC", "Mouse-like env.":"#65B0AC"}, alpha = 1)
plt.ylabel("Pseudobulked counts per million")
#plt.xlabel("Chimera (donor" + r'$\rightarrow$' + "host)")
plt.xlabel("Cell type")
plt.title("Expression of $\it{Igf2}$ in embryonic heart data")
xmin, xmax, ymin, ymax = plt.axis()
#plt.ylim(0, 1)
#plt.axvline(0.5, color = "black", linewidth = 2.5, alpha = 1, linestyle="dashed")
#plt.legend(bbox_to_anchor=(1.475, 1.))
plt.legend([],[], frameon=False)
plt.xticks(rotation = 90)
plt.show()

In [None]:
#Set up to plot things
term = 'Imprinted'
gene_set = list(df_imp_info[0])
enrich = []
fdr = []
cell_types = []
genes = []
for file in os.listdir("Final/GSEAPY_New4_Imprinting_Heart_Rank_New_TheirCT"):
    if "PatMat" not in file and "Heart" not in file and "Mixed connective 2" not in file:
        v = pd.read_csv("Final/GSEAPY_New4_Imprinting_Heart_Rank_New_TheirCT/" + file + "/gseapy.gene_set.prerank.report.csv", sep = ",")
        out = []
        for index, row in v.iterrows():
            if row["Term"] == term:
                enrich.append(row["ES"])
                fdr.append(row["FDR q-val"])
                cell_types.append(file.replace(".csv", "").replace(" ", ".").replace(".1", "").replace("2", "1"))
                genes.append(row["Lead_genes"])
d_en = {}
for i in range(len(cell_types)):
    d_en[cell_types[i]] = genes[i]

ct = []
enrichment = []
for file in os.listdir("Final/Heart_TheirCT"):
    if "Mixed_connective_2" not in file:
        vv = pd.read_csv("Final/Heart_TheirCT/" + file, sep = ",")
        vv["Abs l2fc"] = np.abs(vv["l2fc"])
        asc = False
        vv = vv.sort_values("Abs l2fc", ascending = asc)
        out = []
        genes = d_en[file.replace(".csv", "").replace("_l2fc", "").replace("_", ".").replace(".1", "").replace("2", "1")].split(";")
        cut = 0
        for index, row in vv.iterrows():
            cut += 1
            if genes[-1] == row["Gene name"]:
                break
        genes_agree = len(np.intersect1d(list(vv["Gene name"])[:cut], gene_set))
        genes_disagree = len(np.intersect1d(list(vv["Gene name"])[::-1][:cut], gene_set))
        ct.append(file.replace(".csv", "").replace("_l2fc", "").replace("_", ".").replace(".1", "").replace("2", "1"))
        if asc:
            enrichment.append(-np.log2((genes_agree + 1)/(genes_disagree + 1)))
        else:
            enrichment.append(np.log2((genes_agree + 1)/(genes_disagree + 1)))

In [None]:
fig, ax = plt.subplots(figsize = (6, 4.5))
sns.set(font_scale = 1.5)
sns.set_style("white")
to_plot = pd.DataFrame([enrichment, fdr, ct]).T.sort_values(0, ascending = False)
to_plot.columns = ["Log$_2$ fold-enrichment", "FDR", "Cell type"]
sns.barplot(data = to_plot, x = "Cell type", y = "Log$_2$ fold-enrichment", color = "red")
plt.title("Imprinted genes in heart")
#plt.legend(bbox_to_anchor=(1.5, 1.05))
plt.legend([],[], frameon=False)
plt.xticks(rotation=90)