In [None]:
import seaborn as sns
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import os
from matplotlib.pyplot import figure
from collections import Counter
from scipy.stats import spearmanr,pearsonr,fisher_exact,binom_test

In [None]:
imp_conf_list = []
o = open("Human_imprinted_genes.txt")
for line in o:
    l = line.replace("\xa0", "").replace("\n", "").split("\t")
    if ("Imprinted" in l[3] or "Predicted" in l[3]) and "Not" not in l[3]:
        imp_conf_list.append(l)
        
df_imp_info = pd.DataFrame(imp_conf_list)
df_imp_info

In [None]:
#This data is TPM normalized
v = pd.read_csv("Final/Div_AbsDif/GSE155381_CM_CH_gene_expression.txt", sep = "\t")
v = v.set_index("Gene.ID").T

k = pd.read_csv("Final/Div_AbsDif/Annotations.txt", sep = ",")
bleh = pd.read_csv("Final/Div_AbsDif/Cell_Key.txt", sep = "\t", header = None).set_index(0)
kn = bleh.join(k.set_index("Sample Name"))

In [None]:
kn = kn.set_index(1)
v = v.join(kn)
v_m = v[v["Organism"].isin(["Macaca fascicularis"])]
v_h = v[v["Organism"].isin(["Homo sapiens"])]

In [None]:
orths = pd.read_csv("Orthos_HS_MF.txt", sep = "\t").set_index("Crab-eating macaque gene stable ID").dropna()
orths = orths[orths["Crab-eating macaque homology type"].isin(["ortholog_one2one"])]
orths = orths.drop_duplicates("Gene name")


In [None]:
v_m = v_m.T
v_m = v_m.join(orths)
out = []
for index, row in v_m.iterrows():
    if "ENSG" not in index:
        out.append(row)
v_m_new = pd.DataFrame(out)


In [None]:
v_m_new = v_m_new[~v_m_new["Crab-eating macaque gene name"].isin([float("nan")])]
v_m_new.index = v_m_new["Gene stable ID"]
v_h = v_h.T
to_analyze = v_h.join(v_m_new).dropna(subset = ["Gene name"])
to_analyze

In [None]:
to_analyze = to_analyze.T
to_analyze = to_analyze.join(kn)


In [None]:
cts = ['EPI', 'PE', 'TE', 'EXMC']
ta_m = to_analyze[to_analyze["Organism"].isin(["Macaca fascicularis"])]
ta_h = to_analyze[to_analyze["Organism"].isin(["Homo sapiens"])]
ta_m_epi = ta_m[ta_m["lineage"].isin(["EPI"])]
ta_h_epi = ta_h[ta_h["lineage"].isin(["EPI"])]
ta_h_epi

In [None]:
ta_h = ta_h_epi.T
ta_m = ta_m_epi.T
out_h = []
out_m = []
for index, row in ta_h.iterrows():
    if "ENSG" in index:
        out_h.append([index, np.mean(row)])
for index, row in ta_m.iterrows():
    if "ENSG" in index:
        out_m.append([index, np.mean(row)])

In [None]:
df_h = pd.DataFrame(out_h).set_index(0)
df_h.columns = ["Mean counts human"]
df_m = pd.DataFrame(out_m).set_index(0)
df_m.columns = ["Mean counts mac"]
df = df_h.join(df_m)


In [None]:
orths_new = orths.set_index("Gene stable ID")
df["l2fc"] = np.log2((1+df["Mean counts human"])/(1+df["Mean counts mac"]))
df = df.sort_values("l2fc")
df = df.join(orths_new).drop_duplicates(subset=["Gene name"])
df = df[(df["Mean counts human"] > 1) | (df["Mean counts mac"] > 1)]

In [None]:
#Renormalize so that they both sum to 1,000,000
df["Mean normed mac"] = df["Mean counts mac"]/np.sum(df["Mean counts mac"])*1000000
df["Mean normed human"] = df["Mean counts human"]/np.sum(df["Mean counts human"])*1000000
df["l2fc normed"] = np.log2((1+df["Mean normed human"])/(1+df["Mean normed mac"]))


In [None]:
import gseapy as gs
d_imp = {"Imprinted":list(df_imp_info[0])}
d_imp2 = {"Paternal":list(df_imp_info[df_imp_info[4].isin(["Paternal"])][0]), "Maternal":list(df_imp_info[df_imp_info[4].isin(["Maternal"])][0])}

df["Abs l2fc normed"] = np.abs(df["l2fc normed"])
df = df.sort_values("Abs l2fc normed", ascending = False)
ranking = df[["Gene name", "Abs l2fc normed"]]
ranking.index = list(range(df.shape[0]))
rank = True
if rank:
    front_half = list(range(1, ranking.shape[0]//2+1))
    front_half.sort(reverse=True)
    back_half = [-x for x in range(1, ranking.shape[0]//2+1)]
    if ranking.shape[0] % 2 == 0:
        ranking["Abs l2fc normed"] = front_half + back_half
    else:
        ranking["Abs l2fc normed"] = front_half + [0] + back_half
gs.prerank(rnk=ranking, gene_sets=d_imp, threads=4, permutation_num=1000, outdir= 'C:/Users/astar/Chimerism/Final/GSEAPY_New4_Imprinting_HumMac/' + "EPI_Human_Macaque_Test", format='png', seed=6, min_size = 10, max_size = 30000)


In [None]:
df = df.sort_values("l2fc normed", ascending = False)
ranking = df[["Gene name", "l2fc normed"]]
ranking.index = list(range(df.shape[0]))
gs.prerank(rnk=ranking, gene_sets=d_imp2, threads=4, permutation_num=1000, outdir= 'C:/Users/astar/Chimerism/Final/GSEAPY_New4_Imprinting_HumMac/' + "EPI_Human_Macaque_Signed", format='png', seed=6, min_size = 10, max_size = 30000)


In [None]:
human_only = pd.read_csv("Final/Div_AbsDif/GSE109555_All_Embryo_TPM.txt", sep = "\t")
human_only_epi = human_only.T[(human_only.T["POU5F1"] > 1) | (human_only.T["NANOG"] > 1) | (human_only.T["SOX2"] > 1)].T
human_only_mean = pd.DataFrame(human_only_epi.apply(np.mean, axis = 1))
human_only_mean.columns = ["Human only measurement"]

In [None]:
df["Human ensembl"] = df.index
df.index = df["Gene name"]
dff = df.join(human_only_mean)
dff = dff.dropna()

In [None]:
dff["Human only normed"] = dff["Human only measurement"]/np.sum(dff["Human only measurement"])*1000000

In [None]:
print(spearmanr(dff["Mean counts human"], dff["Human only measurement"]))

In [None]:
dff["Human l2fc normed"] = np.log2((dff["Mean normed human"] + 1)/(dff["Human only normed"] + 1))
dff_imp = dff.loc[np.intersect1d(dff.index, df_imp_info[0])]
dff_imp.sort_values("Human l2fc normed")

In [None]:
import gseapy as gs
d_imp = {"Imprinted":list(df_imp_info[0])}
d_imp2 = {"Paternal":list(df_imp_info[df_imp_info[4].isin(["Paternal"])][0]), "Maternal":list(df_imp_info[df_imp_info[4].isin(["Maternal"])][0])}

rank = True


dff["Abs l2fc normed"] = np.abs(dff["Human l2fc normed"])
dff = dff.sort_values("Abs l2fc normed", ascending = False)
ranking = dff[["Gene name", "Abs l2fc normed"]]
ranking.index = list(range(ranking.shape[0]))

if rank:
    front_half = list(range(1, ranking.shape[0]//2+1))
    front_half.sort(reverse=True)
    back_half = [-x for x in range(1, ranking.shape[0]//2+1)]
    if ranking.shape[0] % 2 == 0:
        ranking["Abs l2fc normed"] = front_half + back_half
    else:
        ranking["Abs l2fc normed"] = front_half + [0] + back_half

gs.prerank(rnk=ranking, gene_sets=d_imp, threads=4, permutation_num=1000, outdir= 'C:/Users/astar/Chimerism/Final/GSEAPY_New4_Imprinting_HumMac/' + "EPI_Human_HumanOnly_Test", format='png', seed=6, min_size = 10, max_size = 30000)


In [None]:
#Making Figure 6I
sns.set(font_scale = 1.5)
sns.set_style("white")
tp = pd.DataFrame([[6.514151, "Macaque", "Macaque-like env."], [67.777017, "Human", "Macaque-like env."], [1.791428, "Human", "Human-like env."]])
tp.columns = ["Transcripts per million", "Cell species", "Extrinsic environment"]
sns.barplot(data = tp, x = "Extrinsic environment", y = "Transcripts per million", hue = "Cell species", palette = {"Macaque":"#B06C1B", "Human":"#149B53"})
plt.title("$\it{PLAGL1}$ expression in epiblast")
plt.legend(frameon=False)