In [None]:
import seaborn as sns
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import os
from matplotlib.pyplot import figure
from collections import Counter
from scipy.stats import spearmanr,pearsonr,fisher_exact,binom_test
import gseapy as gs
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale, normalize

In [None]:
#Get list of confident imprinte genes
imp_conf_list = []
o = open("mouse_imprinted_genes.txt")
for line in o:
    l = line.replace("\xa0", "").replace("\n", "").split("\t")
    if ("Imprinted" in l[3] or "Predicted" in l[3]) and "Not" not in l[3]:
        imp_conf_list.append(l)


df_imp_info = pd.DataFrame(imp_conf_list)
df_imp_info

In [None]:
df_cont = pd.DataFrame()
ind = 1
#Iterate through the data from the parathyroid study
for i in os.listdir("Final/Div_AbsDif/GSE232600_RAW"):
    
    #Keep only the host columns
    if "815" in i or "816" in i or "817" in i or "818" in i:
        v = pd.read_csv("Final/Div_AbsDif/GSE232600_RAW/" + i, sep = "\t")
        #v.to_csv("Final/Parathyroid_DESeq2/" + i.replace(".gz", ""), sep = "\t", index = False)
        v = v.set_index("Name")
        
        #Restrict to only TPM values
        v = v[["TPM"]]
        v.columns = ["Host " + i.split("_")[0]]
        
        #Join the host informatin
        if ind:
            df_cont = v
            ind = 0
        else:
            df_cont = df_cont.join(v)
#df_cont["Mean host"] = np.mean(df_cont, axis = 1)

df_donor_match = pd.DataFrame()
ind = 1
for i in os.listdir("Final/Div_AbsDif/GSE232600_RAW"):
    #Keep only if donor, repeat of above
    if "819" in i or "820" in i or "821" in i or "822" in i or "823" in i or "824" in i or "825" in i or "826" in i:
        
        v = pd.read_csv("Final/Div_AbsDif/GSE232600_RAW/" + i, sep = "\t")
        v.to_csv("Final/Parathyroid_DESeq2/" + i.replace(".gz", ""), sep = "\t", index = False)
        v = v.set_index("Name")
        v = v[["TPM"]]
        v.columns = ["Donor " + i.split("_")[0]]
        if ind:
            df_donor_match = v
            ind = 0
        else:
            df_donor_match = df_donor_match.join(v)
#df_don["Mean donor"] = np.mean(df_don, axis = 1)
df_donor_match

df_donor_mis = pd.DataFrame()
ind = 1
for i in os.listdir("Final/Div_AbsDif/GSE232600_RAW"):
    #Keep only if donor cells to the species matched environment
    if "827" in i or "828" in i or "829" in i or "830" in i:
        v = pd.read_csv("Final/Div_AbsDif/GSE232600_RAW/" + i, sep = "\t")
        v.to_csv("Final/Parathyroid_DESeq2/" + i.replace(".gz", ""), sep = "\t", index = False)
        v = v.set_index("Name")
        v = v[["TPM"]]
        v.columns = ["Donor " + i.split("_")[0]]
        if ind:
            df_donor_mis = v
            ind = 0
        else:
            df_donor_mis = df_donor_mis.join(v)
#df_don["Mean donor"] = np.mean(df_don, axis = 1)
df_donor_match

In [None]:
#Convert from ensembly names to gene symbols
tg = pd.read_csv("Final/Div_AbsDif/Ensembl_tran_to_gene.txt", sep="\t").set_index("Transcript stable ID")
df_donor_mis.index = [x.split(".")[0] for x in list(df_donor_mis.index)]
df_donor_mis = df_donor_mis.join(tg).dropna()
df_donor_mis.index = df_donor_mis["Gene name"]
df_donor_mis = df_donor_mis.groupby(df_donor_mis.index).sum(numeric_only = "True")

df_donor_match.index = [x.split(".")[0] for x in list(df_donor_match.index)]
df_donor_match = df_donor_match.join(tg).dropna()
df_donor_match.index = df_donor_match["Gene name"]
df_donor_match = df_donor_match.groupby(df_donor_match.index).sum(numeric_only = "True")

df_cont.index = [x.split(".")[0] for x in list(df_cont.index)]
df_cont = df_cont.join(tg).dropna()
df_cont.index = df_cont["Gene name"]
df_cont = df_cont.groupby(df_cont.index).sum(numeric_only = "True")

In [None]:
#List of genes to plot
to_plot = ["Grb10", "Igf2", "Cdkn1c", "Dhcr7", "Slc38a4", "Gnai3", "Nap1l4", "Ppp1r9a", "Sgce"]
out = []

#Rename columns to those shown in the publication
#Also reformat for seaborn
df_donor_misp = df_donor_mis.loc[to_plot]
for i in list(df_donor_misp.index):
    g = df_donor_misp.loc[i]
    for k in list(g):
        out.append([i, k, "Mouse in species-mismatched env."])
        
df_donor_matchp = df_donor_match.loc[to_plot]
for i in list(df_donor_matchp.index):
    g = df_donor_matchp.loc[i]
    for k in list(g):
        out.append([i, k, "Mouse in species-matched env."])
        
df_contp = df_cont.loc[to_plot]
for i in list(df_contp.index):
    g = df_contp.loc[i]
    for k in list(g):
        out.append([i, k, "Wildtype mouse"])
df_plot = pd.DataFrame(out)
df_plot.columns = ["Gene", "Transcripts per million", "Condition"]

#Compute mean TPM in each condition for the bars
out_mean = []
for i in to_plot:
    for j in ["Mouse in species-mismatched env.", "Mouse in species-matched env.", "Wildtype mouse"]:
        dfg = df_plot[df_plot["Gene"].isin([i])]
        m = np.mean(dfg[dfg["Condition"].isin([j])]["Transcripts per million"])
        out_mean.append([i, m, j])

#Create data frame of the means        
df_mean = pd.DataFrame(out_mean)
df_mean.columns = ["Gene", "Transcripts per million", "Condition"]
df_mean

In [None]:
#Define colors
palette = {"Mouse in species-mismatched env.":"#C43E96", "Mouse in species-matched env.":"#149B53", "Wildtype mouse":"#F0B618"}
palette_alpha = {"Mouse in species-mismatched env.":"#C43E9640", "Mouse in species-matched env.":"#149B5340", "Wildtype mouse":"#F0B61840"}
fig, ax = plt.subplots(figsize = (14, 6))
sns.set(font_scale = 1.5)
sns.set_style("white")
#Create swarmplot of samples overlapping barplot of means
sns.swarmplot(data = df_plot, y = "Transcripts per million", x = "Gene", hue = "Condition", palette = palette, dodge = True, size = 7)
t_ax = sns.barplot(data = df_mean, x = "Gene", y = "Transcripts per million", hue = "Condition", dodge = True, errorbar=None, linewidth=2.5, edgecolor=".5", gap = 0.1)
c = 0

#Recolor the bars to actually be correct
for patch in t_ax.patches:
    if c <= len(to_plot) - 1:
        patch.set_edgecolor("#C43E96")
        patch.set_facecolor("#C43E9640")
    elif c <= 2*len(to_plot) - 1:
        patch.set_edgecolor("#149B53")
        patch.set_facecolor("#149B5340")
    else:
        patch.set_edgecolor("#F0B618")
        patch.set_facecolor("#F0B61840")
    c += 1

#Remove legend
plt.legend([[]], frameon = False)

In [None]:
df_all = df_donor_mis.join(df_donor_match).join(df_cont)

In [None]:
#Expression of a few extremely highly expressed genes skewed the results and so these were removed
mt = []
for index, row in df_all.iterrows():
    if "mt-" in index:
        mt.append(index)
        
df_all_d = df_all.drop(["Pth", "Chga"] + mt)
out = []
#Renormalized to 1000000
for index, row in df_all_d.T.iterrows():
    if "GSM" in index:
        out.append(row/np.sum(row)*1000000)
df_all_new = pd.DataFrame(out).T
df_all_new

In [None]:

df_all = df_all.copy()

In [None]:
#Get median values across samples within a condtion, per gene
df_all["Median mismatch"] = np.median(df_all[df_donor_mis.columns], axis = 1)
df_all["Median match"] = np.median(df_all[df_donor_match.columns], axis = 1)
df_all["Median control"] = np.median(df_all[df_cont.columns], axis = 1)

#Filter to at least moderately expressed in one condition
df_all = df_all[(df_all["Median mismatch"] > 5) | (df_all["Median match"] > 5) | (df_all["Median control"] > 5)]

In [None]:
#Make clustermap
sns.set(font_scale = 1.25)

#Normalize as if we were doing pca
df_all_imp_pca = df_all_imp.drop(list(df_all_imp.columns)[16:], axis = 1)
df_all_imp_pca = pd.DataFrame(normalize(df_all_imp_pca, axis=1))
df_all_imp_pca.index = df_all_imp.index
df_all_imp_pca.columns = list(np.repeat("Mouse in species-mismatched env.", 4)) + list(np.repeat("Mouse in species-matched env.", 8)) + list(np.repeat("Wildtype mouse", 4))

#Compute clustermap
sns.clustermap(df_all_imp_pca, xticklabels = False, yticklabels = False, col_colors=[palette[x] for x in list(df_all_imp_pca.columns)], tree_kws=dict(linewidths=1.5), z_score = None)

In [None]:
### Below was not used so is not commented ###

In [None]:

df_all_imp = df_all.loc[np.intersect1d(df_all.index, df_imp_info[0])]
df_all_imp_pca = df_all_imp.drop(list(df_all_imp.columns)[16:], axis = 1)
df_all_imp_pca = pd.DataFrame(normalize(df_all_imp_pca, axis=1))

pca = PCA(n_components=2)
pca.fit(df_all_imp_pca)
pcap = pd.DataFrame(pca.components_).T
pcap.columns = ["PC1", "PC2"]
pcap["Samples"] = df_all_imp_pca.columns
pcap["Type"] = list(np.repeat("Mouse in species-mismatched env.", 4)) + list(np.repeat("Mouse in species-matched env.", 8)) + list(np.repeat("Wildtype mouse", 4))
fig, ax = plt.subplots(figsize=(7,7))
sns.scatterplot(data=pcap, x = "PC1", y = "PC2", hue = "Type", palette=palette, s = 50)
plt.xlabel("PC1 43.8% of variance explained")
plt.ylabel("PC2 12.6% of variance explained")
plt.title("PCA on imprinted genes in adult parathyroid")
plt.legend(frameon = True, bbox_to_anchor=[1.01, 1], fontsize=15)
print(pca.explained_variance_ratio_)

In [None]:
#Get imprinted genes
d_imp = {"Imprinted":list(df_imp_info[0])}
d_imp2 = {"Paternal":list(df_imp_info[df_imp_info[4].isin(["Paternal"])][0]), "Maternal":list(df_imp_info[df_imp_info[4].isin(["Maternal"])][0])}

#Compute enrichments based on absolute log fold-chagnge
df_test["Gene name"] = df_test.index
df_test["Abs LFC"] = np.abs(df_test["LFC"])
df_test = df_test.sort_values("Abs LFC", ascending = False)
ranking = df_test.copy()
ranking = ranking[["Gene name", "Abs LFC"]]
ranking.index = list(range(ranking.shape[0]))
rank = True
if rank:
    front_half = list(range(1, ranking.shape[0]//2+1))
    front_half.sort(reverse=True)
    back_half = [-x for x in range(1, ranking.shape[0]//2+1)]
    if ranking.shape[0] % 2 == 0:
        ranking["Abs LFC"] = front_half + back_half
    else:
        ranking["Abs LFC"] = front_half + [0] + back_half
gs.prerank(rnk=ranking, gene_sets=d_imp, threads=4, permutation_num=1000, outdir= 'C:/Users/astar/Chimerism/Final/GSEAPY_New4_Imprinting_Parathyroid_Rank_New/' + "Bleh_Median_SignG5", format='png', seed=6, min_size = 10, max_size = 30000)
