In [13]:
import pandas as pd
import numpy as np
import os

#Function to convert file name to cell type
def file_to_celltype(x):
    x = x.split("/")[-1]
    x = x.replace("_GO_Molecular_Function_2023", "").replace("_GO_Biological_Process_2023", "").replace("Mesechyme", "Mesenchyme").replace("Brain_Or_2010_Div_New4_NewNorm_", "")
    x = x.replace("Mesenchymal_Or_2010_Div_New4_NewNorm_", "").replace("Chondrocyte_Or_2010_Div_New4_NewNorm_", "").replace(".csv", "").replace("_", " ").replace(" all", "")
    x = x.replace("Glutamatergic", "Forebrain glutamatergic").replace("GABAergic", "Forebrain GABAergic").replace("Spinal Forebrain", "Spinal").replace("Chondrocytes", "Chondrocyte")
    return x

d_ct_abrev = {"Forebrain glutamatergic progenitors":"brain.glut.prog", "Forebrain GABAergic progenitors":"brain.GABA.prog", "Intermediate progenitors":"inter.prog", "Forebrain glutamatergic neurons":"brain.glut.neu", "Forebrain GABAergic neurons":"brain.GABA.neu", "Spinal GABAergic neurons":"spine.GABA.neu", "Spinal glutamatergic neurons":"spine.glut.neu", "Chondrocytes":"chondrocyte", "Mesenchyme 0":"mesen.0", "Mesenchyme 2":"mesen.2", "Mesenchyme cycling":"mesen.cyc", "Forebrain GABAergic neurons 0":"Toss", "Chondrocyte":"chondrocyte"}


In [14]:
#Supplemental Table 1
#Filter to only the columns we want to keep, update column names to be more readable with cols
cols = ["Gene", "MR1 DM Raw", "MR1 HR Raw", "RM1 HM Raw", "RM1 DR Raw", "RM2 HM Raw", "RM2 DR Raw", "MR1 DM Normed", "MR1 HR Normed", "RM1 HM Normed", "RM1 DR Normed", "RM2 HM Normed", "RM2 DR Normed", \
        "log2 DR/HR","log2 DM/HM", "log2 DM/DR", "log2 HM/HR", "log2 DM/HR", "log2 HM/DR", "Extrinsic component", "Intrinsic component", "Interaction component", "Proportion extrinsic", "Proportion intrinsic", "Proportion interaction"]
keep_cols = ["Unnamed: 0", "MR1_mi Raw", "MR1_ri Raw", "RM1_mi Raw", "RM1_ri Raw", "RM2_mi Raw", "RM2_ri Raw", "MR1_mi Norm CPM", "MR1_ri Norm CPM", "RM1_mi Norm CPM", "RM1_ri Norm CPM", "RM2_mi Norm CPM", "RM2_ri Norm CPM", \
             "DR/HR","DM/HM", "DM/DR", "HM/HR", "DM/HR", "HM/DR", "Extrinsic", "Intrinsic", "Interaction", "Proportion extrinsic", "Proportion intrinsic", "Proportion interaction"]
#Open up excel file
with pd.ExcelWriter('Final/Supplemental_Tables/Supplemental_Table1BLEH.xlsx', engine='xlsxwriter') as writer:
    #For each cell type
    for file in os.listdir("Final/Div/"):
        
        #Filter to columns we want to keep and rename
        df = pd.read_csv("Final/Div/" + file)
        df = df[keep_cols]
        df.columns = cols
        
        #Add sheet for cell type of interest
        df.to_excel(writer, sheet_name=d_ct_abrev[file_to_celltype(file)], index=False)

In [17]:
#Supplemental Table 1 no sheets, nearly identical to above but saves to CSV
cols = ["Gene", "MR1 DM Raw", "MR1 HR Raw", "RM1 HM Raw", "RM1 DR Raw", "RM2 HM Raw", "RM2 DR Raw", "MR1 DM Normed", "MR1 HR Normed", "RM1 HM Normed", "RM1 DR Normed", "RM2 HM Normed", "RM2 DR Normed", \
        "log2 DR/HR","log2 DM/HM", "log2 DM/DR", "log2 HM/HR", "log2 DM/HR", "log2 HM/DR", "Extrinsic component", "Intrinsic component", "Interaction component", "Proportion extrinsic", "Proportion intrinsic", "Proportion interaction"]
keep_cols = ["Unnamed: 0", "MR1_mi Raw", "MR1_ri Raw", "RM1_mi Raw", "RM1_ri Raw", "RM2_mi Raw", "RM2_ri Raw", "MR1_mi Norm CPM", "MR1_ri Norm CPM", "RM1_mi Norm CPM", "RM1_ri Norm CPM", "RM2_mi Norm CPM", "RM2_ri Norm CPM", \
             "DR/HR","DM/HM", "DM/DR", "HM/HR", "DM/HR", "HM/DR", "Extrinsic", "Intrinsic", "Interaction", "Proportion extrinsic", "Proportion intrinsic", "Proportion interaction"]
df_all = 0
ind = 1
for file in os.listdir("Final/Div/"):
    df = pd.read_csv("Final/Div/" + file)
    df["Cell type"] = np.repeat(d_ct_abrev[file_to_celltype(file)], df.shape[0])
    df = df[["Cell type"] + keep_cols]
    df.columns = ["Cell type"] + cols
    if ind:
        ind = 0
        df_all = df
    else:
        df_all = pd.concat([df_all, df])
df_all.to_csv("Final/Supplemental_Tables/Supplemental_Table1.csv", index=False)


Description of Supplemental Table 1:

This table lists the raw/normalized pseudobulked counts, computed log fold-changes, and estimates of extrinsic, intrinsic, and interaction divergence per gene.
For the raw and normalized counts, the first element denotes the sample.  As an example, MR1 indicates Mouse-Rat chimera 1 with Mouse cells being injected into Rat blastocysts.
The second element indicates whether the cells are donor (species-mismatched environment) or host (species-matched environment) and the species of the cells that were pseudobulked.
For example, DM indicates donor mouse cells and HR indicates host rat cells.
The log fold-changes indicate which normalized counts were used to compute it.  For example, log2 DM/HR indicates the log2 fold-change of donor mouse divided by host rat.
Important: Due to differences in the number of cells captured and their quality, RM2 was used for mesen.0, mesen.2, mesen.cyc, and chondrocytes.  RM1 was used for all other cell types.
The Extrinsic/Intrinsic/Interaction components and proportions were computed as described as in the study associated with this supplemental table.
Each sheet corresponds to a cell type.  Only genes with absolute log2 fold-change greater than or equal to 0.5 in at least one comparison are included.


In [22]:
#Supplemental Table 2
#Start with signed proportion extrinsic divergence
folder1 = "GSEAPY_New4_Signed_Proportion_extrinsic"
df_all = 0
ind = 1

#For all files in the enrichment folder
for file in os.listdir("Final/" + folder1):
    
    #For gene ontology enrichments
    if "GO" in file and "Cellular" not in file:
        
        #Get cell type
        ct = file.split("_GO")[0].replace("_", ".")
        
        #Figure out which ontology was used to add that information
        if "Biological" in file:
            df = pd.read_csv("Final/" + folder1 + "/" + file + "/" + "gseapy.gene_set.prerank.report.csv")
            df["Category"] = np.repeat("Biological process", df.shape[0])
            df["Cell type or tissue"] = ct
            
            #Filter to columns
            df = df[["Cell type or tissue", "Category", "Term", "ES", "NES", "NOM p-val", "FDR q-val", "Lead_genes"]]
        elif "Molecular" in file:
            df = pd.read_csv("Final/" + folder1 + "/" + file + "/" + "gseapy.gene_set.prerank.report.csv")
            df["Category"] = np.repeat("Molecular function", df.shape[0])
            df["Cell type or tissue"] = ct
            df = df[["Cell type or tissue", "Category", "Term", "ES", "NES", "NOM p-val", "FDR q-val", "Lead_genes"]]
        if ind:
            ind = 0
            df_all = df
        else:
            df_all = pd.concat([df_all, df])

#Repeat, this time for signed extrinsic divergence
folder1 = "GSEAPY_New4_Signed_Proportion_Mean_Signed_extrinsic"
for file in os.listdir("Final/" + folder1):
    if "GO" in file and "Cellular" not in file:
        ct = file.split("_GO")[0].replace("_", ".")
        if "Biological" in file:
            print(file)
            df = pd.read_csv("Final/" + folder1 + "/" + file + "/" + "gseapy.gene_set.prerank.report.csv")
            df["Category"] = np.repeat("Biological process", df.shape[0])
            df["Cell type or tissue"] = ct
            df = df[["Cell type or tissue", "Category", "Term", "ES", "NES", "NOM p-val", "FDR q-val", "Lead_genes"]]
        elif "Molecular" in file:
            print(file)
            df = pd.read_csv("Final/" + folder1 + "/" + file + "/" + "gseapy.gene_set.prerank.report.csv")
            df["Category"] = np.repeat("Molecular function", df.shape[0])
            df["Cell type or tissue"] = ct
            df = df[["Cell type or tissue", "Category", "Term", "ES", "NES", "NOM p-val", "FDR q-val", "Lead_genes"]]
        if ind:
            ind = 0
            df_all = df
        else:
            df_all = pd.concat([df_all, df])
            
df_all.to_csv("Final/Supplemental_Tables/Supplemental_Table2BLEH.csv", index=False)
 

All_GO_Biological_Process_2023
All_GO_Molecular_Function_2023
Brain_GO_Biological_Process_2023
Brain_GO_Molecular_Function_2023
Conn_GO_Biological_Process_2023
Conn_GO_Molecular_Function_2023


Description of Supplemental Table 2:

Cell type or tissue is the cell type or tissue and Category is the ontology from which the Term came
For cell type, all genes passing filtering criteria were used
For tissues, we averaged across all cell types in a tissue (mesen.0, mesen.2, mesen.cyc, and chondrocyte are in Conn, all other cell types are in CNS, All is averaging across all eleven cell types)
We excluded genes with less than or equal to 2 cell types with estimates of intrinsic/extrinsic/interaction divergence for Conn and CNS, and excluded genes with less than or equal to 4 for All
The Term is the ontology term
ES and NES are the enrichment scores and normalized enrichment scores from GSEAPY preranked
NOM p-val is the nominal p-value from GSEAPY preranked and FDR q-val is the false discovery rate q-value computed by GSEAPY preranked
Lead_genes are the lead genes driving the enrichment (also computed by GSEAPY preranked)


In [26]:
#Supplemental Table 3
#Code is essentially identical to Supplemental table 2, but uses the TF enrichments instead
folder1 = "GSEAPY_Signed_Proportion_Mean_Signed_TF_extrinsic"
df_all = 0
ind = 1
for file in os.listdir("Final/" + folder1):
    ct = file.split("_TF")[0].replace("_", ".")
    df = pd.read_csv("Final/" + folder1 + "/" + file + "/" + "gseapy.gene_set.prerank.report.csv")
    df["Category"] = np.repeat("TF Perturbations Followed by Expression", df.shape[0])
    df["Cell type or tissue"] = ct
    df = df[["Cell type or tissue", "Category", "Term", "ES", "NES", "NOM p-val", "FDR q-val", "Lead_genes"]]
    if ind:
        ind = 0
        df_all = df
    else:
        df_all = pd.concat([df_all, df])
        
folder1 = 'GSEAPY_New4_Signed_Proportion_TF_extrinsic'
for file in os.listdir("Final/" + folder1):
    ct = file.split("_TF")[0].replace("_", ".")
    df = pd.read_csv("Final/" + folder1 + "/" + file + "/" + "gseapy.gene_set.prerank.report.csv")
    df["Category"] = np.repeat("TF Perturbations Followed by Expression", df.shape[0])
    df["Cell type or tissue"] = ct
    df = df[["Cell type or tissue", "Category", "Term", "ES", "NES", "NOM p-val", "FDR q-val", "Lead_genes"]]
    if ind:
        ind = 0
        df_all = df
    else:
        df_all = pd.concat([df_all, df])
df_all.to_csv("Final/Supplemental_Tables/Supplemental_Table3.csv", index=False)

Description of Supplemental Table 3:

Cell type or tissue is the cell type or tissue and Category is the ontology from which the Term came
For cell type, all genes passing filtering criteria were used
For tissues, we averaged across all cell types in a tissue (mesen.0, mesen.2, mesen.cyc, and chondrocyte are in Conn, all other cell types are in CNS, All is averaging across all eleven cell types)
We excluded genes with less than or equal to 2 cell types with estimates of intrinsic/extrinsic/interaction divergence for Conn and CNS, and excluded genes with less than or equal to 4 for All
The Term is the ontology term
ES and NES are the enrichment scores and normalized enrichment scores from GSEAPY preranked
NOM p-val is the nominal p-value from GSEAPY preranked and FDR q-val is the false discovery rate q-value computed by GSEAPY preranked
Lead_genes are the lead genes driving the enrichment (also computed by GSEAPY preranked)
