### Load the libraries

In [None]:
import os 
import numpy as np
import scipy as sci
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.collections as coll

import scanpy as sc
import anndata as ad

from scipy.sparse import csr_matrix
from matplotlib import rcParams
from matplotlib import colors

import custom_functions as cf

In [None]:
## set paramteres for visualisation

sc.settings.set_figure_params(dpi=80)

# settings for the plots
sc.set_figure_params(scanpy=True, dpi=80, dpi_save=500,
                     frameon=True, vector_friendly=True,
                     color_map="YlGnBu",
                     #color_map="BuPu",
                     format='pdf', transparent=False,
                     ipython_format='png2x')

### Load the pre-processed data

In [None]:
adata = ad.read("/home/hpc/martinez/00_projects/human/precision_toxicology/filtered_joint_scran_normalized_removed_clusters_scRNAseq_10X_precision_toxicology_Harmony_nov21.h5ad")

In [None]:
np.mean(adata.obs["n_genes"])

# 1) Subgroup identification and annotation

### Import the marker genes

In [None]:
# import markers
marker_names = []
annot_type_list = []
with open("/home/mrichter/scripts/precision_toxicology/All_markers_no_cell_types_220124.csv") as f:
    head = f.readline()
    for line in f:
        line = line.split(";")
        marker_names.append(line[3])
        annot_type_list.append(line[1]+":"+line[0])      
        
marker_dict = {}
for name in list(set(annot_type_list)):
    marker_dict[name] = []
for i in range(0, len(annot_type_list)):
    marker_dict[annot_type_list[i]].append(marker_names[i])
    
gene_name_dict = {}
for i in range(0, len(adata.var)):
    gene_name_dict[adata.var_names.tolist()[i]] = adata.var_names.tolist()[i]
    
marker_id_dict = {}
for name in list(set(annot_type_list)):
    marker_id_dict[name] = []
    
marker_name_dict = {}
for name in list(set(annot_type_list)):
    marker_name_dict[name] = []
for name in list(set(annot_type_list)):
    item = marker_dict[name]
    for elem in item:
        if elem.upper() in gene_name_dict.keys():
            marker_id_dict[name].append(gene_name_dict[elem.upper()])
            marker_name_dict[name].append(elem.upper()) 

In [None]:
sc.tl.louvain(adata)

In [None]:
sc.pl.umap(adata, color=["louvain", "shared_clusters_across_donors"], wspace=0.4,
          save="_louvain_and_shared_clusters_Harmony_20211126.pdf")

In [None]:
for key in marker_name_dict.keys():
    sc.pl.umap(adata, color=marker_id_dict[key], 
               title=marker_name_dict[key],
               cmap="YlGnBu",
               save="_markers_"+key+"_20211126.pdf")

In [None]:
adata.var_names_make_unique()
adata.raw = adata

In [None]:
for key in marker_name_dict.keys():
    sc.pl.stacked_violin(adata, groupby="louvain", 
                         var_names=marker_name_dict[key], swap_axes=True,
                         save="markers_louvain_"+key+"_20211126.pdf")

In [None]:
sc.tl.rank_genes_groups(adata, groupby="louvain", n_genes=2000)

In [None]:
df = sc.tl.marker_gene_overlap(adata, reference_markers=marker_name_dict, adj_pval_threshold=0.05)

In [None]:
g = sns.clustermap(df, cmap="YlGnBu", yticklabels=1, linewidths=0.1,
                  row_cluster=True)
#plt.savefig("heatmap_overall_louvain_marker_overlap_20211126.pdf", bbox_inches="tight")

In [None]:
annot = []
for elem in adata.obs["louvain"]:
    if elem in ["13","3","6"]:
        annot.append("losing_expression")
    elif elem == "7":
        annot.append("lipids_phaseIII")
    elif elem in ["2","11","0","8","10","15"]:
        annot.append("carbs_phaseII_stress")
    else:
        annot.append("bile_sterols")
adata.obs["subgroups"] = annot

In [None]:
adata.uns["subgroups_colors"] = ['#ff7f0e', '#1f77b4', '#17b2b1', '#9b1c03']

In [None]:
sc.pl.umap(adata, color=["louvain","shared_clusters_across_donors","subgroups"],
          wspace=0.4)
          #save="_subgroup_annot_20211126.pdf")

In [None]:
sc.pl.umap(adata, color=["donor","louvain"])

In [None]:
for key in marker_name_dict.keys():
    sc.pl.stacked_violin(adata, groupby="subgroups",
                         var_names=marker_id_dict[key], 
                         standard_scale="var",
                         swap_axes=True,
                         #title=marker_name_dict[key],
                         cmap="YlGnBu",
                         save="_markers_"+key+"_20220124.pdf")

In [None]:
sc.pl.dotplot(adata, groupby="subgroups", var_names=["RPS19","PRDX1","BAX",
                                                     "ATF4","GCLM","GSTA1",
                                                     "LGALS1","MT1H","MT1M"], 
              #swap_axes=True,
              standard_scale="var")
              #save="some_stress_markers_20211215.pdf")

In [None]:
for key in marker_name_dict.keys():
    sc.pl.stacked_violin(adata, groupby="subgroups",  
                         var_names=marker_name_dict[key], swap_axes=True,
                         save="markers_subgroups_"+key+"_20211126.pdf")

In [None]:
df2 = pd.DataFrame()        
for elem in sorted(list(set(adata.obs["subgroups"].tolist()))):
    adata.var[elem+"_p_cells"] = np.sum(adata[adata.obs["subgroups"] == elem,:].layers["norm_counts"] > 0, axis=0)/len(adata[adata.obs["subgroups"] == elem,:])
    df2[elem+"_p_cells"] = np.sum(adata[adata.obs["subgroups"] == elem,:].layers["norm_counts"] > 0, axis=0)/len(adata[adata.obs["subgroups"] == elem,:])

In [None]:
df1 = df2.unstack().reset_index() 

In [None]:
cols = {"bile_sterols_p_cells":'#ff7f0e', "carbs_phaseII_stress_p_cells":'#1f77b4',
        "lipids_phaseIII_p_cells":"#17b2b1",
        "losing_expression_p_cells":'#9b1c03'}

ax = sns.boxplot(x="level_0", y=0, data=df1, palette=cols, showfliers=False)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')

plt.savefig("boxplot_Fig1_C_percentage_cells_gene_expression_per_cluster_20211126.pdf", bbox_inches="tight")
plt.show()

In [None]:
# Data
df = pd.crosstab(adata.obs["subgroups"], adata.obs["cyclone_phases"], margins = False)
r = np.arange(4)
 
# From raw value to percentage
totals = [i+j+k for i,j,k in zip(df['G1'], df['G2M'], df['S'])]

G1 = [i / j * 100 for i,j in zip(df['G1'], totals)]
G2M = [i / j * 100 for i,j in zip(df['G2M'], totals)]
S = [i / j * 100 for i,j in zip(df['S'], totals)]

# plot
barWidth = 0.85
names = ('I', 'II', 'III', 'IV')

# Create bars
plt.bar(r, G1, width=barWidth, label="G1", color=("#606060"))
plt.bar(r, G2M, bottom=G1, width=barWidth, label="G2M", color=("#cb1361"))
plt.bar(r, S, bottom=[i+j for i,j in zip(G1, G2M)], width=barWidth, label="S", color=("#fbdc12"))


    
# Custom x axis
plt.xticks(r, names)
#plt.xlabel("group")
plt.xticks(rotation=0)
plt.legend(loc='upper left', bbox_to_anchor=(1,1), ncol=1)

# Show graphic
plt.savefig("barplot_Fig1_D_cell_cycle_analysis_20211126.pdf", bbox_inches="tight")
plt.show()

In [None]:
pd.crosstab(adata.obs["subgroups"], adata.obs["cyclone_phases"], margins = True)

In [None]:
sc.pl.dotplot(adata, groupby="subgroups", var_names=["CYP8B1","HSD17B4","HSD3B7","HMGCS2","CYP27A1",
                          "LDHA","GSTO1","SULT2A1","GAMT","GSTZ1",
                                                     "ABCC3","ABCC2","PLIN5","MLXIPL","LDLR"],
              cmap="YlGnBu",
              standard_scale="var",
              save="_figure1_E_marker_genes_20220121.pdf")

In [None]:
sc.pl.dotplot(adata, groupby="subgroups", var_names=["CYP8B1","CYP1B1","HSD17B4","CYP27A1",
                                                       "HSD3B7","CYP51A1","RXRA","AHR","HMGCS2","NR1H3",
                                                       'SULT2A1',"GSTO1","GSTA1","GSTA2","LDHA",
                                                       "UGP2","BAX","PCBD1","GAMT","SULT1A1",
                                                       "ABCC2","ABCC3","SLCO1B1","ABCB1",
                                                        "LBR","CCNL1","APOH","ACACB","PLIN5","LDLR"],
              standard_scale="var", cmap="YlGnBu", save="more_markers_ylgnbu_20220215.pdf")

In [None]:
sc.pl.umap(adata, color=["subgroups","SERPINA1","CYP27A1","LDHA","ABCC2",
                         "Treatment","ALB","CYP8B1","GSTZ1","ABCC3",
                         "donor","TTR","HMGCS2","GSTO1","MLXIPL"], wspace=0.4, ncols=5,
          save="_Fig1_F_subgroup_markers_20211126.pdf")

In [None]:
outfile = open("subgroup_annotation_after_Harmony.txt", "w")
for i in range(0, len(adata)):
    outfile.write(adata.obs_names.tolist()[i])
    outfile.write("\t")
    outfile.write(adata.obs["subgroups"][i])
    outfile.write("\n")
outfile.close()

## Plotting transcription factors predicted by ChEA3

In [None]:
sc.pl.stacked_violin(adata, groupby="subgroups", var_names=["BPTF","EEA1","FOXO3","ZBTB38","ZNF148",
                                                            "GTF3A","PA2G4","NME2","CENPX","HMGN3",
                                                            "MYSM1","KMT2A","NFAT5","ZNF655","ASH1L",
                                                            "IKZF3","ZNF428","STAT4","BATF","STAT1"],
                     #swap_axes=True,
                     standard_scale="var",
                     cmap="YlGnBu",
                    save="regulating_TFs_all_20220124.pdf")

In [None]:
# plotting TFs predicted by ChEA3 for only DMSO-treated cells
adata_DMSO = adata[adata.obs["Treatment"] == "Vehicle"]

In [None]:
sc.pl.stacked_violin(adata_DMSO, groupby="subgroups", var_names=["HNF4A","ZBTB38","BPTF","CLOCK","ZNF148",
                                                            "GTF3A","PA2G4","HMGN3","CENPX","SNAPC5",
                                                            "MLXIPL","MYSM1","NFAT5","ZNF518A","BAZ2B",
                                                            "IKZF3","IKZF1","DNMT1","E2F2","ZNF367"],
                     #swap_axes=True,
                     standard_scale="var",
                     cmap="YlGnBu",
                    save="regulating_TFs_DMSO_20220124.pdf")

## Comparison between in vitro and in vivo

In [None]:
sc.tl.rank_genes_groups(adata, groupby="subgroups", n_genes=10)

In [None]:
adata.uns["rank_genes_groups"]["names"]

In [None]:
markers = ["SPTBN1","DST","CLIP1","APOB","CYP3A5",
           #"DIAPH1","ABCC2","GOLGA4","EIF3A","GOLGB1",
           
           "RPL13","RPL32","RPS27A","RPS13","GAPDH",
           #"RPL14","RPL11","RPL26","RPS12","RPS24",
           
           "DST","ABCC2","CYP3A5","CES1","NKTR",
           #"N4BP2L2","MLXIPL","ABCC3","HSPA5","CCNL1"]

In [None]:
adata1 = adata[:,markers].copy()

In [None]:
adata1

In [None]:
df1 = pd.DataFrame()
for elem in list(set(adata1.obs["subgroups"])):
    #if elem != "losing_expression":
    df1["mean_"+elem] = np.mean(adata1[adata1.obs["subgroups"] == elem].X, axis=0)

In [None]:
df1.index = adata1.var_names.tolist()

In [None]:
df1 = df1.T/np.mean(df1.T)

In [None]:
df1 = (df1 - np.min(df1)) / (np.max(df1) - np.min(df1))

In [None]:
df1

In [None]:
# load the integrated in vivo data

In [None]:
adata2 = sc.read("/home/mrichter/scripts/precision_toxicology/data_integration/scgen_integration_aizarani_macparland_feb22.h5ad")

In [None]:
adata2 = adata2[:,markers]

In [None]:
df2 = pd.DataFrame()
for elem in list(set(adata2.obs["subgroups"])):
    print(elem)
    df2["mean_in_vivo_"+elem] = np.mean(adata2[adata2.obs["subgroups"] == elem].X, axis=0)

In [None]:
df2.index = adata2.var_names.tolist()

In [None]:
df2 = df2.T/np.mean(df2.T)

In [None]:
df2 = (df2 - np.min(df2)) / (np.max(df2) - np.min(df2))

In [None]:
df2

In [None]:
df = pd.concat([df1, df2], axis=0)

In [None]:
df

In [None]:
dfx = df.T.corr()

In [None]:
dfx

In [None]:
sns.clustermap(dfx, cmap="YlGnBu", method="ward")
#plt.savefig("heatmap_correlation_in_vivo_in_vitro_0.pdf")

## Looking into zonation in vitro

In [None]:
sc.pl.umap(adata, color=["CYP27A1","HMGCS2","GSTO1","PCBD1","MLXIPL","APOH"], ncols=2,
          save="_zonation_markers_subgroups_20220125.pdf")

In [None]:
adataL3 = adata[adata.obs["subgroups"] == "lipids_phaseIII"]
adataL3.obs["group"] = "lipids_phaseIII"
adataC2 = adata[adata.obs["subgroups"] == "carbs_phaseII_stress"]
adataC2.obs["group"] = "carbs_phaseII_stress"
adataB = adata[adata.obs["subgroups"] == "bile_sterols"]
adataB.obs["group"] = "bile_sterols"
adata0 = adata[adata.obs["subgroups"] == "losing_expression"]
adata0.obs["group"] = "losing_expression"

In [None]:
sc.pl.umap(adataB, color=["subgroups","CYP27A1","HSD11B1"], 
           save="_bile_sterols_markers2_zonation_20220201.pdf")

In [None]:
sc.pl.umap(adataC2, color=["subgroups","GSTO1","PCBD1"],
           save="_carbs_phaseII_markers_zonation_20220201.pdf")

In [None]:
sc.pl.umap(adataL3, color=["subgroups","MLXIPL","APOH"],
           save="_lipids_phaseIII_markers_zonation_20220201.pdf")

In [None]:
df = pd.read_csv('/home/mrichter/zonation_paper/zonation_markers.txt', sep='\t')
zonation_markers = []
for marker in df.HumanGeneID.tolist():
    if ';' in marker:
        zonation_markers += marker.split(';')
    else:
        zonation_markers.append(marker)

In [None]:
bins = 3
labels = [1,2,3]
df['zones'] = pd.cut(df['HumanModule'], bins=bins, labels=labels)

In [None]:
zonation_dict = df.groupby('zones')['HumanGeneID'].apply(list).to_dict()

In [None]:
pv = df[df["zones"] == 1].HumanGeneID.tolist()
mid = df[df["zones"] == 2].HumanGeneID.tolist()
cv = df[df["zones"] == 3].HumanGeneID.tolist()

In [None]:
sc.tl.score_genes(adata, pv, score_name="pv_score")
sc.tl.score_genes(adata, cv, score_name="cv_score")
sc.tl.score_genes(adata, mid, score_name="mid_score")

In [None]:
sc.pl.umap(adata, color=["subgroups","cv_score","pv_score","mid_score"], ncols=4)

In [None]:
sc.tl.rank_genes_groups(adata_DMSO, groupby="subgroups", n_genes=500)

In [None]:
dfx = sc.tl.marker_gene_overlap(adata_DMSO, reference_markers=zonation_dict, adj_pval_threshold=0.05)

In [None]:
g = sns.clustermap(np.log1p(dfx), cmap="YlGnBu", yticklabels=1, linewidths=0.1,
                  row_cluster=False)
#plt.savefig("heatmap_overall_louvain_marker_overlap_20211126.pdf", bbox_inches="tight")

# 2) Comparing Cocktail vs DMSO

In [None]:
annot = []
for elem in adata.obs["Treatment"]:
    if elem[0:3] == "FFA":
        annot.append("with_FFA")
    else:
        annot.append("no_FFA")
adata.obs["FFA_annot"] = annot

In [None]:
annot = []
for elem in adata.obs["Treatment"]:
    if elem in ["Vehicle", "FFA_Vehicle"]:
        annot.append("no_Cocktail")
    else:
        annot.append("Cocktail")
adata.obs["cocktail_annot"] = annot

In [None]:
annot = []
for idx, elem in enumerate(adata.obs["Treatment"]):
    annot.append(elem+"_"+adata.obs["subgroups"][idx])
adata.obs["group_Treatment"] = annot

In [None]:
annot = []
for idx, elem in enumerate(adata.obs["subgroups"]):
    if adata.obs["Treatment"][idx] == "Vehicle":
        annot.append(elem+"_1DMSO")
    elif adata.obs["Treatment"][idx] == "FFA_Vehicle":
        annot.append(elem+"_2FFA")
    elif adata.obs["Treatment"][idx] == "Cocktail":
        annot.append(elem+"_3Cocktail")
    else:
        annot.append(elem+"_4FFA_Cocktail")
adata.obs["subgroup_treatment"] = annot

In [None]:
adata1 = adata[adata.obs["FFA_annot"] == "no_FFA"]
adata1.obs["group"] = "pseudobulk"

In [None]:
adataL3 = adata1[adata1.obs["subgroups"] == "lipids_phaseIII"]
adataL3.obs["group"] = "lipids_phaseIII"
adataC2 = adata1[adata1.obs["subgroups"] == "carbs_phaseII_stress"]
adataC2.obs["group"] = "carbs_phaseII_stress"
adataB = adata1[adata1.obs["subgroups"] == "bile_sterols"]
adataB.obs["group"] = "bile_sterols"
adata0 = adata1[adata1.obs["subgroups"] == "losing_expression"]
adata0.obs["group"] = "losing_expression"

In [None]:
sc.pl.stacked_violin(adata1, groupby="subgroup_treatment",
                     var_names=["CYP2D6", "CYP2C19", "CYP2C9", "CYP3A4", "CYP1A2"],
                     swap_axes=True,
                     cmap="YlGnBu",
                     save="5cytochromes_Cocktail_vs_DMSO_unscaled_horizontal_20211201.pdf")

In [None]:
sc.pl.stacked_violin(adata1, groupby="Treatment",
                     var_names=["CYP2D6", "CYP2C19", "CYP2C9", "CYP3A4", "CYP1A2"], 
                     cmap="YlGnBu",
                     swap_axes=True,
                     save="5cytochromes_Cocktail_vs_DMSO_pseudobulk_unscaled_horizontal_20211201.pdf")

In [None]:
adata2 = adata1[adata1.obs["subgroups"] != "losing_expression"]
adata2.obs["group"] = "without_losing_exp"

In [None]:
for i in [adataB,adataC2,adataL3,adata0, adata1, adata2]:
    sc.tl.rank_genes_groups(i, groupby="Treatment", #groups=["Cocktail"], reference="Vehicle",
                       n_genes=len(adata.var), use_raw=True)

    dict_genes = i.uns["rank_genes_groups"].copy()

    df = {}
    for idx, elem in enumerate(list(dict_genes["names"].dtype.names)):
        print(idx, elem)
        df[elem+"_gene"] = []
        df[elem+"_logfoldchange"] = []
        df[elem+"_pval_adj"] = []
        for j in range(0, len(adata.var)):
            df[elem+"_gene"].append(dict_genes["names"][j][idx])
            df[elem+"_logfoldchange"].append(dict_genes["logfoldchanges"][j][idx])
            df[elem+"_pval_adj"].append(dict_genes["pvals_adj"][j][idx])

    dataframe = pd.DataFrame(df)        
    for idx, elem in enumerate(list(dict_genes["names"].dtype.names)):
        adata.var[str(i.obs["group"][0])+"_"+elem+"_mean"] = np.mean(i[i.obs["Treatment"] == elem,:].X, axis=0)
        adata.var[str(i.obs["group"][0])+"_"+elem+"_n_cells"] = np.sum(i[i.obs["Treatment"] == elem,:].X > 0, axis=0)
        dataframe.index = dataframe[elem+"_gene"]
        adata.var[str(i.obs["group"][0])+"_"+elem+"_log2FC"] = dataframe[elem+"_logfoldchange"]
        adata.var[str(i.obs["group"][0])+"_"+elem+"_pvals_adj"] = dataframe[elem+"_pval_adj"]

In [None]:
adata.var

In [None]:
for elem in ["bile_sterols","carbs_phaseII_stress","lipids_phaseIII",
             "losing_expression","pseudobulk","without_losing_exp"]:
    changes = []
    for j in range(0, len(adata.var)):
        if adata.var[elem+"_Cocktail_log2FC"][j] > 1 and adata.var[elem+"_Cocktail_pvals_adj"][j] < 0.05:
            changes.append("up_Cocktail")
        elif adata.var[elem+"_Cocktail_log2FC"][j] < -1 and adata.var[elem+"_Cocktail_pvals_adj"][j] < 0.05:
            changes.append("up_DMSO")
        else:
            changes.append("none")
        
    adata.var["changes_"+elem] = changes  

In [None]:
adata.var.to_csv("differential_expression_Cocktail_vs_DMSO_nov21.csv")

In [None]:
genes_no0 = []
for idx, elem in enumerate(adata.var_names.tolist()):
    if adata.var["changes_without_losing_exp"][idx] == "up_Cocktail":
        genes_no0.append(elem)

In [None]:
genes_all = []
for idx, elem in enumerate(adata.var_names.tolist()):
    if adata.var["changes_pseudobulk"][idx] == "up_Cocktail":
        genes_all.append(elem)

In [None]:
genes = {"DEGs_all":genes_all,
         "DEGs_no_losing_expression":genes_no0}

In [None]:
df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in genes.items() ]))

In [None]:
df.to_csv("DEGs_Cocktail_vs_DMSO_with_and_without_losing_expression.csv")

In [None]:
from matplotlib_venn import venn2, venn3

In [None]:
c = venn2([set(genes_all), set(genes_no0)])
c.get_patch_by_id('10').set_color('#9b1c03')
c.get_patch_by_id('10').set_edgecolor('none')
c.get_patch_by_id('01').set_color('#768d82')
c.get_patch_by_id('01').set_edgecolor('none')
c.get_patch_by_id('11').set_color('#ba7ca0')
c.get_patch_by_id('11').set_edgecolor('none')
c.get_patch_by_id('11').set_alpha(0.4)
plt.savefig("venn_pseudobulk_DEGs_with_and_without_losing_exp_proportional_20211129.pdf")

In [None]:
df = pd.read_csv("differential_expression_Cocktail_vs_DMSO_nov21.csv", index_col=0)

In [None]:
up_all = []
bile = []
carbs = []
lipids = []
le = []
for idx, elem in enumerate(df["gene_name"]):
    if df["changes_bile_sterols"][idx] == "up_Cocktail" and df["changes_carbs_phaseII_stress"][idx] == "up_Cocktail" and df["changes_lipids_phaseIII"][idx] == "up_Cocktail" and df["changes_losing_expression"][idx] == "up_Cocktail" and df["changes_pseudobulk"][idx] == "up_Cocktail":
        up_all.append(elem)

lfc_b = []
p_b = []
for idx, elem in enumerate(df["gene_name"]):
    if df["changes_bile_sterols"][idx] == "up_Cocktail":
        bile.append(elem)
        lfc_b.append(df["bile_sterols_Cocktail_log2FC"][idx])
        p_b.append(df["bile_sterols_Cocktail_pvals_adj"][idx])
lfc_c2 = []
p_c2 = []
for idx, elem in enumerate(df["gene_name"]):
    if df["changes_carbs_phaseII_stress"][idx] == "up_Cocktail":
        carbs.append(elem)
        lfc_c2.append(df["carbs_phaseII_stress_Cocktail_log2FC"][idx])
        p_c2.append(df["carbs_phaseII_stress_Cocktail_pvals_adj"][idx])
lfc_l3 = []
p_l3 = []
for idx, elem in enumerate(df["gene_name"]):
    if df["changes_lipids_phaseIII"][idx] == "up_Cocktail":
        lipids.append(elem)
        lfc_l3.append(df["lipids_phaseIII_Cocktail_log2FC"][idx])
        p_l3.append(df["lipids_phaseIII_Cocktail_pvals_adj"][idx])
lfc_0 = []
p_0 = []
for idx, elem in enumerate(df["gene_name"]):
    if df["changes_losing_expression"][idx] == "up_Cocktail":
        le.append(elem)
        lfc_0.append(df["losing_expression_Cocktail_log2FC"][idx])
        p_0.append(df["losing_expression_Cocktail_pvals_adj"][idx])

In [None]:
annot = []
for elem in adata1.obs["subgroups"]:
    if elem == "bile_sterols":
        annot.append("b")
    elif elem == "carbs_phaseII_stress":
        annot.append("c2")
    elif elem == "lipids_phaseIII":
        annot.append("l3")
    else:
        annot.append("0")
adata1.obs["abbreviations"] = annot

annot = []
for idx, elem in enumerate(adata1.obs["Treatment"]):
    annot.append(elem+"_"+adata1.obs["abbreviations"][idx])
adata1.obs["abbr_treatment"] = annot

In [None]:
up_all_DMSO = []
for idx, elem in enumerate(df["gene_name"]):
    if df["changes_bile_sterols"][idx] == "up_DMSO" and df["changes_carbs_phaseII_stress"][idx] == "up_DMSO" and df["changes_lipids_phaseIII"][idx] == "up_DMSO" and df["changes_losing_expression"][idx] == "up_DMSO" and df["changes_pseudobulk"][idx] == "up_DMSO":
        up_all_DMSO.append(elem)

In [None]:
to_plot = up_all
for elem in ["ATF3","SRD5A2","CYP2U1","SLC4A7","PLIN2","OSGIN1","MT1E","FADS2"]:
    to_plot.append(elem)

In [None]:
for elem in up_all_DMSO:
    up_all.append(elem)
    to_plot.append(elem)

In [None]:
df2 = pd.DataFrame()
lfc = []
pval = []
groups = []
genes = []
order = []

for group in ["bile_sterols","carbs_phaseII_stress","lipids_phaseIII","losing_expression"]:
    for elem in list(set(adata1.obs["Treatment"])):
        if elem == "Cocktail":
            for idx, gene in enumerate(to_plot):
                lfc.append(np.float(df[df["gene_name"] == gene][group+"_Cocktail_log2FC"]))
                pval.append(-np.log10(np.float(df[df["gene_name"] == gene][group+"_Cocktail_pvals_adj"])))
                genes.append(gene)
                groups.append(group)
                order.append(idx)
                
df2["lfc_Cocktail"] = lfc
df2["pval_Cocktail"] = pval
df2["gene"] = genes
df2["group"] = groups
df2["order"] = order

In [None]:
m = df2.loc[df2['pval_Cocktail'] != np.inf, 'pval_Cocktail'].max()
df2['pval_Cocktail'].replace(np.inf,m,inplace=True)

In [None]:
import matplotlib

In [None]:
plt.rcParams['figure.figsize']=(4,9)

colormap = sns.diverging_palette(265, 140, s=100, as_cmap=True) #or any other colormap
normalize = matplotlib.colors.Normalize(vmin=-4.8, vmax=4.8)

ordered_df = df2.sort_values(by='order', ascending=False)
p = plt.scatter(x=ordered_df["group"], y=ordered_df['gene'], c=ordered_df["lfc_Cocktail"], 
                s=abs(ordered_df["lfc_Cocktail"])*100,
                cmap=colormap,
                norm=normalize, 
                label='Cocktail')
plt.colorbar()
plt.legend(*p.legend_elements("sizes", num=5),loc='lower left', bbox_to_anchor=(1.75, 0.8))
plt.margins(x=0.1, y=0.03)
plt.xticks(rotation=90)
plt.savefig("dotplot_Figure2_C_logfoldchanges_more_genes_20220120.pdf", bbox_inches="tight")

In [None]:
set(adata.obs["subgroups"])

In [None]:
df = adata.var[adata.var["losing_expression_Cocktail_n_cells"] > 0]
df = df[df["losing_expression_Vehicle_n_cells"] > 0]

In [None]:
cols = []
for elem in df["changes_losing_expression"]:
    if elem == "up_Cocktail":
        cols.append("#279e68")
    elif elem == "up_DMSO":
        cols.append("#888de4")
    else:
        cols.append("0.75")

In [None]:
df["pval_r"] = -np.log10(df["losing_expression_Cocktail_pvals_adj"])

In [None]:
m = df.loc[df['pval_r'] != np.inf, 'pval_r'].max()
df['pval_r'].replace(np.inf,m,inplace=True)

In [None]:
import matplotlib

In [None]:
plt.rcParams['figure.figsize']=(7,7) #rescale figures
sns.set_theme()
font=matplotlib.font_manager.FontProperties()
font.set_weight('normal')
plt.scatter(y=-np.log10(df["losing_expression_Cocktail_pvals_adj"]), x=df["losing_expression_Cocktail_log2FC"],
                  c=cols,alpha=0.7, marker="x")

#pb for i in up_all:
#b for i in ["SPINK1","FGA","ORM2","GSTA2","FGL1","CYP1A1","POR","CYP1A2","GNA13","CYP2B6"]:
#c2 for i in ["SPINK1","ANGPTL3","EEF1G","GSTA2","SERPINA3","ALAS1","POR","MT1G","MT1H","GDF15"]:
#l3 for i in ["FGA","FGG","CFB","SERPING1","SERPINA3","PLIN2","POR","AKR1C1","CYP1A1","CYP1A2"]:
for i in ["ANGPTL3","ALB","ADH1B","SPINK1","ASNS","ALAS1","FADS1","MT2A","MT1G","CYP1A2"]:
    plt.text(df.loc[df["gene_name"] == i, "losing_expression_Cocktail_log2FC"].iloc[0]+0.1, 
             df.loc[df["gene_name"] == i, 'pval_r'].iloc[0]+0.1, i,
             fontdict=dict(color='black', alpha=0.7, size=10, weight="normal"))

plt.savefig("volcano_Cocktail_vs_DMSO_losing_expression_genes.pdf",bbox_inches="tight")
plt.show()

In [None]:
df1 = df[df["changes_losing_expression"] == "up_Cocktail"]
df1["pval_r"] = -np.log10(df1["losing_expression_Cocktail_pvals_adj"])
df2 = df1[-np.log10(df1["losing_expression_Cocktail_pvals_adj"]) > 36]
#df2 = df2[df2["carbs_phaseII_stress_Cocktail_log2FC"] < -1.5]
df2

In [None]:
genes = {}
genes["bile"] = bile
genes["carbs"] = carbs
genes["lipids"] = lipids
genes["losing_exp"] = le

In [None]:
df1 = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in genes.items() ]))
df1.to_csv("all_up_genes_per_subgroup_nov21.csv")

In [None]:
df = pd.read_csv("differential_expression_Cocktail_vs_DMSO_nov21.csv", index_col=0)

In [None]:
b_unique = []
c_unique = []
l_unique = []
o_unique = []
pb = []

for idx, elem in enumerate(df["gene_name"]):
    if df["changes_bile_sterols"][idx] == "up_Cocktail" and df["changes_carbs_phaseII_stress"][idx] != "up_Cocktail" and df["changes_lipids_phaseIII"][idx] != "up_Cocktail" and df["changes_losing_expression"][idx] != "up_Cocktail":
        b_unique.append(elem)
    elif df["changes_bile_sterols"][idx] != "up_Cocktail" and df["changes_carbs_phaseII_stress"][idx] == "up_Cocktail" and df["changes_lipids_phaseIII"][idx] != "up_Cocktail" and df["changes_losing_expression"][idx] != "up_Cocktail":
        c_unique.append(elem)
    elif df["changes_bile_sterols"][idx] != "up_Cocktail" and df["changes_carbs_phaseII_stress"][idx] != "up_Cocktail" and df["changes_lipids_phaseIII"][idx] == "up_Cocktail" and df["changes_losing_expression"][idx] != "up_Cocktail":
        l_unique.append(elem)
    elif df["changes_bile_sterols"][idx] != "up_Cocktail" and df["changes_carbs_phaseII_stress"][idx] != "up_Cocktail" and df["changes_lipids_phaseIII"][idx] != "up_Cocktail" and df["changes_losing_expression"][idx] == "up_Cocktail":
        o_unique.append(elem)

for idx, elem in enumerate(df["gene_name"]):
    if df["changes_pseudobulk"][idx] == "up_Cocktail":
        pb.append(elem)

In [None]:
unique_genes = {}
unique_genes["bile"] = b_unique
unique_genes["carbs"] = c_unique
unique_genes["lipids"] = l_unique
unique_genes["losing_exp"] = o_unique

In [None]:
df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in unique_genes.items() ]))
df.to_csv("unique_genes_Cocktail_per_subgroup_nov21.csv")

In [None]:
genes = {"bile":set(bile),
         "carbs":set(carbs),
         "lipids":set(lipids),
         "losing_exp":set(le)}

In [None]:
from venn import venn

In [None]:
venn(genes, legend_loc="upper left", fontsize=8.5, #fmt="{percentage:.1f}%", 
     cmap=["#ff7f0e","#1f77b4","#17b2b1","#9b1c03"])
plt.savefig("venn_overlap_significant_genes_Cocktail_vs_DMSO_20211129.pdf", bbox_inches="tight")

In [None]:
df = pd.crosstab(adata1.obs["subgroups"], adata1.obs["Treatment"], margins = False)
df['Vehicle'] = df['Vehicle'].div(9820)
df["Cocktail"] = df["Cocktail"].div(6122)

In [None]:
df

In [None]:
# Data
r = np.arange(4)
 
# From raw value to percentage
totals = [i+j for i,j in zip(df['Vehicle'], df['Cocktail'])]

DMSO = [i / j * 100 for i,j in zip(df['Vehicle'], totals)]
Cocktail = [i / j * 100 for i,j in zip(df['Cocktail'], totals)]

# plot
barWidth = 0.85
names = ('bile_sterols','carbohydrates_phaseII',"lipids_phaseIII","losing_exp")

# Create bars
plt.bar(r, DMSO, width=barWidth, label="DMSO", color="#888de4")
plt.bar(r, Cocktail, bottom=DMSO, width=barWidth, label="Cocktail", color="#279e68")

    
# Custom x axis
plt.xticks(r, names)
#plt.xlabel("group")
plt.xticks(rotation=90)
plt.legend(loc='upper left', bbox_to_anchor=(1,1), ncol=1)

# Show graphic
plt.savefig("barplot_percentage_DMSO_and_Cocktail_per_subgroup_20211130.pdf", bbox_inches="tight")
plt.show()

In [None]:
df0 = pd.read_csv("enrichment_losing_expression.csv")
df0["group"] = "losing_expression"
df1 = pd.read_csv("enrichment_bile_sterols.csv")
df1["group"] = "bile_sterols"
df2 = pd.read_csv("enrichment_carbs_phaseII.csv")
df2["group"] = "carbs_phaseII"
df3 = pd.read_csv("enrichment_lipids_phaseIII.csv")
df3["group"] = "lipids_phaseIII"

df = df1.append(df2, ignore_index=True)
df = df.append(df3, ignore_index=True)
#df = df.append(df0, ignore_index=True)

In [None]:
df

In [None]:
def scale_data_5_75(data):
    mind = np.min(data)
    maxd = np.max(data)
    
    if maxd == mind:
        maxd=maxd+1
        mind=mind-1
        
    drange = maxd - mind
    return ((((data - mind)/drange*0.70)+0.05)*100)


In [None]:
df['scaled.overlap'] = scale_data_5_75(df['nGenes'])

In [None]:
df["log_p"] = -np.log10(df["Enrichment FDR"])

In [None]:
ordered_df = df.sort_values(by="log_p", ascending=True)

In [None]:
plt.rcParams['figure.figsize']=(25,6)

D_id_color = {'bile_sterols': '#ff7f0e', 'carbs_phaseII': '#1f77b4', 
              'lipids_phaseIII': '#17b2b1', 'losing_expression': '#9b1c03'}

#color_map = [color for color in D_id_color.values()]
#plt.scatter(x_coordinates,y_coordinates, s = size_map, c = color_map)

ax = sns.scatterplot(data=ordered_df, y=-np.log10(ordered_df['Enrichment FDR']), 
                     x='Pathway', hue='group', linewidth=.01, palette=D_id_color,
                     edgecolor="lightgrey", alpha=0.99,
                     s=[(i+10)**1.15 for i in df['scaled.overlap']])


plt.xticks(rotation=75, horizontalalignment='right', fontweight='light', fontsize='medium')
plt.yticks(rotation=0, fontweight="light", fontsize="medium")
#ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')

min_olap = ordered_df['scaled.overlap'].min()
max_olap = ordered_df['scaled.overlap'].max()
olap_range = max_olap - min_olap

size_leg_vals = [np.round(i/5)*5 for i in [min_olap, min_olap+(20/70)*olap_range, min_olap+(45/70)*olap_range, max_olap]]
size_leg_scaled_vals = scale_data_5_75(size_leg_vals)

    
l1 = ax.scatter([],[], s=(size_leg_scaled_vals[0]+10)**1.15, edgecolors='none', color='black')
l2 = ax.scatter([],[], s=(size_leg_scaled_vals[1]+10)**1.15, edgecolors='none', color='black')
l3 = ax.scatter([],[], s=(size_leg_scaled_vals[2]+10)**1.15, edgecolors='none', color='black')
l4 = ax.scatter([],[], s=(size_leg_scaled_vals[3]+10)**1.15, edgecolors='none', color='black')

labels = [str(int(i)) for i in size_leg_vals]

leg = plt.legend([l1, l2, l3, l4], labels, ncol=1, frameon=False, fontsize=12,
                 handlelength=1, loc = 'lower right', borderpad = 1, labelspacing = 1,
                 handletextpad=2, title='Gene overlap', scatterpoints = 1,  bbox_to_anchor=(1, 1), 
                 facecolor='black')

plt.margins(x=0.02, y=0.02)

plt.savefig("drugCTD_terms_active_groups_20211130.pdf", bbox_inches="tight")
plt.show()

# 3) Comparing FFA vs DMSO

In [None]:
adata1 = adata[adata.obs["cocktail_annot"] == "no_Cocktail"]
adata1.obs["group"] = "pseudobulk"

In [None]:
adataL3 = adata1[adata1.obs["subgroups"] == "lipids_phaseIII"]
adataL3.obs["group"] = "lipids_phaseIII"
adataC2 = adata1[adata1.obs["subgroups"] == "carbs_phaseII_stress"]
adataC2.obs["group"] = "carbs_phaseII_stress"
adataB = adata1[adata1.obs["subgroups"] == "bile_sterols"]
adataB.obs["group"] = "bile_sterols"
adata0 = adata1[adata1.obs["subgroups"] == "losing_expression"]
adata0.obs["group"] = "losing_expression"

In [None]:
df = pd.crosstab(adata1.obs["subgroups"], adata1.obs["Treatment"], margins = True)
#df['Vehicle'] = df['Vehicle'].div(9820)
#df["FFA_Vehicle"] = df["FFA_Vehicle"].div(11736)

In [None]:
df

In [None]:
#### Data
r = np.arange(4)
 
# From raw value to percentage
totals = [i+j for i,j in zip(df['Vehicle'], df['FFA_Vehicle'])]

DMSO = [i / j * 100 for i,j in zip(df['Vehicle'], totals)]
Cocktail = [i / j * 100 for i,j in zip(df['FFA_Vehicle'], totals)]

# plot
barWidth = 0.85
names = ('bile_sterols','carbohydrates_phaseII',"lipids_phaseIII","losing_exp")

# Create bars
plt.bar(r, DMSO, width=barWidth, label="DMSO", color="#888de4")
plt.bar(r, Cocktail, bottom=DMSO, width=barWidth, label="FFA", color="#deb82c")

    
# Custom x axis
plt.xticks(r, names)
#plt.xlabel("group")
plt.xticks(rotation=90)
plt.legend(loc='upper left', bbox_to_anchor=(1,1), ncol=1)

# Show graphic
#plt.savefig("barplot_percentage_DMSO_and_FFA_per_subgroup_20211130.pdf", bbox_inches="tight")
plt.show()

In [None]:
df = pd.crosstab(adata1.obs["Treatment"], adata1.obs["subgroups"], margins = False)
df['bile_sterols'] = df['bile_sterols'].div(7177)
df["carbs_phaseII_stress"] = df["carbs_phaseII_stress"].div(8659)
df["lipids_phaseIII"] = df["lipids_phaseIII"].div(1055)
df["losing_expression"] = df["losing_expression"].div(4665)

In [None]:
adata.uns["Treatment_colors"]

In [None]:
#### Data
r = np.arange(2)
 
# From raw value to percentage
totals = [i+j+k+l for i,j,k,l in zip(df['bile_sterols'], df['carbs_phaseII_stress'],
                                    df['lipids_phaseIII'], df['losing_expression'])]

bile = [i / j * 100 for i,j in zip(df['bile_sterols'], totals)]
carbs = [i / j * 100 for i,j in zip(df['carbs_phaseII_stress'], totals)]
lipids = [i / j * 100 for i,j in zip(df['lipids_phaseIII'], totals)]
losing_exp = [i / j * 100 for i,j in zip(df['losing_expression'], totals)]

# plot
barWidth = 0.85
names = ('DMSO','FFA')

# Create bars
plt.bar(r, bile, width=barWidth, label="I", color="#ff7f0e")
plt.bar(r, carbs, bottom=bile, width=barWidth, label="II", color="#1f77b4")
plt.bar(r, lipids, bottom=[i+j for i,j in zip(bile, carbs)], width=barWidth, label="III",color='#17b2b1')
plt.bar(r, losing_exp, bottom=[i+j+k for i,j,k in zip(bile, carbs, lipids)], width=barWidth,
        label="IV", color="#9b1c03")

    
# Custom x axis
plt.xticks(r, names)
#plt.xlabel("group")
plt.xticks(rotation=90)
plt.legend(loc='upper left', bbox_to_anchor=(1,1), ncol=1)

# Show graphic
plt.savefig("barplot_percentage_subgroup_per_DMSO_and_FFA_20220301.pdf", bbox_inches="tight")
plt.show()

In [None]:
for i in [adataB,adataC2,adataL3,adata0, adata1]:
    sc.tl.rank_genes_groups(i, groupby="Treatment", 
                       n_genes=len(adata.var), use_raw=True)

    dict_genes = i.uns["rank_genes_groups"].copy()

    df = {}
    for idx, elem in enumerate(list(dict_genes["names"].dtype.names)):
        print(idx, elem)
        df[elem+"_gene"] = []
        df[elem+"_logfoldchange"] = []
        df[elem+"_pval_adj"] = []
        for j in range(0, len(adata.var)):
            df[elem+"_gene"].append(dict_genes["names"][j][idx])
            df[elem+"_logfoldchange"].append(dict_genes["logfoldchanges"][j][idx])
            df[elem+"_pval_adj"].append(dict_genes["pvals_adj"][j][idx])

    dataframe = pd.DataFrame(df)        
    for idx, elem in enumerate(list(dict_genes["names"].dtype.names)):
        adata.var[str(i.obs["group"][0])+"_"+elem+"_mean"] = np.mean(i[i.obs["Treatment"] == elem,:].X, axis=0)
        adata.var[str(i.obs["group"][0])+"_"+elem+"_n_cells"] = np.sum(i[i.obs["Treatment"] == elem,:].X > 0, axis=0)
        dataframe.index = dataframe[elem+"_gene"]
        adata.var[str(i.obs["group"][0])+"_"+elem+"_log2FC"] = dataframe[elem+"_logfoldchange"]
        adata.var[str(i.obs["group"][0])+"_"+elem+"_pvals_adj"] = dataframe[elem+"_pval_adj"]

In [None]:
for elem in ["bile_sterols","carbs_phaseII_stress","lipids_phaseIII",
             "losing_expression","pseudobulk"]:
    changes = []
    for j in range(0, len(adata.var)):
        if adata.var[elem+"_FFA_Vehicle_log2FC"][j] > 0.75 and adata.var[elem+"_FFA_Vehicle_pvals_adj"][j] < 0.05:
            changes.append("up_FFA")
        elif adata.var[elem+"_FFA_Vehicle_log2FC"][j] < -0.75 and adata.var[elem+"_FFA_Vehicle_pvals_adj"][j] < 0.05:
            changes.append("up_DMSO")
        else:
            changes.append("none")
        
    adata.var["changes_"+elem] = changes  

In [None]:
adata.var.to_csv("differential_expression_FFA_vs_DMSO_075_nov21.csv")

In [None]:
genes = []
for idx, elem in enumerate(adata.var["changes_bile_sterols"].tolist()):
    if elem == "up_FFA":
        genes.append(adata.var["gene_name"][idx])

In [None]:
from gprofiler import gprofiler
import custom_functions as cf

In [None]:
plt.rcParams['figure.figsize']=(10,8) #rescale figures
genes = genes
    
#Interpretation of differentially expressed genes in paneth cells - g:profiler
gp = gprofiler(genes, organism='hsapiens')
gp = gp[gp["domain"] == "BP"]

gp_enrichment = gp.sort_values('p.value').iloc[:,[2,3,5,6,11]]
gp_enrichment['name']= gp_enrichment['term.name'].copy()
gp_enrichment['p_value']= gp_enrichment['p.value'].copy()
gp_enrichment['intersection_size']= gp_enrichment['overlap.size'].copy()
del gp_enrichment['term.name'], gp_enrichment['overlap.size'], gp_enrichment['p.value']

cf.plot_enrich(gp_enrichment, save='GO_bile_sterols_FFA_vs_DMSO_075_20211130.pdf')

In [None]:
df = adata.var[adata.var["pseudobulk_FFA_Vehicle_n_cells"] > 0]
df = df[df["pseudobulk_Vehicle_n_cells"] > 0]

In [None]:
cols = []
for elem in df["changes_pseudobulk"]:
    if elem == "up_FFA":
        cols.append("#deb82c")
    elif elem == "up_DMSO":
        cols.append("#888de4")
    else:
        cols.append("0.75")

In [None]:
df["pval_r"] = -np.log10(df["pseudobulk_FFA_Vehicle_pvals_adj"])

In [None]:
m = df.loc[df['pval_r'] != np.inf, 'pval_r'].max()
df['pval_r'].replace(np.inf,m,inplace=True)

In [None]:
plt.rcParams['figure.figsize']=(7,7) #rescale figures
sns.set_theme()
font=matplotlib.font_manager.FontProperties()
font.set_weight('normal')
plt.scatter(y=-np.log10(df["pseudobulk_FFA_Vehicle_pvals_adj"]), 
            x=df["pseudobulk_FFA_Vehicle_log2FC"],
            c=cols,alpha=0.7, marker="x")

#0 for i in ["RPS14","RPS18","RPLP1","RPL13","RPL28","SPTBN1","ACSL1","METTL7A","ADH1B","SPINK1"]:
#l3 for i in ["SERPINA3","CFB","RPL17","ONECUT1","DDT","PLIN2","ACADVL","AC124319.1","VNN1","TAF1D"]:
#c2 for i in ["ANGPTL3","FGG","SPINK1","GSTA2","FGL1","PLIN2","FABP1","APOC3","TPM1","LGALS1"]:
#b for i in ["AZGP1","AMBP","SPINK1","GSTA2","ORM2","PLIN2","VNN1","TNFAIP3","TPM1","CYP2A7"]:
for i in ["FGA","FGG","SPINK1","SERPING1","FGL1","PLIN2","HSPB1","AKR1C1","TPM1","ANKRD1"]:
    plt.text(df.loc[df["gene_name"] == i, "pseudobulk_FFA_Vehicle_log2FC"].iloc[0]+0.1, 
             df.loc[df["gene_name"] == i, 'pval_r'].iloc[0]+0.1, i,
             fontdict=dict(color='black', alpha=0.7, size=10, weight="normal"))

plt.savefig("volcano_FFA_vs_DMSO_pseudobulk.pdf",bbox_inches="tight")
plt.show()

In [None]:
df1 = df[df["changes_pseudobulk"] == "up_FFA"]
df1["pval_r"] = -np.log10(df1["pseudobulk_FFA_Vehicle_pvals_adj"])
df2 = df1[-np.log10(df1["pseudobulk_FFA_Vehicle_pvals_adj"]) > 150]
df2 = df2[df2["pseudobulk_FFA_Vehicle_log2FC"] > 1]
df2

In [None]:
sc.pl.stacked_violin(adataB, groupby="subgroup_treatment", 
                     var_names=["CYP8B1","HSD17B4","HSD3B7","HMGCS2","CYP27A1", 
                                "PLIN2","VNN1","TNFAIP3","TPM1","CYP2A7"],
                     swap_axes=True,
                     figsize=(3.5,5),
                     cmap="YlGnBu",
                     standard_scale="var", save="bile_sterols_FFA_vs_DMSO_markers_and_DEGs_20211130.pdf")

In [None]:
sc.pl.stacked_violin(adataC2, groupby="subgroup_treatment", 
                     var_names=["LDHA","GSTO1","SULT2A1","GSTZ1","GAMT",
                               "PLIN2","FABP1","APOC3","TPM1","LGALS1"],
                     swap_axes=True,
                     figsize=(3.5,5),
                     cmap="YlGnBu",
                     standard_scale="var", save="carbs_phaseII_stress_FFA_vs_DMSO_markers_and_DEGs_20211213.pdf")

In [None]:
sc.pl.stacked_violin(adataL3, groupby="subgroup_treatment", 
                     var_names=["ABCC3","ABCC2","PLIN5","MLXIPL","LDLR", #APOB
                               "PLIN2","ACADVL","AC124319.1","VNN1","TAF1D"],
                     swap_axes=True,
                     figsize=(3.5,5),
                     cmap="YlGnBu",
                     standard_scale="var", save="lipids_phaseIII_FFA_vs_DMSO_markers_and_DEGs_20211213.pdf")

In [None]:
genes = ["APOB","APOE","APOA1","APOH","APOA2",
         "PLIN2","PLIN5","ELOVL6","PLIN4",
         "CYP2A6","CYP2J2","CYP7A1","CYP4A11",
         "CIDEC","CPEB4","GNAI3","SCD",
         "DUSP1","NFE2L2","SERPINE1","TNFAIP3",
         "ATF4","DDIT3","LGALS1","MT1H"]

In [None]:
sc.pl.matrixplot(adata1, groupby="subgroup_treatment", var_names=genes, swap_axes=True,
                 #standard_scale="var",
                 log=True,
                 cmap="YlGnBu",
                 save="_individual_genes_unscaled_log_fig3_20211213.pdf")

In [None]:
adata2 = adata1.copy()

In [None]:
adata2.var["mean_norm_counts"] = np.mean(adata2.X, axis=0)

In [None]:
adata2 = adata2[:,adata2.var["mean_norm_counts"] > 0.25].copy()

In [None]:
df = pd.DataFrame()
df2 = pd.DataFrame()
df3 = pd.DataFrame()
for elem in list(set(adata2.obs["subgroup_treatment"])):
    print(elem)
    adata2.var["mean_"+elem] = np.mean(adata2[adata2.obs["subgroup_treatment"] == elem,:].layers["norm_counts"], axis=0)
    adata2.var["CV_"+elem] = np.sqrt(np.exp(np.std(adata2[adata2.obs["subgroup_treatment"] == elem,:].X, axis=0)**2)-1)
    df["CV_"+elem] = np.sqrt(np.exp(np.std(adata2[adata2.obs["subgroup_treatment"] == elem,:].X, axis=0)**2)-1)
    print(np.median(df["CV_"+elem]))
    df2["mean_"+elem] = np.mean(adata2[adata2.obs["subgroup_treatment"] == elem,:].X, axis=0)

In [None]:
df1 = df.unstack().reset_index() 

In [None]:
ordered_df = df1.sort_values(by='level_0')


In [None]:
# use same colors as in the tSNE
cols = {"CV_carbs_phaseII_stress_2FFA":'#deb82c',
        "CV_carbs_phaseII_stress_1DMSO":'#888de4', "CV_losing_expression_2FFA":'#deb82c',
        "CV_losing_expression_1DMSO":'#888de4', 
        "CV_lipids_phaseIII_1DMSO":'#888de4', "CV_lipids_phaseIII_1DMSO":'#888de4',
        "CV_bile_sterols_1DMSO":'#888de4', 
        "CV_bile_sterols_2FFA":'#deb82c', "CV_lipids_phaseIII_2FFA":'#deb82c'}
p = sns.boxplot(x="level_0", y=0, data=ordered_df, palette=cols)
p.set_xticklabels(p.get_xticklabels(), rotation=45, horizontalalignment='right')
box = p.get_position()
p.set_position([box.x0, box.y0, box.width * 1, box.height * 0.5])
plt.yscale("log")
plt.savefig("coefficient_of_variation_per_group_and_treatment_FFA_vs_DMSO_colored_by_treatment_20211130.pdf", bbox_inches="tight")

In [None]:
from scipy import stats

In [None]:
stats.mannwhitneyu(df["CV_bile_sterols_1DMSO"].dropna(), df["CV_bile_sterols_2FFA"].dropna())[1]*3130

# 4) Comparison between FFA_Cocktail, Cocktail and DMSO

In [None]:
adata1 = adata[adata.obs["Treatment"] != "FFA_Vehicle"]
adata2 = adata1[adata1.obs["Treatment"] != "Cocktail"]
adata2.obs["group"] = "pseudobulk"

In [None]:
adataL3 = adata2[adata2.obs["subgroups"] == "lipids_phaseIII"]
adataL3.obs["group"] = "lipids_phaseIII"
adataC2 = adata2[adata2.obs["subgroups"] == "carbs_phaseII_stress"]
adataC2.obs["group"] = "carbs_phaseII_stress"
adataB = adata2[adata2.obs["subgroups"] == "bile_sterols"]
adataB.obs["group"] = "bile_sterols"
adata0 = adata2[adata2.obs["subgroups"] == "losing_expression"]
adata0.obs["group"] = "losing_expression"

In [None]:
for i in [adataB,adataC2,adataL3,adata0, adata2]:
    sc.tl.rank_genes_groups(i, groupby="Treatment", #groups=["Cocktail"], reference="Vehicle",
                       n_genes=len(adata.var), use_raw=True)

    dict_genes = i.uns["rank_genes_groups"].copy()

    df = {}
    for idx, elem in enumerate(list(dict_genes["names"].dtype.names)):
        print(idx, elem)
        df[elem+"_gene"] = []
        df[elem+"_logfoldchange"] = []
        df[elem+"_pval_adj"] = []
        for j in range(0, len(adata.var)):
            df[elem+"_gene"].append(dict_genes["names"][j][idx])
            df[elem+"_logfoldchange"].append(dict_genes["logfoldchanges"][j][idx])
            df[elem+"_pval_adj"].append(dict_genes["pvals_adj"][j][idx])

    dataframe = pd.DataFrame(df)        
    for idx, elem in enumerate(list(dict_genes["names"].dtype.names)):
        adata.var[str(i.obs["group"][0])+"_"+elem+"_mean"] = np.mean(i[i.obs["Treatment"] == elem,:].X, axis=0)
        adata.var[str(i.obs["group"][0])+"_"+elem+"_n_cells"] = np.sum(i[i.obs["Treatment"] == elem,:].X > 0, axis=0)
        dataframe.index = dataframe[elem+"_gene"]
        adata.var[str(i.obs["group"][0])+"_"+elem+"_log2FC"] = dataframe[elem+"_logfoldchange"]
        adata.var[str(i.obs["group"][0])+"_"+elem+"_pvals_adj"] = dataframe[elem+"_pval_adj"]

In [None]:
for elem in ["bile_sterols","carbs_phaseII_stress","lipids_phaseIII",
             "losing_expression","pseudobulk"]:
    changes = []
    for j in range(0, len(adata.var)):
        if adata.var[elem+"_FFA_Cocktail_log2FC"][j] > 1 and adata.var[elem+"_FFA_Cocktail_pvals_adj"][j] < 0.05:
            changes.append("up_FFA_Cocktail")
        elif adata.var[elem+"_FFA_Cocktail_log2FC"][j] < -1 and adata.var[elem+"_FFA_Cocktail_pvals_adj"][j] < 0.05:
            changes.append("up_DMSO")
        else:
            changes.append("none")
        
    adata.var["changes_"+elem] = changes 

In [None]:
adata.var.to_csv("differential_expression_FFA_Cocktail_vs_DMSO_nov21.csv")

In [None]:
set(adata.obs["subgroups"])

In [None]:
df = adata.var[adata.var["pseudobulk_FFA_Cocktail_n_cells"] > 0]
df = df[df["pseudobulk_Vehicle_n_cells"] > 0]

In [None]:
cols = []
for elem in df["changes_pseudobulk"]:
    if elem == "up_FFA_Cocktail":
        cols.append("#d62728")
    elif elem == "up_DMSO":
        cols.append("#888de4")
    else:
        cols.append("0.75")

In [None]:
df["pval_r"] = -np.log10(df["pseudobulk_FFA_Cocktail_pvals_adj"])

In [None]:
m = df.loc[df['pval_r'] != np.inf, 'pval_r'].max()
df['pval_r'].replace(np.inf,m,inplace=True)

In [None]:
plt.rcParams['figure.figsize']=(7,7) #rescale figures
sns.set_theme()
font=matplotlib.font_manager.FontProperties()
font.set_weight('normal')
plt.scatter(y=-np.log10(df["pseudobulk_FFA_Cocktail_pvals_adj"]), 
            x=df["pseudobulk_FFA_Cocktail_log2FC"],
                  c=cols,alpha=0.7, marker="x")

for i in ["ANGPTL3","ADH1B","TDO2","BHMT","TAT","CCL20","TNFAIP3","CYP1A1","DUSP5","GDF15"]:
#b for i in ["ANGPTL3","TDO2","ACSM2A","ACSM2B","TAT","CCL20","TNFAIP3","CYP1A1","RGS9","GDF15"]:
#c2 for i in ["TDO2","GSTA1","ALDH1A1","SCP2","TAT","CCL20","MT2A","POR","PLIN2","GDF15"]:
#l3 for i in ["ACSM2B","APOB","FN1","ALB","ACSM2A","CCL20","PLIN2","ATF3","GNA13","GDF15"]:
#0 for i in ["BHMT","TDO2","ADH1B","FGB","TAT","GDF15","POR","CYP3A5","PLIN2","CYP1A2"]:
    plt.text(df.loc[df["gene_name"] == i, "pseudobulk_FFA_Cocktail_log2FC"].iloc[0]+0.1, 
             df.loc[df["gene_name"] == i, 'pval_r'].iloc[0]+0.1, i,
             fontdict=dict(color='black', alpha=0.7, size=10, weight="normal"))

plt.savefig("volcano_FFACocktail_vs_Vehicle_pseudobulk.pdf",bbox_inches="tight")
plt.show()

In [None]:
df1 = df[df["changes_pseudobulk"] == "up_FFA_Cocktail"]
df1["pval_r"] = -np.log10(df1["pseudobulk_FFA_Cocktail_pvals_adj"])
df2 = df1[-np.log10(df1["pseudobulk_FFA_Cocktail_pvals_adj"]) > 275]
df2 = df2[df2["pseudobulk_FFA_Cocktail_log2FC"] > 1.85]
df2

In [None]:
cd = pd.read_csv("differential_expression_Cocktail_vs_DMSO_nov21.csv", index_col=0)
fcd = pd.read_csv("differential_expression_FFA_Cocktail_vs_DMSO_nov21.csv", index_col=0)

In [None]:
adata1 = adata[adata.obs["cocktail_annot"] == "Cocktail"]

In [None]:
annot = []
for elem in adata1.obs["subgroups"]:
    if elem == "bile_sterols":
        annot.append("b")
    elif elem == "carbs_phaseII_stress":
        annot.append("c2")
    elif elem == "lipids_phaseIII":
        annot.append("l3")
    else:
        annot.append("0")
adata1.obs["abbreviations"] = annot

annot = []
for idx, elem in enumerate(adata1.obs["Treatment"]):
    annot.append(elem+"_"+adata1.obs["abbreviations"][idx])
adata1.obs["abbr_treatment"] = annot

In [None]:
df = pd.DataFrame()
lfc_c = []
lfc_fc = []
groups = []
genes = []
pc = []
pfc = []

order = []

for group in ["bile_sterols","carbs_phaseII_stress","lipids_phaseIII","losing_expression"]:
    for elem in list(set(adata1.obs["Treatment"])):
        if elem == "Cocktail":
            for idx, gene in enumerate(["CYP2D6","CYP2C19","CYP2C9","CYP3A4","CYP1A2"]):
                lfc_c.append(np.float(cd[cd["gene_name"] == gene][group+"_Cocktail_log2FC"]))
                pc.append(np.float(np.sum(adata1[adata1.obs["subgroup_treatment"] == group+"_3"+elem,gene].layers["norm_counts"] > 0)/len(adata1[adata1.obs["subgroup_treatment"] == group+"_3"+elem])))
        elif elem == "FFA_Cocktail":
            for idx, gene in enumerate(["CYP2D6","CYP2C19","CYP2C9","CYP3A4","CYP1A2"]):
                lfc_fc.append(np.float(fcd[fcd["gene_name"] == gene][group+"_FFA_Cocktail_log2FC"]))
                pfc.append(np.float(np.sum(adata1[adata1.obs["subgroup_treatment"] == group+"_4"+elem,gene].layers["norm_counts"] > 0)/len(adata1[adata1.obs["subgroup_treatment"] == group+"_4"+elem])))
                genes.append(gene)
                groups.append(group)
                order.append(idx)
                

df["lfc_Cocktail"] = lfc_c
df["lfc_FFA_Cocktail"] = lfc_fc
df["gene"] = genes
df["group"] = groups

df["percentage_Cocktail"] = pc
df["percentage_FFA_Cocktail"] = pfc

df["order"] = order

In [None]:
np.float(cd[cd["gene_name"] == "CYP1A2"]["lipids_phaseIII_Cocktail_log2FC"])

In [None]:
df

In [None]:
for elem in list(set(df["group"])):
    left, width = 0.1, 0.65
    bottom, height = 0.1, 0.65
    spacing = 0.02

    rect_scatter = [left, bottom, width, height]
    
    plt.figure(figsize=(2.5, 5))
    ax_scatter = plt.axes(rect_scatter)
    
    
    df1 = df[df["group"] == elem]
    # Reorder it following the values of the first value:
    ordered_df = df1.sort_values(by='order')
    my_range=range(1,len(df1.index)+1)
 
    # The horizontal plot is made using the hline function
    ax_scatter.vlines(x=my_range, ymin=ordered_df['lfc_Cocktail'], ymax=ordered_df['lfc_FFA_Cocktail'], color='grey', alpha=0.4)
    #ax_scatter.scatter(my_range, ordered_df['value_DMSO'], color='#888de4', s=ordered_df["percentage_DMSO"]*200, alpha=0.8, label='DMSO')
    p = ax_scatter.scatter(my_range, ordered_df['lfc_Cocktail'], color='#6fb48e', 
                       s=ordered_df["percentage_Cocktail"]*200,
                       alpha=0.8 , label='Cocktail')
    ax_scatter.scatter(my_range, ordered_df["lfc_FFA_Cocktail"], color="#d62728",
                       s=ordered_df["percentage_FFA_Cocktail"]*200, 
                       alpha=0.8, label='FFA_Cocktail')
    #ax_scatter.legend()
     
    ax_scatter.set_xticklabels(ax_scatter.get_xticklabels(), rotation=45, horizontalalignment='right')
    ax_scatter.set_ylim(-2.6,2.6)
    

    plt.legend(*p.legend_elements("sizes", num=4))


    # Add title and axis names
    plt.xticks(my_range, ordered_df['gene'])
    #plt.title("Expression in Cocktail and DMSO", loc='left')
    plt.ylabel(elem+'_log2foldchange')
    #plt.ylabel(elem)
    plt.margins(x=0.1, y=0.1)

    # Show the graph
    plt.savefig("lfc_cocktail_ffa_"+elem+"_20211201.pdf", bbox_inches="tight")
    plt.show()

In [None]:
adata1 = adata[adata.obs["Treatment"] != "FFA_Vehicle"]

In [None]:
df = pd.DataFrame()
values1 = []
values2 = []
values3 = []
values4 = []
groups = []
genes = []
p1 = []
p2 = []
p3 = []
p4 = []
for group in list(set(adata.obs["subgroups"])):
    for elem in list(set(adata.obs["Treatment"])):
        if elem == "Cocktail":
            for gene in ["CYP3A4","CYP1A2","CYP2C9","CYP2C19","CYP2D6"]:
                values1.append(np.mean(adata[adata.obs["subgroup_treatment"] == group+"_3"+elem, gene].X))
                p1.append(np.float(np.sum(adata[adata.obs["subgroup_treatment"] == group+"_3"+elem,gene].layers["norm_counts"] > 0)/len(adata[adata.obs["subgroup_treatment"] == group+"_3"+elem])))
        elif elem == "FFA_Cocktail":
            for gene in ["CYP3A4","CYP1A2","CYP2C9","CYP2C19","CYP2D6"]:
                values3.append(np.mean(adata[adata.obs["subgroup_treatment"] == group+"_4"+elem, gene].X))
                p3.append(np.float(np.sum(adata[adata.obs["subgroup_treatment"] == group+"_4"+elem,gene].layers["norm_counts"] > 0)/len(adata[adata.obs["subgroup_treatment"] == group+"_4"+elem])))
        elif elem == "FFA_Vehicle":
            for gene in ["CYP3A4","CYP1A2","CYP2C9","CYP2C19","CYP2D6"]:
                values4.append(np.mean(adata[adata.obs["subgroup_treatment"] == group+"_2FFA", gene].X))
                p4.append(np.float(np.sum(adata[adata.obs["subgroup_treatment"] == group+"_2FFA",gene].layers["norm_counts"] > 0)/len(adata[adata.obs["subgroup_treatment"] == group+"_2FFA"])))
        elif elem == "Vehicle":
            for gene in ["CYP3A4","CYP1A2","CYP2C9","CYP2C19","CYP2D6"]:
                values2.append(np.mean(adata[adata.obs["subgroup_treatment"] == group+"_1DMSO", gene].X))
                p2.append(np.float(np.sum(adata[adata.obs["subgroup_treatment"] == group+"_1DMSO",gene].layers["norm_counts"] > 0)/len(adata[adata.obs["subgroup_treatment"] == group+"_1DMSO"])))
                genes.append(gene)
                groups.append(group)
                
df["value_DMSO"] = values2
df["value_Cocktail"] = values1
df["value_FFA_Cocktail"] = values3
df["value_FFA"] = values4
df["gene"] = genes
df["group"] = groups
df["percentage_DMSO"] = p2
df["percentage_Cocktail"] = p1
df["percentage_FFA_Cocktail"] = p3
df["percentage_FFA"] = p4

In [None]:
df

In [None]:
df1 = pd.read_csv("differential_expression_Cocktail_vs_DMSO_nov21.csv", index_col=0)
df2 = pd.read_csv("differential_expression_FFA_Cocktail_vs_DMSO_nov21.csv", index_col=0)
df3 = pd.read_csv("differential_expression_FFA_vs_DMSO_1_nov21.csv", index_col=0)

In [None]:
bile_unaffected = []
bile_cocktail = []
bile_ffa = []
for idx, elem in enumerate(df1["changes_bile_sterols"].tolist()):
    if elem == "up_Cocktail" and df2["changes_bile_sterols"][idx] == "up_FFA_Cocktail":
        bile_unaffected.append(df1["gene_name"][idx])
    elif elem == "up_Cocktail" and df2["changes_bile_sterols"][idx] != "up_FFA_Cocktail":
        bile_cocktail.append(df1["gene_name"][idx])
    elif elem != "up_Cocktail" and df2["changes_bile_sterols"][idx] == "up_FFA_Cocktail":
        bile_ffa.append(df1["gene_name"][idx])
carbs_unaffected = []
carbs_cocktail = []
carbs_ffa = []
for idx, elem in enumerate(df1["changes_carbs_phaseII_stress"].tolist()):
    if elem == "up_Cocktail" and df2["changes_carbs_phaseII_stress"][idx] == "up_FFA_Cocktail":
        carbs_unaffected.append(df1["gene_name"][idx])
    elif elem == "up_Cocktail" and df2["changes_carbs_phaseII_stress"][idx] != "up_FFA_Cocktail":
        carbs_cocktail.append(df1["gene_name"][idx])
    elif elem != "up_Cocktail" and df2["changes_carbs_phaseII_stress"][idx] == "up_FFA_Cocktail":
        carbs_ffa.append(df1["gene_name"][idx])
lipids_unaffected = []
lipids_cocktail = []
lipids_ffa = []
for idx, elem in enumerate(df1["changes_lipids_phaseIII"].tolist()):
    if elem == "up_Cocktail" and df2["changes_lipids_phaseIII"][idx] == "up_FFA_Cocktail":
        lipids_unaffected.append(df1["gene_name"][idx])
    elif elem == "up_Cocktail" and df2["changes_lipids_phaseIII"][idx] != "up_FFA_Cocktail":
        lipids_cocktail.append(df1["gene_name"][idx])
    elif elem != "up_Cocktail" and df2["changes_lipids_phaseIII"][idx] == "up_FFA_Cocktail":
        lipids_ffa.append(df1["gene_name"][idx])
l0_unaffected = []
l0_cocktail = []
l0_ffa = []
for idx, elem in enumerate(df1["changes_losing_expression"].tolist()):
    if elem == "up_Cocktail" and df2["changes_losing_expression"][idx] == "up_FFA_Cocktail":
        l0_unaffected.append(df1["gene_name"][idx])
    elif elem == "up_Cocktail" and df2["changes_losing_expression"][idx] != "up_FFA_Cocktail":
        l0_cocktail.append(df1["gene_name"][idx])
    elif elem != "up_Cocktail" and df2["changes_losing_expression"][idx] == "up_FFA_Cocktail":
        l0_ffa.append(df1["gene_name"][idx])

In [None]:
bile_cocktail = []
bile_ffac = []
bile_ffa = []
for idx, elem in enumerate(df1["changes_bile_sterols"].tolist()):
    if elem == "up_Cocktail":
        bile_cocktail.append(df1["gene_name"][idx])
for idx, elem in enumerate(df2["changes_bile_sterols"].tolist()):
    if elem == "up_FFA_Cocktail":
        bile_ffac.append(df2["gene_name"][idx])
for idx, elem in enumerate(df3["changes_bile_sterols"].tolist()):
    if elem == "up_FFA":
        bile_ffa.append(df3["gene_name"][idx])
carbs_cocktail = []
carbs_ffac = []
carbs_ffa = []
for idx, elem in enumerate(df1["changes_carbs_phaseII_stress"].tolist()):
    if elem == "up_Cocktail":
        carbs_cocktail.append(df1["gene_name"][idx])
for idx, elem in enumerate(df2["changes_carbs_phaseII_stress"].tolist()):
    if elem == "up_FFA_Cocktail":
        carbs_ffac.append(df2["gene_name"][idx])
for idx, elem in enumerate(df3["changes_carbs_phaseII_stress"].tolist()):
    if elem == "up_FFA":
        carbs_ffa.append(df3["gene_name"][idx])
lipids_cocktail = []
lipids_ffac = []
lipids_ffa = []
for idx, elem in enumerate(df1["changes_lipids_phaseIII"].tolist()):
    if elem == "up_Cocktail":
        lipids_cocktail.append(df1["gene_name"][idx])
for idx, elem in enumerate(df2["changes_lipids_phaseIII"].tolist()):
    if elem == "up_FFA_Cocktail":
        lipids_ffac.append(df2["gene_name"][idx])
for idx, elem in enumerate(df3["changes_lipids_phaseIII"].tolist()):
    if elem == "up_FFA":
        lipids_ffa.append(df3["gene_name"][idx])
l0_cocktail = []
l0_ffac = []
l0_ffa = []
for idx, elem in enumerate(df1["changes_losing_expression"].tolist()):
    if elem == "up_Cocktail":
        l0_cocktail.append(df1["gene_name"][idx])
for idx, elem in enumerate(df2["changes_losing_expression"].tolist()):
    if elem == "up_FFA_Cocktail":
        l0_ffa.append(df2["gene_name"][idx])
for idx, elem in enumerate(df3["changes_losing_expression"].tolist()):
    if elem == "up_FFA":
        l0_ffa.append(df3["gene_name"][idx])

In [None]:
cocktail = []
for elem in bile_cocktail:
    cocktail.append(elem)
for elem in carbs_cocktail:
    cocktail.append(elem)
for elem in lipids_cocktail:
    cocktail.append(elem)
for elem in l0_cocktail:
    cocktail.append(elem)
    
all_cocktail = list(set(cocktail))

In [None]:
ffac = []
for elem in bile_ffac:
    ffac.append(elem)
for elem in carbs_ffac:
    ffac.append(elem)
for elem in lipids_ffac:
    ffac.append(elem)
for elem in l0_ffac:
    ffac.append(elem)
    
all_ffac = list(set(ffac))

In [None]:
ffa = []
for elem in bile_ffa:
    ffa.append(elem)
for elem in carbs_ffa:
    ffa.append(elem)
for elem in lipids_ffa:
    ffa.append(elem)
for elem in l0_ffa:
    ffa.append(elem)
    
all_ffa = list(set(ffa))

In [None]:
c = venn2([set(cocktail), set(ffac)]) #["#5ca490","#ed1d64"]
c.get_patch_by_id('10').set_color('#5ca490')
c.get_patch_by_id('10').set_edgecolor('none')
c.get_patch_by_id('10').set_alpha(0.7)
c.get_patch_by_id('01').set_color('#ed1d64')
c.get_patch_by_id('01').set_edgecolor('none')
c.get_patch_by_id('01').set_alpha(0.7)
c.get_patch_by_id('11').set_color('#dbc8bb')
c.get_patch_by_id('11').set_edgecolor('none')
c.get_patch_by_id('11').set_alpha(0.4)
plt.savefig("venn_overlap_genes_up_in_Cocktail_and_FFA_Cocktail_proportional_20211201.pdf")

In [None]:
cocktail_only = []
for elem in bile_cocktail:
    cocktail_only.append(elem)
for elem in carbs_cocktail:
    cocktail_only.append(elem)
for elem in lipids_cocktail:
    cocktail_only.append(elem)
for elem in l0_cocktail:
    cocktail_only.append(elem)
    
all_cocktail_only = list(set(cocktail_only))

In [None]:
unaffected = []
for elem in bile_unaffected:
    unaffected.append(elem)
for elem in carbs_unaffected:
    unaffected.append(elem)
for elem in lipids_unaffected:
    unaffected.append(elem)
for elem in l0_unaffected:
    unaffected.append(elem)
    
all_unaffected = list(set(unaffected))

In [None]:
ffac_only = []
for elem in bile_ffa:
    ffac_only.append(elem)
for elem in carbs_ffa:
    ffac_only.append(elem)
for elem in lipids_ffa:
    ffac_only.append(elem)
for elem in l0_ffa:
    ffac_only.append(elem)
    
all_ffac_only = list(set(ffac_only))

In [None]:
genes_dict = {}
genes_dict["cocktail_spec_b"] = bile_cocktail
genes_dict["unaffected_b"] = bile_unaffected
genes_dict["ffa_spec_b"] = bile_ffa
genes_dict["cocktail_spec_c2"] = carbs_cocktail
genes_dict["unaffected_c2"] = carbs_unaffected
genes_dict["ffa_spec_c2"] = carbs_ffa
genes_dict["cocktail_spec_l3"] = lipids_cocktail
genes_dict["unaffected_l3"] = lipids_unaffected
genes_dict["ffa_spec_l3"] = lipids_ffa
genes_dict["cocktail_spec_0"] = l0_cocktail
genes_dict["unaffected_0"] = l0_unaffected
genes_dict["ffa_spec_0"] = l0_ffa
genes_dict["cocktail_spec_all"] = all_cocktail_only
genes_dict["unaffected_all"] = all_unaffected
genes_dict["ffa_spec_all"] = all_ffac_only

In [None]:
df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in genes_dict.items() ]))

In [None]:
df.to_csv("Suppl_Tab7_unaffected_and_affected_genes_20211201.csv")

In [None]:
dfx = pd.DataFrame()
dfx["group"] = ["bile","carbs","lipids","losing_exp"]
dfx["counts_coktail"] = [len(bile_cocktail), len(carbs_cocktail), len(lipids_cocktail), len(l0_cocktail)]
dfx["counts_unaffected"] = [len(bile_unaffected), len(carbs_unaffected), len(lipids_unaffected), len(l0_unaffected)]
dfx["counts_ffa"] = [len(bile_ffa), len(carbs_ffa), len(lipids_ffa), len(l0_ffa)]

In [None]:
dfx.to_csv("counts_cocktail_unaffected_ffa_20211201.csv")

In [None]:
df = pd.crosstab(adata.obs["subgroups"], adata.obs["Treatment"], margins = False)
r = np.arange(4)
 
# From raw value to percentage
totals = [i+j+k for i,j,k in zip(dfx['counts_coktail'], dfx['counts_unaffected'], dfx['counts_ffa'])]

cocktail = [i / j * 100 for i,j in zip(dfx['counts_coktail'], totals)]
unaff = [i / j * 100 for i,j in zip(dfx['counts_unaffected'], totals)]
ffa = [i / j * 100 for i,j in zip(dfx['counts_ffa'], totals)]

# plot
barWidth = 0.85
names = ('bile_sterols', 'carbs_phaseII', 'lipids_phaseIII', 'losing_expression')

# Create bars
plt.bar(r, cocktail, width=barWidth, label="cocktail_specific", color='#5ca490')
plt.bar(r, unaff, bottom=cocktail, width=barWidth, label="unaffected",color='#dbc8bb')
plt.bar(r, ffa, bottom=[i+j for i,j in zip(cocktail, unaff)], width=barWidth, label="ffa_specific",color='#ed1d64')

    
# Custom x axis
plt.xticks(r, names)
#plt.xlabel("group")
plt.xticks(rotation=90)
plt.legend(loc='upper left', bbox_to_anchor=(1,1), ncol=1)

# Show graphic
plt.savefig("barplot_genes_ffa_cocktail_both_20211201.pdf")
plt.show()

In [None]:
genes_dict = {}
genes_dict["cocktail_specific"] = all_cocktail_only
genes_dict["unaffected"] = all_unaffected
genes_dict["ffa_specific"] = all_ffac_only

In [None]:
from gprofiler import gprofiler
import custom_functions as cf

In [None]:
genes = all_cocktail_only
    
#Interpretation of differentially expressed genes in paneth cells - g:profiler
gp = gprofiler(genes, organism='hsapiens')
gp = gp[gp["domain"] == "BP"]

In [None]:
gp.to_csv("GO_terms_cocktail_specific_20211214.csv")

In [None]:
plt.rcParams['figure.figsize']=(6,8) #rescale figures
genes = all_ffac_only

#Interpretation of differentially expressed genes in paneth cells - g:profiler
gp = gprofiler(genes, organism='hsapiens')
gp = gp[gp["domain"] == "BP"]

gp_enrichment = gp.sort_values('p.value').iloc[:,[2,3,5,6,11]]
gp_enrichment['name']= gp_enrichment['term.name'].copy()
gp_enrichment['p_value']= gp_enrichment['p.value'].copy()
gp_enrichment['intersection_size']= gp_enrichment['overlap.size'].copy()
del gp_enrichment['term.name'], gp_enrichment['overlap.size'], gp_enrichment['p.value']

cf.plot_enrich(gp_enrichment, save='GO_shared_FFAcocktail_specific_20211214.pdf')

In [None]:
import gseapy as gp

In [None]:
df = pd.DataFrame(adata[adata.obs["cocktail_annot"] == "Cocktail", all_unaffected].X)

In [None]:
df = df.T

In [None]:
df.index = adata[:,all_unaffected].var_names.tolist()

In [None]:
sample_names = list(adata[adata.obs["cocktail_annot"] == "Cocktail",:].obs["Treatment"])

In [None]:
gs_res = gp.gsea(data=df, # or data='./P53_resampling_data.txt'
                 gene_sets='KEGG_2016', # enrichr library names
                 cls=sample_names, # cls=class_vector
                 # set permutation_type to phenotype if samples >=15
                 permutation_type='phenotype',
                 permutation_num=100, # reduce number to speed up test
                 min_size=5,
                 max_size=200,
                 outdir=None,  # do not write output to disk
                 no_plot=False, # Skip plotting
                 method='signal_to_noise',
                 processes=4, seed= 7,
                 format='png')

In [None]:
gs_res.res2d[gs_res.res2d["pval"] < 0.05]

In [None]:
gs_res.res2d.to_csv("gsea_results_ffa_cocktail_vs_cocktail_on_unaffected_genes.csv")

In [None]:
from gseapy.plot import gseaplot, heatmap
terms = gs_res.res2d.index
# Make sure that ``ofname`` is not None, if you want to save your figure to disk
gseaplot(gs_res.ranking, term=terms[3], **gs_res.results[terms[3]])
plt.savefig("GSEA_ffa_cocktail_vs_cocktail_xenobiotic_metabolism_unaffected_20211201.pdf", bbox_inches="tight")

In [None]:
ins_res = ["NFKBIA","TRIB3","CREB3L3","IRS2","SLC2A1","SOCS3",
           "PIK3CD","CREB5","RELA","CPT1B"]
xeno_met = ['UGT1A1','CYP1B1','CYP3A5','CYP1A2','CYP2B6','CYP1A1',
            'AKR1C1','CYP3A4','CYP2C8',"CYP2C19","CYP2C9","CYP2D6"]

In [None]:
# final unaffected, shared and specific
df = pd.DataFrame()
lfc = []
groups = []
genes = []
treatment = []

order = []

for group in ["bile_sterols","carbs_phaseII_stress","lipids_phaseIII"]:
    for elem in list(set(adata.obs["Treatment"])):
        if elem == "FFA_Vehicle":
            for idx, gene in enumerate(xeno_met):
                lfc.append(np.float(df3[df3["gene_name"] == gene][group+"_FFA_Vehicle_log2FC"]))
                treatment.append("FFA")
                genes.append(gene)
                if group == "0":
                    groups.append("z")
                else:
                    groups.append(group)
                order.append(idx)

        elif elem == "FFA_Cocktail":
            for idx, gene in enumerate(xeno_met):
                lfc.append(np.float(df2[df2["gene_name"] == gene][group+"_FFA_Cocktail_log2FC"]))
                treatment.append("FFA_Cocktail")
                genes.append(gene)
                if group == "0":
                    groups.append("z")
                else:
                    groups.append(group)
                order.append(idx)
                
        elif elem == "Cocktail":
            for idx, gene in enumerate(xeno_met):
                lfc.append(np.float(df1[df1["gene_name"] == gene][group+"_Cocktail_log2FC"]))
                treatment.append("Cocktail")
                genes.append(gene)
                if group == "0":
                    groups.append("z")
                else:
                    groups.append(group)
                order.append(idx)


df["lfc"] = lfc

df["gene"] = genes
df["group"] = groups
df["treatment"] = treatment

df["order"] = order

In [None]:
group_treatment = []
for idx, elem in enumerate(df["group"]):
    group_treatment.append(df["treatment"][idx]+"_"+elem)
df["group_treatment"] = group_treatment
del df["order"]


dfx = df.pivot("gene", "group_treatment", "lfc")


plt.rcParams['figure.figsize']=(5,5)
sns.heatmap(dfx, cmap=sns.diverging_palette(265, 35, s=100, as_cmap=True),
               yticklabels=1, center=0, linewidths=0.1)#, col_cluster=False)
plt.savefig("lfc_xenobio_metab_FFA_Cocktail_and_FFA_Cocktail_active_groups_20211201.pdf", bbox_inches="tight")

In [None]:
markers = {"upstream":["MLX","MLXIPL","RXRA","RORA","NR1H4","PPARA","HNF4A","CEBPA"],
           "phaseI":["CYP2A7","CYP3A5","CYP2B6","CYP2A6","CYP1A1"],
           "phaseII":["GSTA1","GSTO1","GSTA2","SULT2A1","UGT1A1"],
           "phaseIII":["ABCC2","ABCC3","ABCB1","SLCO1B1","ABCG5"],
           "fat_metabolism":["APOB","APOA1","NFE2L2","PLIN2","LDLR"],
           "stress":["CPEB4","DDIT3","TNFRSF10B","PPP1R15A","GOT1"]}

In [None]:
adata1 = adata[adata.obs["Treatment"] != "FFA_Vehicle"]
adata2 = adata1[adata1.obs["subgroups"] != "losing_expression"]

In [None]:
sc.pl.stacked_violin(adata2, groupby="subgroup_treatment",
                    var_names=markers,
                    #swap_axes=True,
                    standard_scale="var",
                    save="subgroup_treatment_upstream_downstream_markers_no_losing_exp_20211201.pdf")

In [None]:
df = cf.cell_compo_table(adata, "subgroups","subgroups")

In [None]:
df1 = [i / 38232 * 100 for i in df['All']]

In [None]:
df2 = pd.DataFrame()
df2["values"] = df1[:-1]
df2.index = df.index[:-1]

In [None]:
color = ['#ff7f0e', '#1f77b4', '#17b2b1', '#9b1c03']
df2.T.plot(kind='bar', stacked=True, color=color)
plt.xticks(np.arange(1), "")
plt.xlabel("Percentage subgroups")

plt.legend(loc='upper left', bbox_to_anchor=(1,1), ncol=1)
plt.savefig("barplot_percentages_subgroups_20211201.pdf", bbox_inches="tight")

# 5) Additional analysis and plots

In [None]:
df = pd.crosstab(adata.obs["subgroups"], adata.obs["Treatment"], margins = False)
df['Vehicle'] = df['Vehicle'].div(9820)
df["FFA_Vehicle"] = df["FFA_Vehicle"].div(11736)
df["Cocktail"] = df["Cocktail"].div(6122)
df["FFA_Cocktail"] = df["FFA_Cocktail"].div(10554)

In [None]:
df

In [None]:
# Data
r = np.arange(4)
 
# From raw value to percentage
totals = [i+j+k+l for i,j,k,l in zip(df['Vehicle'], df['FFA_Vehicle'], df['Cocktail'], df["FFA_Cocktail"])]

DMSO = [i / j * 100 for i,j in zip(df['Vehicle'], totals)]
FFA = [i / j * 100 for i,j in zip(df['FFA_Vehicle'], totals)]
Cocktail = [i / j * 100 for i,j in zip(df['Cocktail'], totals)]
FFA_Cocktail = [i / j * 100 for i,j in zip(df['FFA_Cocktail'], totals)]

# plot
barWidth = 0.85
names = ('bile_sterols', 'carbs_phaseII', 'lipids_phaseIII', 'losing_expression')

# Create bars
plt.bar(r, DMSO, width=barWidth, label="DMSO", color='#888de4')
plt.bar(r, FFA, bottom=DMSO, width=barWidth, label="FFA",color='#deb82c')
plt.bar(r, Cocktail, bottom=[i+j for i,j in zip(DMSO, FFA)], width=barWidth, label="Cocktail",color='#279e68')
plt.bar(r, FFA_Cocktail, bottom=[i+j+k for i,j,k in zip(DMSO, FFA, Cocktail)], width=barWidth,
        label="FFA_Cocktail", color="#d62728")


    
# Custom x axis
plt.xticks(r, names)
#plt.xlabel("group")
plt.xticks(rotation=90)
plt.legend(loc='upper left', bbox_to_anchor=(1,1), ncol=1)

# Show graphic
plt.savefig("barplot_treatment_conditions_per_subgroup_dec21.pdf")
plt.show()

In [None]:
adata2 = adata.copy()

In [None]:
adata2.var["mean_norm_counts"] = np.mean(adata2.X, axis=0)
adata2 = adata2[:,adata2.var["mean_norm_counts"] > 0.25].copy()

In [None]:
df = pd.DataFrame()
df2 = pd.DataFrame()
df3 = pd.DataFrame()
for elem in list(set(adata2.obs["subgroup_treatment"])):
    print(elem)
    adata2.var["mean_"+elem] = np.mean(adata2[adata2.obs["subgroup_treatment"] == elem,:].layers["norm_counts"], axis=0)
    adata2.var["CV_"+elem] = np.sqrt(np.exp(np.std(adata2[adata2.obs["subgroup_treatment"] == elem,:].X, axis=0)**2)-1)
    df["CV_"+elem] = np.sqrt(np.exp(np.std(adata2[adata2.obs["subgroup_treatment"] == elem,:].X, axis=0)**2)-1)
    print(np.median(df["CV_"+elem]))
    df2["mean_"+elem] = np.mean(adata2[adata2.obs["subgroup_treatment"] == elem,:].X, axis=0)

In [None]:
df1 = df.unstack().reset_index() 
ordered_df = df1.sort_values(by='level_0')

In [None]:
# use same colors as in the tSNE
cols = {"CV_carbs_phaseII_stress_2FFA":'#deb82c', "CV_lipids_phaseIII_3Cocktail":'#279e68',
        "CV_lipids_phaseIII_4FFA_Cocktail":'#d62728', 
        "CV_carbs_phaseII_stress_1DMSO":'#888de4', "CV_losing_expression_2FFA":'#deb82c',
        "CV_bile_sterols_3Cocktail":'#279e68', "CV_losing_expression_1DMSO":'#888de4', 
        "CV_lipids_phaseIII_1DMSO":'#888de4', "CV_lipids_phaseIII_1DMSO":'#888de4',
        "CV_bile_sterols_1DMSO":'#888de4', "CV_carbs_phaseII_stress_4FFA_Cocktail":'#d62728',
        "CV_losing_expression_3Cocktail":'#279e68', "CV_carbs_phaseII_stress_3Cocktail":'#279e68',
        "CV_bile_sterols_2FFA":'#deb82c', "CV_bile_sterols_4FFA_Cocktail":'#d62728',
        "CV_losing_expression_4FFA_Cocktail":'#d62728', "CV_lipids_phaseIII_2FFA":'#deb82c'}
p = sns.boxplot(x="level_0", y=0, data=ordered_df, palette=cols)
p.set_xticklabels(p.get_xticklabels(), rotation=45, horizontalalignment='right')
box = p.get_position()
p.set_position([box.x0, box.y0, box.width * 2, box.height * 1])
plt.yscale("log")
plt.savefig("coefficient_of_variation_per_group_and_treatment_colored_by_treatment_20211202.pdf", bbox_inches="tight")

In [None]:
adata.obs["group"] = "pseudobulk"

In [None]:
adataL3 = adata[adata.obs["subgroups"] == "lipids_phaseIII"]
adataL3.obs["group"] = "lipids_phaseIII"
adataC2 = adata[adata.obs["subgroups"] == "carbs_phaseII_stress"]
adataC2.obs["group"] = "carbs_phaseII_stress"
adataB = adata[adata.obs["subgroups"] == "bile_sterols"]
adataB.obs["group"] = "bile_sterols"
adata0 = adata[adata.obs["subgroups"] == "losing_expression"]
adata0.obs["group"] = "losing_expression"

In [None]:
for i in [adata, adataB, adataC2, adataL3, adata0]:
    sc.tl.rank_genes_groups(i, groupby="Treatment", n_genes=len(i.var))
    sc.pl.rank_genes_groups_matrixplot(i, standard_scale="var", dendrogram=False,
                                       save=str(i.obs["group"][0])+"_treatment_comparison.pdf")
    cf.top_marker_as_xlsx(i, "rank_genes_groups", "Treatment", "top_genes_Treatment_"+str(i.obs["group"][0])+".csv")

In [None]:
df0 = pd.read_csv("top_genes_Treatment_losing_expression.csv", index_col=0)
df1 = pd.read_csv("top_genes_Treatment_bile_sterols.csv", index_col=0)
df2 = pd.read_csv("top_genes_Treatment_carbs_phaseII_stress.csv", index_col=0)
df3 = pd.read_csv("top_genes_Treatment_lipids_phaseIII.csv", index_col=0)
dfx = pd.read_csv("top_genes_Treatment_pseudobulk.csv", index_col=0)

In [None]:
df

In [None]:
categories = ['CYP1A2', 'CYP2C9', 'CYP2C19', 'CYP2D6', 'CYP3A4']
categories = [*categories, categories[0]]

dmso = [0.32, 0.81, 0.1, 0.09, 0.34]
cocktail = [0.99, 1.14, 0.09, 0.09, 0.86]
ffa_cocktail = [0.68, 0.79, 0.07, 0.08, 0.44]
ffa = [0.38, 0.66, 0.05, 0.08, 0.31]

dmso = [*dmso, dmso[0]]
cocktail = [*cocktail, cocktail[0]]
ffa_cocktail = [*ffa_cocktail, ffa_cocktail[0]]
ffa = [*ffa, ffa[0]]

label_loc = np.linspace(start=0, stop=2 * np.pi, num=len(dmso))

plt.figure(figsize=(5, 5))
plt.subplot(polar=True)
plt.plot(label_loc, dmso, label='Vehicle', c='#888de4')
plt.plot(label_loc, cocktail, label='Cocktail', c='#279e68')
plt.plot(label_loc, ffa_cocktail, label='FFA_Cocktail', c='#d62728')
plt.plot(label_loc, ffa, label='FFA', c='#deb82c')
plt.title('Pseudobulk', size=20)
lines, labels = plt.thetagrids(np.degrees(label_loc), labels=categories)
plt.legend(loc='center left', bbox_to_anchor=(1, 1))
plt.savefig("spider_chart_cytochromes_pseudobulk_20211203.pdf")
plt.show()

In [None]:
df1 = pd.read_csv("differential_expression_Cocktail_vs_DMSO_nov21.csv", index_col=0)
df3 = pd.read_csv("differential_expression_FFA_vs_DMSO_075_nov21.csv", index_col=0)

In [None]:
#dfx = df1[df1["pseudobulk_Cocktail_pvals_adj"] < 0.05]
len(df1[df1["pseudobulk_Cocktail_log2FC"] > 0])

In [None]:
#dfx = df3[df3["pseudobulk_FFA_Vehicle_pvals_adj"] < 0.05]
len(df3[df3["pseudobulk_FFA_Vehicle_log2FC"] >0])

In [None]:
8108/2326

In [None]:
x = np.arange(2)
variable = [8108, 2326]

fig, ax = plt.subplots()
plt.bar(x, variable)
plt.title("Number of genes with positive log2 fold change")
plt.xticks(x, ('Cocktail vs DMSO', 'FFA vs DMSO'))
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.savefig("Suppl_Fig5_number_genes_positive_log2FC.pdf", bbox_inches="tight")
plt.show()