In [1]:
import pandas as pd, numpy as np
import imp, re

import z2_save_jaspar, z1_save_oligos, z3_transformations
from scipy.stats import ks_2samp
import scipy.stats as stats
idx = pd.IndexSlice

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



In [2]:

#LOAD JASPAR & OLIGO DATA
jaspar = z2_save_jaspar.load_jaspar()
oligos,oligos_by_exp = z1_save_oligos.load_oligos_plus()
oligos_by_exp["analysis_group_key"] = oligos_by_exp.exp.copy()
all_obe = oligos_by_exp.reset_index().loc[lambda x:x.mutant_num<5].groupby(["starts","mutant_num","analysis_group_key"]).norm_mu.mean()


In [None]:
filters =  z3_transformations.compute_filters(all_obe, hq = False)


  keepdims=keepdims)


In [None]:
print(f"""{oligos_by_exp.n_bcs.sum()} barcodes,  {oligos_by_exp.n_transcripts.sum()} total UMIs. 
{oligos_by_exp.n_transcripts.sum() / oligos_by_exp.n_bcs.sum()} umis per barcode
{oligos_by_exp.groupby(["oligo","analysis_group_key"]).n_transcripts.sum().mean()}
""")



In [None]:
tile_sets_nmu = oligos_by_exp.groupby(["starts"]).mean().norm_mu
wt_tiles_nmu = oligos_by_exp.loc[lambda x: x.mutant_num == 0].groupby(["starts"]).mean().norm_mu

e5_tile_starts = oligos_by_exp.loc[lambda x: x.mutant_num == 0]\
    .groupby(["starts"]).mean().loc[lambda x: x.norm_mu >= x.norm_mu.quantile(.95) ].index
e1_tile_starts = oligos_by_exp.loc[lambda x: x.mutant_num == 0]\
    .groupby(["starts"]).mean().loc[lambda x: x.norm_mu > x.norm_mu.quantile(.99) ].index

print(f"""average normalized mu over all tile sets in all experiments {tile_sets_nmu.mean()}
average normalized my over all wildtype tiles {wt_tiles_nmu.mean()}

the 95th percentile value of enmu over all wildtype tiles is {wt_tiles_nmu.quantile(.95)}
the 99th percentile value of enmu over all wiltype tiles is {wt_tiles_nmu.quantile(.99)}

the 95th percentile value of enmu over all tiles is {tile_sets_nmu.quantile(.95)}
the 99th percentile value of enmu over all tiles is {tile_sets_nmu.quantile(.99)}

We will define the e5 and e1 groups using the average expression levels of wild type oligos
being over the 95th and 99th percentiles respectively.

{len(e5_tile_starts)} and {len(e1_tile_starts)} of {oligos_by_exp.starts.nunique()} were identified in 
the e5 and e1 sets respectively.
""")

In [None]:
obe_by_start = oligos_by_exp.reset_index().set_index("starts").sort_index()

In [None]:

oligo_types =pd.concat([pd.concat([
obe_by_start.loc[max(mutant_start_position-150,0):mutant_start_position].loc[lambda x: x.mutant_start == mutant_start_position].assign(tile_class="ablation"),
obe_by_start.loc[max(mutant_start_position-150,0):mutant_start_position].loc[lambda x: (x.mutant_num > 0) &(x.mutant_num < 5) & (x.mutant_start != mutant_start_position)].assign(tile_class="other"),    
obe_by_start.loc[max(mutant_start_position-150,0):max(0,mutant_start_position-30)].loc[lambda x: x.mutant_num ==0].assign(tile_class="wildtype"),
obe_by_start.loc[max(mutant_start_position-150,0):mutant_start_position+30].loc[lambda x: (x.mutant_start == mutant_start_position) | 
                                                                                ( (x.mutant_start+30) == mutant_start_position)].assign(tile_class="ablation60"),    
obe_by_start.loc[max(mutant_start_position-150,0):mutant_start_position+30].loc[lambda x: (x.mutant_num > 0) &(x.mutant_num < 5) &
                                                                                ( (x.mutant_start != mutant_start_position) & 
                                                                                ( (x.mutant_start+30) != mutant_start_position))].assign(tile_class="other60"),    
obe_by_start.loc[max(mutant_start_position-150,0):max(0,mutant_start_position)].loc[lambda x: x.mutant_num ==0].assign(tile_class="wildtype60"),
    
    ], ignore_index =True).assign(mutant_tile_start = mutant_start_position) for mutant_start_position in oligos_by_exp.mutant_start.unique()]).set_index(["mutant_tile_start","tile_class"])


In [None]:
oligo_types["starts"] = oligo_types.join(oligos_by_exp.reset_index().drop_duplicates("oligo").set_index("oligo")[["starts"]],on="oligo").starts

In [None]:
oligo_types = oligo_types.sort_index(level=[0,1])

In [None]:

#means = pd.Series(mutant_starts).apply(lambda s: oligo_types.loc[idx[s,:],:].groupby("tile_class").norm_mu.mean())

def get_stats(s):
    tile_groups = oligo_types.loc[idx[s,:],:].groupby("tile_class")
    if not set(["ablation","other","wildtype",
                "ablation60","other60","wildtype60"]).issubset(tile_groups.groups.keys()): 
        print (f"skipping position {s}")
        return pd.Series().rename(s)

    ablations = tile_groups.get_group("ablation")
    others = tile_groups.get_group("other")
    wildtypes = tile_groups.get_group("wildtype")
    
    ab1_dld1 = tile_groups.get_group("ablation").loc[lambda x: x.exp=="DLD1_WT_BR1"]
    ot1_dld1 = tile_groups.get_group("other").loc[lambda x: x.exp=="DLD1_WT_BR1"]
    wt1_dld1 = tile_groups.get_group("wildtype").loc[lambda x: x.exp=="DLD1_WT_BR1"]
    
    ab2_dld1 = tile_groups.get_group("ablation").loc[lambda x: x.exp=="DLD1_WT_BR2"]
    ot2_dld1 = tile_groups.get_group("other").loc[lambda x: x.exp=="DLD1_WT_BR2"]
    wt2_dld1 = tile_groups.get_group("wildtype").loc[lambda x: x.exp=="DLD1_WT_BR2"]


    ablations60 = tile_groups.get_group("ablation60")
    others60 = tile_groups.get_group("other60")
    wildtypes60 = tile_groups.get_group("wildtype60")
    
    wt_tt = stats.ttest_ind( ablations.norm_mu, wildtypes.norm_mu)
    all_tt = stats.ttest_ind( ablations.norm_mu, pd.concat([wildtypes.norm_mu,others.norm_mu]))
    
    r1_tt = stats.ttest_ind( ab1_dld1.norm_mu, wt1_dld1.norm_mu)
    r2_tt = stats.ttest_ind( ab2_dld1.norm_mu, wt2_dld1.norm_mu)
    
    r1_aott = stats.ttest_ind( ab1_dld1.norm_mu, pd.concat([wt1_dld1.norm_mu,ot1_dld1.norm_mu]))
    r2_aott = stats.ttest_ind( ab2_dld1.norm_mu, pd.concat([wt2_dld1.norm_mu,ot2_dld1.norm_mu]))
            
        
#     wt_rep_tt = stats.ttest_ind( ablations.norm_mu, wildtypes.norm_mu)
#     all_rep_tt = stats.ttest_ind( ablations.norm_mu, pd.concat([wildtypes.norm_mu,others.norm_mu]))
    
    
    return pd.Series({

        "mu_wt":wildtypes.norm_mu.mean(),
        "mu_ot":others.norm_mu.mean(),
        "mu_ab":ablations.norm_mu.mean(),

        "mu_wt60":wildtypes60.norm_mu.mean(),
        "mu_ot60":others60.norm_mu.mean(),
        "mu_ab60":ablations60.norm_mu.mean(),
        
        "tt_wt_pval":-1 * np.log(wt_tt[1]),
        "tt_all_pval":-1 * np.log(all_tt[1]),
        
                
        "tt_wt_ttstat":wt_tt[0],
        "tt_all_ttstat":all_tt[0],
        
                        
        "tt_r1_ttstat":r1_tt[0],
        "tt_r2_ttstat":r2_tt[0],
                                
        "tt_r1_aottstat":r1_aott[0],
        "tt_r2_aottstat":r2_aott[0],
        
        "tt_r1_aott_pval":-1 * np.log(r1_aott[1]),
        "tt_r2_aott_pval":-1 * np.log(r2_aott[1]),
        
                

    }).rename(s)





In [None]:
enriched_starts = e1_tile_starts
mutant_starts = oligos_by_exp.loc[lambda x: x.starts.isin(enriched_starts)].mutant_start.unique()
e1_stats = pd.Series(mutant_starts, index = mutant_starts).apply(get_stats)


enriched_starts = e5_tile_starts
mutant_starts = oligos_by_exp.loc[lambda x: x.starts.isin(enriched_starts)].mutant_start.unique()
e5_stats = pd.Series(mutant_starts, index = mutant_starts).apply(get_stats)


In [None]:
import seaborn as sns
%matplotlib inline

In [None]:
sns.jointplot( x = "tt_r1_aott_pval", y = "tt_r1_pval", data = pd.concat([
    e5_stats.assign(enrichment=0),
    e1_stats.assign(enrichment=1)]))

In [None]:
sns.jointplot( x = "tt_r1_ttstat", y = "tt_r2_ttstat", data = pd.concat([
    e5_stats.assign(enrichment=0),
    e1_stats.assign(enrichment=1)]))

In [None]:
sns.jointplot( x = "tt_wt_ttstat", y = "tt_all_ttstat", data = pd.concat([
    e5_stats.assign(enrichment=0),
    e1_stats.assign(enrichment=1)]))

In [None]:

f = plt.gca()

sns.jointplot( x = "tt_wt_pval", y = "tt_all_pval", data = pd.concat([
    e5_stats.assign(enrichment=0),
    e1_stats.assign(enrichment=1)]))

In [None]:

f = plt.gca()

sns.jointplot( x = "tt_wt_pval", y = "tt_all_pval", data = pd.concat([
    e5_stats.assign(enrichment=0),
    e1_stats.assign(enrichment=1)]))

In [None]:

for enriched_starts in [ oligos_by_exp.starts.unique() ]: 

    
    
    mutant_starts = oligos_by_exp.loc[lambda x: x.starts.isin(enriched_starts)].mutant_start.unique()
    
    #means = pd.Series(mutant_starts).apply(lambda s: oligo_types.loc[idx[s,:],:].groupby("tile_class").norm_mu.mean())
    
    def get_stats(s):
        tile_groups = oligo_types.loc[idx[s,:],:].groupby("tile_class")
        if not set(["ablation","other","wildtype",
                    "ablation60","other60","wildtype60"]).issubset(tile_groups.groups.keys()): 
            print (f"skipping position {s}")
            return pd.Series().rename(s)
        
        ablations = tile_groups.get_group("ablation")
        others = tile_groups.get_group("other")
        wildtypes = tile_groups.get_group("wildtype")
        
        ablations60 = tile_groups.get_group("ablation60")
        others60 = tile_groups.get_group("other60")
        wildtypes60 = tile_groups.get_group("wildtype60")

        wt_tt = stats.ttest_ind( ablations.norm_mu, wildtypes.norm_mu)
        all_tt = stats.ttest_ind( ablations.norm_mu, pd.concat([wildtypes.norm_mu,others.norm_mu]))

        return pd.Series({

            "mu_wt":wildtypes.norm_mu.mean(),
            "mu_ot":others.norm_mu.mean(),
            "mu_ab":ablations.norm_mu.mean(),

            "mu_wt60":wildtypes60.norm_mu.mean(),
            "mu_ot60":others60.norm_mu.mean(),
            "mu_ab60":ablations60.norm_mu.mean(),

            "tt_wt_pval":wt_tt[1],
            "tt_all_pval":wt_tt[1],

            }).rename(s)

    stats2 = pd.Series(mutant_starts, index = mutant_starts).apply(get_stats)

        
  

In [None]:
oligo_types.loc[idx[s,:],:].loc[idx[1526,"wildtype60"],:]

In [None]:
oligo_types.loc[idx[s,:],:].groupby("tile_class").mean()

In [None]:
oligo_types.reset_index().tile_class.value_counts()

In [None]:
oligo_types.reset_index().tile_class.unique()