In [8]:
import os
import pandas as pd
import numpy as np
from numpy import inf
import sys

In [43]:
def filter_BFs(st_spec):
    
    """
    A function that filters the BF files from cloud
    # 
    # Inputs:
    #    st_spec             - BF data frame
    #    ST_top_gene_dict   - A pd.DataFrame with fields: age_1, age_2, region_1, region_2, AAR1, AAR2, logsBFs (list), Delta (list), genes (list)  


    """
    #print(st_spec.head())
    # do some renaming
    st_spec = st_spec[st_spec['BF'] != inf]

    # Log10 BF
    st_spec['BF'] = np.float64(st_spec['BF'])
    x = np.log(st_spec['BF'])
    x[x == -inf] = sys.float_info.min # makes sure no inf
    x[x == inf] = sys.float_info.max # makes sure no inf
    st_spec['logBF'] = x
    #print(st_spec['condition_1'].unique())
    #print(st_spec['condition_2'].unique())
    # rename gene names
    st_spec['gene_new'] = [i.split("_")[0] for i in st_spec['gene']]
    st_spec['age_1'] = [i.split(" ")[0] for i in st_spec['condition_1']]
    st_spec['region_1'] = [i.split(" ")[1] for i in st_spec['condition_1']]
    st_spec['age_2'] = [i.split(" ")[0] for i in st_spec['condition_2']]
    st_spec['region_2'] = [i.split(" ")[1] for i in st_spec['condition_2']]
    #print('done clean up')
    ## Top 100 ST genes per condition and per region
    ST_top_gene_dict = pd.DataFrame(columns = ['age_1', 'age_2', 'region_1', 'region_2', 'AAR1', 'AAR2', 'genes', 'logBFs', 'Delta'])
    counter = 0
    df_group = st_spec.groupby(['age_1', 'age_2', 'region_1', 'region_2', 'AAR1', 'AAR2'])
    for label, dfs in df_group: # this is for splotch_one_level

        # this gets genes super specific against the whole rest of the datset

        #if (label[5] == 'Rest'):

        #print(counter)

        #dfs = df[(df['logBF'] > 2) & (df['Delta'] > 0)]
        #dfs = df[(df['Delta'] > 0)]
        #dfs = df
        #print(df.sort_values(by='logBF', ascending=False)['gene_new'].head(5).tolist())
        if (len(dfs.sort_values(by='logBF', ascending=False)['gene_new'].head(5).tolist()) == 0):
            continue

        ST_top_gene_dict.at[counter, 'age_1'] = label[0]
        ST_top_gene_dict.at[counter, 'age_2'] = label[1]
        ST_top_gene_dict.at[counter, 'region_1'] = label[2]
        ST_top_gene_dict.at[counter, 'region_2'] = label[3]
        ST_top_gene_dict.at[counter, 'AAR1'] = label[4]
        ST_top_gene_dict.at[counter, 'AAR2'] = label[5]
        ST_top_gene_dict.at[counter, 'genes'] = dfs.sort_values(by=['logBF', 'Delta'], ascending=[False, False])['gene_new'].head(250).tolist()
        ST_top_gene_dict.at[counter, 'logBFs'] = dfs.sort_values(by=['logBF', 'Delta'], ascending=[False, False])['logBF'].head(250).tolist()
        ST_top_gene_dict.at[counter, 'Delta'] = dfs.sort_values(by=['logBF', 'Delta'], ascending=[False, False])['Delta'].head(250).tolist()
        counter += 1
    
    return ST_top_gene_dict

In [13]:
%%capture cap_out --no-stderr
#!gsutil ls gs://fc-2e7e9da8-0b98-4a74-a1ed-61c5ba5a13a4/3c779645-2301-4389-a0d3-da5f554e148a/splotch_diff_exp_workflow/a3bf8b06-c212-49b7-883d-8a96072c8a02/call-diff_exp/ 
!gsutil ls gs://fc-2e7e9da8-0b98-4a74-a1ed-61c5ba5a13a4/432aaa8c-bc18-48c3-ab68-6359ee309769/splotch_diff_exp_workflow/9665d171-c25e-4bab-9232-4efd111c371d/call-diff_exp/*/input_dir/analysis_output/BF-beta_df_output.tsv

In [14]:
shards = [i for i in cap_out.stdout.split("\r\n")]

In [20]:
%%capture cap_out --no-stderr
!gsutil ls gs://fc-2e7e9da8-0b98-4a74-a1ed-61c5ba5a13a4/432aaa8c-bc18-48c3-ab68-6359ee309769/splotch_diff_exp_workflow/9665d171-c25e-4bab-9232-4efd111c371d/call-diff_exp/*/attempt*/input_dir/analysis_output/BF-beta_df_output.tsv
    
    


In [21]:
shards_attempts = [i for i in cap_out.stdout.split("\r\n")]

In [24]:
shards_final = shards + shards_attempts

In [25]:
path = '/home/sanjavickovic/python_runs'

df_bf = pd.DataFrame([])
for shard in shards_final:
    
    if shard.startswith('gs://fc'):
        
    #check file names
        filename = shard
        
        print(filename)
        output_filename = os.path.join(path, 'bf.tsv')

        # copy file due to access permissions
        !gsutil -m cp $filename $output_filename
        !gzip $output_filename

        # read in as pandas and filter
        bf_tmp = pd.read_csv(output_filename+".gz", compression='gzip', header=0, sep='\t', quotechar='"', error_bad_lines=False, index_col=0)
        bf_tmp['BF'] = bf_tmp['BF'].astype(float)
        bf_tmp['Delta'] = bf_tmp['Delta'].astype(float)
        bf_tmp = bf_tmp[(bf_tmp['BF']>2) & (bf_tmp['Delta']>0)]

        # append to larger df
        df_bf = df_bf.append(bf_tmp,ignore_index=True)

        # delete file from memory 
        output_filename = output_filename+".gz"
        !rm $output_filename


gs://fc-2e7e9da8-0b98-4a74-a1ed-61c5ba5a13a4/432aaa8c-bc18-48c3-ab68-6359ee309769/splotch_diff_exp_workflow/9665d171-c25e-4bab-9232-4efd111c371d/call-diff_exp/shard-0/input_dir/analysis_output/BF-beta_df_output.tsv
Copying gs://fc-2e7e9da8-0b98-4a74-a1ed-61c5ba5a13a4/432aaa8c-bc18-48c3-ab68-6359ee309769/splotch_diff_exp_workflow/9665d171-c25e-4bab-9232-4efd111c371d/call-diff_exp/shard-0/input_dir/analysis_output/BF-beta_df_output.tsv...
==> NOTE: You are downloading one or more large file(s), which would            
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

/ [1/1 files][700.8 MiB/700.8 MiB] 100% Done                                    
Operation completed over 1 objects/700.8 MiB.                                    


In [36]:
ST_top_gene_dict = filter_BFs(df_bf)

                 gene           condition_1         condition_2        AAR1  \
0  0610007P14Rik_1442  12w BL6WT.Proximal M  2yr BL6WT.Middle F  crypt apex   
1  0610007P14Rik_1442    12w BL6WT.Distal M  2yr BL6WT.Middle F  crypt apex   
2  0610007P14Rik_1442   3w BL6WT.Proximal F  2yr BL6WT.Middle F  crypt apex   
3  0610007P14Rik_1442     3w BL6WT.Middle F  2yr BL6WT.Middle F  crypt apex   
4  0610007P14Rik_1442     6w BL6WT.Middle F  2yr BL6WT.Middle F  crypt apex   

         AAR2         BF     Delta  \
0  crypt apex   3.157051  0.634437   
1  crypt apex   4.419036  0.683694   
2  crypt apex   5.964412  0.552041   
3  crypt apex   4.662538  0.459751   
4  crypt apex  34.111819  0.474198   

                                           Samples_1  \
0  [-0.5319189999999999, -0.505155, -0.347624, -0...   
1  [-0.476041, -0.358795, -0.41092799999999996, -...   
2  [-0.358349, -0.36698200000000003, -0.750993999...   
3  [-0.718244, -0.593104, -0.6721699999999999, -0...   
4  [-0.634664999

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [27]:
expand_ages_dict = {
    '0d' : '0w',
    '1w' : '1w',
    '2w' : '2w',
    '3w' : '3w',
    '4w' : '4w',
    '6w' : '6w',
    '8w' : '8w',
    '12w' : '12w',
    '6m' : '26w',
    '1yr' : '52w',
    '2yr' : '104w'
}

In [41]:
ST_top_gene_dict['region_1'] = [i.replace("BL6WT.", "") for i in ST_top_gene_dict['region_1']]
ST_top_gene_dict['region_2'] = [i.replace("BL6WT.", "") for i in ST_top_gene_dict['region_2']]

In [42]:
ST_top_gene_dict['age_1'] = ST_top_gene_dict['age_1'].map(expand_ages_dict)
ST_top_gene_dict['age_2'] = ST_top_gene_dict['age_2'].map(expand_ages_dict)

In [21]:
# Saves formated DE genes df
#ST_top_gene_dict.to_csv(os.path.join(path, 'ST_top_gene_dict.csv'))