### Init

In [3]:
#import  RNAseqQueryingInit
##static params
baseDir='/home/jovyan/efs/all_seq/rnaseq_merged/' #Base directory

import warnings
warnings.filterwarnings('ignore')
import ipywidgets as widgets
import os
import pandas as pd
from IPython.display import display,Javascript
from ipywidgets import  Layout


def run_all(ev):
    display(Javascript('IPython.notebook.execute_cells_below()'))

exampleQuery='B-Cell,T-Cell' 
style = {'description_width': 'initial'}
widget_query=widgets.Text(
    #value='',
    placeholder='Enter conditions seperated by comma to search and compare: eg. B-Cell,T-Cell',
    description='',
    disabled=False,
    #description='(50% width, 80px height) button',
    layout=widgets.Layout(width='80%', height='50px'),
    style=style
    
)

baseDir_FnameS=pd.Series(os.listdir(baseDir))
speciesWithReprocessedData=baseDir_FnameS[baseDir_FnameS.str.contains('.npy$')].str.split('.').str[0].unique()

widget_specie=widgets.Select(
    options=speciesWithReprocessedData,
    value='Homo_sapiens',
    # rows=10,
    description='Select your species:',
    disabled=False,
    style=style
)

button_query = widgets.Button(description="Search",
                             layout=Layout(width='20%', height='10%'))
button_query.on_click(run_all)
button_query.style.button_color='lightblue'
accordion = widgets.HBox(children=[widget_query,button_query])

checkbox_exportCSV=widgets.Checkbox(
    value=False,
    description='Export output matrix as CSV',
    disabled=False
)


# Search and compare RNA-seq profiles based on experimental conditions

| Example comparison query | Return expression matrix|
|---|---|
|T-Cell|Profiles with annotation "T-Cell"|
| T-Cell, B-Cell |  Profiles with annotation "T-Cell" and "B-Cell"|
| single.\*cell.\*neuron, single.\*cell.\*glioblastoma | Profiles with annotation "single cell neuron" and "single cell glioblastoma"|

Query format: Each query is a list of regulary expressions deliminated by a comma, where each regular expression define a group in the comparison. 

[Click here for more info on SkyMap](./README.ipynb)

In [4]:
display(accordion)
display(widget_specie)
display(checkbox_exportCSV)

HBox(children=(Text(value='', layout=Layout(height='50px', width='80%'), placeholder='Enter conditions seperat…

Select(description='Select your species:', index=4, options=('Canis_familiaris', 'Drosophila_melanogaster', 'M…

Checkbox(value=False, description='Export output matrix as CSV')

<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>



# Excutation

### More parameters: 

expression_metric: Those are Kallisto expression metric: "tpm","est_counts"
baseDir: if run locally, change it to mirror our path.

In [None]:
#from  RNAseqQueryingInit import *
import re

In [None]:
expression_metric='tpm' #

In [None]:
querySpecie=widget_specie.get_interact_value()

In [None]:
queryStr=widget_query.get_interact_value()

listOfQueries=re.split(" *, *", queryStr)

if len(queryStr)<3:
    raise ValueError('Please provide a query with more than 3 characters')
#if len(listOfQueries)<2:
#    raise ValueError('Please provide a query with more than 2 conditions')


In [None]:
queryLabelToRegexDict=dict(zip(listOfQueries,listOfQueries))

## Data loading

### load in SRS biospecieman annotations

In [None]:
%matplotlib notebook

import pandas as pd
import numpy as np

allSRS_pickle_dir='/home/jovyan/efs/all_seq/meta_data/allSRS.with_processed_data.flat.pickle.gz'
%time allSRS=pd.read_pickle(allSRS_pickle_dir)
allSRS.index.names=['SRS']

### load in technical metadata

In [None]:
sra_dump_pickle_dir='/home/jovyan/efs/all_seq/meta_data/sra_dump.fastqc.bowtie_algn.pickle'
%time technical_meta_data_df=pd.read_pickle(sra_dump_pickle_dir)
technical_meta_data_df[('SRAmeta','Run')]=technical_meta_data_df.index

### load the expression matrix

Check files in baseDir directory for more species

In [None]:
def loadDf(fname,mmap_mode='r'):
    with open(fname+'.index.txt') as f:
        myIndex=map(lambda s:s.replace("\n",""), f.readlines())
    with open(fname+'.columns.txt') as f:
        myColumns=map(lambda s:s.replace("\n",""), f.readlines())
    tmpMatrix=np.load(fname+".npy",mmap_mode=mmap_mode)
    tmpDf=pd.DataFrame(tmpMatrix,index=myIndex,columns=myColumns)
    tmpDf.columns.name='Run'
    return tmpDf
data_matrix_dir=baseDir+'/{specie}.gene_symbol.{expression_metric}'.format(specie=querySpecie,
                                            expression_metric=expression_metric)

%time rnaseqDf=loadDf(data_matrix_dir)

### Find the relevent SRS (Sample  IDs)  


In [None]:
myL=[]
for  queryRegex in queryLabelToRegexDict.values():
    %time hitSrsS=allSRS[allSRS.str.contains(queryRegex,case=False)]
    myL.append(hitSrsS)

queryLabel='queryLabel'
mergeS=pd.concat(myL,keys=queryLabelToRegexDict.keys(),names=[queryLabel])
mergeS_noDup=mergeS.groupby(['SRS','queryLabel']).first()
unqiueHitMask=mergeS_noDup.groupby('SRS').size()==1
unqiueHitSrs=unqiueHitMask.index[unqiueHitMask]
mergeS_noDup_unique=mergeS_noDup[mergeS_noDup.index.get_level_values('SRS').isin(unqiueHitSrs)]

Number of SRS per query class

In [None]:
mergeS_noDup_unique.groupby(queryLabel).size()

In [None]:
srsToClasses_all=mergeS_noDup_unique.reset_index().set_index(['SRS'])['queryLabel']

srsToClasses=srsToClasses_all

### map SRS Ids to SRR Ids

In [None]:
m_SRAMeta=technical_meta_data_df[('SRAmeta','Sample')].isin(srsToClasses.index)
technical_meta_data_df_hit=technical_meta_data_df[m_SRAMeta]

SRAMetasrsCorrespondingQuery=srsToClasses.loc[technical_meta_data_df_hit[('SRAmeta','Sample')]].values
technical_meta_data_df_hit[('SRAmeta',queryLabel)]=SRAMetasrsCorrespondingQuery
relevantMetaColsL=[('SRAmeta',queryLabel),('SRAmeta','Study'),('SRAmeta','Sample'),('SRAmeta','Run'),('SRAmeta','ScientificName')]
technical_meta_data_df_sub=technical_meta_data_df_hit[relevantMetaColsL]
designDf=technical_meta_data_df_sub['SRAmeta']

Top species with # of reprocessed profiles

In [None]:
print ('# expression profiles per query class and species available in SRA: ',designDf.groupby(['queryLabel','ScientificName']).size())

In [None]:
hitSrsAllAnnotS=allSRS[allSRS.index.get_level_values('SRS').isin(mergeS.index.get_level_values('SRS'))]

In [None]:
srsToTextS=hitSrsAllAnnotS

In [None]:
srsToTextS=pd.Series(data="NCBI SRA SRS:"+srsToTextS.index+' <br> '+srsToTextS.values,index=srsToTextS.index)

In [None]:
designDf['Description']=srsToTextS[designDf.Sample].values

### Subset the set of reprocessed data

In [None]:
%time designDf_specie=designDf[(designDf['ScientificName']==querySpecie)&(designDf.Run.isin(rnaseqDf.columns))]
queryDesignDf=designDf_specie

In [None]:
print ('Number of samples per query class that have data reprocessed in SkyMap: ',designDf_specie.groupby(queryLabel).size())


In [None]:
%time hitDf=pd.DataFrame( list(map( lambda srrId: rnaseqDf[srrId],queryDesignDf.Run))).T
hitDf.columns=queryDesignDf.set_index(queryDesignDf.columns.tolist()).index

# Export and download the expression matrix


### Output:  fully annnotated matrix matrix

Example layout is listed in the cell below

In [None]:
hitDf

Export the hit dataframe for analysis

In [None]:
hitDf.to_pickle('./tmp.profile.pickle')


Hit the following button to download the expression profile matrix. 
If you see a dialogue box when you hit the following download button, click "Leave page"

In [None]:
if checkbox_exportCSV.get_interact_value():
    hitDf.to_csv('./tmp.csv')
    from IPython.core.display import display, HTML
    display(HTML('<form method="get" action="tmp.csv">\
       <button type="submit">Click here to download the profile matrix as an CSV file</button>\
    </form>'))

# Click here to view the analysis associated with the returned matrix
* [Interactive PCA 2D/ 3D](./RNAseqPCA.ipynb)
* [Interactive TSNE](./RNAseqTSNE.ipynb)
* [Volcano plot with t-test (Works only when there are more than 2 conditions)](VolcanoPlot.ipynb)
* [Boxplot showing the expression level of a gene ](QueryGenesInConditions.ipynb)
* [Study level condition correlation heatmap](RNAseqStudyClustermap.ipynb)