### init code

In [1]:
#import  RNAseqQueryingInit
##static params
baseDir='/home/jovyan/efs/all_seq/rnaseq_merged/' #Base directory

import warnings
warnings.filterwarnings('ignore')
import ipywidgets as widgets
import os
import pandas as pd
from IPython.display import display,Javascript
from ipywidgets import  Layout


def run_all(ev):
    display(Javascript('IPython.notebook.execute_cells_below()'))

exampleQuery='B-Cell,T-Cell' 
style = {'description_width': 'initial'}
widget_query=widgets.Text(
    #value='',
    placeholder='Enter conditions seperated by comma to search and compare: eg. B-Cell,T-Cell',
    description='',
    disabled=False,
    #description='(50% width, 80px height) button',
    layout=widgets.Layout(width='50%', height='50px'),
    style=style
    
)

baseDir_FnameS=pd.Series(os.listdir(baseDir))
speciesWithReprocessedData=baseDir_FnameS[baseDir_FnameS.str.contains('.npy$')].str.split('.').str[0].unique()

widget_specie=widgets.Select(
    options=speciesWithReprocessedData,
    value='Homo_sapiens',
    # rows=10,
    description='Select your species:',
    disabled=False,
    style=style
)

button_query = widgets.Button(description="Search",
                             layout=Layout(width='20%', height='10%'))
button_query.on_click(run_all)
button_query.style.button_color='lightblue'
accordion = widgets.HBox(children=[widget_query,button_query])

checkbox_exportCSV=widgets.Checkbox(
    value=False,
    description='Export output matrix as CSV',
    disabled=False
)


# Search and compare RNA-seq profiles based on experimental conditions

| Example comparison query | description|
|---|---|
|T-Cell|Just extracting T-Cells|
| T-Cell, B-Cell | Differential expression analysis between profiles with annotation "T-Cell" and "B-Cell"|
| single.\*cell.\*neuron, single.\*cell.\*glioblastoma | Differential expression analysis between profiles with annotation "single cell neuron" and "single cell glioblastoma"|

Query format: Each query is a list of regulary expressions deliminated by a comma, where each regular expression define a group in the comparison. 

[Click here for more info on SkyMap](./README.ipynb)

In [3]:
display(accordion)
display(widget_specie)
display(checkbox_exportCSV)

<IPython.core.display.Javascript object>

HBox(children=(Text(value='T-Cell', layout=Layout(height='50px', width='50%'), placeholder='Enter conditions s…

Select(description='Select your species:', index=4, options=('Canis_familiaris', 'Drosophila_melanogaster', 'M…

Checkbox(value=False, description='Export output matrix as CSV')

<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>



# Excutation

### More parameters: 

expression_metric: Those are Kallisto expression metric: "tpm","est_counts"
baseDir: if run locally, change it to mirror our path.

In [4]:
#from  RNAseqQueryingInit import *
import re

In [5]:
expression_metric='tpm' #

In [6]:
querySpecie=widget_specie.get_interact_value()

In [7]:
queryStr=widget_query.get_interact_value()

listOfQueries=re.split(" *, *", queryStr)

if len(queryStr)<3:
    raise ValueError('Please provide a query with more than 3 characters')
#if len(listOfQueries)<2:
#    raise ValueError('Please provide a query with more than 2 conditions')


In [8]:
queryLabelToRegexDict=dict(zip(listOfQueries,listOfQueries))

## Data loading

### load in SRS biospecieman annotations

In [9]:
%matplotlib notebook

import pandas as pd
import numpy as np

allSRS_pickle_dir='/home/jovyan/efs/all_seq/meta_data/allSRS.with_processed_data.flat.pickle.gz'
%time allSRS=pd.read_pickle(allSRS_pickle_dir)
allSRS.index.names=['SRS']

CPU times: user 824 ms, sys: 112 ms, total: 936 ms
Wall time: 946 ms


### load in technical metadata

In [10]:
sra_dump_pickle_dir='/home/jovyan/efs/all_seq/meta_data/sra_dump.fastqc.bowtie_algn.pickle'
%time technical_meta_data_df=pd.read_pickle(sra_dump_pickle_dir)
technical_meta_data_df[('SRAmeta','Run')]=technical_meta_data_df.index

CPU times: user 3.4 s, sys: 1.24 s, total: 4.64 s
Wall time: 4.67 s


### load the expression matrix

Check files in baseDir directory for more species

In [11]:
def loadDf(fname,mmap_mode='r'):
    with open(fname+'.index.txt') as f:
        myIndex=map(lambda s:s.replace("\n",""), f.readlines())
    with open(fname+'.columns.txt') as f:
        myColumns=map(lambda s:s.replace("\n",""), f.readlines())
    tmpMatrix=np.load(fname+".npy",mmap_mode=mmap_mode)
    tmpDf=pd.DataFrame(tmpMatrix,index=myIndex,columns=myColumns)
    tmpDf.columns.name='Run'
    return tmpDf
data_matrix_dir=baseDir+'/{specie}.gene_symbol.{expression_metric}'.format(specie=querySpecie,
                                            expression_metric=expression_metric)

%time rnaseqDf=loadDf(data_matrix_dir)

CPU times: user 92 ms, sys: 16 ms, total: 108 ms
Wall time: 126 ms


### Find the relevent SRS (Sample  IDs)  


In [12]:
myL=[]
for  queryRegex in queryLabelToRegexDict.values():
    %time hitSrsS=allSRS[allSRS.str.contains(queryRegex,case=False)]
    myL.append(hitSrsS)

queryLabel='queryLabel'
mergeS=pd.concat(myL,keys=queryLabelToRegexDict.keys(),names=[queryLabel])
mergeS_noDup=mergeS.groupby(['SRS','queryLabel']).first()
unqiueHitMask=mergeS_noDup.groupby('SRS').size()==1
unqiueHitSrs=unqiueHitMask.index[unqiueHitMask]
mergeS_noDup_unique=mergeS_noDup[mergeS_noDup.index.get_level_values('SRS').isin(unqiueHitSrs)]

CPU times: user 2.22 s, sys: 4 ms, total: 2.23 s
Wall time: 2.23 s


Number of SRS per query class

In [13]:
mergeS_noDup_unique.groupby(queryLabel).size()

queryLabel
T-Cell    1498
dtype: int64

In [14]:
srsToClasses_all=mergeS_noDup_unique.reset_index().set_index(['SRS'])['queryLabel']

srsToClasses=srsToClasses_all

### map SRS Ids to SRR Ids

In [15]:
m_SRAMeta=technical_meta_data_df[('SRAmeta','Sample')].isin(srsToClasses.index)
technical_meta_data_df_hit=technical_meta_data_df[m_SRAMeta]

SRAMetasrsCorrespondingQuery=srsToClasses.loc[technical_meta_data_df_hit[('SRAmeta','Sample')]].values
technical_meta_data_df_hit[('SRAmeta',queryLabel)]=SRAMetasrsCorrespondingQuery
relevantMetaColsL=[('SRAmeta',queryLabel),('SRAmeta','Study'),('SRAmeta','Sample'),('SRAmeta','Run'),('SRAmeta','ScientificName')]
technical_meta_data_df_sub=technical_meta_data_df_hit[relevantMetaColsL]
designDf=technical_meta_data_df_sub['SRAmeta']

Top species with # of reprocessed profiles

In [16]:
print ('# expression profiles per query class and species available in SRA: ',designDf.groupby(['queryLabel','ScientificName']).size())

# expression profiles per query class and species available in SRA:  queryLabel  ScientificName
T-Cell      Homo_sapiens      1675
            Mus_musculus        64
dtype: int64


In [17]:
hitSrsAllAnnotS=allSRS[allSRS.index.get_level_values('SRS').isin(mergeS.index.get_level_values('SRS'))]

In [18]:
srsToTextS=hitSrsAllAnnotS

In [19]:
srsToTextS=pd.Series(data="NCBI SRA SRS:"+srsToTextS.index+' <br> '+srsToTextS.values,index=srsToTextS.index)

In [20]:
designDf['Description']=srsToTextS[designDf.Sample].values

### Subset the set of reprocessed data

In [21]:
%time designDf_specie=designDf[(designDf['ScientificName']==querySpecie)&(designDf.Run.isin(rnaseqDf.columns))]
queryDesignDf=designDf_specie

CPU times: user 36 ms, sys: 0 ns, total: 36 ms
Wall time: 35 ms


In [22]:
print ('Number of samples per query class that have data reprocessed in SkyMap: ',designDf_specie.groupby(queryLabel).size())


Number of samples per query class that have data reprocessed in SkyMap:  queryLabel
T-Cell    1191
dtype: int64


In [23]:
%time hitDf=pd.DataFrame( list(map( lambda srrId: rnaseqDf[srrId],queryDesignDf.Run))).T
hitDf.columns=queryDesignDf.set_index(queryDesignDf.columns.tolist()).index

CPU times: user 672 ms, sys: 188 ms, total: 860 ms
Wall time: 861 ms


# Export and download the expression matrix


### Output:  fully annnotated matrix matrix

Example layout is listed in the cell below

In [24]:
hitDf

queryLabel,T-Cell,T-Cell,T-Cell,T-Cell,T-Cell,T-Cell,T-Cell,T-Cell,T-Cell,T-Cell,T-Cell,T-Cell,T-Cell,T-Cell,T-Cell,T-Cell,T-Cell,T-Cell,T-Cell,T-Cell,T-Cell
Study,SRP075608,SRP075608,SRP075608,SRP075608,SRP075608,SRP064809,SRP064809,SRP064809,SRP064809,SRP064809,...,SRP108237,SRP108237,SRP111077,SRP019939,SRP019939,SRP019939,SRP019939,SRP019939,SRP019939,SRP019939
Sample,SRS1464366,SRS1464362,SRS1464357,SRS1464355,SRS1464353,SRS1114238,SRS1114237,SRS1114244,SRS1114243,SRS1114242,...,SRS2237773,SRS2237774,SRS2335484,SRS403486,SRS403486,SRS403486,SRS403487,SRS403488,SRS403488,SRS403488
Run,SRR3579131,SRR3579127,SRR3579122,SRR3579118,SRR3579114,SRR2648303,SRR2648305,SRR2648293,SRR2648294,SRR2648296,...,SRR5626833,SRR5626834,SRR5803144,SRR791578,SRR791579,SRR791580,SRR791583,SRR791584,SRR791585,SRR791586
ScientificName,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,...,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens
Description,NCBI SRA SRS:SRS1464366 <br> TITLE: AB14 <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: CD4 T-cell RNA extract <br>cell type: CD4 T cell <br>cd4 t cell subtype: Central Memory T Cell (TCM) <br>age: 19 days post extraction from host <br>treatment state: Activated Uninfected <br>,NCBI SRA SRS:SRS1464362 <br> TITLE: AB10 <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: CD4 T-cell RNA extract <br>cell type: CD4 T cell <br>cd4 t cell subtype: Central Memory T Cell (TCM) <br>age: 19 days post extraction from host <br>treatment state: Activated Uninfected <br>,NCBI SRA SRS:SRS1464357 <br> TITLE: AB05 <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: CD4 T-cell RNA extract <br>cell type: CD4 T cell <br>cd4 t cell subtype: Central Memory T Cell (TCM) <br>age: 17 days post extraction from host <br>treatment state: Resting Uninfected <br>,NCBI SRA SRS:SRS1464355 <br> TITLE: AB03 <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: CD4 T-cell RNA extract <br>cell type: CD4 T cell <br>cd4 t cell subtype: Central Memory T Cell (TCM) <br>age: 17 days post extraction from host <br>treatment state: Resting Latently Infected <br>,NCBI SRA SRS:SRS1464353 <br> TITLE: AB01 <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: CD4 T-cell RNA extract <br>cell type: CD4 T cell <br>cd4 t cell subtype: Central Memory T Cell (TCM) <br>age: 17 days post extraction from host <br>treatment state: Resting Uninfected <br>,NCBI SRA SRS:SRS1114238 <br> TITLE: MT4 HIV 2 RNA-Seq <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: MT4 T-cells <br>treatment: HIV-1 infected <br>cell line: MT4 <br>,NCBI SRA SRS:SRS1114237 <br> TITLE: MT4 HIV 2 MeRIP-Seq <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: MT4 T-cells <br>treatment: HIV-1 infected <br>cell line: MT4 <br>,NCBI SRA SRS:SRS1114244 <br> TITLE: MT4 Control 1 RNA-Seq <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: MT4 T-cells <br>treatment: control <br>cell line: MT4 <br>,NCBI SRA SRS:SRS1114243 <br> TITLE: MT4 Control 1 MeRIP-Seq <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: MT4 T-cells <br>treatment: control <br>cell line: MT4 <br>,NCBI SRA SRS:SRS1114242 <br> TITLE: MT4 Control 2 RNA-Seq <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: MT4 T-cells <br>treatment: control <br>cell line: MT4 <br>,...,NCBI SRA SRS:SRS2237773 <br> TITLE: CD4+_Mock_day1_donor4 <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: CD4+ T-cells <br>infection: Mock <br>day: 1 <br>donor: 4 <br>,NCBI SRA SRS:SRS2237774 <br> TITLE: CD4+_Mock_day4_donor4 <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: CD4+ T-cells <br>infection: Mock <br>day: 4 <br>donor: 4 <br>,"NCBI SRA SRS:SRS2335484 <br> TITLE: Pooled 8 weeks sample <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: Human CD3+,CD45 + T-cells in humanized mice <br>cell type: CD3+,CD45 + T-cells <br>sample age: 8 weeks <br>gender: male <br>",NCBI SRA SRS:SRS403486 <br> TITLE: GSM1104129: Jurkat Cells 216 <br>DESCRIPTION: source: immortalized T-cell line <br>SCIENTIFIC_NAME: Homo sapiens <br>cell line: TIB-152 <br>,NCBI SRA SRS:SRS403486 <br> TITLE: GSM1104129: Jurkat Cells 216 <br>DESCRIPTION: source: immortalized T-cell line <br>SCIENTIFIC_NAME: Homo sapiens <br>cell line: TIB-152 <br>,NCBI SRA SRS:SRS403486 <br> TITLE: GSM1104129: Jurkat Cells 216 <br>DESCRIPTION: source: immortalized T-cell line <br>SCIENTIFIC_NAME: Homo sapiens <br>cell line: TIB-152 <br>,NCBI SRA SRS:SRS403487 <br> TITLE: GSM1104130: Jurkat Cells 245 <br>DESCRIPTION: source: immortalized T-cell line <br>SCIENTIFIC_NAME: Homo sapiens <br>cell line: TIB-152 <br>,NCBI SRA SRS:SRS403488 <br> TITLE: GSM1104131: Jurkat Cells 236 <br>DESCRIPTION: source: immortalized T-cell line <br>SCIENTIFIC_NAME: Homo sapiens <br>cell line: TIB-152 <br>,NCBI SRA SRS:SRS403488 <br> TITLE: GSM1104131: Jurkat Cells 236 <br>DESCRIPTION: source: immortalized T-cell line <br>SCIENTIFIC_NAME: Homo sapiens <br>cell line: TIB-152 <br>,NCBI SRA SRS:SRS403488 <br> TITLE: GSM1104131: Jurkat Cells 236 <br>DESCRIPTION: source: immortalized T-cell line <br>SCIENTIFIC_NAME: Homo sapiens <br>cell line: TIB-152 <br>
TRDD2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
TRDD1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
TRDD3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
IGHD4-17,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
IGHD4-4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
IGHD2-15,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,16.605499,28.545300,16.635300,0.000000,0.000000,0.000000,0.000000
IGHD3-22,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,19.456200,0.000000,0.000000,0.000000
IGHD3-16,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
IGHD5-18,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
IGHD6-25,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [25]:
hitDf.to_pickle('./tmp.profile.pickle')


Hit the following button to download the expression profile matrix. 
If you see a dialogue box when you hit the following download button, click "Leave page"

In [26]:
if checkbox_exportCSV.get_interact_value():
    hitDf.to_csv('./tmp.csv')
    from IPython.core.display import display, HTML
    display(HTML('<form method="get" action="tmp.csv">\
       <button type="submit">Download profile matrix as CSV!</button>\
    </form>'))