In [1]:
import  RNAseqQueryingInit

# Search and compare RNA-seq profiles based on experimental conditions

| Example comparison query | description|
|---|---|
|T-Cell|Just extracting T-Cells|
| T-Cell, B-Cell | Differential expression analysis between profiles with annotation "T-Cell" and "B-Cell"|
| single.\*cell.\*neuron, single.\*cell.\*glioblastoma | Differential expression analysis between profiles with annotation "single cell neuron" and "single cell glioblastoma"|

Query format: Each query is a list of regulary expressions deliminated by a comma, where each regular expression define a group in the comparison. 

[Click here for more info on SkyMap](./README.ipynb)

In [38]:
display(RNAseqQueryingInit.accordion)
display(RNAseqQueryingInit.widget_specie)
display(RNAseqQueryingInit.checkbox_exportCSV)

<IPython.core.display.Javascript object>

HBox(children=(Text(value='T-Cell,B-Cell', layout=Layout(height='50px', width='50%'), placeholder='Enter condi…

Select(description='Select your species:', index=4, options=('Canis_familiaris', 'Drosophila_melanogaster', 'M…

Checkbox(value=False, description='Export CSV')

<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>


### Example analysis: Simple differential expression analysis between the thousands of human B-Cells and T-Lymphocytes expression profiles  



This notebook is a template which consist of the following steps:
1. Setting input free-text regex queries to define the classes of experimental conditions (queryLabelToRegexDict)
2. Querying the reprocessed data which consist of >400,000 expression profiles
3. Generate a fully annotated expression matrix
4. DE analysis like volcano plot, correlation heatmap and PCA. (Most plots are interactive, u can download with a simple click)

### More parameters: 

expression_metric: Those are Kallisto expression metric: "tpm","est_counts"
baseDir: if run locally, change it to mirror our path.

In [39]:
from  RNAseqQueryingInit import *
import re

In [40]:
expression_metric='tpm' #

In [41]:
querySpecie=RNAseqQueryingInit.widget_specie.get_interact_value()

In [42]:
queryStr=RNAseqQueryingInit.widget_query.get_interact_value()

listOfQueries=re.split(" *, *", queryStr)

if len(queryStr)<3:
    raise ValueError('Please provide a query with more than 3 characters')
#if len(listOfQueries)<2:
#    raise ValueError('Please provide a query with more than 2 conditions')


In [43]:
queryLabelToRegexDict=dict(zip(listOfQueries,listOfQueries))

# Data loading

### load in SRS biospecieman annotations

In [44]:
%matplotlib notebook

import pandas as pd
import numpy as np

allSRS_pickle_dir='/home/jovyan/efs/all_seq/meta_data/allSRS.with_processed_data.flat.pickle.gz'
%time allSRS=pd.read_pickle(allSRS_pickle_dir)
allSRS.index.names=['SRS']

CPU times: user 836 ms, sys: 68 ms, total: 904 ms
Wall time: 911 ms


### load in technical metadata

In [45]:
sra_dump_pickle_dir='/home/jovyan/efs/all_seq/meta_data/sra_dump.fastqc.bowtie_algn.pickle'
%time technical_meta_data_df=pd.read_pickle(sra_dump_pickle_dir)
technical_meta_data_df[('SRAmeta','Run')]=technical_meta_data_df.index

CPU times: user 3.89 s, sys: 1.24 s, total: 5.13 s
Wall time: 5.13 s


### load the expression matrix

Check files in baseDir directory for more species

In [46]:
def loadDf(fname,mmap_mode='r'):
    with open(fname+'.index.txt') as f:
        myIndex=map(lambda s:s.replace("\n",""), f.readlines())
    with open(fname+'.columns.txt') as f:
        myColumns=map(lambda s:s.replace("\n",""), f.readlines())
    tmpMatrix=np.load(fname+".npy",mmap_mode=mmap_mode)
    tmpDf=pd.DataFrame(tmpMatrix,index=myIndex,columns=myColumns)
    tmpDf.columns.name='Run'
    return tmpDf
data_matrix_dir=baseDir+'/{specie}.gene_symbol.{expression_metric}'.format(specie=querySpecie,
                                            expression_metric=expression_metric)

%time rnaseqDf=loadDf(data_matrix_dir)

CPU times: user 108 ms, sys: 4 ms, total: 112 ms
Wall time: 121 ms


# Find the relevent SRS (Sample  IDs)  


In [47]:
myL=[]
for  queryRegex in queryLabelToRegexDict.values():
    %time hitSrsS=allSRS[allSRS.str.contains(queryRegex,case=False)]
    myL.append(hitSrsS)

queryLabel='queryLabel'
mergeS=pd.concat(myL,keys=queryLabelToRegexDict.keys(),names=[queryLabel])
mergeS_noDup=mergeS.groupby(['SRS','queryLabel']).first()
unqiueHitMask=mergeS_noDup.groupby('SRS').size()==1
unqiueHitSrs=unqiueHitMask.index[unqiueHitMask]
mergeS_noDup_unique=mergeS_noDup[mergeS_noDup.index.get_level_values('SRS').isin(unqiueHitSrs)]

CPU times: user 2.27 s, sys: 0 ns, total: 2.27 s
Wall time: 2.27 s
CPU times: user 2.05 s, sys: 0 ns, total: 2.05 s
Wall time: 2.06 s


Number of SRS per query class

In [48]:
mergeS_noDup_unique.groupby(queryLabel).size()

queryLabel
T-Cell    1498
B-Cell     881
dtype: int64

In [49]:
srsToClasses_all=mergeS_noDup_unique.reset_index().set_index(['SRS'])['queryLabel']

srsToClasses=srsToClasses_all

### map SRS Ids to SRR Ids

In [50]:
m_SRAMeta=technical_meta_data_df[('SRAmeta','Sample')].isin(srsToClasses.index)
technical_meta_data_df_hit=technical_meta_data_df[m_SRAMeta]

SRAMetasrsCorrespondingQuery=srsToClasses.loc[technical_meta_data_df_hit[('SRAmeta','Sample')]].values
technical_meta_data_df_hit[('SRAmeta',queryLabel)]=SRAMetasrsCorrespondingQuery
relevantMetaColsL=[('SRAmeta',queryLabel),('SRAmeta','Study'),('SRAmeta','Sample'),('SRAmeta','Run'),('SRAmeta','ScientificName')]
technical_meta_data_df_sub=technical_meta_data_df_hit[relevantMetaColsL]
designDf=technical_meta_data_df_sub['SRAmeta']

Top species with # of reprocessed profiles

In [51]:
print ('# expression profiles per query class and species available in SRA: ',designDf.groupby(['queryLabel','ScientificName']).size())

# expression profiles per query class and species available in SRA:  queryLabel  ScientificName
B-Cell      Homo_sapiens       959
            Mus_musculus       246
T-Cell      Homo_sapiens      1675
            Mus_musculus        64
dtype: int64


In [52]:
hitSrsAllAnnotS=allSRS[allSRS.index.get_level_values('SRS').isin(mergeS.index.get_level_values('SRS'))]

In [53]:
srsToTextS=hitSrsAllAnnotS

In [54]:
srsToTextS=pd.Series(data="NCBI SRA SRS:"+srsToTextS.index+' <br> '+srsToTextS.values,index=srsToTextS.index)

In [55]:
designDf['Description']=srsToTextS[designDf.Sample].values

### Subset the set of reprocessed data

In [56]:
%time designDf_specie=designDf[(designDf['ScientificName']==querySpecie)&(designDf.Run.isin(rnaseqDf.columns))]
queryDesignDf=designDf_specie

CPU times: user 40 ms, sys: 0 ns, total: 40 ms
Wall time: 38.4 ms


In [57]:
print ('Number of samples per query class that have data reprocessed in SkyMap: ',designDf_specie.groupby(queryLabel).size())


Number of samples per query class that have data reprocessed in SkyMap:  queryLabel
B-Cell     485
T-Cell    1191
dtype: int64


In [58]:
%time hitDf=pd.DataFrame( list(map( lambda srrId: rnaseqDf[srrId],queryDesignDf.Run))).T
hitDf.columns=queryDesignDf.set_index(queryDesignDf.columns.tolist()).index

CPU times: user 1.06 s, sys: 296 ms, total: 1.36 s
Wall time: 6.18 s


### Output:  fully annnotated matrix matrix

Example layout is listed in the cell below

In [59]:
hitDf

queryLabel,B-Cell,B-Cell,B-Cell,B-Cell,B-Cell,B-Cell,B-Cell,B-Cell,B-Cell,T-Cell,T-Cell,T-Cell,T-Cell,T-Cell,T-Cell,T-Cell,T-Cell,T-Cell,T-Cell,T-Cell,T-Cell
Study,SRP015715,SRP015715,SRP045500,SRP045500,SRP077016,SRP077016,SRP077016,SRP077016,SRP072506,SRP075608,...,SRP108237,SRP108237,SRP111077,SRP019939,SRP019939,SRP019939,SRP019939,SRP019939,SRP019939,SRP019939
Sample,SRS362251,SRS362252,SRS684342,SRS684323,SRS1523416,SRS1523415,SRS1523414,SRS1523413,SRS1366402,SRS1464366,...,SRS2237773,SRS2237774,SRS2335484,SRS403486,SRS403486,SRS403486,SRS403487,SRS403488,SRS403488,SRS403488
Run,SRR567561,SRR567562,SRR1551103,SRR1551084,SRR3714038,SRR3714037,SRR3714036,SRR3714035,SRR3309374,SRR3579131,...,SRR5626833,SRR5626834,SRR5803144,SRR791578,SRR791579,SRR791580,SRR791583,SRR791584,SRR791585,SRR791586
ScientificName,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,...,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens,Homo_sapiens
Description,NCBI SRA SRS:SRS362251 <br> TITLE: low c-Myc. P493-6 T=0HR - RNA-Seq <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: P493-6 cells with low c-Myc expression <br>cell line: P493-6 <br>cell type: B-cell lymphoma <br>,NCBI SRA SRS:SRS362252 <br> TITLE: high c-Myc. P493-6 T=24HR - RNA-Seq <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: P493-6 cells with high c-Myc expression <br>cell line: P493-6 <br>cell type: B-cell lymphoma <br>,"NCBI SRA SRS:SRS684342 <br> TITLE: lib344 <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: Whole Blood <br>age: 35 <br>cellcount: 640619 <br>celltype: B-cells <br>collectiondate: August 24 2012 <br>diseasestatus: MS posttreatment <br>donorid: 56 <br>gender: F <br>index: 10 <br>race: White <br>samplename: 56_Bcells <br>smoker: N <br>time since steroid dose: 1 month, IV <br>years since diagnosis: -- <br>",NCBI SRA SRS:SRS684323 <br> TITLE: lib325 <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: Whole Blood <br>age: 42 <br>cellcount: 1275892 <br>celltype: B-cells <br>collectiondate: June 19 2012 <br>diseasestatus: Type 1 Diabetes <br>donorid: 40 <br>gender: M <br>index: 11 <br>race: White <br>samplename: 40_Bcells <br>smoker: -- <br>time since steroid dose: -- <br>years since diagnosis: 6 <br>,NCBI SRA SRS:SRS1523416 <br> TITLE: SEM_NascentRNASeq_Rep4 <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: Leukemia cell line (SEM) <br>cell line: Leukemia cell line; SEM <br>cell type: Paediatric pro B-cell line derived from ALL with t(4;11)(q21;q23) translocation <br>molecule subtype: nascent RNA <br>,NCBI SRA SRS:SRS1523415 <br> TITLE: SEM_NascentRNASeq_Rep3 <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: Leukemia cell line (SEM) <br>cell line: Leukemia cell line; SEM <br>cell type: Paediatric pro B-cell line derived from ALL with t(4;11)(q21;q23) translocation <br>molecule subtype: nascent RNA <br>,NCBI SRA SRS:SRS1523414 <br> TITLE: SEM_NascentRNASeq_Rep2 <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: Leukemia cell line (SEM) <br>cell line: Leukemia cell line; SEM <br>cell type: Paediatric pro B-cell line derived from ALL with t(4;11)(q21;q23) translocation <br>molecule subtype: nascent RNA <br>,NCBI SRA SRS:SRS1523413 <br> TITLE: SEM_NascentRNASeq_Rep1 <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: Leukemia cell line (SEM) <br>cell line: Leukemia cell line; SEM <br>cell type: Paediatric pro B-cell line derived from ALL with t(4;11)(q21;q23) translocation <br>molecule subtype: nascent RNA <br>,"NCBI SRA SRS:SRS1366402 <br> TITLE: MD901 cells with scramble shRNA control, replicate 3 <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: MD901 cells with scramble shRNA control <br>cell line: MD901 <br>cell type: diffuse large B-cell lymphoma cell line <br>",NCBI SRA SRS:SRS1464366 <br> TITLE: AB14 <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: CD4 T-cell RNA extract <br>cell type: CD4 T cell <br>cd4 t cell subtype: Central Memory T Cell (TCM) <br>age: 19 days post extraction from host <br>treatment state: Activated Uninfected <br>,...,NCBI SRA SRS:SRS2237773 <br> TITLE: CD4+_Mock_day1_donor4 <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: CD4+ T-cells <br>infection: Mock <br>day: 1 <br>donor: 4 <br>,NCBI SRA SRS:SRS2237774 <br> TITLE: CD4+_Mock_day4_donor4 <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: CD4+ T-cells <br>infection: Mock <br>day: 4 <br>donor: 4 <br>,"NCBI SRA SRS:SRS2335484 <br> TITLE: Pooled 8 weeks sample <br>SCIENTIFIC_NAME: Homo sapiens <br>source_name: Human CD3+,CD45 + T-cells in humanized mice <br>cell type: CD3+,CD45 + T-cells <br>sample age: 8 weeks <br>gender: male <br>",NCBI SRA SRS:SRS403486 <br> TITLE: GSM1104129: Jurkat Cells 216 <br>DESCRIPTION: source: immortalized T-cell line <br>SCIENTIFIC_NAME: Homo sapiens <br>cell line: TIB-152 <br>,NCBI SRA SRS:SRS403486 <br> TITLE: GSM1104129: Jurkat Cells 216 <br>DESCRIPTION: source: immortalized T-cell line <br>SCIENTIFIC_NAME: Homo sapiens <br>cell line: TIB-152 <br>,NCBI SRA SRS:SRS403486 <br> TITLE: GSM1104129: Jurkat Cells 216 <br>DESCRIPTION: source: immortalized T-cell line <br>SCIENTIFIC_NAME: Homo sapiens <br>cell line: TIB-152 <br>,NCBI SRA SRS:SRS403487 <br> TITLE: GSM1104130: Jurkat Cells 245 <br>DESCRIPTION: source: immortalized T-cell line <br>SCIENTIFIC_NAME: Homo sapiens <br>cell line: TIB-152 <br>,NCBI SRA SRS:SRS403488 <br> TITLE: GSM1104131: Jurkat Cells 236 <br>DESCRIPTION: source: immortalized T-cell line <br>SCIENTIFIC_NAME: Homo sapiens <br>cell line: TIB-152 <br>,NCBI SRA SRS:SRS403488 <br> TITLE: GSM1104131: Jurkat Cells 236 <br>DESCRIPTION: source: immortalized T-cell line <br>SCIENTIFIC_NAME: Homo sapiens <br>cell line: TIB-152 <br>,NCBI SRA SRS:SRS403488 <br> TITLE: GSM1104131: Jurkat Cells 236 <br>DESCRIPTION: source: immortalized T-cell line <br>SCIENTIFIC_NAME: Homo sapiens <br>cell line: TIB-152 <br>
TRDD2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
TRDD1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
TRDD3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
IGHD4-17,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
IGHD4-4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
IGHD2-15,0.000000,0.000000,38.765598,3.861290,20.200600,16.686501,4.846030,53.599300,0.000000,0.000000,...,0.000000,0.000000,0.000000,16.605499,28.545300,16.635300,0.000000,0.000000,0.000000,0.000000
IGHD3-22,0.000000,0.000000,0.000000,1.930640,20.200600,5.562180,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,19.456200,0.000000,0.000000,0.000000
IGHD3-16,0.000000,0.000000,3.730820,26.841000,24.240700,4.776560,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
IGHD5-18,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
IGHD6-25,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [60]:
hitDf.to_pickle('./tmp.profile.pickle')

### Export and download the expression matrix

Hit the following button to download the expression profile matrix. 
If you see a dialogue box when you hit the following download button, click "Leave page"

In [61]:
if RNAseqQueryingInit.checkbox_exportCSV.get_interact_value():
    hitDf.head().to_csv('./tmp.csv')
    from IPython.core.display import display, HTML
    display(HTML('<form method="get" action="tmp.csv">\
       <button type="submit">Download profile matrix as CSV!</button>\
    </form>'))