### Init

This notebook is showing how to slice the expression level of a single gene in < 10ms. 

#### code for data loading

In [None]:
import pandas as pd
import numpy as np

In [None]:
def loadDf(fname,mmap_mode='r'):
    with open(fname+'.index.txt') as f:
        myIndex=map(lambda s:s.replace("\n",""), f.readlines())
    with open(fname+'.columns.txt') as f:
        myColumns=map(lambda s:s.replace("\n",""), f.readlines())
    tmpMatrix=np.load(fname+".npy",mmap_mode=mmap_mode)
    tmpDf=pd.DataFrame(tmpMatrix,index=myIndex,columns=myColumns)
    tmpDf.columns.name='Run'
    return tmpDf

#### make ipywidgets

In [None]:
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display,Javascript
from ipywidgets import  Layout
import re
import os

button_query = widgets.Button(description="Search",
                             layout=Layout(width='20%', height='100%'))
def run_all(ev):
    display(Javascript('IPython.notebook.execute_cells_below()'))

button_query.style.button_color='lightblue'
button_query.on_click(run_all)


style = {'description_width': 'initial'}
widget_query=widgets.Text(
    #value='',
    placeholder='Enter a single Ensembl gene name: eg. TP53',
    description='',
    disabled=False,
    #description='(50% width, 80px height) button',
    layout=widgets.Layout(width='60%', height='50px'),
    style=style
)


drop down bar widget

In [None]:
baseDir='/home/jovyan/efs/all_seq/rnaseq_merged/' #Base directory
baseDir_FnameS=pd.Series(os.listdir(baseDir))
speciesWithReprocessedData=baseDir_FnameS[baseDir_FnameS.str.contains('.npy$')].str.split('.').str[0].unique()
widget_specie=widgets.Select(
    options=speciesWithReprocessedData,
    value='Homo_sapiens',
    # rows=10,
    description='Select your species:',
    disabled=False,
    style=style
)

In [None]:
#accordion = widgets.HBox(children=[widget_query,widget_specie])
accordion=widgets.VBox(children=[widget_query,widget_specie,button_query])

### Query a single gene

In [None]:
display(accordion)

### Execute

In [None]:
queryGene=widget_query.get_interact_value()

In [None]:

expression_metric='tpm' #offer Kallisto expression metric: ["tpm","est_counts"]
specie=widget_specie.get_interact_value() #Check files in baseDir directory for more species
data_matrix_dir=baseDir+'/{specie}.gene_symbol.{expression_metric}'.format(specie=specie,
                                            expression_metric=expression_metric)

#a memory mapped dataframe
rnaseqDf=loadDf(data_matrix_dir)
print ('Dimension of expression matrix: (# of genes, # of sequencing run) ',rnaseqDf.shape)

In [None]:
%time hitGeneS=rnaseqDf.loc[queryGene]

In [None]:
print ('# sequencing runs extracted: ',len(hitGeneS))

Distribution of expression over first 100 runs

In [None]:
%time hitGeneS=hitGeneS.copy() #unless it is copied, it is still memory mapped

In [None]:
%time nLargestSrr=hitGeneS.nlargest(1000)

In [None]:
sra_dump_pickle_dir='/home/jovyan/efs/all_seq/meta_data/sra_dump.pickle'
%time technical_meta_data_df=pd.read_pickle(sra_dump_pickle_dir)

In [None]:
technical_meta_data_df[('Run')]=technical_meta_data_df.index

In [None]:
tmpDf=nLargestSrr.to_frame()

In [None]:
tmpDf['BioProj']=technical_meta_data_df['proj_accession_BioProject'][tmpDf.index]

In [None]:
bioProjectAnnotDf=pd.read_pickle('/home/jovyan/efs/all_seq/meta_data/bioproject.pickle'
                                ).set_index('accession')

In [None]:
tmpDf['Title']=bioProjectAnnotDf['Title'][tmpDf.BioProj.fillna('')].values


In [None]:
#np.random.choice(inputAnalyzeDf.index,2) #['GAPDH','TP53']
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode()
#print ('Genes plotting: ',annotateGenesList)

layout = go.Layout(
    boxmode='group',
    showlegend=False,
        xaxis=dict(
        autorange=True,
        showgrid=False,
        zeroline=False,
        showline=False,
        ticks='',
        showticklabels=False
    ),
yaxis={'title':"Expression level of {} measured in log2 scale".format(queryGene)}
)


In [None]:
import textwrap 
wrapper=textwrap.TextWrapper(width=30) 

In [None]:
tmpDf['Title_wrapped']=tmpDf['Title'].astype(str).apply(lambda Str:"<br>".join(wrapper.wrap(Str)))

In [None]:
sampleSizeMinThreshold=5
m=tmpDf.groupby('BioProj').size()>=sampleSizeMinThreshold

In [None]:
selectedStudies=m[m].index

In [None]:
myL=[]
medExpressionS=tmpDf[tmpDf.BioProj.isin(selectedStudies)
                    ].groupby('BioProj').median()

plotStudyOrderS=medExpressionS[queryGene].sort_values(ascending=False)

for myQueryLabel in plotStudyOrderS.index: 
    tmpDf2=tmpDf[tmpDf['BioProj']==myQueryLabel]
    annotS=bioProjectAnnotDf.loc[myQueryLabel]
    #myLabelStr="Accession"+annotS.name+'<br>Title'+annotS.loc['Title']+'<br>'
    myL.append( go.Box(
        #name=myLabelStr,
           y=np.log2(tmpDf2[queryGene]+1),
                      x="Accession: "+tmpDf2['BioProj']+ "<br>Title: "+tmpDf2['Title_wrapped'])
              )

fig=go.Figure(data=myL,layout=layout)


### Figures

In [None]:
iplot(fig )

SRR to expression level series

In [None]:
print ('# of expression profiles analyzed: ',len(hitGeneS))