In [1]:
#import  RNAseqQueryingInit
##static params
baseDir='/home/ec2-user/efs/all_seq/rnaseq_merged/' #Base directory

import warnings
warnings.filterwarnings('ignore')
import ipywidgets as widgets
import os
import pandas as pd
import re

In [2]:
exampleQuery='B-Cell,T-Cell' 

In [3]:
baseDir_FnameS=pd.Series(os.listdir(baseDir))
speciesWithReprocessedData=baseDir_FnameS[baseDir_FnameS.str.contains('.npy$')].str.split('.').str[0].unique()


## Data loading

### load in SRS biospecieman annotations

In [4]:
import pandas as pd
import numpy as np

allSRS_pickle_dir='~/efs/all_seq/meta_data/allSRS.with_processed_data.flat.pickle.gz'
allSRS=pd.read_pickle(allSRS_pickle_dir)
allSRS.index.names=['SRS']

### load in technical metadata

In [5]:
sra_dump_pickle_dir='~/efs/all_seq/meta_data/sra_dump.fastqc.bowtie_algn.pickle'
%time technical_meta_data_df=pd.read_pickle(sra_dump_pickle_dir)
technical_meta_data_df[('SRAmeta','Run')]=technical_meta_data_df.index

CPU times: user 4.06 s, sys: 906 ms, total: 4.97 s
Wall time: 7.38 s


### load the expression matrix

Check files in baseDir directory for more species

In [104]:
expression_metric='tpm' #
queryLabel='queryLabel'

def loadDf(fname,mmap_mode='r'):
    with open(fname+'.index.txt') as f:
        myIndex=map(lambda s:s.replace("\n",""), f.readlines())
    with open(fname+'.columns.txt') as f:
        myColumns=map(lambda s:s.replace("\n",""), f.readlines())
    tmpMatrix=np.load(fname+".npy",mmap_mode=mmap_mode)
    tmpDf=pd.DataFrame(tmpMatrix,index=myIndex,columns=myColumns)
    tmpDf.columns.name='Run'
    return tmpDf


### define layout

In [105]:
def returnDesignDf(queryLabelToRegexDict):
    myL=[]
    for  queryRegex in queryLabelToRegexDict.values():
        hitSrsS=allSRS[allSRS.str.contains(queryRegex,case=False)]
        myL.append(hitSrsS)

    queryLabel='queryLabel'
    mergeS=pd.concat(myL,keys=queryLabelToRegexDict.keys(),names=[queryLabel])
    mergeS_noDup=mergeS.groupby(['SRS','queryLabel']).first()
    unqiueHitMask=mergeS_noDup.groupby('SRS').size()==1
    unqiueHitSrs=unqiueHitMask.index[unqiueHitMask]
    mergeS_noDup_unique=mergeS_noDup[mergeS_noDup.index.get_level_values('SRS').isin(unqiueHitSrs)]

    #Number of SRS per query class
    mergeS_noDup_unique.groupby(queryLabel).size()

    srsToClasses_all=mergeS_noDup_unique.reset_index().set_index(['SRS'])['queryLabel']

    srsToClasses=srsToClasses_all

    ### map SRS Ids to SRR Ids

    m_SRAMeta=technical_meta_data_df[('SRAmeta','Sample')].isin(srsToClasses.index)
    technical_meta_data_df_hit=technical_meta_data_df[m_SRAMeta]

    SRAMetasrsCorrespondingQuery=srsToClasses.loc[technical_meta_data_df_hit[('SRAmeta','Sample')]].values
    technical_meta_data_df_hit[('SRAmeta',queryLabel)]=SRAMetasrsCorrespondingQuery
    relevantMetaColsL=[('SRAmeta',queryLabel),('SRAmeta','Study'),('SRAmeta','Sample'),('SRAmeta','Run'),('SRAmeta','ScientificName')]
    technical_meta_data_df_sub=technical_meta_data_df_hit[relevantMetaColsL]
    designDf=technical_meta_data_df_sub['SRAmeta']
    
    hitSrsAllAnnotS=allSRS[allSRS.index.get_level_values('SRS').isin(mergeS.index.get_level_values('SRS'))]
    srsToTextS=hitSrsAllAnnotS
    srsToTextS=pd.Series(data="NCBI SRA SRS:"+srsToTextS.index+' <br> '+srsToTextS.values,index=srsToTextS.index)
    designDf['Description']=srsToTextS[designDf.Sample].values
    return designDf
#Top species with # of reprocessed profiles

### define call back functions

In [168]:
import dash
from dash.dependencies import Input, Output
import dash_core_components as dcc
import dash_html_components as html

import flask
import pandas as pd
import time
import os
from flask_caching import Cache

server = flask.Flask('app')
server.secret_key = os.environ.get('secret_key', 'secret')

df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/hello-world-stock.csv')

app = dash.Dash('app', server=server)
cache = Cache(app.server, config={
    'CACHE_TYPE': 'redis',
    'CACHE_TYPE': 'filesystem',
    'CACHE_DIR': 'cache-directory',
    'CACHE_THRESHOLD': 200
})




app.scripts.config.serve_locally = False
dcc._js_dist[0]['external_url'] = 'https://cdn.plot.ly/plotly-basic-latest.min.js'


In [169]:

"""
input: search text
output:
"""



@cache.memoize()
def query_and_serialize_data(interact_value):
    querySpecie='Homo_sapiens'#widget_specie.get_interact_value()
    queryStr=interact_value
    listOfQueries=re.split(" *, *", queryStr)
    if len(queryStr)<3:
        raise ValueError('Please provide a query with more than 3 characters')
    queryLabelToRegexDict=dict(zip(listOfQueries,listOfQueries))
    designDf=returnDesignDf(queryLabelToRegexDict)

    #Subset the set of reprocessed data
    data_matrix_dir=baseDir+'/{specie}.gene_symbol.{expression_metric}'.format(specie=querySpecie,
                                        expression_metric=expression_metric)

    rnaseqDf=loadDf(data_matrix_dir)
    designDf_specie=designDf[(designDf['ScientificName']==querySpecie)&(designDf.Run.isin(rnaseqDf.columns))]
    queryDesignDf=designDf_specie
    #print ('Number of samples per query class that have data reprocessed in SkyMap: ',designDf_specie.groupby(queryLabel).size())
    hitDf=pd.DataFrame( list(map( lambda srrId: rnaseqDf[srrId],queryDesignDf.Run))).T
    hitDf.columns=queryDesignDf.set_index(queryDesignDf.columns.tolist()).index
    return hitDf


#session_id = str(uuid.uuid4())
app.layout = html.Div([
    html.Div(dcc.Input(id='input-box', type='text')),
    html.Button('Search', id='button'),
    dcc.Graph(id='my-graph') ,
    html.Div(id='output-container-button',
             children='Enter a value and press submit'),
    
    html.Div(id='designDf', style={'display': 'none'}),
    #html.Div(id='designDf', style={'display': 'none'})
    #dcc.Graph(id='my-graph2') ,
], style={'columnCount': 2})
        #return (str(designDf.shape[0]))
"""
,
    [dash.dependencies.State('input-box', 'value')]
"""

@app.callback(
    dash.dependencies.Output('output-container-button', 'children'),
    [dash.dependencies.Input('button', 'n_clicks'),
    # Input('session-id', 'children')
    ],
    [dash.dependencies.State('input-box', 'value')])
def searchFunction(n_clicks,interact_value):
    if n_clicks: #if n_clicks not null
        #gnerate and savee expression matrix
        hitDf=query_and_serialize_data(interact_value)
        return "Number of sequencing experiment returned: {}".format(hitDf.shape[1])

#    [dash.dependencies.State('input-box', 'value')]
@app.callback(
    dash.dependencies.Output('my-graph','figure'),
    [dash.dependencies.Input('output-container-button','children'),
     #dash.dependencies.Input('input-box', 'value'),
    ],[dash.dependencies.State('input-box', 'value')])
def plotPCA(container,interact_value):
        print ('called plot PCA: ',container,interact_value)
        #if str( "Number of sequencing experiment returned") in str(container):
        #    print ('in loop')
        if len(str(interact_value))>0:
            interact_value=interact_value

            import numpy as np
            hitDf=query_and_serialize_data(interact_value)
            inputAnalyzeDf=np.log2(hitDf+1)
            inPcaDf=inputAnalyzeDf.T

            from sklearn import decomposition
            import plotly.graph_objs as go
            #from plotly.offline import iplot, init_notebook_mode

            PCA=decomposition.PCA(n_components=3)

            pcaM=PCA.fit_transform((inPcaDf))
            pcaDf=pd.DataFrame( data=pcaM,index=inPcaDf.index)

            layout_3d = go.Layout(
                        scene = dict(
                        xaxis = dict(
                            title='PC0'),
                        yaxis = dict(
                            title='PC1',),
                        zaxis = dict(
                            title='PC2',),),
                      )

            #fig = go.Figure(layout=layout_3d)
            dataL=[]
            for label, sub_pca_df in pcaDf.groupby('queryLabel'):
                dataL.append(go.Scatter3d(x=sub_pca_df[0], y=sub_pca_df[1],z=sub_pca_df[2],
                                  name=label,
                                hovertext=sub_pca_df.index.get_level_values('Description'),
                                mode = 'markers')
                            )

            return {'data':dataL,'layout':layout_3d}



In [170]:
app.run_server()

 * Serving Flask app "app" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
127.0.0.1 - - [15/Feb/2019 02:36:29] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [15/Feb/2019 02:36:30] "GET /_dash-layout HTTP/1.1" 200 -
127.0.0.1 - - [15/Feb/2019 02:36:30] "GET /_dash-dependencies HTTP/1.1" 200 -
127.0.0.1 - - [15/Feb/2019 02:36:30] "POST /_dash-update-component HTTP/1.1" 200 -
[2019-02-15 02:36:30,717] ERROR in app: Exception on /_dash-update-component [POST]
Traceback (most recent call last):
  File "/home/ec2-user/miniconda3/lib/python3.7/site-packages/flask/app.py", line 2292, in wsgi_app
    response = self.full_dispatch_request()
  File "/home/ec2-user/miniconda3/lib/python3.7/site-packages/flask/app.py", line 1815, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "/home/ec2-user/miniconda3/lib/python3.7/site-packages/flask/app.py", line 1718, in handle_user_exception
    reraise(exc_type, exc_value, tb)
  File "/home/ec2-user/miniconda3/lib/python3.7/site-packages/flask/_compat.py", l

called plot PCA:  None None


127.0.0.1 - - [15/Feb/2019 02:36:33] "POST /_dash-update-component HTTP/1.1" 200 -


called plot PCA:  Number of sequencing experiment returned: 1191 T-Cell


127.0.0.1 - - [15/Feb/2019 02:36:37] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [15/Feb/2019 02:37:32] "POST /_dash-update-component HTTP/1.1" 200 -


called plot PCA:  Number of sequencing experiment returned: 485 B-Cell


127.0.0.1 - - [15/Feb/2019 02:37:34] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [15/Feb/2019 02:38:19] "POST /_dash-update-component HTTP/1.1" 200 -


called plot PCA:  Number of sequencing experiment returned: 1676 T-Cell, B-Cell


127.0.0.1 - - [15/Feb/2019 02:38:26] "POST /_dash-update-component HTTP/1.1" 200 -


In [145]:
#import plotly.graph_objs as go

In [146]:
#go.Scatter3d

 * Serving Flask app "app" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
127.0.0.1 - - [15/Feb/2019 02:22:38] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [15/Feb/2019 02:22:38] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [15/Feb/2019 02:22:39] "GET /_dash-layout HTTP/1.1" 200 -
127.0.0.1 - - [15/Feb/2019 02:22:39] "GET /_dash-dependencies HTTP/1.1" 200 -
127.0.0.1 - - [15/Feb/2019 02:22:40] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [15/Feb/2019 02:22:40] "GET /_favicon.ico HTTP/1.1" 200 -
127.0.0.1 - - [15/Feb/2019 02:22:41] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [15/Feb/2019 02:22:41] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [15/Feb/2019 02:22:41] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [15/Feb/2019 02:22:41] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [15/Feb/2019 02:22:41] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [15/Feb/2019 02:22:50] "POST /_dash-update-component HTTP/1.1" 

### scratch

B-Cell,T-Cell

In [94]:
df=pd.DataFrame()
df.to_json()

'{}'

In [9]:
expression_metric='tpm' #

querySpecie=widget_specie.get_interact_value()

queryStr=widget_query.get_interact_value()

listOfQueries=re.split(" *, *", queryStr)


NameError: name 'widget_specie' is not defined

In [17]:
import uuid

In [None]:
uuid.uuid4()

In [151]:
!echo "# skymap_web_server" >> README.md

In [152]:
!git init

/usr/bin/sh: git: command not found


In [154]:
#!conda install -y git

Collecting package metadata: done
Solving environment: done

## Package Plan ##

  environment location: /home/ec2-user/miniconda3

  added / updated specs:
    - git


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    git-2.20.1                 |  pl526hacde149_0         7.9 MB
    krb5-1.16.1                |       h173b8e3_7         1.4 MB
    libcurl-7.63.0             |    h20c2e04_1000         550 KB
    libssh2-1.8.0              |       h1ba5d50_4         233 KB
    perl-5.26.2                |       h14c3975_0        15.9 MB
    ------------------------------------------------------------
                                           Total:        26.0 MB

The following NEW packages will be INSTALLED:

  git                pkgs/main/linux-64::git-2.20.1-pl526hacde149_0
  krb5               pkgs/main/linux-64::krb5-1.16.1-h173b8e3_7
  libcurl            pkgs/main/linux-64::libcurl-7.6

In [156]:
!git init

Initialized empty Git repository in /home/ec2-user/code/.git/


In [157]:
!git add queryToData.ipynb 

In [159]:
!git commit -m "first commit"


[master (root-commit) 11a6359] first commit
 Committer: EC2 Default User <ec2-user@ip-172-20-119-1.us-west-2.compute.internal>
Your name and email address were configured automatically based
on your username and hostname. Please check that they are accurate.
You can suppress this message by setting them explicitly. Run the
following command and follow the instructions in your editor to edit
your configuration file:

    git config --global --edit

After doing this, you may fix the identity used for this commit with:

    git commit --amend --reset-author

 1 file changed, 528 insertions(+)
 create mode 100644 queryToData.ipynb


In [160]:
!git remote add origin https://github.com/brianyiktaktsui/skymap_web_server.git

In [161]:
!git push -u origin master

Username for 'https://github.com': ^C
