# 1. Import Python Packages

In [None]:
import sys, os
sys.path.insert(0,'/global/project/projectdirs/metatlas/anaconda/lib/python2.7/site-packages' )

from metatlas import metatlas_objects as metob
from metatlas.helpers import dill2plots as dp
from metatlas.helpers import metatlas_get_data_helper_fun as ma_data
from metatlas.helpers import rt_corrector as rt_corrector
from metatlas.helpers import chromatograms_mp_plots as cp

import qgrid

from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets
from IPython.display import display

import time
import dill
import numpy as np
import multiprocessing as mp
import pandas as pd

%matplotlib notebook

# 2. Register LCMS Runs into categorical groups.

* ### Select MetAtlas LCMS Runs by experiment and filename.  

In [None]:
f = dp.interact_get_metatlas_files()

* ### Create a "File-Info" sheet from the selected files.  
This sheet needs to be downloaded and filled in.  The "File-Info" sheet is the exchange format we use to define the grouping membership for LCMS runs.


In [None]:
dp.make_empty_fileinfo_sheet('/global/homes/b/bpb/Downloads/empty_finfo.tab',f.files)

* ### Create metatlas groups from filled in file-info sheet
Defining groups of files allows for the selection of sets of LCMS runs by specifying the group names.  In addition,  the group membership is preserved in the exported metatlas datasets; so the application of statistical methods based on grouped data is straightforward.

Your filled in sheet will look something like this:
<style type="text/css">
	table.tableizer-table {
		font-size: 12px;
		border: 1px solid #CCC; 
		font-family: Arial, Helvetica, sans-serif;
	} 
	.tableizer-table td {
		padding: 4px;
		margin: 3px;
		border: 1px solid #CCC;
	}
	.tableizer-table th {
		background-color: #104E8B; 
		color: #FFF;
		font-weight: bold;
	}
</style>
<table class="tableizer-table">
<thead><tr class="tableizer-firstrow"><th>mzml_file</th><th>group</th><th>description</th></tr></thead><tbody>
 <tr><td>.../20160531_KBL_violacein_cells_384_final/20160531_C18_ACN_POS_MSMS_KBL_Qex_A_10_Run413.mzML</td><td>20160531_KBL_C18_Vio_cells_384_Quad_1</td><td>&nbsp;</td></tr>
 <tr><td>.../20160531_KBL_violacein_cells_384_final/20160531_C18_ACN_POS_MSMS_KBL_Qex_A_11_Run415.mzML</td><td>20160531_KBL_C18_Vio_cells_384_Quad_1</td><td>&nbsp;</td></tr>
 <tr><td>.../20160531_KBL_violacein_cells_384_final/20160531_C18_ACN_POS_MSMS_KBL_Qex_A_12_Run417.mzML</td><td>20160531_KBL_C18_Vio_cells_384_Quad_1</td><td>&nbsp;</td></tr>
 <tr><td>.../20160531_KBL_violacein_cells_384_final/20160531_C18_ACN_POS_MSMS_KBL_Qex_A_1_Run395.mzML</td><td>20160531_KBL_C18_Vio_cells_384_Quad_2</td><td>&nbsp;</td></tr>
 <tr><td>.../20160531_KBL_violacein_cells_384_final/20160531_C18_ACN_POS_MSMS_KBL_Qex_A_2_Run397.mzML</td><td>20160531_KBL_C18_Vio_cells_384_Quad_2</td><td>&nbsp;</td></tr>
 <tr><td>.../20160531_KBL_violacein_cells_384_final/20160531_C18_ACN_POS_MSMS_KBL_Qex_A_3_Run399.mzML</td><td>20160531_KBL_C18_Vio_cells_384_Quad_2</td><td></td></tr>
</tbody></table>

A text description of each group is an optional field.  These can be a few, short sentences that describe each group.

In [None]:
g = dp.make_groups_from_fileinfo_sheet('/global/homes/b/bpb/Downloads/20160517_RL_HM_6550_HILIC_CA-MF-groups.csv',
                                       filetype='csv',
                                       store=False)

View the list of metatlas objects using "to_dataframe"

In [None]:
metob.to_dataframe(g).head()

# 3. Create a new Atlas
* ### From Google Doc Reference Data

In [None]:
dp = reload(dp)
atlas_sheet = dp.get_formatted_atlas_from_google_sheet(polarity='POS',
                                          method='QE119_ZIC-HILIC',
                                          mz_tolerance=10)

In [None]:
atlas_sheet.head()

* ### From a pandas dataframe

In [None]:
dp = reload(dp)
names = dp.make_atlas_from_spreadsheet('',
                                       'QE_HILIC_POS_ISTD_AND_COMMON_METABOLITES',
                                       filetype='dataframe',
                                       sheetname='',
                                       polarity = 'positive',
                                       store=True,
                                       dataframe = atlas_sheet,
                                      mz_tolerance = 10)

* ### From a spreadsheet
This is by far the most common way to create a new Atlas in Metabolite Atlas.  The columns the sheet must be exactly as what is seen here.  In cases where there isn't a compound in the database, the "label" field below is used.  Here is an example of what a sheet could look like.

<style type="text/css">
	table.tableizer-table {
		font-size: 12px;
		border: 1px solid #CCC; 
		font-family: Arial, Helvetica, sans-serif;
	} 
	.tableizer-table td {
		padding: 4px;
		margin: 3px;
		border: 1px solid #CCC;
	}
	.tableizer-table th {
		background-color: #104E8B; 
		color: #FFF;
		font-weight: bold;
	}
</style>
<table class="tableizer-table">
<thead><tr class="tableizer-firstrow"><th>label</th><th>rt_min</th><th>rt_max</th><th>rt_peak</th><th>mz</th><th>mz_tolerance</th><th>inchi_key</th></tr></thead><tbody>
 <tr><td>violacein </td><td>4.2</td><td>4.4</td><td>4.3</td><td>344.1036913</td><td>5</td><td>XAPNKXIRQFHCHN-QGOAFFKASA-N</td></tr>
 <tr><td>deoxyviolacein (iso1 - main)</td><td>4.75</td><td>4.9</td><td>4.8</td><td>328.1087767</td><td>5</td><td>OJUJNNKCVPCATE-QGOAFFKASA-N</td></tr>
 <tr><td>tryptophan</td><td>2.3</td><td>2.45</td><td>2.36</td><td>205.0978776</td><td>5</td><td>QIVBCDIJIAJPQS-VIFPVBQESA-N</td></tr>
 <tr><td>deoxychromoviridans</td><td>5.4</td><td>6</td><td>5.75</td><td>605.244821</td><td>5</td><td>&nbsp;</td></tr>
 <tr><td>chromoviridans</td><td>5.15</td><td>5.5</td><td>5.3</td><td>621.239736</td><td>5</td><td>&nbsp;</td></tr>
 <tr><td>ABMBA</td><td>4.72</td><td>4.88</td><td>4.8</td><td>229.9811</td><td>5</td><td>LCMZECCEEOQWLQ-UHFFFAOYSA-N</td></tr>
</tbody></table>

These tables can be csv or tab delimited text or excel spreadsheets.

There is a <a href = 'https://drive.google.com/open?id=0BweRoDuGcsLBNkYtQUpjQ0ktZDg'> lookup table here </a> of all compounds to get the inchi_key.

For old MetAtlas atlases, you can use Excel's "vlookup" function along with <a href = 'https://drive.google.com/open?id=0BweRoDuGcsLBQUxrRjgtbjhnSDg'> this lookup table </a> to map the old names to valid inchi keys.

```
=VLOOKUP(H2,$A:$B,2,0) where $A:$B are columns containing name and inchi-key
```

This is a <a href = 'https://drive.google.com/open?id=0BweRoDuGcsLBaThjcEZuSjh2dXM'> link </a> to all the old compound identifications that were in the database prior to the refactoring in Mid June, 2016.


In [None]:
dp = reload(dp)
names = dp.make_atlas_from_spreadsheet('/global/homes/b/bpb/Downloads/msmsl_neg_atlas_istd.txt',
                                       '20161007_MP3umZHILIC_V12_NEG_MetIDJamboree',
                                       filetype='tab',
                                       sheetname='',
                                       polarity = 'negative',
                                       store=False,
                                      mz_tolerance = 100)

# 0 73 2016019_metalas_KZ_Ave_uptake_qxqct_hilic_updated_pos_V2 2016-07-26 17:31:26
# 1 69 201500826_KZ_Ave_library_qxqct_hilic_pos_V2 2016-07-26 18:17:32
# 2 93 201500826_KZ_Ave_library_qxqct_hilic_neg_V2 2016-07-26 18:14:40
# 3 106 2016019_metatlas_KZ_Ave_uptake_qxqct_hilic_updated_neg_V2 2016-07-26 18:02:34
        

* ### Use interactive interface to build atlas from spreadsheet

In [None]:
###### CURRENTLY UNDER DEVELOPMENT/TESTING #######
# %system cat /project/projectdirs/metatlas/anaconda/lib/python2.7/site-packages/IPython/html.py
# /html/static/style/ipython.min.css
# .widget-hlabel{min-width:10ex;padding-right:8px;padding-top:3px;text-align:right;vertical-align:text-top}
from IPython.display import HTML, display
display(HTML('''<style>
    .widget-label { min-width: 20ex !important; }
</style>'''))
dp = reload(dp)
names = interact(dp.make_atlas_from_spreadsheet,__manual=True,)
##################################################

# 4. Select groups of files to operate on

In [None]:
dp = reload(dp)
groups = dp.select_groups_for_analysis(name = '2016%KZ%pos',
                                       most_recent = True,
                                       remove_empty = True,
                                       include_list = [], exclude_list = [])#QC','Blank'])


# 5. Select Atlas to use
* ### Select by Atlas name

In [None]:
atlas = dp.get_metatlas_atlas(name='test_save_20161104',do_print = True,most_recent=True)
#201500826_KZ_Ave_library_qxqct_hilic_pos_V2
#20161007_MP3umZHILIC_V12_NEG_MetIDJamboree

* ### A list of atlases is returned by the cell above.  
You must run the following cell to specify which Atlas you want to continue your analysis with (Even if only a single atlas is returned).

In [None]:
ma_data = reload(ma_data)
myAtlas = atlas[0]
atlas_df = ma_data.make_atlas_df(myAtlas)
print myAtlas.name
print myAtlas.username

### Take a look at the selected atlas

### View only the first few lines
* Remove the ".head()" to show the whole thing
* Put a number in the ".head()" to show number of rows
* Edit set_option to configure display:
* More options available here:
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.set_option.html

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)
atlas_df.head(5)

### View the atlas as a qgrid widget (good for widescreen displays)

In [None]:
compound_grid = qgrid.QGridWidget(df=atlas_df)#,set_grid_option={'show_toolbar',True})
compound_grid.export()

### View the atlas as a pandas dataframe (tweak the pandas display options to fit)
* More options available here:
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.set_option.html

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)
atlas_df

# 6. Get Data

### Much faster data getting script

In [None]:
all_files = []
for my_group in groups:
    for my_file in my_group.items:
        all_files.append((my_file,my_group,atlas_df,myAtlas))
        
pool = mp.Pool(processes=min(10, len(all_files)))
# from metatlas.helpers.metatlas_get_data_helper_fun import get_data_for_atlas_df_and_file
t0 = time.time()
metatlas_dataset = pool.map(ma_data.get_data_for_atlas_df_and_file, all_files)
pool.close()
pool.terminate()
#If you're code crashes here, make sure to terminate any processes left open.
print time.time() - t0


# 7. Adjust Retention Times.  

In [None]:
dp = reload(dp)
%matplotlib notebook
a = dp.adjust_rt_for_selected_compound(metatlas_dataset,include_lcmsruns = [])

In [None]:
# dp = reload(dp)
data,compound_grid = dp.show_compound_grid(input_dataset = metatlas_dataset)#,input_fname = '/global/homes/b/bpb/Downloads/20160531_KBL_C18_Vio_cells_384_Q_1_to_4.pkl')

# 8. Make Supplementary Tables


### Specify a directory to put all the figures into

In [14]:
output_dir = '/global/homes/b/bpb/Downloads/mp_plots_test/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

* ### Export Atlas to a Spreadsheet

In [None]:
atlas_identifications = dp.export_atlas_to_spreadsheet(myAtlas,os.path.join(output_dir,'atlas_export.csv'))

* ### Dataframes and spreadsheets

In [None]:
dp = reload(dp)
# atlas_identifications = dp.export_atlas_to_spreadsheet(myAtlas,'%s/sheets/%s.csv'%(plot_location_label,myAtlas.name))
peak_height = dp.make_output_dataframe(input_fname = '',input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='peak_height' , output_loc=os.path.join(output_dir,'sheets'))
peak_area = dp.make_output_dataframe(input_fname = my_file,input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='peak_area' , output_loc=os.path.join(output_dir,'sheets'))
mz_peak = dp.make_output_dataframe(input_fname = my_file,input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='mz_peak' , output_loc=os.path.join(output_dir,'sheets'))
rt_peak = dp.make_output_dataframe(input_fname = my_file, input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [],fieldname='rt_peak' , output_loc=os.path.join(output_dir,'sheets'))
mz_centroid = dp.make_output_dataframe(input_fname = my_file,input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='mz_centroid' , output_loc=os.path.join(output_dir,'sheets'))
rt_centroid = dp.make_output_dataframe(input_fname = my_file,input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='rt_centroid' , output_loc=os.path.join(output_dir,'sheets'))

* ### Error bar

In [None]:
dp = reload(dp)
dp.plot_errorbar_plots(peak_height, output_loc=os.path.join(output_dir,'error_bar_peak_height'))

* ### Chromatograms

### Make a plot for each compound:
* Each lcmsrun will be a subplot

In [None]:
## THINGS YOU MIGHT WANT TO CHANGE
nCols = 8
share_y = False

## THINGS YOU PROBABLY DON'T WANT TO CHANGE
file_names = ma_data.get_file_names(metatlas_dataset)
compound_names = ma_data.get_compound_names(metatlas_dataset)[0]
nRows = int(np.ceil(len(file_names)/float(nCols)))
args_list = []
for compound_idx, my_compound in enumerate(compound_names):
    my_data = list()
    for file_idx, my_file in enumerate(file_names):
        my_data.append(metatlas_dataset[file_idx][compound_idx])

    kwargs = {'data': my_data,
              'file_name': os.path.join(output_dir, my_compound+'.pdf'),
              'rowscols': (nRows, nCols),
              'share_y': share_y,
              'names': file_names}
    args_list.append(kwargs)
max_processes = 20
pool = mp.Pool(processes=min(max_processes, len(metatlas_dataset[0])))
pool.map(cp.plot_compounds_and_files_mp, args_list)
pool.close()
pool.terminate()

### Make a plot for each lcmsrun
* Each compound will be a subplot

In [None]:
## THINGS YOU MIGHT WANT TO CHANGE
nCols = 8
share_y = False

## THINGS YOU PROBABLY DON'T WANT TO CHANGE
file_names = ma_data.get_file_names(metatlas_dataset)
compound_names = ma_data.get_compound_names(metatlas_dataset)[0]
nRows = int(np.ceil(len(compound_names)/float(nCols)))
args_list = []
for file_idx, my_file in enumerate(file_names):
    kwargs = {'data': metatlas_dataset[file_idx],
              'file_name': os.path.join(output_dir, my_file +'.pdf'),
              'rowscols': (nRows, nCols),
              'share_y': share_y,
              'names': compound_names}
    args_list.append(kwargs)

max_processes = 20
pool = mp.Pool(processes=min(max_processes, len(metatlas_dataset)))
pool.map(cp.plot_compounds_and_files_mp, args_list)
pool.close()
pool.terminate()


* ### Identification Figures

In [None]:
dp = reload(dp)
dp.make_identification_figure(input_dataset = metatlas_dataset, input_fname = my_file, include_lcmsruns = [],exclude_lcmsruns = ['RootCass','QC','Blank','blank'], output_loc=os.path.join(output_dir,'identification'))

### Make a single tar file of your output directory

In [23]:
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
tarball_name = timestr + '_' + os.path.basename(os.path.normpath(output_dir)) + '.tar.gz'
%system tar -zcf $tarball_name -C $output_dir .
print 'done'

[]

### The tarball will be stored in your current directory.  Run this to see current directory

In [21]:
%system pwd

['/global/u2/b/bpb/metatlas/docs/example_notebooks']

# 9. Clean up Zombie Processes

* ### These are not meant to be used as part of normal work

* ### If code crashes, we will have to use these tools to clean things up

### Try simply closing the pool

In [None]:
pool.close()
pool.terminate()

### Make a DataFrame of user's processes

In [11]:
import os
import psutil
import getpass
import pandas as pd
from datetime import datetime

pids = [int(pid) for pid in os.listdir('/proc') if pid.isdigit()]
proc_df = []
for pid in pids:
    try:
        process = psutil.Process(pid)
        if process.username() == getpass.getuser():
            temp = {'pid': process.pid,
                    'name': process.name(),
                    'user': process.username(),
                    'created_timestamp': int(process.create_time()),
                    'created_datestr': str(datetime.fromtimestamp(process.create_time()))}
            proc_df.append(temp)
    except:
        pass
    
pd.DataFrame(proc_df)

Unnamed: 0,created_datestr,created_timestamp,name,pid,user
0,2016-11-04 10:18:48.720000,1478279928,jupyterhub-sing,9144,bpb
1,2016-11-04 12:11:18.470000,1478286678,python,27959,bpb
2,2016-11-02 06:15:09.930000,1478092509,jupyterhub-sing,28411,bpb
3,2016-11-02 06:17:36.180000,1478092656,bash,31782,bpb
4,2016-11-02 06:26:54.710000,1478093214,python,54877,bpb
5,2016-11-02 07:43:47.190000,1478097827,jupyterhub-sing,60980,bpb


### Kill process by process id (pid)

In [10]:
p = psutil.Process()
p.terminate() 

# Ancient Codes and Partially Developed Tools

In [None]:
### Store data to a pickle file
# saved_filename = '/global/homes/b/bpb/Downloads/20160818_POS_MO_HEfungusonly_V1.pkl'
# with open(output_filename,'w') as f:
#     dill.dump(metatlas_dataset,f)

### Load a pre-existing metatlas dataset  
# metatlas_dataset = ma_data.get_dill_data(saved_filename)

In [None]:
### copy files to $SCRATCH
### You will likely never have to do this, but just in case, here is the code.
# from shutil import copyfile
# scratch = os.environ['SCRATCH']
# for my_group in groups:
#     for my_file in my_group.items:
#         new_path = os.path.join(scratch,'temp_metatlas')
#         if not os.path.isdir(new_path):
#             os.mkdir(new_path)
#         new_file = os.path.join(new_path,os.path.basename(my_file.hdf5_file))
#         copyfile(my_file.hdf5_file, new_file)
#         my_file.hdf5_file = new_file
#         print my_file.hdf5_file

In [None]:
# %matplotlib inline
# dp = reload(dp)
# pickles = ['/global/homes/b/bpb/Downloads/KZ_Avena_Exudate_atlases_and_groups_1/neg_data.pkl',
#           '/global/homes/b/bpb/Downloads/KZ_Avena_Exudate_atlases_and_groups_1/pos_data.pkl',
# '/global/homes/b/bpb/Downloads/KZ_Avena_Uptake_atlases_and_group_2/pos_data.pkl',
# '/global/homes/b/bpb/Downloads/KZ_Avena_Uptake_atlases_and_group_2/neg_data.pkl']
# for p in pickles:
#     plot_location_label = p.split('.')[0]+'/'
#     print plot_location_label
#     if not os.path.exists(plot_location_label):
#         os.makedirs(plot_location_label)
#     metatlas_dataset = ma_data.get_dill_data(p)
#     dp.make_identification_figure(input_dataset = metatlas_dataset, input_fname = p, include_lcmsruns = [],exclude_lcmsruns = ['RootCass','QC','Blank','blank'], output_loc=plot_location_label+'/identification')

In [None]:
######### DO NOT USE #######
# output_filename = '/global/homes/b/bpb/Downloads/20160531_KBL_C18_Vio_cells_384_Q_1_to_4.pkl'
# data = dp.get_data_for_groups_and_atlas(groups,myAtlas,output_filename)
############################

In [None]:
######### DO NOT USE #######
### THIS STILL NEEDS SOME REPAIRS ###
# rt_corrector.display_atlases()
### USE AT YOUR OWN RISK ###
############################

In [None]:
# msmls_files = metob.retrieve('Lcmsruns',experiment = '20161007_KBL_MPZHILIC3um_MSMLS_stds',name = '%pos%',username = '*')
# print len(msmls_files)
# kate_files = metob.retrieve('Lcmsruns',experiment = '20161007_KBL_MPZHILIC3um_KateStandards',name = '%pos%', username = '*')
# print len(kate_files)
# g = metob.Group()
# g.name = '20161007_MP3umZHILIC_V12_POS_MetIDJamboree'
# for f in msmls_files:
#     g.items.append(f)
# for f in kate_files:
#     g.items.append(f)
# metob.store([g])