# Demonstrates how to load input CSV files and run them through PALS

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pathlib
import pickle

sys.path.append('..')

In [3]:
import pandas as pd

In [4]:
from pals.pimp_tools import get_pimp_API_token_from_env, PIMP_HOST, download_from_pimp
from pals.feature_extraction import DataSource
from pals.PLAGE import PLAGE
from pals.ORA import ORA
from pals.GSEA import GSEA
from pals.common import *



# Beer Analysis

### Load data

In [23]:
osp = '/Volumes/Transcend2/17_20_PhD/19_20_PhD_Metabolomics/'
intensity_csv = '/Users/anamaria/Desktop/samples_filtered.csv'
annotation_csv ='/Users/anamaria/Desktop/annotation.csv'
sd = pd.read_csv(osp+'pymz/mzmine/peak_picked_files/sample_description.csv')

experimental_design =  {
    'comparisons': [{'case': 'infected', 'control': 'control', 'name':'infected/control'}],
    'groups': {'infected': [], 'control': []}
}    

for case in experimental_design['groups']:
    if case == 'infected':
        experimental_design['groups'][case] = [s for s in sd[sd['Condition'] == 'infected']['Sample Name']]
    if case == 'control':
        experimental_design['groups'][case] = [s for s in sd[sd['Condition'] == 'control']['Sample Name']]

In [16]:
int_df, annotation_df, groups = load_data(intensity_csv, annotation_csv)

2020-10-13 16:09:39.698 | DEBUG    | pals.common:load_data:165 - Loaded 603 x 76 peak intensities from /Users/anamaria/Desktop/samples_filtered.csv
2020-10-13 16:09:39.700 | DEBUG    | pals.common:load_data:166 - Loaded groups: {'1': ['row ID'], '28621253.99112652': ['ZIK6.mzXML'], '24640484.38107913': ['ZIK7.mzXML'], '33008265.78013173': ['ZIK10.mzXML'], '42847335.83581253': ['C7_2.mzXML'], '17186613.05137111': ['KM_10.mzXML'], '20854256.241177734': ['C1.mzXML'], '48234259.52069932': ['C10_2.mzXML'], '28440550.373382807': ['VL7.mzXML'], '13797797.795971667': ['KM_9.mzXML'], '14035107.721769508': ['KM_11.mzXML'], '18056019.41098433': ['KM_8.mzXML'], '32745048.04885156': ['KM_3.mzXML'], '45397271.78259371': ['C18_2.mzXML'], '48322070.51710834': ['KM_2.mzXML'], '43252999.04044532': ['C2_2.mzXML'], '32269382.197101586': ['C5_2.mzXML'], '36125496.58238283': ['C12_2.mzXML'], '83811408.14835934': ['VL17.mzXML'], '59913060.78692186': ['VL16.mzXML'], '25800199.16220113': ['KM_4.mzXML'], '21102

In [17]:
int_df.head()

Unnamed: 0_level_0,ZIK6.mzXML,ZIK7.mzXML,ZIK10.mzXML,C7_2.mzXML,KM_10.mzXML,C1.mzXML,C10_2.mzXML,VL7.mzXML,KM_9.mzXML,KM_11.mzXML,...,KM_7.mzXML,KM_6.mzXML,C4_2.mzXML,C13_2.mzXML,C10.mzXML,C9_2.mzXML,VL13.mzXML,VL12.mzXML,VL19.mzXML,VL18.mzXML
row ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2051,60852130.0,68725250.0,70301290.0,174686900.0,21638870.0,78887000.0,312527000.0,53535620.0,25374240.0,23251190.0,...,29391560.0,22483750.0,100312500.0,206951100.0,61998230.0,119237100.0,25467970.0,131102500.0,82619070.0,0.0
6,25176290.0,26581060.0,37339720.0,64628680.0,16077540.0,30582820.0,48196490.0,38864150.0,16679600.0,16530380.0,...,13229150.0,18621370.0,49266710.0,42555500.0,36915080.0,57262920.0,40691650.0,60840400.0,59823810.0,30358070.0
2057,3929179.0,4057528.0,6033983.0,17750270.0,1254025.0,10369190.0,5960417.0,8272370.0,0.0,2273807.0,...,1007962.0,1179840.0,8422987.0,6063877.0,17091640.0,16397480.0,954559.7,51136690.0,27356100.0,0.0
10,4342657.0,2640288.0,3978803.0,6722572.0,2854626.0,3275266.0,5625803.0,4703867.0,0.0,2818107.0,...,2552461.0,2267662.0,8397378.0,7129759.0,5844209.0,6561646.0,4430804.0,9189017.0,7417835.0,3923343.0
2058,7744800.0,7566545.0,9511789.0,22459730.0,1386831.0,9665703.0,36170530.0,6607266.0,2555483.0,997069.0,...,2304320.0,2400551.0,12562840.0,23606070.0,7232901.0,13789340.0,2386602.0,16052260.0,11720110.0,0.0


In [18]:
annotation_df.head()

Unnamed: 0_level_0,entity_id
peak_id,Unnamed: 1_level_1
633,C00418
613,C00064
3109,
2989,
324,C00245


Define some comparisons. This should be specified by users from the interface.

For simplicity, we can just let user specify one comparison at a time (what is currently the case in PALS Viewer), although the codes allow us to specify multiple comparisons.

In [16]:
comparisons = [
    ('beer1', 'beer2'), 
    # ('beer3', 'beer4')
]

Create experimental design dictionary

In [19]:
experimental_design = {
    'groups': groups,
    'comparisons': []
}
for case, control in comparisons:
    experimental_design['comparisons'].append({
        'case': case,
        'control': control,
        'name': '%s/%s' % (case, control)
    })
experimental_design

{'groups': {'beer1': ['Beer_1_full1.mzXML',
   'Beer_1_full2.mzXML',
   'Beer_1_full3.mzXML'],
  'beer2': ['Beer_2_full1.mzXML', 'Beer_2_full2.mzXML', 'Beer_2_full3.mzXML'],
  'beer3': ['Beer_3_full1.mzXML', 'Beer_3_full2.mzXML', 'Beer_3_full3.mzXML'],
  'beer4': ['Beer_4_full1.mzXML', 'Beer_4_full2.mzXML', 'Beer_4_full3.mzXML']},
 'comparisons': [{'case': 'beer1', 'control': 'beer2', 'name': 'beer1/beer2'},
  {'case': 'beer3', 'control': 'beer4', 'name': 'beer3/beer4'}]}

In [24]:
experimental_design

{'comparisons': [{'case': 'infected',
   'control': 'control',
   'name': 'infected/control'}],
 'groups': {'infected': ['ZIK6.mzXML',
   'ZIK7.mzXML',
   'ZIK10.mzXML',
   'VL7.mzXML',
   'KM_9.mzXML',
   'KM_11.mzXML',
   'KM_8.mzXML',
   'KM_3.mzXML',
   'KM_2.mzXML',
   'VL17.mzXML',
   'VL16.mzXML',
   'KM_5.mzXML',
   'VL10.mzXML',
   'VL1.mzXML',
   'VL11.mzXML',
   'ZIK1.mzXML',
   'KM_17.mzXML',
   'VL15.mzXML',
   'VL14.mzXML',
   'KM_12.mzXML',
   'KM_18.mzXML',
   'ZIK4.mzXML',
   'ZIK5.mzXML',
   'VL4.mzXML',
   'VL5.mzXML',
   'ZIK8.mzXML',
   'ZIK9.mzXML',
   'KM_14.mzXML',
   'KM_15.mzXML',
   'VL8.mzXML',
   'VL9.mzXML',
   'KM_20.mzXML',
   'KM_21.mzXML',
   'ZIK3.mzXML',
   'VL3.mzXML',
   'ZIK2.mzXML',
   'KM_6.mzXML',
   'VL13.mzXML',
   'VL12.mzXML',
   'VL19.mzXML',
   'VL18.mzXML'],
  'control': ['C7_2.mzXML',
   'KM_10.mzXML',
   'C1.mzXML',
   'C10_2.mzXML',
   'C18_2.mzXML',
   'C2_2.mzXML',
   'C5_2.mzXML',
   'C12_2.mzXML',
   'KM_4.mzXML',
   'C8_2.mzXML',

### PALS analysis using KEGG database exported from PiMP

In [25]:
ds = DataSource(int_df, annotation_df, experimental_design, DATABASE_PIMP_KEGG)

2020-10-13 16:13:53.777 | DEBUG    | pals.feature_extraction:__init__:43 - Using PiMP_KEGG as database
2020-10-13 16:13:53.779 | DEBUG    | pals.loader:load_data:42 - Loading /Users/anamaria/git/PALS/pals/data/PiMP_KEGG.json.zip
2020-10-13 16:13:53.980 | DEBUG    | pals.feature_extraction:__init__:56 - Mapping pathway to unique ids
2020-10-13 16:13:53.989 | DEBUG    | pals.feature_extraction:__init__:70 - Creating dataset to pathway mapping
2020-10-13 16:13:54.131 | DEBUG    | pals.feature_extraction:__init__:98 - Computing unique id counts


In [26]:
plage = PLAGE(ds)
pathway_df = plage.get_pathway_df()

2020-10-13 16:13:56.616 | DEBUG    | pals.feature_extraction:change_zero_peak_ints:308 - Setting the zero intensity values in the dataframe
2020-10-13 16:13:56.625 | DEBUG    | pals.feature_extraction:change_zero_peak_ints:310 - 0
2020-10-13 16:13:56.691 | DEBUG    | pals.feature_extraction:standardize_intensity_df:277 - Scaling the data across the sample: zero mean and unit variance
2020-10-13 16:13:56.723 | DEBUG    | pals.PLAGE:get_plage_activity_df:90 - Mean values of the rows in the DF is [ 0.  0. -0. -0. -0. -0. -0.  0.  0. -0. -0.  0. -0.  0. -0. -0. -0.  0.
  0.  0.  0.  0. -0.  0.  0. -0.  0.  0. -0.  0. -0. -0. -0. -0.  0.  0.
 -0. -0. -0.  0.  0.  0.  0. -0. -0. -0. -0.  0. -0.  0. -0. -0.  0.  0.
 -0.  0.  0. -0.  0.  0.  0.  0.  0. -0. -0. -0. -0.  0. -0. -0.  0.  0.
  0. -0. -0. -0.  0.  0.  0. -0. -0. -0. -0. -0. -0. -0. -0.  0. -0.  0.
 -0.  0. -0. -0.  0.  0. -0.  0.  0. -0.  0. -0.  0. -0.  0. -0. -0. -0.
  0.  0.  0.  0.  0.  0. -0. -0. -0. -0.  0. -0.  0.  0.  0.  0

In [30]:
pd.set_option('display.max_rows', 500)
pathway_df.sort_values('PiMP_KEGG infected/control comb_p', ascending=True, inplace=True)
pathway_df

Unnamed: 0,pw_name,infected/control p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,PiMP_KEGG infected/control comb_p
map00750,Vitamin B6 metabolism,0.0,29,5,17.24,0.0002176285,0.58,2.0,0.0
map00240,Pyrimidine metabolism,4.342559e-09,56,5,8.93,0.004643343,1.12,2.0,4.342559e-09
map00471,D-Glutamine and D-glutamate metabolism,2.582631e-08,7,2,28.57,0.00769712,0.14,2.0,2.582631e-08
map04964,Proximal tubule bicarbonate reclamation,2.582631e-08,16,2,12.5,0.03921214,0.32,2.0,2.582631e-08
map04724,Glutamatergic synapse,2.582631e-08,7,2,28.57,0.00769712,0.14,2.0,2.582631e-08
map04727,GABAergic synapse,2.582631e-08,9,2,22.22,0.01286096,0.18,2.0,2.582631e-08
map00910,Nitrogen metabolism,2.582631e-08,17,2,11.76,0.04388109,0.34,2.0,2.582631e-08
map04020,Calcium signaling pathway,0.01261071,9,1,11.11,0.1660377,0.18,2.0,0.01261071
map05152,Tuberculosis,0.01261071,6,1,16.67,0.1139487,0.12,2.0,0.01261071
map04666,Fc gamma R-mediated phagocytosis,0.01261071,4,1,25.0,0.0774605,0.08,2.0,0.01261071
