In [4]:
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
import time
import uproot
import numpy as np
import pandas as pd
from glob import glob
from pprint import pprint
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
#need ROOT
import ROOT
warnings.filterwarnings(
    'ignore', category=pd.io.pytables.PerformanceWarning)

input_vars = ['pxB1', 'pyB1', 'pzB1', 'eB1', 'pxB2', 'pyB2', 'pzB2', 'eB2', 'pxJ1', 'pyJ1', 'pzJ1', 'eJ1',
              'pxJ2', 'pyJ2', 'pzJ2', 'eJ2', 'pxL1', 'pyL1', 'pzL1', 'eL1', 'pxN1', 'pyN1', 'pzN1', 'eN1',
              'pxH', 'pyH', 'pzH', 'eH', 'pxt11', 'pyt11', 'pzt11', 'et11', 'pxt12', 'pyt12', 'pzt12',
              'et12', 'pxt21', 'pyt21', 'pzt21', 'et21', 'pxt22', 'pyt22', 'pzt22', 'et22', 'pxW1', 'pyW1', 
              'pzW1', 'eW1', 'pxW2', 'pyW2', 'pzW2', 'eW2', 'm_B1', 'pt_B1', 'eta_B1', 'phi_B1', 'm_B2', 
              'pt_B2', 'eta_B2', 'phi_B2', 'm_J1', 'pt_J1', 'eta_J1', 'phi_J1', 'm_J2', 'pt_J2', 'eta_J2', 
              'phi_J2', 'm_L1', 'pt_L1', 'eta_L1', 'phi_L1', 'm_N1', 'pt_N1', 'eta_N1', 'phi_N1', 'm_H', 
              'pt_H', 'eta_H', 'phi_H', 'm_t11', 'pt_t11', 'eta_t11', 'phi_t11', 'm_t12', 'pt_t12', 
              'eta_t12', 'phi_t12', 'm_t21', 'pt_t21', 'eta_t21', 'phi_t21', 'm_t22', 'pt_t22', 'eta_t22', 
              'phi_t22', 'm_W1', 'pt_W1', 'eta_W1', 'phi_W1', 'm_W2', 'pt_W2', 'eta_W2', 'phi_W2', 
              'dr_B1_B2', 'dr_B1_J1', 'dr_B1_J2', 'dr_B1_L1', 'dr_B1_N1', 'dr_B1_H', 'dr_B1_t11', 
              'dr_B1_t12', 'dr_B1_t21', 'dr_B1_t22', 'dr_B1_W1', 'dr_B1_W2', 'dr_B2_B1', 'dr_B2_J1', 
              'dr_B2_J2', 'dr_B2_L1', 'dr_B2_N1', 'dr_B2_H', 'dr_B2_t11', 'dr_B2_t12', 'dr_B2_t21', 
              'dr_B2_t22', 'dr_B2_W1', 'dr_B2_W2', 'dr_J1_B1', 'dr_J1_B2', 'dr_J1_J2', 'dr_J1_L1', 
              'dr_J1_N1', 'dr_J1_H', 'dr_J1_t11', 'dr_J1_t12', 'dr_J1_t21', 'dr_J1_t22', 'dr_J1_W1', 
              'dr_J1_W2', 'dr_J2_B1', 'dr_J2_B2', 'dr_J2_J1', 'dr_J2_L1', 'dr_J2_N1', 'dr_J2_H', 
              'dr_J2_t11', 'dr_J2_t12', 'dr_J2_t21', 'dr_J2_t22', 'dr_J2_W1', 'dr_J2_W2', 'dr_L1_B1', 
              'dr_L1_B2', 'dr_L1_J1', 'dr_L1_J2', 'dr_L1_N1', 'dr_L1_H', 'dr_L1_t11', 'dr_L1_t12', 
              'dr_L1_t21', 'dr_L1_t22', 'dr_L1_W1', 'dr_L1_W2', 'dr_N1_B1', 'dr_N1_B2', 'dr_N1_J1', 
              'dr_N1_J2', 'dr_N1_L1', 'dr_N1_H', 'dr_N1_t11', 'dr_N1_t12', 'dr_N1_t21', 'dr_N1_t22', 
              'dr_N1_W1', 'dr_N1_W2', 'dr_H_B1', 'dr_H_B2', 'dr_H_J1', 'dr_H_J2', 'dr_H_L1', 'dr_H_N1', 
              'dr_H_t11', 'dr_H_t12', 'dr_H_t21', 'dr_H_t22', 'dr_H_W1', 'dr_H_W2', 'dr_t11_B1', 
              'dr_t11_B2', 'dr_t11_J1', 'dr_t11_J2', 'dr_t11_L1', 'dr_t11_N1', 'dr_t11_H', 'dr_t11_t12', 
              'dr_t11_t21', 'dr_t11_t22', 'dr_t11_W1', 'dr_t11_W2', 'dr_t12_B1', 'dr_t12_B2', 'dr_t12_J1', 
              'dr_t12_J2', 'dr_t12_L1', 'dr_t12_N1', 'dr_t12_H', 'dr_t12_t11', 'dr_t12_t21', 'dr_t12_t22', 
              'dr_t12_W1', 'dr_t12_W2', 'dr_t21_B1', 'dr_t21_B2', 'dr_t21_J1', 'dr_t21_J2', 'dr_t21_L1', 
              'dr_t21_N1', 'dr_t21_H', 'dr_t21_t11', 'dr_t21_t12', 'dr_t21_t22', 'dr_t21_W1', 'dr_t21_W2', 
              'dr_t22_B1', 'dr_t22_B2', 'dr_t22_J1', 'dr_t22_J2', 'dr_t22_L1', 'dr_t22_N1', 'dr_t22_H', 
              'dr_t22_t11', 'dr_t22_t12', 'dr_t22_t21', 'dr_t22_W1', 'dr_t22_W2', 'dr_W1_B1', 'dr_W1_B2', 
              'dr_W1_J1', 'dr_W1_J2', 'dr_W1_L1', 'dr_W1_N1', 'dr_W1_H', 'dr_W1_t11', 'dr_W1_t12', 
              'dr_W1_t21', 'dr_W1_t22', 'dr_W1_W2', 'dr_W2_B1', 'dr_W2_B2', 'dr_W2_J1', 'dr_W2_J2', 
              'dr_W2_L1', 'dr_W2_N1', 'dr_W2_H', 'dr_W2_t11', 'dr_W2_t12', 'dr_W2_t21', 'dr_W2_t22', 
              'dr_W2_W1', 'd_eta_B1_B2', 'd_phi_B1_B2', 'd_eta_B1_J1', 'd_phi_B1_J1', 'd_eta_B1_J2', 
              'd_phi_B1_J2', 'd_eta_B1_L1', 'd_phi_B1_L1', 'd_eta_B1_N1', 'd_phi_B1_N1', 'd_eta_B1_H', 
              'd_phi_B1_H', 'd_eta_B1_t11', 'd_phi_B1_t11', 'd_eta_B1_t12', 'd_phi_B1_t12', 'd_eta_B1_t21', 
              'd_phi_B1_t21', 'd_eta_B1_t22', 'd_phi_B1_t22', 'd_eta_B1_W1', 'd_phi_B1_W1', 'd_eta_B1_W2', 
              'd_phi_B1_W2', 'd_eta_B2_B1', 'd_phi_B2_B1', 'd_eta_B2_J1', 'd_phi_B2_J1', 'd_eta_B2_J2', 
              'd_phi_B2_J2', 'd_eta_B2_L1', 'd_phi_B2_L1', 'd_eta_B2_N1', 'd_phi_B2_N1', 'd_eta_B2_H', 
              'd_phi_B2_H', 'd_eta_B2_t11', 'd_phi_B2_t11', 'd_eta_B2_t12', 'd_phi_B2_t12', 'd_eta_B2_t21', 
              'd_phi_B2_t21', 'd_eta_B2_t22', 'd_phi_B2_t22', 'd_eta_B2_W1', 'd_phi_B2_W1', 'd_eta_B2_W2', 
              'd_phi_B2_W2', 'd_eta_J1_B1', 'd_phi_J1_B1', 'd_eta_J1_B2', 'd_phi_J1_B2', 'd_eta_J1_J2', 
              'd_phi_J1_J2', 'd_eta_J1_L1', 'd_phi_J1_L1', 'd_eta_J1_N1', 'd_phi_J1_N1', 'd_eta_J1_H', 
              'd_phi_J1_H', 'd_eta_J1_t11', 'd_phi_J1_t11', 'd_eta_J1_t12', 'd_phi_J1_t12', 'd_eta_J1_t21', 
              'd_phi_J1_t21', 'd_eta_J1_t22', 'd_phi_J1_t22', 'd_eta_J1_W1', 'd_phi_J1_W1', 'd_eta_J1_W2', 
              'd_phi_J1_W2', 'd_eta_J2_B1', 'd_phi_J2_B1', 'd_eta_J2_B2', 'd_phi_J2_B2', 'd_eta_J2_J1', 
              'd_phi_J2_J1', 'd_eta_J2_L1', 'd_phi_J2_L1', 'd_eta_J2_N1', 'd_phi_J2_N1', 'd_eta_J2_H', 
              'd_phi_J2_H', 'd_eta_J2_t11', 'd_phi_J2_t11', 'd_eta_J2_t12', 'd_phi_J2_t12', 'd_eta_J2_t21', 
              'd_phi_J2_t21', 'd_eta_J2_t22', 'd_phi_J2_t22', 'd_eta_J2_W1', 'd_phi_J2_W1', 'd_eta_J2_W2', 
              'd_phi_J2_W2', 'd_eta_L1_B1', 'd_phi_L1_B1', 'd_eta_L1_B2', 'd_phi_L1_B2', 'd_eta_L1_J1', 
              'd_phi_L1_J1', 'd_eta_L1_J2', 'd_phi_L1_J2', 'd_eta_L1_N1', 'd_phi_L1_N1', 'd_eta_L1_H', 
              'd_phi_L1_H', 'd_eta_L1_t11', 'd_phi_L1_t11', 'd_eta_L1_t12', 'd_phi_L1_t12', 'd_eta_L1_t21', 
              'd_phi_L1_t21', 'd_eta_L1_t22', 'd_phi_L1_t22', 'd_eta_L1_W1', 'd_phi_L1_W1', 'd_eta_L1_W2', 
              'd_phi_L1_W2', 'd_eta_N1_B1', 'd_phi_N1_B1', 'd_eta_N1_B2', 'd_phi_N1_B2', 'd_eta_N1_J1', 
              'd_phi_N1_J1', 'd_eta_N1_J2', 'd_phi_N1_J2', 'd_eta_N1_L1', 'd_phi_N1_L1', 'd_eta_N1_H', 
              'd_phi_N1_H', 'd_eta_N1_t11', 'd_phi_N1_t11', 'd_eta_N1_t12', 'd_phi_N1_t12', 'd_eta_N1_t21', 
              'd_phi_N1_t21', 'd_eta_N1_t22', 'd_phi_N1_t22', 'd_eta_N1_W1', 'd_phi_N1_W1', 'd_eta_N1_W2', 
              'd_phi_N1_W2', 'd_eta_H_B1', 'd_phi_H_B1', 'd_eta_H_B2', 'd_phi_H_B2', 'd_eta_H_J1', 
              'd_phi_H_J1', 'd_eta_H_J2', 'd_phi_H_J2', 'd_eta_H_L1', 'd_phi_H_L1', 'd_eta_H_N1', 
              'd_phi_H_N1', 'd_eta_H_t11', 'd_phi_H_t11', 'd_eta_H_t12', 'd_phi_H_t12', 'd_eta_H_t21', 
              'd_phi_H_t21', 'd_eta_H_t22', 'd_phi_H_t22', 'd_eta_H_W1', 'd_phi_H_W1', 'd_eta_H_W2', 
              'd_phi_H_W2', 'd_eta_t11_B1', 'd_phi_t11_B1', 'd_eta_t11_B2', 'd_phi_t11_B2', 'd_eta_t11_J1', 
              'd_phi_t11_J1', 'd_eta_t11_J2', 'd_phi_t11_J2', 'd_eta_t11_L1', 'd_phi_t11_L1', 'd_eta_t11_N1', 
              'd_phi_t11_N1', 'd_eta_t11_H', 'd_phi_t11_H', 'd_eta_t11_t12', 'd_phi_t11_t12', 
              'd_eta_t11_t21', 'd_phi_t11_t21', 'd_eta_t11_t22', 'd_phi_t11_t22', 'd_eta_t11_W1', 
              'd_phi_t11_W1', 'd_eta_t11_W2', 'd_phi_t11_W2', 'd_eta_t12_B1', 'd_phi_t12_B1', 
              'd_eta_t12_B2', 'd_phi_t12_B2', 'd_eta_t12_J1', 'd_phi_t12_J1', 'd_eta_t12_J2', 
              'd_phi_t12_J2', 'd_eta_t12_L1', 'd_phi_t12_L1', 'd_eta_t12_N1', 'd_phi_t12_N1', 
              'd_eta_t12_H', 'd_phi_t12_H', 'd_eta_t12_t11', 'd_phi_t12_t11', 'd_eta_t12_t21', 
              'd_phi_t12_t21', 'd_eta_t12_t22', 'd_phi_t12_t22', 'd_eta_t12_W1', 'd_phi_t12_W1', 
              'd_eta_t12_W2', 'd_phi_t12_W2', 'd_eta_t21_B1', 'd_phi_t21_B1', 'd_eta_t21_B2', 
              'd_phi_t21_B2', 'd_eta_t21_J1', 'd_phi_t21_J1', 'd_eta_t21_J2', 'd_phi_t21_J2', 
              'd_eta_t21_L1', 'd_phi_t21_L1', 'd_eta_t21_N1', 'd_phi_t21_N1', 'd_eta_t21_H', 
              'd_phi_t21_H', 'd_eta_t21_t11', 'd_phi_t21_t11', 'd_eta_t21_t12', 'd_phi_t21_t12', 
              'd_eta_t21_t22', 'd_phi_t21_t22', 'd_eta_t21_W1', 'd_phi_t21_W1', 'd_eta_t21_W2', 
              'd_phi_t21_W2', 'd_eta_t22_B1', 'd_phi_t22_B1', 'd_eta_t22_B2', 'd_phi_t22_B2', 
              'd_eta_t22_J1', 'd_phi_t22_J1', 'd_eta_t22_J2', 'd_phi_t22_J2', 'd_eta_t22_L1', 
              'd_phi_t22_L1', 'd_eta_t22_N1', 'd_phi_t22_N1', 'd_eta_t22_H', 'd_phi_t22_H', 
              'd_eta_t22_t11', 'd_phi_t22_t11', 'd_eta_t22_t12', 'd_phi_t22_t12', 'd_eta_t22_t21', 
              'd_phi_t22_t21', 'd_eta_t22_W1', 'd_phi_t22_W1', 'd_eta_t22_W2', 'd_phi_t22_W2', 
              'd_eta_W1_B1', 'd_phi_W1_B1', 'd_eta_W1_B2', 'd_phi_W1_B2', 'd_eta_W1_J1', 'd_phi_W1_J1', 
              'd_eta_W1_J2', 'd_phi_W1_J2', 'd_eta_W1_L1', 'd_phi_W1_L1', 'd_eta_W1_N1', 'd_phi_W1_N1', 
              'd_eta_W1_H', 'd_phi_W1_H', 'd_eta_W1_t11', 'd_phi_W1_t11', 'd_eta_W1_t12', 'd_phi_W1_t12', 
              'd_eta_W1_t21', 'd_phi_W1_t21', 'd_eta_W1_t22', 'd_phi_W1_t22', 'd_eta_W1_W2', 'd_phi_W1_W2', 
              'd_eta_W2_B1', 'd_phi_W2_B1', 'd_eta_W2_B2', 'd_phi_W2_B2', 'd_eta_W2_J1', 'd_phi_W2_J1', 
              'd_eta_W2_J2', 'd_phi_W2_J2', 'd_eta_W2_L1', 'd_phi_W2_L1', 'd_eta_W2_N1', 'd_phi_W2_N1', 
              'd_eta_W2_H', 'd_phi_W2_H', 'd_eta_W2_t11', 'd_phi_W2_t11', 'd_eta_W2_t12', 'd_phi_W2_t12', 
              'd_eta_W2_t21', 'd_phi_W2_t21', 'd_eta_W2_t22', 'd_phi_W2_t22', 'd_eta_W2_W1', 'd_phi_W2_W1'
             ]
def get_columns(fname):
    
    return columns, todrop


def build_filelist(input_dir):
    files = [ifile for ifile in glob('{}/*.csv'.format(input_dir))]
    nominal = {
        'hh': [], 
        'ttbar': []
    }
    systematics = {}
    for fname in files:
        print(fname)
        if 'hh' in fname:
            nominal['hh'].append(fname)
        elif 'tt' in fname:
            nominal['ttbar'].append(fname)
    return nominal, systematics

def process_files(all_data, files, is_signal):
    for ifile in files:
        print (ifile)
        MakeCsvDataFrame = ROOT.RDF.MakeCsvDataFrame
        input_file = MakeCsvDataFrame(ifile)
        print('This is the file ' + ifile) 
        input_dict = input_file.AsNumpy()
        input_df = pd.DataFrame.from_dict(input_dict)
        print("Nevents = ", input_df.shape[0])
        slim_df = input_df[input_vars]
        single_meta_df = pd.DataFrame(slim_df.index,columns = ['index'])
        single_meta_df['names'] = np.full(len(slim_df), ifile)
        single_meta_df['isSignal'] = np.ones(len(slim_df)) if is_signal == 1 else np.zeros(len(slim_df))
        single_training_df = slim_df.astype('float64')
        single_training_df['isSM'] = np.zeros(len(single_meta_df)) if is_signal == -1.0 else np.ones(len(single_meta_df))
        all_data['meta'] = pd.concat([all_data['meta'], single_meta_df])
        all_data['train'] = pd.concat([all_data['train'], single_training_df])
            
    return all_data

def build_scaler(sm_only):
    scaler = StandardScaler()
    scaler.fit(sm_only.values)
    scaler_info = pd.DataFrame.from_dict({
        'mean': scaler.mean_,
        'scale': scaler.scale_,
        'variance': scaler.var_,
        'nsamples': scaler.n_samples_seen_
    })
    scaler_info.set_index(sm_only.columns.values, inplace=True)
    return scaler, scaler_info

def format_for_store(all_data, scaler):
    formatted_data = pd.DataFrame(
        scaler.transform(all_data['train'].values),
        columns=all_data['train'].columns.values, dtype='float64'
    )
    print(formatted_data)
    formatted_data['idx'] = all_data['meta']['index'].values
    formatted_data['sample_names'] = all_data['meta']['names'].values
    formatted_data['signal'] = all_data['meta']['isSignal'].values
    return formatted_data

start = time.time()

store = pd.HDFStore('/localdata/Athar/CNN/datasets/{}.h5'.format('feat_9'),
                    complevel=9, complib='bzip2')
all_data = {
    'meta': pd.DataFrame(),
    'train': pd.DataFrame()
}
filelist, _ = build_filelist('/localdata/Athar/CNN/trans_mom/feature3')  # list of files to process
all_data = process_files(all_data, filelist['hh'], is_signal = 1)
all_data = process_files(all_data, filelist['ttbar'], is_signal = 0)
sm_only = all_data['train'][(all_data['train']['isSM'] == 1)]
scaler, store['scaler'] = build_scaler(sm_only)
store['nominal'] = format_for_store(all_data, scaler)
print(store['nominal'])
print ('Complete! Preprocessing completed in {} seconds'.format(time.time() - start))
store.close()

/localdata/Athar/CNN/trans_mom/feature3/tt.csv
/localdata/Athar/CNN/trans_mom/feature3/hh.csv
/localdata/Athar/CNN/trans_mom/feature3/hh.csv
This is the file /localdata/Athar/CNN/trans_mom/feature3/hh.csv
Nevents =  74618
/localdata/Athar/CNN/trans_mom/feature3/tt.csv
This is the file /localdata/Athar/CNN/trans_mom/feature3/tt.csv
Nevents =  244917
            pxB1      pyB1      pzB1       eB1      pxB2      pyB2      pzB2  \
0       0.748264 -0.323712  0.358747 -0.599583 -0.748264  0.323712 -0.358747   
1       0.622026 -0.289351  0.677837 -0.441962 -0.622026  0.289351 -0.677837   
2       0.390392  0.669193  0.030322 -0.752662 -0.390392 -0.669193 -0.030322   
3       0.370628 -1.224012 -0.483162 -0.127369 -0.370628  1.224012  0.483162   
4      -0.328544 -0.055025 -0.998560 -0.293521  0.328544  0.055025  0.998560   
...          ...       ...       ...       ...       ...       ...       ...   
319530  0.250885 -1.129724  0.081653 -0.379318 -0.250885  1.129724 -0.081653   
319531  0