## Train and save the SVDTime Neural Network

The SVDTimeNN is a MultilayerPerceptron estimator of 
The truth data in this case are bin numbers in a series of time shift bins. The result of such a training is a distribution function for a time shift value. From these, it is easy to calculate mean value and standard deviation, but also do a range of approximate probabilistic calculations.

##### Required Python packages

The following python packages are used:
- math (basic python math functions)
- numpy (Vectors and matrices for numerics)
- pandas (Python analogue of Excel tables)
- matplotlib (Plotting library)
- seaborn (Advanced plotting)
- scipy (Scientific computing package)
- scikit-learn (machine learning)

Only sklear2pmml is missing in the current basf2 distribution. Install it with

pip3 install --user git+https://github.com/jpmml/sklearn2pmml.git

##### Other pre-requisites:

A sample of training data, plus binning and bounds information in pickle (*.pkl) files.

In [1]:
import math
import datetime
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import pylab
import seaborn as sns
from scipy import stats as stats
from scipy.optimize import minimize_scalar
from sklearn.neural_network import MLPClassifier
from sklearn2pmml import sklearn2pmml, PMMLPipeline
from svd.SVDSimBase import *
from lxml import etree as ET

Welcome to JupyROOT 6.08/06


### Retrieve training sample

In [2]:
n_samples = 1000000
pkl_name = 'SVDTime_Training3_{0}_{1}.pkl'

stockdata = pd.read_pickle(pkl_name.format('Sample', n_samples))
n_samples = len(stockdata)
bounds = pd.read_pickle(pkl_name.format('Bounds', n_samples))
bins = pd.read_pickle(pkl_name.format('Bins', n_samples))

timearray = bins['midpoint']
timebins = np.unique(bins[['lower','upper']])

In [3]:
stockdata.head()

Unnamed: 0,test,amplitude,t0,tau,sigma,s1,s2,s3,s4,s5,s6,normed_tau,t0_bin,abin
0,0.267836,14.456133,-7.404476,225.897068,4.139629,0.0,0.0,14.010917,12.561512,10.387404,8.454864,13.321875,13,12
1,0.76131,90.094752,-0.203702,244.035236,2.352088,0.0,0.0,65.898884,90.557821,75.252274,55.695186,25.833429,15,88
2,0.389157,98.08342,-39.552291,328.020822,4.495098,8.231188,68.519079,95.882218,93.212643,79.419842,63.624859,83.765971,3,96
3,0.646701,94.73681,29.742417,284.786097,4.981928,0.0,0.0,0.0,62.62635,93.7388,88.519938,53.943026,25,92
4,0.361906,3.346757,-38.118085,239.676184,1.654655,0.0,0.0,4.230489,4.834844,4.230489,0.0,22.826592,3,1


### Split the data into training and test samples

In [4]:
test_fraction = 0.2
X = stockdata[['s'+str(i) for i in range(2,5)]+['normed_tau']]
Y = stockdata['t0_bin']
X_train = X[stockdata.test>test_fraction]
X_test = X[stockdata.test<test_fraction]
Y_train = Y[stockdata.test>test_fraction]
Y_test = Y[stockdata.test<test_fraction]

In [5]:
X.head()

Unnamed: 0,s2,s3,s4,normed_tau
0,0.0,14.010917,12.561512,13.321875
1,0.0,65.898884,90.557821,25.833429
2,68.519079,95.882218,93.212643,83.765971
3,0.0,0.0,62.62635,53.943026
4,0.0,4.230489,4.834844,22.826592


In [6]:
classifier = MLPClassifier(
    hidden_layer_sizes = (len(timearray)-1,len(timearray)+1),
    activation = 'relu',
    solver = 'adam',
    tol = 1.0e-6, 
    alpha = 0.005, 
    verbose = False
)
nntime_fitter = PMMLPipeline([('claasifier', classifier)])

In [7]:
nntime_fitter.fit(X_train,Y_train)

PMMLPipeline(steps=[('claasifier', MLPClassifier(activation='relu', alpha=0.005, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(24, 26), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=1e-06, validation_fraction=0.1,
       verbose=False, warm_start=False))])

In [8]:
test_score = nntime_fitter.score(X_test, Y_test)
train_score = nntime_fitter.score(X_train, Y_train)
print('Test: {}'.format(test_score))
print('Train: {}'.format(train_score))

Test: 0.6368647975077881
Train: 0.6368750586395622


In [9]:
pmml_name = 'SVDTimeNet3.pmml'
xml_name = pmml_name.replace('pmml','xml')
sklearn2pmml(nntime_fitter, pmml_name, with_repr = True)

Oct 29, 2017 11:51:49 PM org.jpmml.sklearn.Main run
INFO: Parsing PKL..
Oct 29, 2017 11:51:49 PM org.jpmml.sklearn.Main run
INFO: Parsed PKL in 70 ms.
Oct 29, 2017 11:51:49 PM org.jpmml.sklearn.Main run
INFO: Converting..
Oct 29, 2017 11:51:49 PM org.jpmml.sklearn.Main run
INFO: Converted in 99 ms.
Oct 29, 2017 11:51:49 PM org.jpmml.sklearn.Main run
INFO: Marshalling PMML..
Oct 29, 2017 11:51:50 PM org.jpmml.sklearn.Main run
INFO: Marshalled PMML in 1002 ms.


In [10]:
parser = ET.XMLParser(remove_blank_text=True)
net = ET.parse(pmml_name, parser)
root = net.getroot()
# namespace hassle
namespace = root.nsmap[None]
nsprefix = '{'+namespace+'}'
procinfo = root.find(nsprefix + 'MiningBuildTask')

In [11]:
# Save some metadata
name = ET.SubElement(procinfo, nsprefix + 'Title')
name.text = 'Neural network for time shift estimation'

# Information on use of the classifier
target = ET.SubElement(procinfo, nsprefix + 'IntendedUse')
basf2proc = ET.SubElement(target, nsprefix + 'basf2process')
basf2simulation = ET.SubElement(basf2proc, nsprefix + 'Simulation')
basf2simulation.text = 'yes'
basf2reconstruction = ET.SubElement(basf2proc, nsprefix + 'Reconstruction')
basf2reconstruction.text = 'yes'
sensorType = ET.SubElement(target, nsprefix + 'SensorType')
sensorType.text = 'all'
sensorSide = ET.SubElement(target, nsprefix + 'SensorSide')
sensorSide.text = 'all'

#information on training
training = ET.SubElement(procinfo, nsprefix + 'Training')
source = ET.SubElement(training, nsprefix + 'SampleSource')
source.text = 'Toy simulation'
genfunc = ET.SubElement(training, nsprefix + 'Waveform')
genfunc.text = 'beta-prime'
num_samples = ET.SubElement(training, nsprefix + 'SampleSize')
train_samples = ET.SubElement(num_samples, nsprefix + 'Training', {'n': str(int((1-test_fraction)*n_samples))})
test_samples = ET.SubElement(num_samples, nsprefix + 'Test', {'n': str(int(test_fraction*n_samples))})
bounds.apply(
    lambda row: ET.SubElement(training, nsprefix + 'Parameter', **{u:str(v) for u, v in row.items()}), axis = 1
)

netparams = ET.SubElement(procinfo, nsprefix + 'NetworkParameters')
inputLayer = ET.SubElement(netparams, nsprefix + 'NetworkLayer')
inputLayer.attrib['number'] = str(0)
inputLayer.attrib['kind'] = 'input'
inputLayer.attrib['size'] = str(4) # 4 as in 3 APV samples + tau
n_hidden = len(classifier.hidden_layer_sizes)
for (iLayer, sz) in zip(range(1,1+n_hidden), classifier.hidden_layer_sizes) :
    layer = ET.SubElement(netparams, nsprefix + 'NetworkLayer')
    layer.attrib['number'] = str(iLayer)
    layer.attrib['kind'] = 'hidden'
    layer.attrib['size'] = str(sz)
outputLayer = ET.SubElement(netparams, nsprefix + 'NetworkLayer')
outputLayer.attrib['number'] = str(n_hidden+1)
outputLayer.attrib['kind'] = 'output'
outputLayer.attrib['size'] = str(len(timearray)) 

for field in root.find(nsprefix + 'DataDictionary'):
    if field.attrib['name'] == 't0_bin':
        for child in field:
            i = int(child.attrib['value'])
            child.attrib['lower'] = '{0:.3f}'.format(bins.loc[i,'lower'])
            child.attrib['upper'] = '{0:.3f}'.format(bins.loc[i,'upper'])
            child.attrib['midpoint'] = '{0:.3f}'.format(bins.loc[i,'midpoint'])


In [12]:
net.write(xml_name, xml_declaration = True, pretty_print = True, encoding = 'utf-8')

#### Set up tau en/decoder

In [13]:
amp_index = bounds[bounds.value == 'amplitude'].index[0]
amp_range = (bounds.ix[amp_index,'low'], bounds.ix[amp_index, 'high'])
tau_index = bounds[bounds.value == 'tau'].index[0]
tau_range = (bounds.ix[tau_index,'low'], bounds.ix[tau_index, 'high'])
coder = tau_encoder(amp_range, tau_range)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app


#### True values

In [14]:
Trues_test = stockdata[stockdata.test < test_fraction][['t0', 'amplitude','tau','t0_bin', 'abin']]

In [15]:
Trues_test.head()

Unnamed: 0,t0,amplitude,tau,t0_bin,abin
9,22.750252,15.804565,210.951435,23,13
10,-6.626674,3.33262,317.705473,13,1
12,-43.348413,28.392209,259.240924,2,26
17,14.525145,62.533486,344.844077,20,60
20,11.062595,58.220374,279.920782,19,56


#### Predicted probabilities.

In [16]:
probs = nntime_fitter.predict_proba(X_test)
probs

array([[  3.22943076e-15,   2.31026181e-12,   1.25848460e-10, ...,
          3.18291917e-01,   3.00966684e-01,   3.31849573e-01],
       [  1.68856471e-02,   3.93159120e-02,   7.32354696e-02, ...,
          5.23160455e-04,   2.07493618e-04,   1.74210185e-04],
       [  5.81510786e-01,   3.71006820e-01,   4.64274593e-02, ...,
          1.61197557e-15,   1.33350840e-15,   5.67819582e-15],
       ..., 
       [  1.64289774e-19,   2.34224262e-19,   7.30353028e-19, ...,
          5.26618400e-36,   1.63373292e-37,   2.31713424e-39],
       [  2.78004730e-05,   1.60519801e-02,   4.42976286e-01, ...,
          4.81153145e-12,   7.79398725e-12,   3.68363044e-11],
       [  8.79026891e-38,   7.46937458e-37,   1.50923845e-34, ...,
          1.63200696e-46,   8.71202351e-48,   4.08424222e-50]])

### Calculate time shifts and amplitudes from probabilities

In [17]:
def fitFromProb(fw, signals, p, tau, timearray):
    t_fit = np.average(timearray, weights = p)
    t_sigma = np.sqrt(np.average((timearray - t_fit)**2, weights = p))
    weights = fw(-t_fit + np.linspace(0, 2*dt, 3, endpoint = True), tau = tau)
    weights[signals.values == 0.0] = 0.0
    norm = 1.0 / np.inner(weights, weights)
    a_fit = np.inner(signals, weights) * norm
    a_sigma = np.sqrt(norm)
    residuals = signals - a_fit * weights
    ndf = np.sum(np.ones_like(signals[signals>0])) - 2 # Can't be less than 1
    chi2_ndf = np.inner(residuals, residuals)/ndf
    return pd.Series({
        't_fit':t_fit, 
        't_sigma':t_sigma, 
        'a_fit':a_fit, 
        'a_sigma':a_sigma,
        'chi2_ndf':chi2_ndf
            })

In [18]:
probdf = pd.DataFrame(probs)
probdf.index = X_test.index
probdf.to_pickle('SVDTime_Training3_Probs_{0}.pkl'.format(n_samples))

In [19]:
fits = X_test.apply(
    lambda row: fitFromProb(
        betaprime_wave, 
        row[['s'+str(i) for i in range(2,5)]], 
        probdf.ix[row.name],
        coder.decode(row['normed_tau']), 
        timearray), 
    axis = 1
)
fits['t_true'] = Trues_test['t0']
fits['tau'] = Trues_test['tau']
fits['a_true'] = Trues_test['amplitude']
fits['t_bin'] = Trues_test['t0_bin']
fits['a_bin'] = Trues_test['abin']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [20]:
fits.head()

Unnamed: 0,a_fit,a_sigma,chi2_ndf,t_fit,t_sigma,t_true,tau,a_true,t_bin,a_bin
9,16.017756,1.106572,-0.0,26.435361,2.914273,22.750252,210.951435,15.804565,23,13
10,3.736473,0.741103,inf,-24.73616,13.014162,-6.626674,317.705473,3.33262,13,1
12,27.669342,0.64441,2.695265,-44.140297,1.843487,-43.348413,259.240924,28.392209,2,26
17,64.061465,1.253124,inf,15.747233,1.634458,14.525145,344.844077,62.533486,20,60
20,58.30701,0.992306,inf,11.362067,1.239587,11.062595,279.920782,58.220374,19,56


In [21]:
fits.to_pickle('SVDTime_Training3_Fits_{0}.pkl'.format(n_samples))

In [22]:
import pickle

In [23]:
with open('classifier3.pkl', 'wb') as f:
    pickle.dump(classifier, f)


In [24]:
with open('classifier3.txt', 'w') as cdump:
    cdump.write("Classifier coefficients:\n")
    for iLayer in range(len(classifier.coefs_)):
        cdump.write('Layer: {0}\n'.format(iLayer))
        nrows = classifier.coefs_[iLayer].shape[0]
        ncols = classifier.coefs_[iLayer].shape[1]
        cdump.write('Weights:\n')
        for col in range(ncols):
            s = " ".join([str(classifier.coefs_[iLayer][row, col]) for row in range(nrows)])
            s+="\n"
            cdump.write(s)
        # intercepts should have nrows dimension
        cdump.write('Intercepts:\n')
        s = " ".join([str(classifier.intercepts_[iLayer][col]) for col in range(ncols)])
        s += "\n"
        cdump.write(s)

print("Data written.")

Data written.
