## Approximation of the KS test statistic

This notebook contains the test for an analytic approximation for the Kolmogorov-Smirnov test statistic based on soft maxing and exploiting the sigmoid function.

In [5]:
def ks_approx(a,b):
    """Takes in two lists of numbers, a and b, and returns an approximation to the KS test statistic between them"""
    pdf_scale = 50
    max_scale = 200
    
    c = np.append(a,b)
    count_gtr = lambda num, all_nums:  np.sum(1/(1+np.exp(pdf_scale*(a-x))))
    a_pdf = np.array([np.sum(1/(1+np.exp(pdf_scale*(a-x)))) for x in c])/a.shape[0]
    b_pdf = np.array([np.sum(1/(1+np.exp(pdf_scale*(b-x)))) for x in c])/b.shape[0]
    #print(c)
    #a_loc = [c-x for x in a]
    pdf_diff = (a_pdf - b_pdf)
    pdf_diff = np.sqrt(pdf_diff*pdf_diff)
    #print(pdf_diff)
    return np.log(np.sum(np.exp(max_scale*(pdf_diff))))/max_scale

In [6]:
def ks_approx_slow(a,b):
    """Takes in two lists of numbers, a and b, and returns an approximation to the KS test statistic between them"""
    pdf_scale = 50
    max_scale = 200
    
    c = np.append(a,b)
    a_pdf = np.array([np.sum(1/(1+np.exp(pdf_scale*(a-x)))) for x in c])/a.shape[0]
    b_pdf = np.array([np.sum(1/(1+np.exp(pdf_scale*(b-x)))) for x in c])/b.shape[0]
    #print(c)
    #a_loc = [c-x for x in a]
    pdf_diff = (a_pdf - b_pdf)
    pdf_diff = np.sqrt(pdf_diff*pdf_diff)
    #print(pdf_diff)
    return np.log(np.sum(np.exp(max_scale*(pdf_diff))))/max_scale

In [7]:
import numpy as np
from scipy.stats import ks_2samp

a = np.random.rand(512)
b = np.random.rand(2000)
#print(a)
#print(b)
print(ks_2samp(a, b)[0])
print(ks_approx_slow(a,b))

0.03749999999999998
0.056018057216997895


In [8]:
a = np.array([1,3,6,7,8,3,23,434,235,4,8,5])
b = np.array([1,12,43,24,5,23,244,23,11,2])
print(ks_approx(a, b))
print(ks_2samp(a, b))

0.4000127101797548
Ks_2sampResult(statistic=0.45, pvalue=0.15820583934355137)


  
  if __name__ == '__main__':


## Tensorflow Implementation

Now we need to get this working with TF tensors and the like...

In [9]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["KERAS_BACKEND"] = "tensorflow"
import tensorflow as tf
tf.set_random_seed(42)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
           
import keras.backend.tensorflow_backend as K

import keras
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Dropout, Flatten, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D, LeakyReLU, Lambda
from keras.layers import Input, merge, Concatenate, concatenate, Add
from keras.losses import binary_crossentropy
from keras.utils import plot_model

In [10]:
a = K.variable([1,2,3,4,5])
b = K.variable([6,7,8,9,10])

pdf_scale = 50
max_scale = 200

c = K.concatenate([a,b])
print("a =",K.eval(a))
print("c =",K.eval(c))
#a_pdf = K.map_fn(lambda x: 1/(1+K.exp(x)), a)
a_len = int(a.shape[0])
b_len = int(b.shape[0])
a_pdf = K.map_fn(lambda x: 1/((a_len)*(1+K.exp(pdf_scale*(x-c)))), a)
print("a_pdf =",K.eval(a_pdf))
a_pdf = K.sum(K.map_fn(lambda x: 1/(1+K.exp(pdf_scale*(x-c))), a), axis=0)/int(a.shape[0])
print("summed a_pdf =",K.eval(a_pdf))
#a_pdf = K.variable([K.sum(1/(1+K.exp(pdf_scale*(a-x)))) for x in c])/a.shape[0]
#b_pdf = K.variable([K.sum(1/(1+K.exp(pdf_scale*(b-x)))) for x in c])/b.shape[0]
#pdf_diff = (a_pdf - b_pdf)
#pdf_diff = K.sqrt(pdf_diff*pdf_diff)
#K.log(K.sum(K.exp(max_scale*(pdf_diff))))/max_scale

('a =', array([1., 2., 3., 4., 5.], dtype=float32))
('c =', array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.], dtype=float32))
('a_pdf =', array([[1.0000000e-01, 2.0000000e-01, 2.0000000e-01, 2.0000000e-01,
        2.0000000e-01, 2.0000000e-01, 2.0000000e-01, 2.0000000e-01,
        2.0000000e-01, 2.0000000e-01],
       [3.8574995e-23, 1.0000000e-01, 2.0000000e-01, 2.0000000e-01,
        2.0000000e-01, 2.0000000e-01, 2.0000000e-01, 2.0000000e-01,
        2.0000000e-01, 2.0000000e-01],
       [0.0000000e+00, 3.8574995e-23, 1.0000000e-01, 2.0000000e-01,
        2.0000000e-01, 2.0000000e-01, 2.0000000e-01, 2.0000000e-01,
        2.0000000e-01, 2.0000000e-01],
       [0.0000000e+00, 0.0000000e+00, 3.8574995e-23, 1.0000000e-01,
        2.0000000e-01, 2.0000000e-01, 2.0000000e-01, 2.0000000e-01,
        2.0000000e-01, 2.0000000e-01],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 3.8574995e-23,
        1.0000000e-01, 2.0000000e-01, 2.0000000e-01, 2.0000000e-01,
        2.0000000e

## Cost Function

Now with the real data loaded in, like a cost function

In [37]:
data = np.load("/home/users/bhashemi/Projects/GIT/DY-GAN/delphes/total_Zmumu_13TeV_PU20_v2.npa")
data = data[data["genmll"] > 50.]
data = data[np.random.randint(0, data.shape[0], 2000)]
mll_true = data["mll"]

def ks_approx(a,b):
    """Takes in two lists of numbers, a and b, and returns an approximation to the KS test statistic between them"""
    pdf_scale = 50
    max_scale = 200

    c = K.concatenate([a,b])
    num_a = K.sum(a)/K.mean(a)
    num_b = b.shape[0]
    a_pdf = K.sum(K.map_fn(lambda x: 1/(1+K.exp(pdf_scale*(x-c))), a), axis=0)/num_a
    b_pdf = K.sum(K.map_fn(lambda x: 1/(1+K.exp(pdf_scale*(x-c))), b), axis=0)/num_b
    pdf_diff = (a_pdf - b_pdf)
    pdf_diff = K.sqrt(pdf_diff*pdf_diff)
    return K.log(K.sum(K.exp(max_scale*(pdf_diff))))/max_scale

def loss_func(y_true, y_pred_mll):
    y_true = y_true[:,0]
    y_pred = y_pred_mll[:,0]
    mll_pred = y_pred_mll[:,0]

    mll_loss = ks_approx(mll_pred, mll_true)
    #mll_loss = 1

    return binary_crossentropy(y_true, y_pred) + c*mll_loss



input_shape=(8,)
noise_shape=input_shape
output_shape=(8,)
#Gen:
inputs = Input(shape=input_shape)
x = Dense(64)(inputs)
x = LeakyReLU(alpha=0.2)(x)

for size in [128,256,512,256,128]:
    x = Dense(size)(x)
    x = LeakyReLU(alpha=0.2)(x)
    
x = Dense(output_shape[0])(x)
gen = Model(inputs=inputs, outputs=[x])

#Disc:
inputs = Input(input_shape)
x = Dense(128)(inputs)
for size in [128,256,256,128,64,32,16,8]:
    x = Dense(size)(x)
    x = LeakyReLU(alpha=0.2)(x)
out = Dense(1,activation='sigmoid')(x)
disc = Model(inputs=inputs, outputs=out)

gen.compile(loss=loss_func, optimizer=keras.optimizers.Adam(lr=0.0002, beta_1=0.5))
disc.compile(loss=loss_func, optimizer=keras.optimizers.Adam(lr=0.0002, beta_1=0.5))
disc.trainable = False #only train the generator
z = Input(shape=noise_shape)
img = gen(z)
valid = disc(img)
combined = Model(z, valid)
combined.compile(loss=loss_func, optimizer=keras.optimizers.Adam(lr=0.0002, beta_1=0.5))

In [50]:
import numpy as np
from scipy.stats import ks_2samp

a = np.random.rand(512)
b = np.random.rand(2000)
#print(a)
#print(b)
print(ks_2samp(a, b)[0])
print(ks_approx_slow(a,b))

0.02770312500000005
0.04897199190736596
