In [1]:
from validphys.api import API
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from validphys.fkparser import load_fktable
from collections import defaultdict

from typing import List, Dict
from collections import namedtuple

from n3fit.layers import DIS

Using Keras backend


In [2]:
from utils import XGRID
from model_utils import *
seed = 124143

In [3]:
# List of DIS dataset
dataset_inputs = [
  #{'dataset': 'NMC_NC_NOTFIXED_DW_EM-F2', 'frac': 0.75, 'variant': 'legacy'},
  {'dataset': 'NMC_NC_NOTFIXED_P_EM-SIGMARED', 'frac': 0.75, 'variant': 'legacy'},
  {'dataset': 'SLAC_NC_NOTFIXED_P_DW_EM-F2', 'frac': 0.75, 'variant': 'legacy'},
  {'dataset': 'SLAC_NC_NOTFIXED_D_DW_EM-F2', 'frac': 0.75, 'variant': 'legacy'},
  {'dataset': 'BCDMS_NC_NOTFIXED_P_DW_EM-F2', 'frac': 0.75, 'variant': 'legacy'},
  {'dataset': 'BCDMS_NC_NOTFIXED_D_DW_EM-F2', 'frac': 0.75, 'variant': 'legacy'},
  {'dataset': 'CHORUS_CC_NOTFIXED_PB_DW_NU-SIGMARED', 'frac': 0.75, 'variant': 'legacy'},
  {'dataset': 'CHORUS_CC_NOTFIXED_PB_DW_NB-SIGMARED', 'frac': 0.75, 'variant': 'legacy'},
  {'dataset': 'NUTEV_CC_NOTFIXED_FE_DW_NU-SIGMARED', 'cfac': ['MAS'], 'frac': 0.75, 'variant': 'legacy'},
  {'dataset': 'NUTEV_CC_NOTFIXED_FE_DW_NB-SIGMARED', 'cfac': ['MAS'], 'frac': 0.75, 'variant': 'legacy'},
  {'dataset': 'HERA_NC_318GEV_EM-SIGMARED', 'frac': 0.75, 'variant': 'legacy'},
  {'dataset': 'HERA_NC_225GEV_EP-SIGMARED', 'frac': 0.75, 'variant': 'legacy'},
  {'dataset': 'HERA_NC_251GEV_EP-SIGMARED', 'frac': 0.75, 'variant': 'legacy'},
  {'dataset': 'HERA_NC_300GEV_EP-SIGMARED', 'frac': 0.75, 'variant': 'legacy'},
  {'dataset': 'HERA_NC_318GEV_EP-SIGMARED', 'frac': 0.75, 'variant': 'legacy'},
  {'dataset': 'HERA_CC_318GEV_EM-SIGMARED', 'frac': 0.75, 'variant': 'legacy'},
  {'dataset': 'HERA_CC_318GEV_EP-SIGMARED', 'frac': 0.75, 'variant': 'legacy'},
  {'dataset': 'HERA_NC_318GEV_EAVG_CHARM-SIGMARED', 'frac': 0.75, 'variant': 'legacy'},
  {'dataset': 'HERA_NC_318GEV_EAVG_BOTTOM-SIGMARED', 'frac': 0.75, 'variant': 'legacy'},
]

# Dictionary for validphys API
common_dict = dict(
    dataset_inputs=dataset_inputs,
    metadata_group="nnpdf31_process",
    use_cuts='internal',
    datacuts={'q2min': 3.49, 'w2min': 12.5},
    theoryid=40000000,
    t0pdfset='NNPDF40_nnlo_as_01180',
    use_t0=True
)

In [4]:
groups_data = API.procs_data(**common_dict)
groups_index = API.groups_index(**common_dict)

In [5]:
from validphys.pineparser import pineappl_reader
from n3fit.layers.observable import compute_float_mask
from n3fit.backends import operations as op

# Initialise the dictionaries
fk_table_dict = defaultdict(list)
central_data_dict = {}
padded_fk_dict = defaultdict(list)
xgrid_masks_dict = defaultdict(list)

total_ndata_wc = 0
data_size_dict = defaultdict(list)

for idx_proc, group_proc in enumerate(groups_data):
  for idx_exp, exp_set in enumerate(group_proc.datasets):
    
    dataset_name = exp_set.name
    dataset_size = exp_set.load_commondata().ndata
    total_ndata_wc += dataset_size

    # Collect FKSpecs and cuts
    fkspecs = exp_set.fkspecs
    cuts = exp_set.cuts

    # Read FKData and FK table in numpy version
    fk_data = pineappl_reader(fkspecs[0]).with_cuts(cuts)
    fk_table = fk_data.get_np_fktable()

    # xgrid for this dataset
    xgrid = fk_data.xgrid

    # Check that XGRID is just a small-x extension
    # of xgrid
    res = True
    for i, x in enumerate(xgrid):
      offset = 50 - xgrid.size
      try:
        assert(np.isclose(x, XGRID[offset+i]))
      except AssertionError:
        print(f"XGRID is not an extension for {dataset_name}.")

    # Load DIS object for padding the FK table
    dis = DIS(
      [fk_data],
      [fk_table],
      dataset_name,
      None,
      exp_set.op,
      n_replicas=1,
      name=f"dat_{dataset_name}"
    )
    
    # OLD
    # Pad the fk table so that (N, x, 9) -> (N, x, 14)
    mask = dis.masks[0]
    padded_fk_table = dis.fktables[0]#dis.pad_fk(dis.fktables[0], mask)
    padded_fk_dict[dataset_name] = dis.pad_fk(dis.fktables[0], mask)

    # Extend xgrid to low-x (N, x, 14) -> (N, 50, 14)
    xgrid_mask = np.zeros(XGRID.size, dtype=bool)
    offset = XGRID.size - xgrid.size
    for i in range(xgrid.size):
      xgrid_mask[offset + i] = True
    xgrid_mask = compute_float_mask(xgrid_mask)
    paddedx_fk_table = op.einsum('Xx, nFx -> nXF', xgrid_mask, padded_fk_table)
    xgrid_masks_dict[dataset_name] = xgrid_mask
    # Check the mask in x is applied correctly
    #for i in range(XGRID.size):
    if i >= offset:
      try:
        assert(np.allclose(paddedx_fk_table[:,i,:], padded_fk_table[:,:,i - offset]))
      except AssertionError:
        print(f'Problem in the unchanged values for {dataset_name}')
    else:
      try:
        assert(np.allclose(paddedx_fk_table[:,i,:], np.zeros_like(paddedx_fk_table[:,i,:])))
      except AssertionError:
        print(f'Problem in the extension for {dataset_name}')

    # Save to dict
    fk_table_dict[dataset_name] = paddedx_fk_table
    central_data_dict[dataset_name] = exp_set.load_commondata().with_cuts(cuts).central_values.to_numpy()


### Compute NTK

In [122]:
nnpdf = generate_sequential_model(outputs=9, nlayers=2, units=[28, 20],seed=151316, name='NNPDF', kernel_initializer=tf.keras.initializers.GlorotNormal, predictions=False)
NTK = compute_ntk(nnpdf, XGRID, round_to_zero=True)

# Check NTK
for f in range(NTK.shape[1]):
  ntk = NTK[:,f,:,f]
  try:
    assert(np.allclose(ntk, ntk.T))
  except AssertionError:
    print(f'NTK for flavor {f} is not symmetric')
  
  try:
    np.linalg.cholesky(ntk)
  except np.linalg.LinAlgError as e:
    print(f'{e} for {f}')

Matrix is not positive definite for 0
Matrix is not positive definite for 1
Matrix is not positive definite for 2
Matrix is not positive definite for 3
Matrix is not positive definite for 4
Matrix is not positive definite for 5
Matrix is not positive definite for 6
Matrix is not positive definite for 7
Matrix is not positive definite for 8


In [123]:
C_sys = API.dataset_inputs_t0_covmat_from_systematics(**common_dict)
C = API.groups_covmat_no_table(**common_dict)
C_index = C.index
C_col = C.columns
#C = pd.DataFrame(C_sys, index=C_index, columns=C_col)
#C = pd.DataFrame(np.identity(C.shape[0]), index=C_index, columns=C_col)
Cinv = np.linalg.inv(C)
Cinv = pd.DataFrame(Cinv, index=C_index, columns=C_col)

L = np.linalg.cholesky(Cinv)

### Compute and regularise matrix $ M = (FK)^{T} C_{Y}^{-1} (FK)$.

In [124]:
M = np.zeros((50, 9, 50, 9))
for exp, fk in fk_table_dict.items():
  Cinv_red = Cinv.xs(level='dataset', key=exp).T.xs(level='dataset', key=exp)
  #R_fk = np.einsum('fF, nxF -> nxf', R, fk) 
  aux = np.einsum('Ixf, IJ -> xfJ', fk, Cinv_red)
  M += np.einsum('xfI, IXF -> xfXF', aux, fk)

oldshape = M.shape
prod = 1
invshape = oldshape[2:] + oldshape[:2]
for k in oldshape[2:]:
    prod *= k
M_old_shape = M
M = M.reshape(prod, -1)

In [139]:
eta_id = 0.001
Mr = M + eta_id * np.identity(M.shape[0])
Mr_old_shape = Mr.reshape(prod, -1)
M_inv = np.linalg.inv(Mr)
M_inv = M_inv.reshape(*oldshape)

In [126]:
# Construct dataframe for predictions
Y = pd.DataFrame(np.zeros(Cinv.shape[0]), index=Cinv.index)
for exp_name, data in central_data_dict.items():
  if data.size == Y.loc[(slice(None), [exp_name], slice(None)), :].size:
    Y.loc[(slice(None), [exp_name], slice(None)), :] = data
  else:
    raise ValueError

Cinv_Y = Cinv @ Y

FK_Cinv_y = np.zeros((50,9))
for exp, fk in fk_table_dict.items():
  Cinv_Y_red = Cinv_Y.xs(level='dataset', key=exp).to_numpy()
  #R_fk = np.einsum('fF, nxF -> nxf', R, fk)
  FK_Cinv_y += np.einsum('Ixf, I -> xf', fk, Cinv_Y_red[:,0])

K = np.einsum('xfXF , XF -> xf', M_inv, FK_Cinv_y)

### Compute the matrix that governs the evolution $H = \Theta M$

In [209]:
# Compute the matrix that controls the evolution
#H = NTK @ Mr
H = np.einsum('iajb, jbkc -> iakc', NTK, M_old_shape)
#H = np.einsum('iajb, jbkc -> iakc', M_old_shape, tmp)
print(f'The shape of H is {H.shape}')

H = H.reshape(prod, -1)
try:
  assert(np.allclose(H, H.T))
except AssertionError as e:
  print("Matrix H is not symmetric")

try:
  assert(np.allclose(H, 0.5 * (H + H.T)))
except AssertionError as e:
  print("Matrix H is not symmetric")

eigvals, eigvecs = np.linalg.eig(H)
eigvals = eigvals.real
eigvecs = eigvecs.real

The shape of H is (50, 9, 50, 9)
Matrix H is not symmetric
Matrix H is not symmetric


Check rank of matrix $H$

In [211]:
rank_H = np.linalg.matrix_rank(H)
rank_R = np.linalg.matrix_rank(eigvecs)
print(f'Rank(H) = {rank_H}')
print(f'Rank(R) = {rank_R}')

Rank(H) = 32
Rank(R) = 119


In [207]:
H_reconstructed = np.zeros_like(H)
for k in range(eigvals.size):
  vi_vj = np.outer(eigvecs[:,k], eigvecs[:,k])
  H_reconstructed = eigvals[k] * vi_vj

try:
  assert(np.allclose(H, H_reconstructed))
except AssertionError as e:
  print(e)




It seems from the cell below that the eigenvectors are not orthogonal with each other.

In [208]:
for k1 in range(eigvals.size):
  for k2 in range(eigvals.size):
    res = np.dot(eigvecs[:,k1], eigvecs[:,k2])
    if k1 == k2:
      try:
        assert(np.allclose(res, 1.))
      except AssertionError:
        print(f"Vector {k1} not normalised to 1 | res = {res}")

    else:
      try:
        assert(np.allclose(res, 0.))
      except AssertionError:
        print(f"Error for {k1} and {k2} res = {res}")

Print eigenvalues of the matrix H

In [169]:
for i, val in enumerate(eigvals.real):
  print(f'{i}: {val}')

0: -0.0031811389404780336
1: -0.0011778046373825694
2: -0.0005953086193544881
3: -0.0004661710393776418
4: -0.0003216681780940574
5: -0.0001793756169771962
6: -0.00016551466256927306
7: -0.00015589086433919532
8: -0.00013270100469467834
9: -8.489724091639179e-05
10: -6.172001630674645e-05
11: -5.37929254523777e-05
12: -4.614848207863867e-05
13: -4.072767789680325e-05
14: -2.5696508715173003e-05
15: -2.5103411412283937e-05
16: -2.2844535186817132e-05
17: -2.0852333352630377e-05
18: -2.0696930501812476e-05
19: -2.0254697526810786e-05
20: -1.420429876392285e-05
21: -8.830232902370948e-06
22: -8.106375556590646e-06
23: -7.114621826340943e-06
24: -7.006409080688369e-06
25: -6.958924079240497e-06
26: -6.574418547045391e-06
27: -5.004756372775453e-06
28: -4.8387868419971045e-06
29: -4.0563457154455795e-06
30: -3.6920069817564222e-06
31: -3.3853520551291535e-06
32: -2.902373356746652e-06
33: -2.7844791290478355e-06
34: -2.5575574031410946e-06
35: -2.366521612898839e-06
36: -2.0022734517114647e

Print maximum and minimum component of each eigenvector

In [170]:
i = 1
for k in range(eigvecs.shape[1]):
  print(f'{k}: max = {eigvecs[:,k].max()} |  min = {eigvecs[:,k].min()}')

0: max = 0.5839991622603118 |  min = -0.3338639588335049
1: max = 0.5123685925541503 |  min = -0.18762394865365484
2: max = 0.45376564112307227 |  min = -0.357296011807748
3: max = 0.34799032166353033 |  min = -0.20618051252850772
4: max = 0.339441888219115 |  min = -0.5531649420939484
5: max = 0.3845649644091113 |  min = -0.2347352181144389
6: max = 0.5221319211922941 |  min = -0.24813427680365097
7: max = 0.332572364271651 |  min = -0.34980614020504447
8: max = 0.34303518878026146 |  min = -0.2960442729976378
9: max = 0.38621323515987366 |  min = -0.50839913071937
10: max = 0.3921354346855571 |  min = -0.21632535547401313
11: max = 0.39266355073475023 |  min = -0.4128603981748013
12: max = 0.3583898880663653 |  min = -0.4299769785508997
13: max = 0.22909836348778415 |  min = -0.2802582636616097
14: max = 0.37364361738984947 |  min = -0.3268349326246597
15: max = 0.35413270950819725 |  min = -0.39036209648831727
16: max = 0.21241214941077924 |  min = -0.41261091576377
17: max = 0.3403

Print the components of the $k$-th eigenvector

In [171]:
k = 0
for i, val in enumerate(eigvecs[:,k]):
  print(f'{i} : {val}')

0 : 0.0
1 : 0.0
2 : 0.0
3 : 0.0
4 : 0.0
5 : 0.0
6 : 0.0
7 : 0.0
8 : 0.0
9 : 0.0
10 : 0.0
11 : 0.0
12 : 0.0
13 : 0.0
14 : 0.0
15 : 0.0
16 : 0.0
17 : 0.0
18 : 0.0
19 : 0.0
20 : 0.0
21 : 0.0
22 : 0.0
23 : 0.0
24 : 0.0
25 : 0.0
26 : 0.0
27 : 0.0
28 : 0.0
29 : 0.0
30 : 0.0
31 : 0.0
32 : 0.0
33 : 0.0
34 : 0.0
35 : 0.0
36 : 0.0
37 : 0.0
38 : 0.0
39 : 0.0
40 : 0.0
41 : 0.0
42 : 0.0
43 : 0.0
44 : 0.0
45 : 0.0
46 : -0.0027994418416097196
47 : 0.06215812867791044
48 : -0.016726104903662664
49 : -0.0022032502600571723
50 : -0.004326454909784609
51 : -0.011815832355197585
52 : -0.0047217528529360266
53 : -0.00488663462447557
54 : -0.010582927342759113
55 : -0.029540730153719254
56 : -0.02877730602401214
57 : 0.001031991913214121
58 : -0.026066519323038367
59 : -0.047712815275725205
60 : -0.02370123853748422
61 : 0.010833287315693215
62 : 0.002746310627958478
63 : -0.0017904704592523813
64 : -0.00629436974662707
65 : -4.1823951062619137e-10
66 : -2.2371067016485648e-10
67 : 3.9036226903605284e-10
68

### Integrated PDF as function of the training time t

Regularised the eigenvalues of H

In [108]:
regularised_eigvals = np.zeros_like(eigvals)
for i in range(regularised_eigvals.size):
  regularised_eigvals[i] = round_float32(eigvals[i], np.sort(eigvals)[::-1][0])

...and print them after regularisation

Trying to project a toy vector in the eigenspace

In [212]:
vector =  np.random.rand(eigvecs.shape[0]).astype(np.float32)
vector_tilde = [np.dot(vector, eigvecs[:, k]) for k in range(eigvals.size)]

aux = np.zeros_like(eigvecs[:, 0])
for k in range(eigvecs.shape[0]):
  aux += vector_tilde[k] * eigvecs[:,k]

try:
  assert(np.allclose(aux, vector))
except AssertionError:
  print("The projected vector is not the same")

The projected vector is not the same


In [213]:
def extract_independent_columns(matrix):
    """
    Extract a subset of linearly independent columns from the matrix, 
    preserving the original order.
    
    Args:
        matrix (numpy.ndarray): The input square matrix.
        
    Returns:
        numpy.ndarray: A matrix containing only the independent columns.
    """
    independent_columns = []
    current_matrix = np.empty((matrix.shape[0], 0), dtype=matrix.dtype)

    for col_idx in range(matrix.shape[1]):
        candidate_matrix = np.hstack([current_matrix, matrix[:, col_idx:col_idx + 1]])
        if np.linalg.matrix_rank(candidate_matrix) > np.linalg.matrix_rank(current_matrix):
            independent_columns.append(col_idx)
            current_matrix = candidate_matrix

    return matrix[:, independent_columns]

# Example usage
matrix = np.array([[1, 2, 3], 
                   [2, 4, 6], 
                   [3, 6, 9]], dtype=float)

independent_columns = extract_independent_columns(matrix)
print("Matrix with independent columns:\n", independent_columns)


ValueError: zero-size array to reduction operation maximum which has no identity

In [214]:
matrix = H
independent_columns = []
current_matrix = np.empty((matrix.shape[0], 0), dtype=matrix.dtype)

In [220]:
current_matrix.shape

(450, 0)

In [221]:
col_idx = 2
np.allclose(matrix[:, col_idx:col_idx + 1], matrix[:, col_idx:col_idx+1])
test = np.hstack([current_matrix, matrix[:,col_idx]])

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 1 dimension(s)

In [None]:
for col_idx in range(matrix.shape[1]):
    candidate_matrix = np.hstack([current_matrix, matrix[:, col_idx]])
    if np.linalg.matrix_rank(candidate_matrix) > np.linalg.matrix_rank(current_matrix):
        independent_columns.append(col_idx)
        current_matrix = candidate_matrix