In [1]:
from rascal.neighbourlist.structure_manager import mask_center_atoms_by_species
from ase.io import read
from rascal.representations import SphericalInvariants
from rascal.utils import get_optimal_radial_basis_hypers
import numpy as np
from helpers import *

In [2]:
PATH_TRAIN = "./make_tensor_data/train_tensor/CSD-3k+S546_shift_tensors.xyz"

structures_train = read(PATH_TRAIN,format="extxyz",index=":")

for structure in structures_train: mask_center_atoms_by_species(structure,species_select=[1])
for structure in structures_train: structure.wrap(eps=1e-12)

structures_train = filter_by_status(structures_train,status="PASSING")

In [27]:
PATH_TEST = "./make_tensor_data/test_tensor/CSD-500+104-7_shift_tensors.xyz"

structures_test = read(PATH_TEST,format="extxyz",index=":")

for structure in structures_test: mask_center_atoms_by_species(structure,species_select=[1])
for structure in structures_test: structure.wrap(eps=1e-12)

In [28]:
shifts_test = np.array([tensor for structure in structures_test for tensor in structure.arrays["cs_iso"][structure.arrays["center_atoms_mask"]]])

In [3]:
shifts = train_tensors = np.array([tensor for structure in structures_train for tensor in structure.arrays["cs_iso"][structure.arrays["center_atoms_mask"]]])

In [29]:
shifts_test.shape

(35289,)

In [5]:
hypers = dict(soap_type="PowerSpectrum",
              interaction_cutoff=4.5,
              max_radial=8,
              max_angular=5,
              gaussian_sigma_constant=0.3,
              gaussian_sigma_type="Constant",
              cutoff_function_type="RadialScaling",
              cutoff_smooth_width=0.5,
              cutoff_function_parameters=
                    dict(
                            rate=1,
                            scale=3.0,
                            exponent=6
                        ),
              radial_basis="GTO",
              normalize=True,
              optimization=
                    dict(
                            Spline=dict(
                               accuracy=1.0e-05
                            )
                        ),
              compute_gradients=False
              )

hypers = get_optimal_radial_basis_hypers(hypers, structures_train, expanded_max_radial=20)

In [6]:
calculator = SphericalInvariants(**hypers)

In [7]:
X_train = calculator.transform(structures_train).get_features(calculator)

In [30]:
X_test = calculator.transform(structures_test).get_features(calculator)

In [31]:
X_test.shape

(35289, 5760)

In [18]:
from skcosmo.sample_selection import FPS

In [19]:
selector = FPS(
                    n_to_select=5000,
                    progress_bar=True,
                    score_threshold=1e-12,
                    full=False,

                    # int or 'random', default=0
                    # Index of the first selection.
                    # If ‘random’, picks a random value when fit starts.
                    initialize = 0,
                    )
selector.fit(X_train)

  0%|          | 0/4999 [00:00<?, ?it/s]

FPS()

In [22]:
from sklearn.kernel_approximation import Nystroem

In [23]:
feature_map_nystroem = Nystroem(kernel="linear",
                                n_components=5000)

In [9]:
selected_idx_ = np.load("selected_sample_ids_H_n8_l5_PASSING.npy")

In [10]:
KNM = X_train @ X_train[selected_idx_].T

In [12]:
from sklearn.linear_model import RidgeCV

In [13]:
clf = RidgeCV(alphas=np.logspace(-8,3,15)).fit(KNM, shifts)


In [32]:
KTM = (X_test @ X_train[selected_idx_].T)

In [35]:
y_pred = clf.predict(KTM)

In [34]:
from sklearn.metrics import mean_squared_error

In [39]:
mean_squared_error(y_pred,shifts_test,squared=False)

0.5901255807888104

In [26]:
X_train[selector.selected_idx_].shape

(5000, 5760)

In [21]:
np.save("selected_sample_ids_H_n8_l5_PASSING",selector.selected_idx_)

In [16]:
def build_structure_dict(PATH):
    """Builds a dictionary from an extended xyz file 
    containing key: CSD-NAME value: atoms object pairs
    """
    extxyz = read(PATH,format="extxyz",index=':')
    status_dict = {atom.info["NAME"]: atom for atom in extxyz}
    return status_dict

MemoryError: Unable to allocate 135. GiB for an array with shape (134692, 134692) and data type float64

In [120]:
train_structures[0].info

NameError: name 'train_structures' is not defined

In [116]:
train_struct_nice = train_structures[:100]

NameError: name 'train_structures' is not defined

In [37]:
y_pred

array([26.23856941, 26.23859742, 26.23857163, ..., 23.92599114,
       23.92454931, 23.92598499])

In [38]:
shifts_test

array([25.97, 25.97, 25.97, ..., 23.13, 23.13, 23.13])

In [40]:
import tqdm
from nice.blocks import *
from nice.utilities import *

In [117]:
indices = np.arange(len(structures_train))
indices = np.random.permutation(indices)
indices = indices[:200]

In [118]:
train_struct_nice = [structures_train[ind] for ind in indices]

In [119]:
structures_train 

[Atoms(symbols='N4O4C20H20', pbc=True, cell=[13.564047596, 5.79502231063, 5.60401339238], center_atoms_mask=..., cs_iso=..., cs_tensor=...),
 Atoms(symbols='C18H36N8O8', pbc=True, cell=[[7.131018008, 0.0, 0.0], [-3.35710500272, 9.26783468512, 0.0], [-2.0474364974, -1.27908356908, 9.61766816655]], center_atoms_mask=..., cs_iso=..., cs_tensor=...),
 Atoms(symbols='C28H16N32O32', pbc=True, cell=[[12.722757232, 0.0, 0.0], [0.0, 11.8045558426, 0.0], [-0.879600543992, 0.0, 8.09690265277]], center_atoms_mask=..., cs_iso=..., cs_tensor=...),
 Atoms(symbols='C64H118N14O4', pbc=True, cell=[[6.71344207, 0.0, 0.0], [-0.499359248051, 7.71360395615, 0.0], [-1.05722614374, -2.25817365532, 31.890877292]], center_atoms_mask=..., cs_iso=..., cs_tensor=...),
 Atoms(symbols='C32H32N16O16', pbc=True, cell=[[13.24907966, 0.0, 0.0], [0.0, 11.1910736183, 0.0], [-0.519960131257, 0.0, 6.25747407802]], center_atoms_mask=..., cs_iso=..., cs_tensor=...),
 Atoms(symbols='C40H28O18', pbc=True, cell=[[12.009052166, 0

In [60]:
len(train_struct_nice)

200

In [99]:
def get_nice():
    return StandardSequence([
        StandardBlock(ThresholdExpansioner(num_expand=150),
                      CovariantsPurifierBoth(max_take=10),
                      IndividualLambdaPCAsBoth(n_components=50),
                      ThresholdExpansioner(num_expand=300, mode='invariants'),
                      InvariantsPurifier(max_take=50),
                      InvariantsPCA(n_components=200)),
        ],
                            initial_scaler=InitialScaler(
                                mode='signal integral', individually=True))

In [121]:
all_species = get_all_species(train_struct_nice)

In [122]:
all_species

array([ 1,  6,  7,  8, 16])

In [134]:
hypers_nice = dict(
              interaction_cutoff=4.5,
              max_radial=12,
              max_angular=9,
              gaussian_sigma_constant=0.3,
              gaussian_sigma_type="Constant",
              cutoff_function_type="RadialScaling",
              cutoff_smooth_width=0.5,
              cutoff_function_parameters=
                    dict(
                            rate=1,
                            scale=3.0,
                            exponent=6
                        ),
              radial_basis="GTO",
              optimization=
                    dict(
                            Spline=dict(
                               accuracy=1.0e-05
                            )
                        ),
              compute_gradients=False
              )


train_coefficients = get_spherical_expansion(train_struct_nice, hypers_nice,
                                             all_species)

100%|██████████| 2/2 [00:02<00:00,  1.30s/it]
100%|██████████| 5/5 [00:00<00:00, 25.58it/s]


In [135]:
train_coefficients[1]

array([[[[ 1.61886645e-002,  0.00000000e+000,  0.00000000e+000, ...,
           0.00000000e+000,  0.00000000e+000,  0.00000000e+000],
         [-1.38337269e-005, -4.64445284e-005, -7.06225793e-005, ...,
           0.00000000e+000,  0.00000000e+000,  0.00000000e+000],
         [ 8.96138692e-005, -1.32753929e-004, -3.89724305e-005, ...,
           0.00000000e+000,  0.00000000e+000,  0.00000000e+000],
         ...,
         [ 1.25849890e-006,  2.34016083e-005,  3.49897854e-005, ...,
           0.00000000e+000,  0.00000000e+000,  0.00000000e+000],
         [-1.78366335e-005,  2.77922478e-005, -5.03899564e-006, ...,
          -1.13631022e-006,  0.00000000e+000,  0.00000000e+000],
         [-3.60204132e-006,  4.04167391e-006, -2.45717198e-005, ...,
           2.97940450e-005,  8.91311757e-007, -1.11719426e-006]],

        [[ 6.29108318e-003,  0.00000000e+000,  0.00000000e+000, ...,
           0.00000000e+000,  0.00000000e+000,  0.00000000e+000],
         [ 6.46501014e-006,  6.58577446e-005, 

In [136]:
nice = {}
nice[1] = get_nice()

In [137]:
train_coefficients[1].shape

(7435, 60, 10, 19)

In [140]:
nice[1].fit(train_coefficients[1][:1000])



In [139]:
train_features = {}

train_features[1] = nice[1].transform(
        train_coefficients[1], return_only_invariants=True)

  result = 1.0 / np.sqrt(np.sum(coefficients[:, :, 0, 0]**2, axis=1))
  return Data(even_coefficients,
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  even_coefficients_sizes), Data(odd_coefficients,
  return Data(new_even,
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  new_even_actual_sizes), Data(new_odd,
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [129]:
train_coefficients.keys()

dict_keys([1, 6, 7, 8, 16])

In [133]:
train_features[1][2].shape

(7435, 200)