In [2]:
from ase.io import read
import numpy as np

In [3]:
from rascal.models import Kernel, train_gap_model, compute_KNM, KRR
#from rascal.utils import from_dict, to_dict, CURFilter, dump_obj, load_obj
try:
    from tqdm.notebook import tqdm
except ImportError:
    tqdm = (lambda i, **kwargs: i)


In [2]:
from time import sleep

In [3]:
from tqdm.auto import tqdm, trange

In [1]:
for i in tqdm(trange(10)):
    sleep(1)

NameError: name 'tqdm' is not defined

In [3]:
PATH = "CSD-2k_relaxed_shifts.txt"

In [None]:
def load_CSD_data(PATH):
    structures = read(PATH,format="extxyz",index=":")
    
    for atom in structures:
        atom.wrap()
    
    shifts = np.concatenate([atom.arrays["CS"] for atom in structures]) 
    
    return structures, shifts

In [4]:
structures = read(PATH,format="extxyz",index=":")
#include index=":" to read all from file
#format="extyz" top properly recognize the extended xyz format
#doc on the extended xyz format: https://wiki.fysik.dtu.dk/ase/ase/io/formatoptions.html#ase.io.extxyz.read_extxyz

#------------extyz format (from ase doc)--------
# The list of properties in the file is described by the Properties parameter, 
# which should take the form of a series of colon separated triplets giving the name, 
# format ( R for real, I for integer) and number of columns of each property. For example:
# Properties=species:S:1:pos:R:3:CS:R:2 (in the CS-500/2K file)
#
# How to access?
#
# Additional key-value pairs in the comment line are parsed 
# into the ase.Atoms.atoms.info dictionary, with the following conventions...
for atom in structures:
    atom.wrap()

In [5]:
from rascal.models.asemd import ASEMLCalculator
from rascal.utils import from_dict, to_dict, CURFilter, FPSFilter, dump_obj, load_obj, get_score, print_score

In [3]:
from rascal.representations import SphericalInvariants

In [5]:
hypers = {"soap_type": "PowerSpectrum",
          "interaction_cutoff": 3,
          "radial_basis": "GTO",
          "max_radial": 3,
          "max_angular": 3,
          "gaussian_sigma_constant": 0.3,
          "gaussian_sigma_type":"Constant",
          "cutoff_function_type":"RadialScaling",
          "cutoff_smooth_width": 0.5,
          "normalize": True,
          "compute_gradients":False,
          "cutoff_function_parameters":dict(rate=1,scale=3.5,exponent=4)
          #"expansion_by_species_method":'structure wise'
         }

#max_angular: Number of radial basis functions
#max_radial: Highest angular momentum number (l) in the expansion

# num_interactions grow :  e.g.
# H-H
# H-H, O-O, H-O
# H-H, O-O, N-N, H-O, H-N, N-O
# H-H, O-O, N-N, C-C, H-C, N-C, O-C, O-H, O-N, N-H,

#scales as N_elements + (n over 2) binomial    #bispectrum should scale as N_elements + (n over 2) + (n over 3)

NameError: name 'shifts' is not defined

In [9]:
#retrieving shifts 
shifts = np.concatenate([atom.arrays["CS"] for atom in structures]) 

#generating training and testing set
N_dataset = len(structures)
N_train = int(0.5*N_dataset)

#getting atomic species 

global_species = set()
for frame in structures:
    global_species.update(frame.get_atomic_numbers())
global_species = np.array(list(global_species))  # is array([8, 1, 6, 7]) in this set

#build training set

ids = list(range(N_dataset))
np.random.seed(10)
np.random.shuffle(ids)

train_ids = ids[:N_train]
frames_train = [structures[ii] for ii in ids[:N_train]]
y_train = [shifts[ii] for ii in ids[:N_train]]
y_train = np.array(y_train)
y_train_shifts = np.concatenate([atom.arrays["CS"] for atom in frames_train]) 

In [21]:
test_H = structures[0][structures[0].symbols == "H"]

In [22]:
test_H.arrays["CS"]

array([15.89, 15.89, 15.89, 15.89, 16.43, 16.43, 16.43, 16.43, 24.43,
       24.43, 24.43, 24.43, 23.86, 23.86, 23.86, 23.86, 23.95, 23.95,
       23.95, 23.95, 24.63, 24.63, 24.63, 24.63, 23.64, 23.64, 23.64,
       23.64, 23.29, 23.29, 23.29, 23.29, 24.35, 24.35, 24.35, 24.35,
       24.07, 24.07, 24.07, 24.07, 29.78, 29.78, 29.78, 29.78, 29.21,
       29.21, 29.21, 29.21, 29.19, 29.19, 29.19, 29.19, 29.55, 29.55,
       29.55, 29.55, 29.71, 29.71, 29.71, 29.71, 29.29, 29.29, 29.29,
       29.29, 30.81, 30.81, 30.81, 30.81, 30.67, 30.67, 30.67, 30.67,
       30.38, 30.38, 30.38, 30.38])

In [31]:
hel

KeyboardInterrupt: 

In [7]:
soaps = SphericalInvariants(**hypers)
#managers = soaps.transform(frames_train)
#soaps.transform(frames_train).get_subset()

In [8]:
type(soaps)

rascal.representations.spherical_invariants.SphericalInvariants

In [67]:
managers.get_features()

TypeError: get_features() missing 1 required positional argument: 'calculator'

In [66]:
    managers.managers

<rascal.lib._rascal.neighbour_list.ManagerCollection_Strict_CenterContribution_NeighbourList_Centers at 0x7faa32c85260>

In [55]:
type(managers)

rascal.neighbourlist.structure_manager.AtomsList

In [11]:
from rascal.neighbourlist import structure_manager

In [52]:
y_train_shifts.shape

(94247,)

In [56]:
ind = managers.get_representation_info()[:,2] == 6 #!!!
#It has as many rows as the number representations and 
#they correspond to the index of the structure, 
#the central atom and its atomic species.

In [12]:
def make_element_wise_environments(calculator,frames,y=None):
    
    #get unique elements 
    y_element_wise = {}
    X_element_wise = {}
    
    atoms_list = calculator.transform(frames)
    X_repr = atoms_list.get_features(calculator)
    
    elements = np.unique(atoms_list.get_representation_info()[:,2])
    

    for element in elements:
        
        ind = atoms_list.get_representation_info()[:,2] == element
        
        if y is not None:
            y_element_wise[element] = y[ind]
        X_element_wise[element] = X_repr[ind]
        
    
    return X_element_wise, y_element_wise     
    #return elements    

In [13]:
out = make_element_wise_environments(soaps,frames_train)

In [14]:
out[0][1].shape

(38407, 360)

In [15]:
make_element_wise_environments(managers)

TypeError: make_element_wise_environments() missing 1 required positional argument: 'frames'

In [57]:
ind.shape

(94247,)

In [37]:
help(structure_manager)

Help on module rascal.neighbourlist.structure_manager in rascal.neighbourlist:

NAME
    rascal.neighbourlist.structure_manager

CLASSES
    builtins.object
        AtomsList
    
    class AtomsList(builtins.object)
     |  AtomsList(frames, nl_options, start=None, length=None, managers=None)
     |  
     |  A container for the neighbourlist and representation data associated with a list of atomic structures.
     |  
     |  This is a wrapper class for the `StructureManagerCollection` that have between precompiled on the C++ side.
     |  
     |  Attributes
     |  ----------
     |  nl_options : dict
     |      Parameters for each layer of the wrapped structure manager. Parameters
     |      can be specified for these layers: center, neighbourlist and strict.
     |  managers : StructureManagerCollection
     |      C++ object from rascal that holds the neighbourlist and the data associated with representations.
     |  
     |  Methods defined here:
     |  
     |  __getitem__

In [36]:
managers.get_representation_info()

array([[    0,     0,     6],
       [    0,     1,     6],
       [    0,     2,     6],
       ...,
       [  999, 94244,     1],
       [  999, 94245,     1],
       [  999, 94246,     1]], dtype=int32)

In [17]:
X_soaps.keys()

dict_keys([(1, 1), (1, 6), (1, 7), (1, 8), (6, 6), (6, 7), (6, 8), (7, 7), (7, 8), (8, 8)])

In [13]:
managers = soaps.transform(frames_train)

In [69]:
help(KRR)

Help on class KRR in module rascal.models.krr:

class KRR(rascal.utils.io.BaseIO)
 |  KRR(weights, kernel, X_train, self_contributions, description='KRR potential model', units=None)
 |  
 |  Kernel Ridge Regression model. Only supports sparse GPR
 |  training for the moment.
 |  
 |  Parameters
 |  ----------
 |  weights : np.array
 |      weights of the model
 |  
 |  kernel : Kernel
 |      kernel class used to train the model
 |  
 |  X_train : SparsePoints
 |      reference samples used for the training
 |  
 |  self_contributions : dictionary
 |      map atomic number to the property baseline, e.g. isolated atoms
 |      energies when the model has been trained on total energies.
 |  
 |  description : string
 |      User-defined string used to describe the model for future reference
 |  
 |  units : dict
 |      Energy and length units used by the model (default: eV and Å (aka AA),
 |      same as used in ASE)
 |  
 |  Method resolution order:
 |      KRR
 |      rascal.utils.io

In [30]:
managers.get_features(soaps).shape

(94247, 360)

In [14]:
n_sparse = {1:100, 6:100, 7:100, 8:100}
compressor = FPSFilter(soaps, n_sparse, act_on='sample per species')
X_sparse = compressor.select_and_filter(managers)

In [47]:
compressor.selected_ids

[[32, 0, 64, 80],
 [],
 [],
 [],
 [],
 [],
 [7],
 [109, 45, 78, 1],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [77, 9],
 [],
 [],
 [9],
 [],
 [],
 [26],
 [],
 [25],
 [],
 [],
 [],
 [],
 [2],
 [],
 [],
 [36],
 [66],
 [6],
 [38],
 [],
 [],
 [],
 [132],
 [],
 [],
 [],
 [7, 20],
 [],
 [],
 [66],
 [],
 [],
 [],
 [],
 [70, 20],
 [62],
 [3],
 [35],
 [],
 [65],
 [],
 [],
 [],
 [],
 [],
 [],
 [28],
 [],
 [],
 [37],
 [],
 [],
 [],
 [22],
 [],
 [],
 [],
 [],
 [],
 [91, 19],
 [],
 [57],
 [],
 [],
 [],
 [],
 [],
 [],
 [124],
 [],
 [],
 [],
 [18],
 [],
 [],
 [14],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [29],
 [13],
 [46],
 [],
 [],
 [],
 [],
 [22, 15],
 [84],
 [],
 [],
 [],
 [],
 [],
 [34],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [57],
 [],
 [],
 [],
 [],
 [],
 [0],
 [17],
 [],
 [49, 8, 20],
 [],
 [57],
 [],
 [62],
 [25],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [28],
 [3],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [128],
 [],
 [],
 [],
 [],
 [],
 [],
 [127],
 [16],
 []

In [1]:
import numpy as np
n = np.zeros((15000,15000))
print("%d megabytes" % (n.size/1e06 * n.itemsize))

1800 megabytes


In [21]:
hypers['compute_gradients'] = True
soaps = SphericalInvariants(**hypers)

In [22]:
zeta = 2
kernel = Kernel(soaps, name='GAP', zeta=zeta, target_type='Atom', kernel_type='Sparse')

In [48]:
X_feat = X_sparse.get_features()

In [49]:
X_feat.shape

(400, 360)

In [52]:
KNM = (X_soaps @ X_feat.T)**2
KMM = (X_feat @ X_feat.T)**2

In [18]:
def GPR(X_N,X_M,y_train,sigma=1e-04):
    KNM = (X_N @ X_M.T)**2
    KMM = (X_M @ X_M.T)**2
    c = np.linalg.lstsq(KNM.T @ KNM + sigma ** 2 * KMM, KNM.T @ y_train, rcond=None)[0]
    return c

In [19]:
def evaluate_GPR(c,X_test,X_M):
    KTM = (X_test @ X_M.T)**2
    ytest = KTM @ c
    return ytest

In [53]:
KMM.shape

(400, 400)

In [51]:
KNM.shape

(94247, 400)

In [37]:
KNM = compute_KNM(frames_train, X_sparse, kernel, soaps)


ValueError: could not broadcast input array from shape (96,400) into shape (400,)

In [47]:
help(CURFilter)

Help on class CURFilter in module rascal.utils.filter:

class CURFilter(Filter)
 |  CURFilter(representation, Nselect, act_on='sample per species', selector_args={}, **kwargs)
 |  
 |  A super class for filtering representations based upon a standard
 |  sample or feature selection class.
 |  
 |  This is mainly a wrapper around selectors (implemented e.g. in
 |  scikit-cosmo) that handles the semantic-index transformations
 |  required after selection.
 |  
 |  Parameters
 |  ----------
 |  
 |  representation : Calculator
 |      Representation calculator associated with the kernel
 |  
 |  Nselect: int
 |      number of points to select. If act_on='sample per species' then it should
 |      be a dictionary mapping atom type to the number of samples, e.g.
 |      Nselect = {1:200,6:100,8:50}.
 |  
 |  selector: selector to use for filtering. The selector should
 |          have a `fit` function, which when called will select from the input
 |          matrix the desired features / sa

(188100,)

In [None]:
def get_feature_indices():
    
















































In [9]:
class DataContainer:
    def load_dataset(self,DATAPATH):
        
        atoms = read(DATAPATH,format="extxyz",index=":")
        
        #wrapping atoms
        for atom in atoms:
            atom.wrap()
            
        self.atoms = atoms
        
        #extracting shifts 
        #from SI: Cartesian coordinates, GIPAW calculated isotropic chemical shielding and 
        #ShiftML calculated isotropic chemical shielding
        
        # GIPAW shifts are always first column
        
        shifts = np.concatenate([atom.arrays["CS"] for atom in atoms])
        self.Y = shifts[:,0] #should return (N_environments,) shaped array -> check
        
    
        
    def build_representations(self,hypers):
        self.hypers = hypers
        soap = SphericalInvariants(**hypers)
        self.X = soap.transform(self.stoms).get_features(soap)
        
        
    
    def __init__(self, DATAPATH=None, hypers=None):
        self.Y = None
        self.X = None 
        self.atoms = None
        self.hypers = hypers
        
        if DATAPATH:
            self.load_dataset(DATAPATH)
            if hypers:
                self.build_representations(hypers)
    

In [33]:
kernel = Kernel(soap,name="GAP",kernel_type="Sparse",target_type="Atom",zeta=2,grad=(False,False)) #zeta is exponent for kernel

# bug if zeta is not given: Name will not be accepted: https://github.com/cosmo-epfl/librascal/issues/330

In [49]:
kernel.shape

AttributeError: 'Kernel' object has no attribute 'shape'

In [24]:
n_sparse = {1:100,6:100,7:100,8:100}
compressor = CURFilter(soap, n_sparse, act_on="sample per species")
X_sparse = compressor.select_and_filter(managers)

In [38]:
KNM = kernel(managers, X_sparse)


In [41]:
X_sparse.size

<bound method SparsePoints.size of <rascal.models.sparse_points.SparsePoints object at 0x7f95bf3b1a90>>

In [44]:
data = DataContainer("CSD-500.txt")

In [None]:
model = KRR()

In [47]:
model = train_gap_model(kernel, this[:450], KNM, X_sparse, data.Y[:58048], self_contributions={1:0,6:0,7:0,8:0}, lambdas=[0.1, 0.01], jitter=1e-13)

  KNM[:n_centers] /= lambdas[0] / delta * np.sqrt(Natoms)[:, None]
  KNM[:n_centers] /= lambdas[0] / delta * np.sqrt(Natoms)[:, None]
  Y /= lambdas[0] / delta * np.sqrt(Natoms)[:, None]
  Y /= lambdas[0] / delta * np.sqrt(Natoms)[:, None]


LinAlgError: SVD did not converge in Linear Least Squares

[Atoms(symbols='C88H96O8', pbc=True, cell=[[8.43116035, 0.0, 0.0], [0.158219155128, 14.5042431863, 0.0], [1.16980663624, 4.4685149855, 14.9100096405]], CS=...),
 Atoms(symbols='C26H28O8', pbc=True, cell=[[6.265332446, 0.0, 0.0], [1.69572475717, 7.62373170428, 0.0], [0.677006762785, 0.474085175524, 12.0657897552]], CS=...),
 Atoms(symbols='C24H56N8O20', pbc=True, cell=[[11.983069428, 0.0, 0.0], [0.0, 6.48024023448, 0.0], [-1.34408098239, 0.0, 11.1755303792]], CS=...),
 Atoms(symbols='C40H88N8O32', pbc=True, cell=[[6.810229092, 0.0, 0.0], [0.0, 19.6040820995, 0.0], [-2.93884540099, 0.0, 11.3874385466]], CS=...),
 Atoms(symbols='C56H76N4O12', pbc=True, cell=[[12.0044483, 0.0, 0.0], [0.0, 9.16633262402, 0.0], [-0.572168019323, 0.0, 12.0746022959]], CS=...),
 Atoms(symbols='C84H76N12O12', pbc=True, cell=[[8.80105717, 0.0, 0.0], [0.0, 20.9461376191, 0.0], [-1.37174157157, 0.0, 9.87928348495]], CS=...),
 Atoms(symbols='C48H48N8O12', pbc=True, cell=[[7.408043738, 0.0, 0.0], [0.0, 10.8920615241

In [50]:
help(train_gap_model)

Help on function train_gap_model in module rascal.models.krr:

train_gap_model(kernel, frames, KNM_, X_sparse, y_train, self_contributions, grad_train=None, lambdas=None, jitter=1e-08)
        Defines the procedure to train a SOAP-GAP model [1]:
        .. math::
            Y(A) = \sum_{i \in A} y_{a_i}(X_i),
        where :math:`Y(A)` is the predicted property function associated with the
        atomic structure :math:`A$, :math:`i` and :math:`a_i` are the index and
        species of the atoms in structure :math:`X` and :math:`y_a(A_i)` is the
        atom centered model that depends on the central atomic species.
        The individual predictions are given by:
        .. math::
            y_{a_i(A_i) = \sum_m^{M} lpha_m \delta_{b_m a_i} k(A_i,T_m),
        where :math:`k(\cdot,\cdot)` is a kernel function, :math:`lpha_m` are the
        weights of the model and :math:`b_m is the atom type associated with the
        sparse point :math:`T_m`.
        Hence a kernel element for 