In [7]:
from helpers import load_CSD_data, make_element_wise_environments

In [8]:
#------loading subsample of dataset--------

PATH_TRAIN = "CSD-2k_relaxed_shifts.txt"
PATH_TEST = "CSD-500.txt"
#structures_test, shifts_test = load_CSD_data(PATH_TEST,random_subsample=100)
structures_train, shifts_train = load_CSD_data(PATH_TRAIN,prop_string="CS",random_subsample=200)

In [9]:
shifts_train.shape

(18140,)

In [10]:
from rascal.representations import SphericalInvariants

hypers = {"soap_type": "PowerSpectrum",
          "interaction_cutoff": 3,
          "radial_basis": "GTO",
          "max_radial": 9,
          "max_angular": 9,
          "gaussian_sigma_constant": 0.3,
          "gaussian_sigma_type":"Constant",
          "cutoff_function_type":"ShiftedCosine",
          "cutoff_smooth_width": 0.5,
          "normalize": True,
          "compute_gradients":False,
          "cutoff_function_parameters":dict(rate=1,scale=3.5,exponent=4),
          #"optimization": dict(Spline=dict(accuracy=1.0e-05))
          #"expansion_by_species_method":'structure wise'
         }

In [11]:
calculator = SphericalInvariants(**hypers)

In [17]:
shifts_train.shape

(18140,)

In [21]:
import numpy as np

In [22]:
def make_element_wise_environments_DEBUG(calculator,frames,y=None,select=False):
    """Returns shifts and environments of only one atomtype from the atoms in frames. 
       Or returns a dictionary of atomic-type-wise 
    
    Parameters
    ----------
    calculator : rascal.representations calculator object
                 calculator object with hyperparameters 
    
    frames     : list of ase.atoms objects
                 wrapped structures of the dataset
    
    y          : numpy array of shape (N_environments,X)
                 array of atomic properties
                 
    select     : int
                 atomic number to select atomic species
    Returns
    -------
    
    X_element_wise: dict or numpy.array
                    either dict with atomic numbers keys containing the representations in numpy array, 
                    or numpy array with representations of the selected atomic species
    y_element_wise: dict or numpy.array
                    either dict with atomic numbers keys containing the shifts in numpy arrays, 
                    or numpy array with representations of the selected atomic species
    
    """
    
    
    #get unique elements 
    y_element_wise = {}
    X_element_wise = {}
    
    atoms_list = calculator.transform(frames)
    X_repr = atoms_list.get_features(calculator)
    
    elements = np.unique(atoms_list.get_representation_info()[:,2])
    

    for element in elements:
        
        ind = atoms_list.get_representation_info()[:,2] == element
        print(ind.shape)
        if y is not None:
            y_element_wise[element] = y[ind]
        X_element_wise[element] = X_repr[ind]
    
    #TODO: Change this not to loop over array
    if select is not None:
        return X_element_wise[select], y_element_wise[select] 
    else:
        return X_element_wise, y_element_wise

In [23]:
H_environments_train, H_shifts_train = make_element_wise_environments_DEBUG(calculator,structures_train,shifts_train,select=6)

(18140,)
(18140,)
(18140,)
(18140,)


In [13]:
from rascal.models.kernels import Kernel

In [14]:
print("%d megabytes" % (H_environments_train.size/1e06 * H_environments_train.itemsize))

368 megabytes


In [22]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics.pairwise import polynomial_kernel

import numpy as np

y = H_shifts_train

X = H_environments_train
clf = KernelRidge(alpha=1e-06,kernel='poly', degree=2, coef0=1)
clf.fit(X, y)

KernelRidge(alpha=1e-06, degree=2, kernel='poly')

In [16]:
#eval
structures_test, shifts_test = load_CSD_data(PATH_TEST)
H_environments_test, H_shifts_test = make_element_wise_environments(calculator,structures_test,shifts_test,select=6)

KeyboardInterrupt: 

In [15]:
sh

NameError: name 'shifts_test' is not defined

In [24]:
y_predicted = clf.predict(H_environments_test)

In [11]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [27]:
np.sqrt(2*mean_squared_error(H_shifts_test[:,0], np.array(y_predicted),squared=True))

11.732554953455425

In [26]:
mean_absolute_error(H_shifts_test[:,0],y_predicted)

5.788674852147157

In [39]:
def optimize_hypers(X_train,X_test,y_train,y_test,y_ML_old,alphas=np.logspace(-8, 0, num=25)):
    errors = []
    errors_ml = []
    
    for val in alphas:
        clf = KernelRidge(alpha=val, degree=2)
        clf.fit(X_train, y_train)
        y_predicted = clf.predict(X_test)
        errors.append(np.sqrt(2*mean_squared_error(y_predicted, y_test,squared=True)))
        errors_ml.append(np.sqrt(2*mean_squared_error(y_ML_old, y_predicted,squared=True)))
    return errors, errors_ml, alphas
    
    
    