In [1]:
from loader.loader import load_data
from sklearn.linear_model import Ridge, RidgeCV
from feature_utils.parallel import get_features_in_parallel, get_optimal_radial_basis_hypers_parallel
from rascal.representations import SphericalInvariants as SOAP
from sklearn.metrics import mean_squared_error, mean_absolute_error
from skcosmo.preprocessing import StandardFlexibleScaler
from sklearn.compose import TransformedTargetRegressor
from skcosmo.model_selection import atom_groups_by_frame
from sklearn.model_selection import GroupKFold
import numpy as np
import ase.io
from tqdm import tqdm, trange
from nice.blocks import *
from nice.utilities import *
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error
from joblib import dump
from helpers.helpers import return_relative_inds

In [2]:
from rascal.neighbourlist.structure_manager import mask_center_atoms_by_id

In [3]:
from rascal.utils import get_optimal_radial_basis_hypers

In [4]:
train_structures, test_structures, train_properties, test_properties = load_data("../make_tensor_data/train_tensor/CSD-3k+S546_shift_tensors.xyz",\
                                                                                    "../make_tensor_data/test_tensor/CSD-500+104-7_shift_tensors.xyz")

In [5]:
def transform_sequentially_modified(nice,
                           structures,
                           rascal_hypers,
                           all_species,
                           fit_species,
                           block_size=500,
                           show_progress=True):
    ''' transforming structures into structural features by chunks in order to use less amount of RAM
    
    Args:
        nice: dictionary where keys are species and entries are nice transformers.\
        If you want to use single nice transformer to all environments regardless of central\
        specie just pass {key : nice_single for specie in all_species}
        structures: list of Ase atoms objects
        rascal_hypers: dictionary with parameters for librascal controlling spherical expansion.\
        Should be the same as used for fitting nice transformers
        all_species: numpy array with ints of all unique species in the dataset. 
        block_size: size of chunks measured in number of environments
        show_progress: whether or not show progress via tqdm
        
        
    Return:
        numpy array with shape [len(structures), number of structural features] with structural features
    '''

    pieces = []

    for i in tqdm.tqdm(range(0, len(structures), block_size),
                       disable=not show_progress):
        now = {}
        coefficients = get_spherical_expansion(structures[i:i + block_size],
                                               rascal_hypers,
                                               all_species,
                                               show_progress=False)
        for specie in fit_species:
            if (coefficients[specie].shape[0] != 0):
                now[specie] = nice[specie].transform(
                    coefficients[specie], return_only_invariants=True)
            else:
                # determining size of output
                dummy_shape = coefficients[specie].shape
                dummy_shape = list(dummy_shape)
                dummy_shape[0] = 1
                dummy_data = np.ones(dummy_shape)
                dummy_output = nice[specie].transform(
                    dummy_data, return_only_invariants=True)
                current_block = {}
                for key in dummy_output.keys():
                    current_block[key] = np.zeros(
                        [0, dummy_output[key].shape[1]])
                now[specie] = current_block

        pieces.append(now)

    max_v = len(pieces[0][int(fit_species[0])])

    new_feat = {}
    for specie in fit_species:
        new_feat[int(specie)] = {val: [] for val in range(1,max_v+1)}

    for specie in fit_species:
        for block in pieces:
            for v, feat in block[int(specie)].items():
                new_feat[int(specie)][v].append(feat)

        for key_v in new_feat[int(specie)].keys():
            new_feat[int(specie)][key_v] = np.concatenate(new_feat[int(specie)][key_v])
        
        
    return new_feat

In [6]:
HYPERS = {
'interaction_cutoff': 6.3,
'max_radial': 5,
'max_angular': 5,
'gaussian_sigma_type': 'Constant',
'gaussian_sigma_constant': 0.05,
'cutoff_smooth_width': 0.3,
"cutoff_function_type":"RadialScaling",
 'cutoff_function_parameters': dict(rate=1.,
                    scale= 2.0,
                    exponent=3.
                        ),
'radial_basis': 'GTO'
}

def get_transformer():
    return StandardSequence([
        StandardBlock(ThresholdExpansioner(num_expand=150),
                      CovariantsPurifierBoth(max_take=10),
                      IndividualLambdaPCAsBoth(n_components=50),
                      ThresholdExpansioner(num_expand=300, mode='invariants'),
                      InvariantsPurifier(max_take=50),
                      InvariantsPCA(n_components=200)),
        StandardBlock(ThresholdExpansioner(num_expand=150),
                      CovariantsPurifierBoth(max_take=10),
                      IndividualLambdaPCAsBoth(n_components=50),
                      ThresholdExpansioner(num_expand=300, mode='invariants'),
                      InvariantsPurifier(max_take=50),
                      InvariantsPCA(n_components=200)),
        StandardBlock(None, None, None,
                      ThresholdExpansioner(num_expand=300, mode='invariants'),
                      InvariantsPurifier(max_take=50),
                      InvariantsPCA(n_components=200))
    ],
                            initial_scaler=InitialScaler(
                                mode='signal integral', individually=True))

In [7]:
all_species = get_all_species(train_structures + test_structures)
fit_species = [1,6,7,8]

fit_ind = {}
for specie in [1,6,7,8]:
    fit_ind[specie] = np.load("../PCov-FPS-sample-ids/PCOV_FPS_selected_sample_ids_{}_selected_20000_n8_l8_PASSING.npy".format(specie))
    
    
train_coefficients = get_spherical_expansion(train_structures, HYPERS, all_species)
test_coefficients = get_spherical_expansion(test_structures, HYPERS, all_species)


transformers = {}
for key in train_coefficients.keys():
    transformers[key] = get_transformer()

print(transformers.keys())

for specie in [1,6,7,8]:
        ind = fit_ind[specie]
        transformers[specie].fit(train_coefficients[specie][ind])
        print("fitted")
        
train_features = {}
for specie in [1,6,7,8]:
    train_features[specie] = transformers[specie].transform(train_coefficients[specie], return_only_invariants=True)

test_features = {}
for specie in [1,6,7,8]:
    test_features[specie] = transformers[specie].transform(test_coefficients[specie],
                                                   return_only_invariants=True)

100%|███████████████████████████████████████████| 35/35 [00:12<00:00,  2.79it/s]
100%|█████████████████████████████████████████████| 5/5 [00:00<00:00,  6.60it/s]
100%|█████████████████████████████████████████████| 6/6 [00:03<00:00,  1.51it/s]
100%|█████████████████████████████████████████████| 5/5 [00:00<00:00, 27.80it/s]


dict_keys([1, 6, 7, 8, 16])
fitted
fitted
fitted
fitted


In [8]:
this = transform_sequentially_modified({1:transformers[1], 6:transformers[6]},test_structures,HYPERS,np.array([1,6,7,8,16]),np.array([1, 6]),block_size=50)

100%|███████████████████████████████████████████| 12/12 [00:52<00:00,  4.39s/it]


In [13]:
this[1][].shape

(31446, 200)

In [16]:
new_this = this.copy()

In [20]:
[print(a) for i in range(1,6) for a in range(i) ]

0
0
1
0
1
2
0
1
2
3
0
1
2
3
4


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [24]:
new_this[1]

{1: {1: array([[ 9.52380949e-01,  7.82997514e-02, -2.24303347e-01, ...,
           3.87404416e-03,  9.80443260e-05,  1.19503899e-04],
         [ 9.52380782e-01,  7.82997780e-02, -2.24303517e-01, ...,
           3.87415339e-03,  9.80599606e-05,  1.19502369e-04],
         [ 9.52380962e-01,  7.82997334e-02, -2.24303260e-01, ...,
           3.87405288e-03,  9.80459620e-05,  1.19503925e-04],
         ...,
         [ 9.80052854e-01, -1.72693760e-01,  5.87731086e-02, ...,
          -1.58678037e-05, -1.82004648e-04,  2.89683750e-05],
         [ 9.80052749e-01, -1.72694178e-01,  5.87736813e-02, ...,
          -1.57088529e-05, -1.81989955e-04,  2.89639057e-05],
         [ 9.80030086e-01, -1.72827231e-01,  5.88581736e-02, ...,
          -4.54925361e-07, -1.80953646e-04,  2.86133403e-05]]),
  2: array([[-1.11727001e-01, -7.63402573e-03,  4.90459847e-03, ...,
           4.96379914e-18,  7.50193780e-18, -1.05659282e-17],
         [-1.11728538e-01, -7.63163188e-03,  4.90148543e-03, ...,
           4.

In [91]:
fit_species = np.array([1,6])

max_v = len(new_this[0][int(fit_species[0])])

new_feat = {}
for specie in fit_species:
    new_feat[specie] = {val: [] for val in range(1,max_v+1)}
    
for specie in fit_species:
    for block in this:
        for v, feat in block[specie].items():
            new_feat[specie][v].append(feat)

    for key_v in new_feat[specie].keys():
        new_feat[specie][key_v] = np.concatenate(new_feat[specie][key_v])


In [86]:
new_feat[6][1].shape

(31446, 25)

In [72]:
this[0][6]

{1: array([[ 9.71176310e-01, -1.20317536e-01, -1.45874314e-01, ...,
          4.60990362e-04, -1.32776683e-03, -3.85676367e-04],
        [ 9.71176270e-01, -1.20317360e-01, -1.45874474e-01, ...,
          4.60983589e-04, -1.32778650e-03, -3.85681409e-04],
        [ 9.71176278e-01, -1.20317478e-01, -1.45874462e-01, ...,
          4.60992841e-04, -1.32777227e-03, -3.85676571e-04],
        ...,
        [ 8.86965994e-01, -1.94275624e-01,  3.79754539e-01, ...,
          5.08065188e-04, -2.57350113e-03,  9.06237349e-04],
        [ 8.86986361e-01, -1.94228851e-01,  3.79737539e-01, ...,
          5.07804791e-04, -2.57605218e-03,  9.04899297e-04],
        [ 8.86985602e-01, -1.94229607e-01,  3.79738667e-01, ...,
          5.07809407e-04, -2.57608990e-03,  9.04892298e-04]]),
 2: array([[-3.55381374e-02,  9.00472744e-02, -9.57336358e-02, ...,
         -7.06105295e-18, -1.82261296e-18,  2.45394227e-18],
        [-3.55388371e-02,  9.00470223e-02, -9.57344379e-02, ...,
         -7.53995458e-18, -1.206

In [53]:
max_v

4

In [54]:
new_feat[5]

KeyError: 5

In [39]:
new_feat[6].shape

KeyError: 6

In [30]:
train_coefficients[1].shape

(134692, 40, 9, 17)

In [16]:
train_coefficients[8].shape

(43354, 25, 6, 11)

In [11]:
all_species

array([ 1,  6,  7,  8, 16])

In [16]:
fit_species = np.load("../PCov-FPS-sample-ids/PCOV_FPS_selected_sample_ids_7_selected_20000_n8_l8_PASSING.npy")
relative_inds = return_relative_inds(train_structures,fit_species,7)
map_dict = {key.info["NAME"]:index for index,key in enumerate(train_structures)}
for pairs in relative_inds:
    struct_ind = map_dict[pairs[0]]
    mask_center_atoms_by_id(train_structures[struct_ind],id_select=int(pairs[1]))


In [31]:
train_structures[2].arrays['center_atoms_mask']

array([False, False, False, False, False, False,  True,  True, False,
       False, False, False, False,  True,  True, False, False, False,
       False, False,  True, False, False,  True, False, False, False,
       False,  True, False, False,  True, False, False, False, False,
        True,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
        True, False, False, False, False, False, False, False, False])

In [32]:
train_structures[2].numbers == 7

array([False, False, False, False,  True,  True,  True,  True, False,
       False, False, False,  True,  True,  True,  True, False, False,
       False, False,  True,  True,  True,  True, False, False, False,
       False,  True,  True,  True,  True, False, False, False, False,
        True,  True,  True,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True,  True,  True,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True,  True,  True,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True,  True,  True,
        True, False, False, False, False, False, False, False, False])

In [25]:
np.sort(np.load("../PCov-FPS-sample-ids/PCOV_FPS_selected_sample_ids_7_selected_20000_n8_l8_PASSING.npy"))[:25]

array([ 0,  1,  2,  3,  7,  9, 11, 14, 15, 17, 18, 20, 23, 24, 27, 28, 29,
       33, 34, 40, 43, 45, 47, 48, 50])

In [None]:
for key in [1,6,7,8]:
        fit_species = np.load("../PCov-FPS-sample-ids/PCOV_FPS_selected_sample_ids_{}_selected_20000_n8_l8_PASSING.npy".format(key))
        relatvie 
        for frame in mask_center_atoms_by_id()

In [None]:
all_species = get_all_species(train_structures + test_structures)

train_coefficients = get_spherical_expansion(train_structures, HYPERS, all_species)
test_coefficients = get_spherical_expansion(test_structures, HYPERS, all_species)
print(test_coefficients.keys())
transformers = {}
for key in train_coefficients.keys():
    transformers[key] = get_transformer()

#TODO: Pass here train_coefficients:

fit_species = [1,6,7,8]




for key in fit_species:
        fit_species = np.load("../PCov-FPS-sample-ids/PCOV_FPS_selected_sample_ids_{}_selected_20000_n8_l8_PASSING.npy".format(key))
        print(train_coefficients[key][fit_species].shape)
        transformers[key].fit(train_coefficients[key][fit_species])

train_features = {}
for specie in [1,6,7,8]:
    train_features[specie] = transformers[specie].transform(train_coefficients[specie], return_only_invariants=True)

test_features = {}
for specie in [1,6,7,8]:
    test_features[specie] = transformers[specie].transform(test_coefficients[specie],
                                                   return_only_invariants=True)

for specie in [1,6,7,8]:
    Strain, Stest, Ytrain, Ytest = load_data("../make_tensor_data/train_tensor/CSD-3k+S546_shift_tensors.xyz",\
                                                                                    "../make_tensor_data/test_tensor/CSD-500+104-7_shift_tensors.xyz", selected_species=int(specie))
    
    
    Xtrain = np.concatenate([value for key, value in train_features[specie].items()],axis=1)
    Xtest = np.concatenate([value for key, value in test_features[specie].items()],axis=1)
    
    
    
    groups = atom_groups_by_frame(Strain)
    splits = list(GroupKFold(n_splits=5).split(Xtrain,Ytrain,groups=groups))
    model = RidgeCV(alphas=np.logspace(-6,3,10),cv=splits,scoring="neg_mean_squared_error")
    model.fit(Xtrain,Ytrain)
    print("{} species ridges' alpha is: {}".format(specie,model.alpha_))
    
    Ypred_test = model.predict(Xtest)
    Ypred_train = model.predict(Xtrain)
    
    rmse_test = mean_squared_error(Ytest,Ypred_test,squared=False)
    
    rmse_train = mean_squared_error(Ytrain,Ypred_train,squared=False)
    
    mae_test = mean_absolute_error(Ypred_test,Ytest)
    mae_train = mean_absolute_error(Ypred_train,Ytrain)
    
    print("test-RMSE: {} \n   train-RMSE: {}:\n  test-MAE:{}\n  train-MAE:{}".format(rmse_test,rmse_train,mae_test,mae_train))
    
    
    dump(transformers[specie],str(specie) + "_NICE_transformer.pkl")
    
    
    

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35/35 [00:34<00:00,  1.02it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:02<00:00,  1.77it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:10<00:00,  1.81s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  7.75it/s]


dict_keys([1, 6, 7, 8, 16])
(20000, 40, 9, 17)
(20000, 40, 9, 17)
(20000, 40, 9, 17)
(20000, 40, 9, 17)


In [14]:
type(int(all_species[0]))

int

In [23]:
H_feats = np.concatenate([value for key, value in train_features[1].items()],axis=1)

In [51]:
H_feats_test = np.concatenate([value for key, value in test_features[1].items()],axis=1)

In [52]:
int_structs_test = deepcopy(test_structures)
[mask_center_atoms_by_species(frame,species_select=[1]) for frame in int_structs_test]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [39]:
train_properties_H = np.concatenate([structure.arrays["cs_iso"][structure.numbers == 1]for structure in train_structures])

In [53]:
test_properties_H = np.concatenate([structure.arrays["cs_iso"][structure.numbers == 1]for structure in test_structures])

In [54]:
test_properties_H.shape

(5428,)

In [55]:
H_feats_test.shape

(5428, 625)

In [31]:
from copy import deepcopy

In [32]:
from rascal.neighbourlist.structure_manager import mask_center_atoms_by_species

In [38]:
int_structs = deepcopy(train_structures)
[mask_center_atoms_by_species(frame,species_select=[1]) for frame in int_structs]
groups = atom_groups_by_frame(int_structs)

In [35]:
int_structs

[Atoms(symbols='C32H40N16O20', pbc=True, cell=[[9.176034118, 0.0, 0.0], [0.0, 15.1320602235, 0.0], [-2.59975398631, 0.0, 7.49494796345]], center_atoms_mask=..., cs_iso=..., cs_tensor=...),
 Atoms(symbols='C40H28N4', pbc=True, cell=[[5.752027846, 0.0, 0.0], [0.585579442834, 8.8607170474, 0.0], [1.18489472816, 0.511320763342, 13.1036716279]], center_atoms_mask=..., cs_iso=..., cs_tensor=...),
 Atoms(symbols='C56H52N12O12', pbc=True, cell=[[11.986138672, 0.0, 0.0], [0.0, 7.70902892056, 0.0], [-2.70475609431, 0.0, 13.8708031584]], center_atoms_mask=..., cs_iso=..., cs_tensor=...),
 Atoms(symbols='C32H16N64O48', pbc=True, cell=[12.129281862, 12.4461836092, 12.6773798508], center_atoms_mask=..., cs_iso=..., cs_tensor=...),
 Atoms(symbols='C28H46N10O12', pbc=True, cell=[[8.584040452, 0.0, 0.0], [0.304948037057, 12.5673614153, 0.0], [-3.45549689991, -0.639588270038, 7.76044469083]], center_atoms_mask=..., cs_iso=..., cs_tensor=...),
 Atoms(symbols='C16H24N16O20S4', pbc=True, cell=[[7.308028718

In [45]:
splits = list(GroupKFold(n_splits=3).split(H_feats,train_properties_H,groups=groups))

In [49]:
model = RidgeCV(alphas=np.logspace(-6,3,10),cv=splits,scoring="neg_mean_squared_error")

In [50]:
model.fit(H_feats,train_properties_H)

RidgeCV(alphas=array([1.00000000e-06, 2.97635144e-06, 8.85866790e-06, 2.63665090e-05,
       7.84759970e-05, 2.33572147e-04, 6.95192796e-04, 2.06913808e-03,
       6.15848211e-03, 1.83298071e-02, 5.45559478e-02, 1.62377674e-01,
       4.83293024e-01, 1.43844989e+00, 4.28133240e+00, 1.27427499e+01,
       3.79269019e+01, 1.12883789e+02, 3.35981829e+02, 1.00000000e+03]),
        cv=[(array([   68,    69,    70, ..., 19443, 19444, 19445]),
             array([    0,     1,     2, ..., 19519, 19520, 19521])),
            (array([    0,     1,     2, ..., 19519, 19520, 19521]),
             array([   68,    69,    70, ..., 19443, 19444, 19445])),
            (array([    0,     1,     2, ..., 19519, 19520, 19521]),
             array([  182,   183,   184, ..., 19419, 19420, 19421]))],
        scoring='neg_mean_squared_error')

In [59]:
model.alpha_

0.05455594781168514

In [56]:
y_pred = model.predict(H_feats_test)

In [57]:
from sklearn.metrics import mean_squared_error

In [58]:
mean_squared_error(test_properties_H,y_pred,squared="False")

1.3084624824108166

In [28]:
train_structures[0].arrays["cs_iso"][train_structures[0].numbers == 1]

array([27.96, 27.95, 27.96, 27.95, 26.71, 26.71, 26.71, 26.71, 27.46,
       27.46, 27.46, 27.46, 27.15, 27.15, 27.15, 27.15, 28.06, 28.06,
       28.06, 28.06, 27.61, 27.61, 27.61, 27.61, 26.97, 26.97, 26.97,
       26.97, 26.3 , 26.3 , 26.3 , 26.3 , 27.72, 27.71, 27.72, 27.71,
       18.26, 18.26, 18.26, 18.26])

In [25]:
train_properties.shape

(48217,)

In [14]:
train_features.shape

AttributeError: 'dict' object has no attribute 'shape'

In [6]:
atom_group = atom_groups_by_frame(train_structures)

In [7]:
import time

In [8]:
model = Ridge(alpha=0.0002587533653942811)
start_time = time.time()
#RidgeCV(alphas=np.logspace(-8,0,50),cv=splits,scoring="neg_mean_squared_error")#TransformedTargetRegressor(regressor=,transformer=StandardFlexibleScaler())
model.fit(Xtrain, train_properties)
print("--- 10 steps took %s seconds ---" % (time.time() - start_time))

--- 10 steps took 30.988507747650146 seconds ---
