In [1]:
from helpers import filter_by_status
from rascal.neighbourlist.structure_manager import mask_center_atoms_by_species, mask_center_atoms_by_id
from ase.io import read
import numpy as np
from helpers import return_relative_inds

In [None]:
mask_center_atoms_by_species

In [44]:
help(mask_center_atoms_by_id)

Help on function mask_center_atoms_by_id in module rascal.neighbourlist.structure_manager:

mask_center_atoms_by_id(frame, id_select=None, id_blacklist=None)
    Mask the centers (center-select) of an ASE atoms object, by index
    
    Parameters
    ----------
    frame: ase.Atoms
        Atomic structure to mask
    
    id_select: list of int
        List of atom IDs to select
    
    id_blacklist: list of int
        List of atom IDs to exclude
    
    Returns
    -------
    None (the Atoms object is modified directly)
    
    Notes
    -----
    The default is to select all atoms.  If `id_select` is provided,
    select only those atoms.  If only `id_blacklist` is provided, select
    all atoms *except* those in the blacklist.  If both are provided,
    atoms are first selected based on `id_select` and then excluded based
    on `id_blacklist`.  If the atoms object already has a mask, then
    `id_select` is applied first using the `or` operation, then
    `id_blacklist` is a

In [15]:
%config IPCompleter.use_jedi = False

In [78]:
# this function loads the CSD-2K, or CSD-3K dataset and prepares the "workbench"

# pass TRAIN TEST PATH
    # check if this paths are valid
    
# filter by IDs
    #
# if IDs, elif species, elif 

# after all this generate subselection
# we 

def load_data(TRAINPATH,TESTPATH,physical_property="cs_iso",filter_by="PASSING",selected_ids=None,\
              selected_species=None,random_subsample_train=None,random_subsample_test=None):
    
    #load train and teststructures
    train_structures = read(TRAINPATH,format="extxyz",index=":")
    test_structures = read(TESTPATH,format="extxyz",index=":")
    
    # filter them by status, this might be swapped with masking step, but ID selection 
    # should not be done on unfiltered structures
    
    if filter_by is not None:
        train_structures = filter_by_status(train_structures,status=filter_by)
    
    #wrap train structures
    for structure in train_structures:
        structure.wrap(eps=1e-12)
    
    #wrap test structures
    for structure in test_structures:
        structure.wrap(eps=1e-12)
    
    
    if (selected_ids is None) and (selected_species is None):
        #pass here because no masking is necessary
        pass
    
    
    elif (selected_ids is not None) and (selected_species is None):
        raise NotImplementedError()
    
    #select only species
    elif (selected_ids is None) and (selected_species is not None):
        
        #select species by atomic numbers
        for structure in train_structures: mask_center_atoms_by_species(structure,species_select=[selected_species])
        for structure in test_structures: mask_center_atoms_by_species(structure,species_select=[selected_species])
    
    elif (selected_ids is not None) and (selected_species is not None):
        #use helper function to get dict/array of the structures
        #make dict that contains list indices ?
        dict_CSD_identifiers = {i.info["NAME"]: n for n,i in enumerate(train_structures) }
        relative_inds = return_relative_inds(train_structures,selected_ids,selected_species)
        

        for structure in train_structures: structure.arrays["center_atoms_mask"] = np.full((len(structure)),False)
        
        for mask in relative_inds:
            frame_number = dict_CSD_identifiers[mask[0]]
            mask_center_atoms_by_id(train_structures[frame_number],id_select=int(mask[1]))
        
        for structure in test_structures: mask_center_atoms_by_species(structure,species_select=[selected_species])
        
    else:
        raise ValueError()
    
    #do subsample here:
    
    if random_subsample_train is not None:
        ids = list(range(len(train_structures)))
        np.random.shuffle(ids)
        ids = ids[:random_subsample_train]
        train_structures = [train_structures[ii] for ii in ids[:random_subsample_train]]
    
    if random_subsample_test is not None:
        ids = list(range(len(test_structures)))
        np.random.shuffle(ids)
        ids = ids[:random_subsample_test]
        test_structures = [test_structures[ii] for ii in ids[:random_subsample_test]]
        
    #extract selected property
    if (selected_ids is None) and (selected_species is None):
        train_properties = np.array([tensor for structure in train_structures for tensor in structure.arrays[physical_property]])
        test_properties = np.array([tensor for structure in test_structures for tensor in structure.arrays[physical_property]])
    else:
        train_properties = np.array([tensor for structure in train_structures for tensor in structure.arrays[physical_property][structure.arrays["center_atoms_mask"]]])
        test_properties = np.array([tensor for structure in test_structures for tensor in structure.arrays[physical_property][structure.arrays["center_atoms_mask"]]])
    
    
    return train_structures, test_structures, train_properties, test_properties
    

In [79]:
train_structures, test_structures, train_properties, test_properties = load_data("./make_tensor_data/train_tensor/CSD-3k+S546_shift_tensors.xyz",\
                                                                                    "./make_tensor_data/test_tensor/CSD-500+104-7_shift_tensors.xyz",selected_ids=filtered_inds,selected_species=7)

In [80]:
train_structures[0].arrays["center_atoms_mask"]

array([ True, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False])

In [81]:
len(train_properties)

8000

In [7]:
filtered_inds = np.load("./selected_ids/selected_sample_ids_7_n12_l9_PASSING.npy")

In [18]:
filtered_inds

array([    0, 21213, 18712, ...,   630,  6222, 42278])

In [11]:
len(filtered_inds)

8000

In [6]:
from helpers import return_relative_inds

In [9]:
this = return_relative_inds(train_structures,filtered_inds,7)

In [31]:
for i in this: print(i)

['PYRIDO04' '0']
['TANPEE' '6']
['DAJVUG01' '158']
['TANPEE' '7']
['SAPDAQ' '44']
['JEMFUD' '27']
['IFAYEV' '17']
['WAGMOH01' '10']
['DAJVUG01' '145']
['SENRUZ' '30']
['RIGJOG' '45']
['CIDROW' '28']
['DZBASK' '70']
['PEKSAZ' '26']
['DZBASK' '67']
['PYRCOX01' '21']
['HIPXAF' '8']
['IXAQAB' '8']
['SENRUZ' '31']
['PEKSAZ' '27']
['PYRZIN14' '1']
['CIDROW' '29']
['MOLJED' '11']
['JACPOS01' '11']
['DOCMEO' '85']
['BIZBET' '0']
['ZUHKIW' '0']
['NSMACM' '90']
['XODBAU' '21']
['TAFKIU' '32']
['TAFKIU' '33']
['BUYZAW' '42']
['GUHROQ01' '3']
['BATWUO' '31']
['YEHPIK' '9']
['WOQPEY' '11']
['VIBCEQ' '71']
['TAFKIU' '38']
['APENUG' '64']
['DUDNAR' '17']
['SENRUZ' '24']
['NSMACM' '91']
['DAJVUG01' '155']
['ERIVAC' '23']
['LEKZEH' '178']
['QEXJIN' '73']
['RIGJOG' '36']
['BILXUR' '135']
['CIKGAE' '4']
['DAJVUG01' '156']
['CMPDZB' '6']
['BUYZAW' '20']
['VUHSOI' '138']
['UMEREJ' '65']
['APENUG' '63']
['SAPDAQ' '47']
['BUYZAW' '23']
['CEWGUH' '5']
['BUYZAW' '43']
['AZOBEN04' '51']
['LOWBIK' '9']
['GEYSEI'

In [10]:
this

array([['PYRIDO04', '0'],
       ['TANPEE', '6'],
       ['DAJVUG01', '158'],
       ...,
       ['EXIGAU', '96'],
       ['FULTEM', '66'],
       ['MMCPUR', '48']], dtype='<U21')

In [17]:
train_structures[0].get_atomic_numbers()

array([7, 7, 7, 7, 8, 8, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1])

In [26]:
train_structures[25].info

{'magres_units': {'cs_tensor': 'ppm', 'cs_iso': 'ppm'},
 'STATUS': 'PASSING',
 'NAME': 'TABFAG',
 'ENERGY': -2746.49423079}

In [28]:
a = None
for i in train_structures:
    if i.info["NAME"] == "DAJVUG01":
     a = i

In [30]:
a.get_atomic_numbers()[158]

7

In [32]:
dict_CSD_identifiers = {i.info["NAME"]: n for n,i in enumerate(train_structures) }

In [33]:
dict_CSD_identifiers

{'PYRIDO04': 0,
 'MODYOU': 1,
 'ELIGOX': 2,
 'TUHHUZ': 3,
 'CSURCD10': 4,
 'DAVARO': 5,
 'BOZGIH': 6,
 'FEHCUQ': 7,
 'DAZVEF': 8,
 'PUBMUU23': 9,
 'MOKPUX': 10,
 'TEOXDE01': 11,
 'RIGJOG': 12,
 'BINROF': 13,
 'UPONAQ': 14,
 'HMBFUR': 15,
 'RAYLAE': 16,
 'VABLIT': 17,
 'UREJAD': 18,
 'XURCIW': 19,
 'MCBPCX': 20,
 'DOXKOR': 21,
 'UDURUH': 22,
 'DALGON03': 23,
 'XUHPIB': 24,
 'TABFAG': 25,
 'MEWSUE': 26,
 'KENMOF': 27,
 'BUXWIA': 28,
 'KIFJAK': 29,
 'JEXZUH': 30,
 'GAFWIV': 31,
 'EXIGAU': 32,
 'ULEZES': 33,
 'RIZZEH': 34,
 'XIBTUY': 35,
 'SITQET': 36,
 'FIZJON01': 37,
 'NEPBEQ': 38,
 'TRZPUR': 39,
 'HOFLAQ01': 40,
 'BUWCAX': 41,
 'DIHIXL10': 42,
 'ZIPSIC': 43,
 'BOQQUT08': 44,
 'SEYJAH': 45,
 'NEHQAU': 46,
 'TUPWEI': 47,
 'YITKAP': 48,
 'VOBJEB': 49,
 'AZOXCH': 50,
 'GIVXUE': 51,
 'INEZUY': 52,
 'XOMWAX': 53,
 'XOKXIE': 54,
 'NERJIE': 55,
 'EFUMAU02': 56,
 'APENOA': 57,
 'GACLEE': 58,
 'NTRACD': 59,
 'SIBFOA': 60,
 'EVIQAB': 61,
 'OCPNDN': 62,
 'FUGLEA': 63,
 'DAFNON': 64,
 'QIZBIL': 65,


In [47]:
for mask in this:
    frame_number = dict_CSD_identifiers[mask[0]]
    mask_center_atoms_by_id(train_structures[frame_number],id_select=int(mask[1]))

In [55]:
train_structures[0].arrays["center_atoms_mask"]

array([ True, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False])

In [4]:
hypers_a = dict(interaction_cutoff=4.5,
              max_radial=12,
              max_angular=9,
              gaussian_sigma_constant=0.3,
              gaussian_sigma_type="Constant",
              cutoff_function_type="RadialScaling",
              cutoff_smooth_width=0.5,
              cutoff_function_parameters=
                    dict(
                            rate=1,
                            scale=3.0,
                            exponent=6
                        ),
              radial_basis="GTO",
              optimization=
                    dict(
                            Spline=dict(
                               accuracy=1.0e-05
                            )
                        ),
              compute_gradients=False
              )

hypers_b = dict(interaction_cutoff=4.5,
              max_radial=12,
              max_angular=9,
              gaussian_sigma_constant=0.3,
              gaussian_sigma_type="Constant",
              cutoff_function_type="RadialScaling",
              cutoff_smooth_width=0.5,
              cutoff_function_parameters=
                    dict(
                            rate=1,
                            scale=3.0,
                            exponent=6
                        ),
              radial_basis="GTO",
              optimization=
                    dict(
                            Spline=dict(
                               accuracy=1.0e-05
                            )
                        ),
              compute_gradients=False
              )


In [5]:
hypers_a == hypers_b

True