# Dataset creation for "Crystal-structure identification using Bayesian deep learning"

Author: Angelo Ziletti (angelo.ziletti@gmail.com; ziletti@fhi-berlin.mpg.de)


### Brief summary
This notebook shows how to create the datasets starting from the tar files which contain the descriptor.    

The tar files for different displacements and vacancies are extracted, and a dataset is created using the function *prepare_dataset* from *ai4materials.dataprocessing.preprocessing*; a dataset consists of a numpy array containing the diffraction intensity in spherical harmonics (DISH) descriptor, a numpy array containing the correct labels (the label of the pristine structure), and a *json* files providing a human-readable summary of the content of the dataset.     

Moreover, a ASE database containing structures for each defective transformation (displacements and vacancies) is written to disk using the *write_ase_db* function from *ai4materials.utils.utils_data_retrieval*.

In [2]:
# load libraries
from ai4materials.dataprocessing.preprocessing import load_dataset_from_file
from ai4materials.wrappers import load_descriptor
from ai4materials.utils.utils_config import set_configs
from ai4materials.utils.utils_config import setup_logger
from ai4materials.dataprocessing.preprocessing import load_dataset_from_file
from ai4materials.dataprocessing.preprocessing import prepare_dataset
from ai4materials.utils.utils_data_retrieval import write_ase_db
from ase.spacegroup import get_spacegroup as ase_get_spacegroup
from collections import Counter
import itertools
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
%matplotlib inline  

main_folder = '/home/ziletti/Documents/calc_nomadml/rot_inv_3d/'
dataset_folder = os.path.abspath(os.path.normpath(os.path.join(main_folder, 'datasets')))
desc_folder = os.path.abspath(os.path.normpath(os.path.join(main_folder, 'desc_folder')))

configs = set_configs(main_folder=main_folder)
logger = setup_logger(configs, level='ERROR', display_configs=False)
configs['io']['dataset_folder'] = dataset_folder
configs['io']['desc_folder'] = desc_folder

## 1. Pristine structures

In [3]:
# hcp - spacegroup 194
filenames_pristine_hcp = ['hcp/pristine/A_hP2_194_c_target_nb_atoms128_rotid0_pristine.tar.gz',
                        'hcp/pristine/A_hP2_194_c_target_nb_atoms128_rotid1_pristine.tar.gz',
                        'hcp/pristine/A_hP2_194_c_target_nb_atoms128_rotid2_pristine.tar.gz',
                        'hcp/pristine/A_hP2_194_c_target_nb_atoms128_rotid3_pristine.tar.gz',
                        'hcp/pristine/A_hP2_194_c_target_nb_atoms128_rotid4_pristine.tar.gz']
# sc - spacegroup 221
filenames_pristine_sc = ['sc/pristine/A_cP1_221_a_target_nb_atoms128_rotid0_pristine.tar.gz',
                       'sc/pristine/A_cP1_221_a_target_nb_atoms128_rotid1_pristine.tar.gz',
                       'sc/pristine/A_cP1_221_a_target_nb_atoms128_rotid2_pristine.tar.gz',
                       'sc/pristine/A_cP1_221_a_target_nb_atoms128_rotid3_pristine.tar.gz',
                       'sc/pristine/A_cP1_221_a_target_nb_atoms128_rotid4_pristine.tar.gz',
                       'sc/pristine/A_cP1_221_a_target_nb_atoms128_rotid5_pristine.tar.gz',
                       'sc/pristine/A_cP1_221_a_target_nb_atoms128_rotid6_pristine.tar.gz',
                       'sc/pristine/A_cP1_221_a_target_nb_atoms128_rotid7_pristine.tar.gz',
                       'sc/pristine/A_cP1_221_a_target_nb_atoms128_rotid8_pristine.tar.gz',
                       'sc/pristine/A_cP1_221_a_target_nb_atoms128_rotid9_pristine.tar.gz',
                       'sc/pristine/A_cP1_221_a_target_nb_atoms128_rotid10_pristine.tar.gz',
                       'sc/pristine/A_cP1_221_a_target_nb_atoms128_rotid11_pristine.tar.gz',
                       'sc/pristine/A_cP1_221_a_target_nb_atoms128_rotid12_pristine.tar.gz',
                       'sc/pristine/A_cP1_221_a_target_nb_atoms128_rotid13_pristine.tar.gz',
                       'sc/pristine/A_cP1_221_a_target_nb_atoms128_rotid14_pristine.tar.gz',
                       'sc/pristine/A_cP1_221_a_target_nb_atoms128_rotid15_pristine.tar.gz',
                       'sc/pristine/A_cP1_221_a_target_nb_atoms128_rotid16_pristine.tar.gz',
                       'sc/pristine/A_cP1_221_a_target_nb_atoms128_rotid17_pristine.tar.gz',
                       'sc/pristine/A_cP1_221_a_target_nb_atoms128_rotid18_pristine.tar.gz',
                       'sc/pristine/A_cP1_221_a_target_nb_atoms128_rotid19_pristine.tar.gz',
                       'sc/pristine/A_cP1_221_a_target_nb_atoms128_rotid20_pristine.tar.gz',
                       'sc/pristine/A_cP1_221_a_target_nb_atoms128_rotid21_pristine.tar.gz',
                       'sc/pristine/A_cP1_221_a_target_nb_atoms128_rotid22_pristine.tar.gz',
                       'sc/pristine/A_cP1_221_a_target_nb_atoms128_rotid23_pristine.tar.gz',
                       'sc/pristine/A_cP1_221_a_target_nb_atoms128_rotid24_pristine.tar.gz']

# fcc - spacegroup 225
filenames_pristine_fcc = ['fcc/pristine/A_cF4_225_a_target_nb_atoms128_rotid0_pristine.tar.gz',
                         'fcc/pristine/A_cF4_225_a_target_nb_atoms128_rotid1_pristine.tar.gz',
                         'fcc/pristine/A_cF4_225_a_target_nb_atoms128_rotid2_pristine.tar.gz',
                         'fcc/pristine/A_cF4_225_a_target_nb_atoms128_rotid3_pristine.tar.gz',
                         'fcc/pristine/A_cF4_225_a_target_nb_atoms128_rotid4_pristine.tar.gz',
                         'fcc/pristine/A_cF4_225_a_target_nb_atoms128_rotid5_pristine.tar.gz',
                         'fcc/pristine/A_cF4_225_a_target_nb_atoms128_rotid6_pristine.tar.gz',
                         'fcc/pristine/A_cF4_225_a_target_nb_atoms128_rotid7_pristine.tar.gz',
                         'fcc/pristine/A_cF4_225_a_target_nb_atoms128_rotid8_pristine.tar.gz',
                         'fcc/pristine/A_cF4_225_a_target_nb_atoms128_rotid9_pristine.tar.gz',
                         'fcc/pristine/A_cF4_225_a_target_nb_atoms128_rotid10_pristine.tar.gz',
                         'fcc/pristine/A_cF4_225_a_target_nb_atoms128_rotid11_pristine.tar.gz',
                         'fcc/pristine/A_cF4_225_a_target_nb_atoms128_rotid12_pristine.tar.gz',
                         'fcc/pristine/A_cF4_225_a_target_nb_atoms128_rotid13_pristine.tar.gz',
                         'fcc/pristine/A_cF4_225_a_target_nb_atoms128_rotid14_pristine.tar.gz',
                         'fcc/pristine/A_cF4_225_a_target_nb_atoms128_rotid15_pristine.tar.gz',
                         'fcc/pristine/A_cF4_225_a_target_nb_atoms128_rotid16_pristine.tar.gz',
                         'fcc/pristine/A_cF4_225_a_target_nb_atoms128_rotid17_pristine.tar.gz',
                         'fcc/pristine/A_cF4_225_a_target_nb_atoms128_rotid18_pristine.tar.gz',
                         'fcc/pristine/A_cF4_225_a_target_nb_atoms128_rotid19_pristine.tar.gz',                         
                         'fcc/pristine/A_cF4_225_a_target_nb_atoms128_rotid20_pristine.tar.gz',
                         'fcc/pristine/A_cF4_225_a_target_nb_atoms128_rotid21_pristine.tar.gz',
                         'fcc/pristine/A_cF4_225_a_target_nb_atoms128_rotid22_pristine.tar.gz',
                         'fcc/pristine/A_cF4_225_a_target_nb_atoms128_rotid23_pristine.tar.gz',
                         'fcc/pristine/A_cF4_225_a_target_nb_atoms128_rotid24_pristine.tar.gz']

# diam - spacegroup 227
filenames_pristine_diam = ['diam/pristine/A_cF8_227_a_target_nb_atoms128_rotid0_pristine.tar.gz',
                          'diam/pristine/A_cF8_227_a_target_nb_atoms128_rotid1_pristine.tar.gz',
                          'diam/pristine/A_cF8_227_a_target_nb_atoms128_rotid2_pristine.tar.gz',
                          'diam/pristine/A_cF8_227_a_target_nb_atoms128_rotid3_pristine.tar.gz',
                          'diam/pristine/A_cF8_227_a_target_nb_atoms128_rotid4_pristine.tar.gz',
                          'diam/pristine/A_cF8_227_a_target_nb_atoms128_rotid5_pristine.tar.gz',
                          'diam/pristine/A_cF8_227_a_target_nb_atoms128_rotid6_pristine.tar.gz',
                          'diam/pristine/A_cF8_227_a_target_nb_atoms128_rotid7_pristine.tar.gz',
                          'diam/pristine/A_cF8_227_a_target_nb_atoms128_rotid8_pristine.tar.gz',
                          'diam/pristine/A_cF8_227_a_target_nb_atoms128_rotid9_pristine.tar.gz',
                          'diam/pristine/A_cF8_227_a_target_nb_atoms128_rotid10_pristine.tar.gz',
                          'diam/pristine/A_cF8_227_a_target_nb_atoms128_rotid11_pristine.tar.gz',
                          'diam/pristine/A_cF8_227_a_target_nb_atoms128_rotid12_pristine.tar.gz',
                          'diam/pristine/A_cF8_227_a_target_nb_atoms128_rotid13_pristine.tar.gz',
                          'diam/pristine/A_cF8_227_a_target_nb_atoms128_rotid14_pristine.tar.gz',
                          'diam/pristine/A_cF8_227_a_target_nb_atoms128_rotid15_pristine.tar.gz',
                          'diam/pristine/A_cF8_227_a_target_nb_atoms128_rotid16_pristine.tar.gz',
                          'diam/pristine/A_cF8_227_a_target_nb_atoms128_rotid17_pristine.tar.gz',
                          'diam/pristine/A_cF8_227_a_target_nb_atoms128_rotid18_pristine.tar.gz',
                          'diam/pristine/A_cF8_227_a_target_nb_atoms128_rotid19_pristine.tar.gz',
                          'diam/pristine/A_cF8_227_a_target_nb_atoms128_rotid20_pristine.tar.gz',                          
                          'diam/pristine/A_cF8_227_a_target_nb_atoms128_rotid21_pristine.tar.gz',
                          'diam/pristine/A_cF8_227_a_target_nb_atoms128_rotid22_pristine.tar.gz',
                          'diam/pristine/A_cF8_227_a_target_nb_atoms128_rotid23_pristine.tar.gz',
                          'diam/pristine/A_cF8_227_a_target_nb_atoms128_rotid24_pristine.tar.gz']
# bcc - spacegroup 229
filenames_pristine_bcc = ['bcc/pristine/A_cI2_229_a_target_nb_atoms128_rotid0_pristine.tar.gz',
                         'bcc/pristine/A_cI2_229_a_target_nb_atoms128_rotid1_pristine.tar.gz',
                         'bcc/pristine/A_cI2_229_a_target_nb_atoms128_rotid2_pristine.tar.gz',
                         'bcc/pristine/A_cI2_229_a_target_nb_atoms128_rotid3_pristine.tar.gz',
                         'bcc/pristine/A_cI2_229_a_target_nb_atoms128_rotid4_pristine.tar.gz',
                         'bcc/pristine/A_cI2_229_a_target_nb_atoms128_rotid5_pristine.tar.gz',
                         'bcc/pristine/A_cI2_229_a_target_nb_atoms128_rotid6_pristine.tar.gz',
                         'bcc/pristine/A_cI2_229_a_target_nb_atoms128_rotid7_pristine.tar.gz',
                         'bcc/pristine/A_cI2_229_a_target_nb_atoms128_rotid8_pristine.tar.gz',                         
                         'bcc/pristine/A_cI2_229_a_target_nb_atoms128_rotid9_pristine.tar.gz',
                         'bcc/pristine/A_cI2_229_a_target_nb_atoms128_rotid10_pristine.tar.gz',
                         'bcc/pristine/A_cI2_229_a_target_nb_atoms128_rotid11_pristine.tar.gz',
                         'bcc/pristine/A_cI2_229_a_target_nb_atoms128_rotid12_pristine.tar.gz',
                         'bcc/pristine/A_cI2_229_a_target_nb_atoms128_rotid13_pristine.tar.gz',
                         'bcc/pristine/A_cI2_229_a_target_nb_atoms128_rotid14_pristine.tar.gz',
                         'bcc/pristine/A_cI2_229_a_target_nb_atoms128_rotid15_pristine.tar.gz',
                         'bcc/pristine/A_cI2_229_a_target_nb_atoms128_rotid16_pristine.tar.gz',
                         'bcc/pristine/A_cI2_229_a_target_nb_atoms128_rotid17_pristine.tar.gz',
                         'bcc/pristine/A_cI2_229_a_target_nb_atoms128_rotid18_pristine.tar.gz',                         
                         'bcc/pristine/A_cI2_229_a_target_nb_atoms128_rotid19_pristine.tar.gz',
                         'bcc/pristine/A_cI2_229_a_target_nb_atoms128_rotid20_pristine.tar.gz',
                         'bcc/pristine/A_cI2_229_a_target_nb_atoms128_rotid21_pristine.tar.gz',
                         'bcc/pristine/A_cI2_229_a_target_nb_atoms128_rotid22_pristine.tar.gz',
                         'bcc/pristine/A_cI2_229_a_target_nb_atoms128_rotid23_pristine.tar.gz',
                         'bcc/pristine/A_cI2_229_a_target_nb_atoms128_rotid24_pristine.tar.gz']                         
                         
desc_files_pristine_hcp = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_pristine_hcp]
desc_files_pristine_sc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_pristine_sc]
desc_files_pristine_fcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_pristine_fcc]
desc_files_pristine_diam = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_pristine_diam]
desc_files_pristine_bcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_pristine_bcc]

y_true = []
target_list_hcp_pristine, structure_list_hcp_pristine = load_descriptor(desc_files=desc_files_pristine_hcp, configs=configs)
y_true = y_true + [194]*len(structure_list_hcp_pristine)

target_list_sc_pristine, structure_list_sc_pristine = load_descriptor(desc_files=desc_files_pristine_sc, configs=configs)
y_true = y_true + [221]*len(structure_list_sc_pristine)

target_list_fcc_pristine, structure_list_fcc_pristine = load_descriptor(desc_files=desc_files_pristine_fcc, configs=configs)
y_true = y_true + [225]*len(structure_list_fcc_pristine)

target_list_diam_pristine, structure_list_diam_pristine = load_descriptor(desc_files=desc_files_pristine_diam, configs=configs)
y_true = y_true + [227]*len(structure_list_diam_pristine)

target_list_bcc_pristine, structure_list_bcc_pristine = load_descriptor(desc_files=desc_files_pristine_bcc, configs=configs)
y_true = y_true + [229]*len(structure_list_bcc_pristine)

In [4]:
# load structures 
structure_list_pristine = structure_list_hcp_pristine + structure_list_sc_pristine + structure_list_fcc_pristine + structure_list_diam_pristine + structure_list_bcc_pristine
target_list_pristine = target_list_hcp_pristine + target_list_sc_pristine + target_list_fcc_pristine + target_list_diam_pristine + target_list_bcc_pristine

for idx, item in enumerate(target_list_pristine):
    item['data'][0]['target'] = y_true[idx]
    
for idx, structure in enumerate(structure_list_pristine):
    structure.info['target'] = y_true[idx]
    
#y_pred_disp04 = []
#for structure in structure_list_04:
#    y_pred_disp04.append(ase_get_spacegroup(structure, symprec=1e-1).no)

In [5]:
# make dataset

path_to_x, path_to_y, path_to_summary = prepare_dataset(
    structure_list=structure_list_pristine,
    target_list=target_list_pristine,
    desc_metadata='diffraction_3d_sh_spectrum',
    dataset_name='hcp-sc-fcc-diam-bcc_pristine-large',
    target_name='target',
    target_categorical=True,
    input_dims=(50, 32),
    configs=configs,
    dataset_folder=dataset_folder,
    main_folder=configs['io']['main_folder'],
    desc_folder=configs['io']['desc_folder'],
    tmp_folder=configs['io']['tmp_folder'],
    notes="Hcp, sc, fcc, diam and sc structures pristine. 25 rotations for cubic, 5 for hcp.")

x, y, dataset_info = load_dataset_from_file(path_to_x=path_to_x, path_to_y=path_to_y,
                                                              path_to_summary=path_to_summary)

# write ASE database (wirting a json database does not work
# probably because there is a bug in ASE that prevents to write a json database from Jupyter Notebooks
write_ase_db(structure_list_pristine, main_folder=main_folder, db_name='hcp-sc-fcc-diam-bcc_pristine-large', db_type='db',
                 overwrite=True, folder_name='db_ase')

Counter(y)

Counter({0: 4500, 1: 4500, 2: 4500, 3: 4500, 4: 4500})

## 2. Structures with displacements

### 2.1 Displacements 0.1%

In [None]:
# hcp - spacegroup 194
filenames_disp01_hcp = ['hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid0_disp01.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid1_disp01.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid2_disp01.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid3_disp01.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid4_disp01.tar.gz']
# sc - spacegroup 221
filenames_disp01_sc = ['sc/disp/A_cP1_221_a_target_nb_atoms128_rotid0_disp01.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid1_disp01.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid2_disp01.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid3_disp01.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid4_disp01.tar.gz']

# fcc - spacegroup 225
filenames_disp01_fcc = ['fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid0_disp01.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid1_disp01.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid2_disp01.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid3_disp01.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid4_disp01.tar.gz']

# diam - spacegroup 227
filenames_disp01_diam = ['diam/disp/A_cF8_227_a_target_nb_atoms128_rotid0_disp01.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid1_disp01.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid2_disp01.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid3_disp01.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid4_disp01.tar.gz']
# bcc - spacegroup 229
filenames_disp01_bcc = ['bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid0_disp01.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid1_disp01.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid2_disp01.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid3_disp01.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid4_disp01.tar.gz']
                         
                         
desc_files_disp01_hcp = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp01_hcp]
desc_files_disp01_sc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp01_sc]
desc_files_disp01_fcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp01_fcc]
desc_files_disp01_diam = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp01_diam]
desc_files_disp01_bcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp01_bcc]

y_true = []
target_list_hcp_disp01, structure_list_hcp_disp01 = load_descriptor(desc_files=desc_files_disp01_hcp, configs=configs)
y_true = y_true + [194]*len(structure_list_hcp_disp01)

target_list_sc_disp01, structure_list_sc_disp01 = load_descriptor(desc_files=desc_files_disp01_sc, configs=configs)
y_true = y_true + [221]*len(structure_list_sc_disp01)

target_list_fcc_disp01, structure_list_fcc_disp01 = load_descriptor(desc_files=desc_files_disp01_fcc, configs=configs)
y_true = y_true + [225]*len(structure_list_fcc_disp01)

target_list_diam_disp01, structure_list_diam_disp01 = load_descriptor(desc_files=desc_files_disp01_diam, configs=configs)
y_true = y_true + [227]*len(structure_list_diam_disp01)

target_list_bcc_disp01, structure_list_bcc_disp01 = load_descriptor(desc_files=desc_files_disp01_bcc, configs=configs)
y_true = y_true + [229]*len(structure_list_bcc_disp01)

In [None]:
# read structures
structure_list_disp01 = structure_list_hcp_disp01 + structure_list_sc_disp01 + structure_list_fcc_disp01 + structure_list_diam_disp01 + structure_list_bcc_disp01
target_list_disp01 = target_list_hcp_disp01 + target_list_sc_disp01 + target_list_fcc_disp01 + target_list_diam_disp01 + target_list_bcc_disp01

for idx, item in enumerate(target_list_disp01):
    item['data'][0]['target'] = y_true[idx]
    
for idx, structure in enumerate(structure_list_disp01):
    structure.info['target'] = y_true[idx]
    
#y_pred_disp04 = []
#for structure in structure_list_04:
#    y_pred_disp04.append(ase_get_spacegroup(structure, symprec=1e-1).no)

In [None]:
# make dataset

path_to_x, path_to_y, path_to_summary = prepare_dataset(
    structure_list=structure_list_disp01,
    target_list=target_list_disp01,
    desc_metadata='diffraction_3d_sh_spectrum',
    dataset_name='hcp-sc-fcc-diam-bcc_displacement-0.1%',
    target_name='target',
    target_categorical=True,
    input_dims=(50, 32),
    configs=configs,
    dataset_folder=dataset_folder,
    main_folder=configs['io']['main_folder'],
    desc_folder=configs['io']['desc_folder'],
    tmp_folder=configs['io']['tmp_folder'],
    notes="Hcp, sc, fcc, diam and sc structures with 0.1% displacement")

x, y, dataset_info = load_dataset_from_file(path_to_x=path_to_x, path_to_y=path_to_y,
                                                              path_to_summary=path_to_summary)

# write ASE database (wirting a json database does not work
# probably because there is a bug in ASE that prevents to write a json database from Jupyter Notebooks
write_ase_db(structure_list_disp01, main_folder=main_folder, db_name='hcp-sc-fcc-diam-bcc_displacement-0.1%', db_type='db',
                 overwrite=True, folder_name='db_ase')

Counter(y)

### 2.2 Displacements 0.2%

In [2]:
# hcp - spacegroup 194
filenames_disp02_hcp = ['hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid0_disp02.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid1_disp02.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid2_disp02.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid3_disp02.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid4_disp02.tar.gz']
# sc - spacegroup 221
filenames_disp02_sc = ['sc/disp/A_cP1_221_a_target_nb_atoms128_rotid0_disp02.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid1_disp02.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid2_disp02.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid3_disp02.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid4_disp02.tar.gz']

# fcc - spacegroup 225
filenames_disp02_fcc = ['fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid0_disp02.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid1_disp02.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid2_disp02.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid3_disp02.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid4_disp02.tar.gz']

# diam - spacegroup 227
filenames_disp02_diam = ['diam/disp/A_cF8_227_a_target_nb_atoms128_rotid0_disp02.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid1_disp02.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid2_disp02.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid3_disp02.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid4_disp02.tar.gz']
# bcc - spacegroup 229
filenames_disp02_bcc = ['bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid0_disp02.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid1_disp02.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid2_disp02.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid3_disp02.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid4_disp02.tar.gz']
                         
                         
desc_files_disp02_hcp = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp02_hcp]
desc_files_disp02_sc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp02_sc]
desc_files_disp02_fcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp02_fcc]
desc_files_disp02_diam = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp02_diam]
desc_files_disp02_bcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp02_bcc]

y_true = []
target_list_hcp_disp02, structure_list_hcp_disp02 = load_descriptor(desc_files=desc_files_disp02_hcp, configs=configs)
y_true = y_true + [194]*len(structure_list_hcp_disp02)

target_list_sc_disp02, structure_list_sc_disp02 = load_descriptor(desc_files=desc_files_disp02_sc, configs=configs)
y_true = y_true + [221]*len(structure_list_sc_disp02)

target_list_fcc_disp02, structure_list_fcc_disp02 = load_descriptor(desc_files=desc_files_disp02_fcc, configs=configs)
y_true = y_true + [225]*len(structure_list_fcc_disp02)

target_list_diam_disp02, structure_list_diam_disp02 = load_descriptor(desc_files=desc_files_disp02_diam, configs=configs)
y_true = y_true + [227]*len(structure_list_diam_disp02)

target_list_bcc_disp02, structure_list_bcc_disp02 = load_descriptor(desc_files=desc_files_disp02_bcc, configs=configs)
y_true = y_true + [229]*len(structure_list_bcc_disp02)

In [3]:
# read structures
structure_list_disp02 = structure_list_hcp_disp02 + structure_list_sc_disp02 + structure_list_fcc_disp02 + structure_list_diam_disp02 + structure_list_bcc_disp02
target_list_disp02 = target_list_hcp_disp02 + target_list_sc_disp02 + target_list_fcc_disp02 + target_list_diam_disp02 + target_list_bcc_disp02

for idx, item in enumerate(target_list_disp02):
    item['data'][0]['target'] = y_true[idx]
    
for idx, structure in enumerate(structure_list_disp02):
    structure.info['target'] = y_true[idx]
    
#y_pred_disp04 = []
#for structure in structure_list_04:
#    y_pred_disp04.append(ase_get_spacegroup(structure, symprec=1e-1).no)

In [4]:
# make dataset

path_to_x, path_to_y, path_to_summary = prepare_dataset(
    structure_list=structure_list_disp02,
    target_list=target_list_disp02,
    desc_metadata='diffraction_3d_sh_spectrum',
    dataset_name='hcp-sc-fcc-diam-bcc_displacement-0.2%',
    target_name='target',
    target_categorical=True,
    input_dims=(50, 32),
    configs=configs,
    dataset_folder=dataset_folder,
    main_folder=configs['io']['main_folder'],
    desc_folder=configs['io']['desc_folder'],
    tmp_folder=configs['io']['tmp_folder'],
    notes="Hcp, sc, fcc, diam and sc structures with 0.2% displacement")

x, y, dataset_info = load_dataset_from_file(path_to_x=path_to_x, path_to_y=path_to_y,
                                                              path_to_summary=path_to_summary)

# write ASE database (wirting a json database does not work
# probably because there is a bug in ASE that prevents to write a json database from Jupyter Notebooks
write_ase_db(structure_list_disp02, main_folder=main_folder, db_name='hcp-sc-fcc-diam-bcc_displacement-0.2%', db_type='db',
                 overwrite=True, folder_name='db_ase')

Counter(y)

Counter({0: 4500, 1: 900, 2: 900, 3: 900, 4: 900})

### 2.3 Displacements 0.6%

In [None]:
# hcp - spacegroup 194
filenames_disp06_hcp = ['hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid0_disp06.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid1_disp06.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid2_disp06.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid3_disp06.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid4_disp06.tar.gz']
# sc - spacegroup 221
filenames_disp06_sc = ['sc/disp/A_cP1_221_a_target_nb_atoms128_rotid0_disp06.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid1_disp06.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid2_disp06.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid3_disp06.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid4_disp06.tar.gz']

# fcc - spacegroup 225
filenames_disp06_fcc = ['fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid0_disp06.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid1_disp06.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid2_disp06.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid3_disp06.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid4_disp06.tar.gz']

# diam - spacegroup 227
filenames_disp06_diam = ['diam/disp/A_cF8_227_a_target_nb_atoms128_rotid0_disp06.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid1_disp06.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid2_disp06.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid3_disp06.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid4_disp06.tar.gz']
# bcc - spacegroup 229
filenames_disp06_bcc = ['bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid0_disp06.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid1_disp06.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid2_disp06.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid3_disp06.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid4_disp06.tar.gz']
                         
                         
desc_files_disp06_hcp = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp06_hcp]
desc_files_disp06_sc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp06_sc]
desc_files_disp06_fcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp06_fcc]
desc_files_disp06_diam = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp06_diam]
desc_files_disp06_bcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp06_bcc]

y_true = []
target_list_hcp_disp06, structure_list_hcp_disp06 = load_descriptor(desc_files=desc_files_disp06_hcp, configs=configs)
y_true = y_true + [194]*len(structure_list_hcp_disp06)

target_list_sc_disp06, structure_list_sc_disp06 = load_descriptor(desc_files=desc_files_disp06_sc, configs=configs)
y_true = y_true + [221]*len(structure_list_sc_disp06)

target_list_fcc_disp06, structure_list_fcc_disp06 = load_descriptor(desc_files=desc_files_disp06_fcc, configs=configs)
y_true = y_true + [225]*len(structure_list_fcc_disp06)

target_list_diam_disp06, structure_list_diam_disp06 = load_descriptor(desc_files=desc_files_disp06_diam, configs=configs)
y_true = y_true + [227]*len(structure_list_diam_disp06)

target_list_bcc_disp06, structure_list_bcc_disp06 = load_descriptor(desc_files=desc_files_disp06_bcc, configs=configs)
y_true = y_true + [229]*len(structure_list_bcc_disp06)

In [None]:
# read structures
structure_list_disp06 = structure_list_hcp_disp06 + structure_list_sc_disp06 + structure_list_fcc_disp06 + structure_list_diam_disp06 + structure_list_bcc_disp06
target_list_disp06 = target_list_hcp_disp06 + target_list_sc_disp06 + target_list_fcc_disp06 + target_list_diam_disp06 + target_list_bcc_disp06

for idx, item in enumerate(target_list_disp06):
    item['data'][0]['target'] = y_true[idx]
    
for idx, structure in enumerate(structure_list_disp06):
    structure.info['target'] = y_true[idx]
    
#y_pred_disp04 = []
#for structure in structure_list_04:
#    y_pred_disp04.append(ase_get_spacegroup(structure, symprec=1e-1).no)

In [None]:
# make dataset

path_to_x, path_to_y, path_to_summary = prepare_dataset(
    structure_list=structure_list_disp06,
    target_list=target_list_disp06,
    desc_metadata='diffraction_3d_sh_spectrum',
    dataset_name='hcp-sc-fcc-diam-bcc_displacement-0.6%',
    target_name='target',
    target_categorical=True,
    input_dims=(50, 32),
    configs=configs,
    dataset_folder=dataset_folder,
    main_folder=configs['io']['main_folder'],
    desc_folder=configs['io']['desc_folder'],
    tmp_folder=configs['io']['tmp_folder'],
    notes="Hcp, sc, fcc, diam and sc structures with 0.6% displacement")

x, y, dataset_info = load_dataset_from_file(path_to_x=path_to_x, path_to_y=path_to_y,
                                                              path_to_summary=path_to_summary)


# write ASE database (wirting a json database does not work
# probably because there is a bug in ASE that prevents to write a json database from Jupyter Notebooks
write_ase_db(structure_list_disp06, main_folder=main_folder, db_name='hcp-sc-fcc-diam-bcc_displacement-0.6%', db_type='db',
                 overwrite=True, folder_name='db_ase')

Counter(y)

### 2.4 Displacements 1.0%

In [None]:
# hcp - spacegroup 194
filenames_disp1_hcp = ['hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid0_disp1.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid1_disp1.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid2_disp1.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid3_disp1.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid4_disp1.tar.gz']
# sc - spacegroup 221
filenames_disp1_sc = ['sc/disp/A_cP1_221_a_target_nb_atoms128_rotid0_disp1.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid1_disp1.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid2_disp1.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid3_disp1.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid4_disp1.tar.gz']

# fcc - spacegroup 225
filenames_disp1_fcc = ['fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid0_disp1.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid1_disp1.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid2_disp1.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid3_disp1.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid4_disp1.tar.gz']

# diam - spacegroup 227
filenames_disp1_diam = ['diam/disp/A_cF8_227_a_target_nb_atoms128_rotid0_disp1.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid1_disp1.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid2_disp1.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid3_disp1.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid4_disp1.tar.gz']
# bcc - spacegroup 229
filenames_disp1_bcc = ['bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid0_disp1.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid1_disp1.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid2_disp1.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid3_disp1.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid4_disp1.tar.gz']
                         
                         
desc_files_disp1_hcp = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp1_hcp]
desc_files_disp1_sc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp1_sc]
desc_files_disp1_fcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp1_fcc]
desc_files_disp1_diam = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp1_diam]
desc_files_disp1_bcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp1_bcc]

y_true = []
target_list_hcp_disp1, structure_list_hcp_disp1 = load_descriptor(desc_files=desc_files_disp1_hcp, configs=configs)
y_true = y_true + [194]*len(target_list_hcp_disp1)

target_list_sc_disp1, structure_list_sc_disp1 = load_descriptor(desc_files=desc_files_disp1_sc, configs=configs)
y_true = y_true + [221]*len(target_list_sc_disp1)

target_list_fcc_disp1, structure_list_fcc_disp1 = load_descriptor(desc_files=desc_files_disp1_fcc, configs=configs)
y_true = y_true + [225]*len(target_list_fcc_disp1)

target_list_diam_disp1, structure_list_diam_disp1 = load_descriptor(desc_files=desc_files_disp1_diam, configs=configs)
y_true = y_true + [227]*len(target_list_diam_disp1)

target_list_bcc_disp1, structure_list_bcc_disp1 = load_descriptor(desc_files=desc_files_disp1_bcc, configs=configs)
y_true = y_true + [229]*len(target_list_bcc_disp1)

In [None]:
# read structures 
structure_list_disp1 = structure_list_hcp_disp1 + structure_list_sc_disp1 + structure_list_fcc_disp1 + structure_list_diam_disp1 + structure_list_bcc_disp1
target_list_disp1 = target_list_hcp_disp1 + target_list_sc_disp1 + target_list_fcc_disp1 + target_list_diam_disp1 + target_list_bcc_disp1

for idx, item in enumerate(target_list_disp1):
    item['data'][0]['target'] = y_true[idx]
    
for idx, structure in enumerate(structure_list_disp1):
    structure.info['target'] = y_true[idx]
    
#y_pred_disp1 = []
#for structure in structure_list_1:
#    y_pred_disp1.append(ase_get_spacegroup(structure, symprec=1e-1).no)

In [None]:
# make dataset

path_to_x, path_to_y, path_to_summary = prepare_dataset(
    structure_list=structure_list_disp1,
    target_list=target_list_disp1,
    desc_metadata='diffraction_3d_sh_spectrum',
    dataset_name='hcp-sc-fcc-diam-bcc_displacement-1%',
    target_name='target',
    target_categorical=True,
    input_dims=(50, 32),
    configs=configs,
    dataset_folder=dataset_folder,
    main_folder=configs['io']['main_folder'],
    desc_folder=configs['io']['desc_folder'],
    tmp_folder=configs['io']['tmp_folder'],
    notes="Hcp, sc, fcc, diam and sc structures with 1% displacement")

x, y, dataset_info = load_dataset_from_file(path_to_x=path_to_x, path_to_y=path_to_y,
                                                              path_to_summary=path_to_summary)


# write ASE database (wirting a json database does not work
# probably because there is a bug in ASE that prevents to write a json database from Jupyter Notebooks
write_ase_db(structure_list_disp1, main_folder=main_folder, db_name='hcp-sc-fcc-diam-bcc_displacement-1%', db_type='db',
                 overwrite=True, folder_name='db_ase')
Counter(y)

### 2.5 Displacements 2.0%

In [None]:
# hcp - spacegroup 194
filenames_disp2_hcp = ['hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid0_disp2.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid1_disp2.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid2_disp2.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid3_disp2.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid4_disp2.tar.gz']
# sc - spacegroup 221
filenames_disp2_sc = ['sc/disp/A_cP1_221_a_target_nb_atoms128_rotid0_disp2.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid1_disp2.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid2_disp2.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid3_disp2.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid4_disp2.tar.gz']

# fcc - spacegroup 225
filenames_disp2_fcc = ['fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid0_disp2.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid1_disp2.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid2_disp2.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid3_disp2.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid4_disp2.tar.gz']

# diam - spacegroup 227
filenames_disp2_diam = ['diam/disp/A_cF8_227_a_target_nb_atoms128_rotid0_disp2.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid1_disp2.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid2_disp2.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid3_disp2.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid4_disp2.tar.gz']
# bcc - spacegroup 229
filenames_disp2_bcc = ['bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid0_disp2.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid1_disp2.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid2_disp2.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid3_disp2.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid4_disp2.tar.gz']
                         
                         
desc_files_disp2_hcp = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp2_hcp]
desc_files_disp2_sc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp2_sc]
desc_files_disp2_fcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp2_fcc]
desc_files_disp2_diam = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp2_diam]
desc_files_disp2_bcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp2_bcc]

y_true = []
target_list_hcp_disp2, structure_list_hcp_disp2 = load_descriptor(desc_files=desc_files_disp2_hcp, configs=configs)
y_true = y_true + [194]*len(structure_list_hcp_disp2)

target_list_sc_disp2, structure_list_sc_disp2 = load_descriptor(desc_files=desc_files_disp2_sc, configs=configs)
y_true = y_true + [221]*len(structure_list_sc_disp2)

target_list_fcc_disp2, structure_list_fcc_disp2 = load_descriptor(desc_files=desc_files_disp2_fcc, configs=configs)
y_true = y_true + [225]*len(structure_list_fcc_disp2)

target_list_diam_disp2, structure_list_diam_disp2 = load_descriptor(desc_files=desc_files_disp2_diam, configs=configs)
y_true = y_true + [227]*len(structure_list_diam_disp2)

target_list_bcc_disp2, structure_list_bcc_disp2 = load_descriptor(desc_files=desc_files_disp2_bcc, configs=configs)
y_true = y_true + [229]*len(structure_list_bcc_disp2)

In [None]:
# read structures 
structure_list_disp2 = structure_list_hcp_disp2 + structure_list_sc_disp2 + structure_list_fcc_disp2 + structure_list_diam_disp2 + structure_list_bcc_disp2
target_list_disp2 = target_list_hcp_disp2 + target_list_sc_disp2 + target_list_fcc_disp2 + target_list_diam_disp2 + target_list_bcc_disp2

for idx, item in enumerate(target_list_disp2):
    item['data'][0]['target'] = y_true[idx]
    
for idx, structure in enumerate(structure_list_disp2):
    structure.info['target'] = y_true[idx]
    
#y_pred_disp2 = []
#for structure in structure_list_2:
#    y_pred_disp2.append(ase_get_spacegroup(structure, symprec=1e-1).no)

In [None]:
# make dataset

path_to_x, path_to_y, path_to_summary = prepare_dataset(
    structure_list=structure_list_disp2,
    target_list=target_list_disp2,
    desc_metadata='diffraction_3d_sh_spectrum',
    dataset_name='hcp-sc-fcc-diam-bcc_displacement-2%',
    target_name='target',
    target_categorical=True,
    input_dims=(50, 32),
    configs=configs,
    dataset_folder=dataset_folder,
    main_folder=configs['io']['main_folder'],
    desc_folder=configs['io']['desc_folder'],
    tmp_folder=configs['io']['tmp_folder'],
    notes="Hcp, sc, fcc, diam and sc structures with 2% displacement")

x, y, dataset_info = load_dataset_from_file(path_to_x=path_to_x, path_to_y=path_to_y,
                                                              path_to_summary=path_to_summary)


# write ASE database (wirting a json database does not work
# probably because there is a bug in ASE that prevents to write a json database from Jupyter Notebooks
write_ase_db(structure_list_disp2, main_folder=main_folder, db_name='hcp-sc-fcc-diam-bcc_displacement-2%', db_type='db',
                 overwrite=True, folder_name='db_ase')

Counter(y)

### 2.6 Displacements 4.0%

In [None]:
# hcp - spacegroup 194
filenames_disp4_hcp = ['hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid0_disp4.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid1_disp4.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid2_disp4.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid3_disp4.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid4_disp4.tar.gz']
# sc - spacegroup 221
filenames_disp4_sc = ['sc/disp/A_cP1_221_a_target_nb_atoms128_rotid0_disp4.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid1_disp4.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid2_disp4.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid3_disp4.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid4_disp4.tar.gz']

# fcc - spacegroup 225
filenames_disp4_fcc = ['fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid0_disp4.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid1_disp4.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid2_disp4.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid3_disp4.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid4_disp4.tar.gz']

# diam - spacegroup 227
filenames_disp4_diam = ['diam/disp/A_cF8_227_a_target_nb_atoms128_rotid0_disp4.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid1_disp4.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid2_disp4.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid3_disp4.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid4_disp4.tar.gz']
# bcc - spacegroup 229
filenames_disp4_bcc = ['bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid0_disp4.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid1_disp4.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid2_disp4.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid3_disp4.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid4_disp4.tar.gz']
                         
                         
desc_files_disp4_hcp = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp4_hcp]
desc_files_disp4_sc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp4_sc]
desc_files_disp4_fcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp4_fcc]
desc_files_disp4_diam = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp4_diam]
desc_files_disp4_bcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp4_bcc]

y_true = []
target_list_hcp_disp4, structure_list_hcp_disp4 = load_descriptor(desc_files=desc_files_disp4_hcp, configs=configs)
y_true = y_true + [194]*len(structure_list_hcp_disp4)

target_list_sc_disp4, structure_list_sc_disp4 = load_descriptor(desc_files=desc_files_disp4_sc, configs=configs)
y_true = y_true + [221]*len(structure_list_sc_disp4)

target_list_fcc_disp4, structure_list_fcc_disp4 = load_descriptor(desc_files=desc_files_disp4_fcc, configs=configs)
y_true = y_true + [225]*len(structure_list_fcc_disp4)

target_list_diam_disp4, structure_list_diam_disp4 = load_descriptor(desc_files=desc_files_disp4_diam, configs=configs)
y_true = y_true + [227]*len(structure_list_diam_disp4)

target_list_bcc_disp4, structure_list_bcc_disp4 = load_descriptor(desc_files=desc_files_disp4_bcc, configs=configs)
y_true = y_true + [229]*len(structure_list_bcc_disp4)

In [None]:
# read structures 
structure_list_disp4 = structure_list_hcp_disp4 + structure_list_sc_disp4 + structure_list_fcc_disp4 + structure_list_diam_disp4 + structure_list_bcc_disp4
target_list_disp4 = target_list_hcp_disp4 + target_list_sc_disp4 + target_list_fcc_disp4 + target_list_diam_disp4 + target_list_bcc_disp4

for idx, item in enumerate(target_list_disp4):
    item['data'][0]['target'] = y_true[idx]
    
for idx, structure in enumerate(structure_list_disp4):
    structure.info['target'] = y_true[idx]
    
#y_pred_disp4 = []
#for structure in structure_list_4:
#    y_pred_disp4.append(ase_get_spacegroup(structure, symprec=1e-1).no)

In [None]:
# make dataset

path_to_x, path_to_y, path_to_summary = prepare_dataset(
    structure_list=structure_list_disp4,
    target_list=target_list_disp4,
    desc_metadata='diffraction_3d_sh_spectrum',
    dataset_name='hcp-sc-fcc-diam-bcc_displacement-4%',
    target_name='target',
    target_categorical=True,
    input_dims=(50, 32),
    configs=configs,
    dataset_folder=dataset_folder,
    main_folder=configs['io']['main_folder'],
    desc_folder=configs['io']['desc_folder'],
    tmp_folder=configs['io']['tmp_folder'],
    notes="Hcp, sc, fcc, diam and sc structures with 4% displacement")

x, y, dataset_info = load_dataset_from_file(path_to_x=path_to_x, path_to_y=path_to_y,
                                                              path_to_summary=path_to_summary)

# write ASE database (wirting a json database does not work
# probably because there is a bug in ASE that prevents to write a json database from Jupyter Notebooks
write_ase_db(structure_list_disp4, main_folder=main_folder, db_name='hcp-sc-fcc-diam-bcc_displacement-4%', db_type='db',
                 overwrite=True, folder_name='db_ase')

Counter(y)

### 2.7 Displacements 5.0%

In [None]:
# hcp - spacegroup 194
filenames_disp5_hcp = ['hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid0_disp5.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid1_disp5.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid2_disp5.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid3_disp5.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid4_disp5.tar.gz']
# sc - spacegroup 221
filenames_disp5_sc = ['sc/disp/A_cP1_221_a_target_nb_atoms128_rotid0_disp5.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid1_disp5.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid2_disp5.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid3_disp5.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid4_disp5.tar.gz']

# fcc - spacegroup 225
filenames_disp5_fcc = ['fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid0_disp5.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid1_disp5.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid2_disp5.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid3_disp5.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid4_disp5.tar.gz']

# diam - spacegroup 227
filenames_disp5_diam = ['diam/disp/A_cF8_227_a_target_nb_atoms128_rotid0_disp5.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid1_disp5.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid2_disp5.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid3_disp5.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid4_disp5.tar.gz']
# bcc - spacegroup 229
filenames_disp5_bcc = ['bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid0_disp5.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid1_disp5.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid2_disp5.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid3_disp5.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid4_disp5.tar.gz']
                         
                         
desc_files_disp5_hcp = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp5_hcp]
desc_files_disp5_sc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp5_sc]
desc_files_disp5_fcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp5_fcc]
desc_files_disp5_diam = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp5_diam]
desc_files_disp5_bcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp5_bcc]

y_true = []
target_list_hcp_disp5, structure_list_hcp_disp5 = load_descriptor(desc_files=desc_files_disp5_hcp, configs=configs)
y_true = y_true + [194]*len(structure_list_hcp_disp5)

target_list_sc_disp5, structure_list_sc_disp5 = load_descriptor(desc_files=desc_files_disp5_sc, configs=configs)
y_true = y_true + [221]*len(structure_list_sc_disp5)

target_list_fcc_disp5, structure_list_fcc_disp5 = load_descriptor(desc_files=desc_files_disp5_fcc, configs=configs)
y_true = y_true + [225]*len(structure_list_fcc_disp5)

target_list_diam_disp5, structure_list_diam_disp5 = load_descriptor(desc_files=desc_files_disp5_diam, configs=configs)
y_true = y_true + [227]*len(structure_list_diam_disp5)

target_list_bcc_disp5, structure_list_bcc_disp5 = load_descriptor(desc_files=desc_files_disp5_bcc, configs=configs)
y_true = y_true + [229]*len(structure_list_bcc_disp5)

In [None]:
# read structures 
structure_list_disp5 = structure_list_hcp_disp5 + structure_list_sc_disp5 + structure_list_fcc_disp5 + structure_list_diam_disp5 + structure_list_bcc_disp5
target_list_disp5 = target_list_hcp_disp5 + target_list_sc_disp5 + target_list_fcc_disp5 + target_list_diam_disp5 + target_list_bcc_disp5

for idx, item in enumerate(target_list_disp5):
    item['data'][0]['target'] = y_true[idx]
    
for idx, structure in enumerate(structure_list_disp5):
    structure.info['target'] = y_true[idx]
    
#y_pred_disp4 = []
#for structure in structure_list_4:
#    y_pred_disp4.append(ase_get_spacegroup(structure, symprec=1e-1).no)

In [None]:
# make dataset

path_to_x, path_to_y, path_to_summary = prepare_dataset(
    structure_list=structure_list_disp5,
    target_list=target_list_disp5,
    desc_metadata='diffraction_3d_sh_spectrum',
    dataset_name='hcp-sc-fcc-diam-bcc_displacement-5%',
    target_name='target',
    target_categorical=True,
    input_dims=(50, 32),
    configs=configs,
    dataset_folder=dataset_folder,
    main_folder=configs['io']['main_folder'],
    desc_folder=configs['io']['desc_folder'],
    tmp_folder=configs['io']['tmp_folder'],
    notes="Hcp, sc, fcc, diam and sc structures with 5% displacement")

x, y, dataset_info = load_dataset_from_file(path_to_x=path_to_x, path_to_y=path_to_y,
                                                              path_to_summary=path_to_summary)

# write ASE database (wirting a json database does not work
# probably because there is a bug in ASE that prevents to write a json database from Jupyter Notebooks
write_ase_db(structure_list_disp5, main_folder=main_folder, db_name='hcp-sc-fcc-diam-bcc_displacement-5%', db_type='db',
                 overwrite=True, folder_name='db_ase')

Counter(y)

### 2.8 Displacements 8.0%

In [3]:
# hcp - spacegroup 194
filenames_disp8_hcp = ['hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid0_disp8.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid1_disp8.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid2_disp8.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid3_disp8.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid4_disp8.tar.gz']
# sc - spacegroup 221
filenames_disp8_sc = ['sc/disp/A_cP1_221_a_target_nb_atoms128_rotid0_disp8.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid1_disp8.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid2_disp8.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid3_disp8.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid4_disp8.tar.gz']

# fcc - spacegroup 225
filenames_disp8_fcc = ['fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid0_disp8.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid1_disp8.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid2_disp8.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid3_disp8.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid4_disp8.tar.gz']

# diam - spacegroup 227
filenames_disp8_diam = ['diam/disp/A_cF8_227_a_target_nb_atoms128_rotid0_disp8.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid1_disp8.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid2_disp8.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid3_disp8.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid4_disp8.tar.gz']
# bcc - spacegroup 229
filenames_disp8_bcc = ['bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid0_disp8.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid1_disp8.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid2_disp8.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid3_disp8.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid4_disp8.tar.gz']
                         
                         
desc_files_disp8_hcp = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp8_hcp]
desc_files_disp8_sc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp8_sc]
desc_files_disp8_fcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp8_fcc]
desc_files_disp8_diam = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp8_diam]
desc_files_disp8_bcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp8_bcc]

y_true = []
target_list_hcp_disp8, structure_list_hcp_disp8 = load_descriptor(desc_files=desc_files_disp8_hcp, configs=configs)
y_true = y_true + [194]*len(structure_list_hcp_disp8)

target_list_sc_disp8, structure_list_sc_disp8 = load_descriptor(desc_files=desc_files_disp8_sc, configs=configs)
y_true = y_true + [221]*len(structure_list_sc_disp8)

target_list_fcc_disp8, structure_list_fcc_disp8 = load_descriptor(desc_files=desc_files_disp8_fcc, configs=configs)
y_true = y_true + [225]*len(structure_list_fcc_disp8)

target_list_diam_disp8, structure_list_diam_disp8 = load_descriptor(desc_files=desc_files_disp8_diam, configs=configs)
y_true = y_true + [227]*len(structure_list_diam_disp8)

target_list_bcc_disp8, structure_list_bcc_disp8 = load_descriptor(desc_files=desc_files_disp8_bcc, configs=configs)
y_true = y_true + [229]*len(structure_list_bcc_disp8)

In [5]:
# read structures 
structure_list_disp8 = structure_list_hcp_disp8 + structure_list_sc_disp8 + structure_list_fcc_disp8 + structure_list_diam_disp8 + structure_list_bcc_disp8
target_list_disp8 = target_list_hcp_disp8 + target_list_sc_disp8 + target_list_fcc_disp8 + target_list_diam_disp8 + target_list_bcc_disp8

for idx, item in enumerate(target_list_disp8):
    item['data'][0]['target'] = y_true[idx]
    
for idx, structure in enumerate(structure_list_disp8):
    structure.info['target'] = y_true[idx]
    
#y_pred_disp4 = []
#for structure in structure_list_4:
#    y_pred_disp4.append(ase_get_spacegroup(structure, symprec=1e-1).no)

In [6]:
# make dataset

path_to_x, path_to_y, path_to_summary = prepare_dataset(
    structure_list=structure_list_disp8,
    target_list=target_list_disp8,
    desc_metadata='diffraction_3d_sh_spectrum',
    dataset_name='hcp-sc-fcc-diam-bcc_displacement-8%',
    target_name='target',
    target_categorical=True,
    input_dims=(50, 32),
    configs=configs,
    dataset_folder=dataset_folder,
    main_folder=configs['io']['main_folder'],
    desc_folder=configs['io']['desc_folder'],
    tmp_folder=configs['io']['tmp_folder'],
    notes="Hcp, sc, fcc, diam and sc structures with 8% displacement")

x, y, dataset_info = load_dataset_from_file(path_to_x=path_to_x, path_to_y=path_to_y,
                                                              path_to_summary=path_to_summary)

# write ASE database (wirting a json database does not work
# probably because there is a bug in ASE that prevents to write a json database from Jupyter Notebooks
write_ase_db(structure_list_disp8, main_folder=main_folder, db_name='hcp-sc-fcc-diam-bcc_displacement-8%', db_type='db',
                 overwrite=True, folder_name='db_ase')

Counter(y)

Counter({0: 4500, 1: 900, 2: 900, 3: 900, 4: 900})

### 2.9 Displacements 10.0%

In [5]:
# hcp - spacegroup 194
filenames_disp10_hcp = ['hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid0_disp10.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid1_disp10.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid2_disp10.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid3_disp10.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid4_disp10.tar.gz']
# sc - spacegroup 221
filenames_disp10_sc = ['sc/disp/A_cP1_221_a_target_nb_atoms128_rotid0_disp10.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid1_disp10.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid2_disp10.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid3_disp10.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid4_disp10.tar.gz']

# fcc - spacegroup 225
filenames_disp10_fcc = ['fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid0_disp10.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid1_disp10.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid2_disp10.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid3_disp10.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid4_disp10.tar.gz']

# diam - spacegroup 227
filenames_disp10_diam = ['diam/disp/A_cF8_227_a_target_nb_atoms128_rotid0_disp10.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid1_disp10.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid2_disp10.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid3_disp10.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid4_disp10.tar.gz']
# bcc - spacegroup 229
filenames_disp10_bcc = ['bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid0_disp10.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid1_disp10.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid2_disp10.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid3_disp10.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid4_disp10.tar.gz']
                         
                         
desc_files_disp10_hcp = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp10_hcp]
desc_files_disp10_sc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp10_sc]
desc_files_disp10_fcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp10_fcc]
desc_files_disp10_diam = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp10_diam]
desc_files_disp10_bcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp10_bcc]

y_true = []
target_list_hcp_disp10, structure_list_hcp_disp10 = load_descriptor(desc_files=desc_files_disp10_hcp, configs=configs)
y_true = y_true + [194]*len(structure_list_hcp_disp10)

target_list_sc_disp10, structure_list_sc_disp10 = load_descriptor(desc_files=desc_files_disp10_sc, configs=configs)
y_true = y_true + [221]*len(structure_list_sc_disp10)

target_list_fcc_disp10, structure_list_fcc_disp10 = load_descriptor(desc_files=desc_files_disp10_fcc, configs=configs)
y_true = y_true + [225]*len(structure_list_fcc_disp10)

target_list_diam_disp10, structure_list_diam_disp10 = load_descriptor(desc_files=desc_files_disp10_diam, configs=configs)
y_true = y_true + [227]*len(structure_list_diam_disp10)

target_list_bcc_disp10, structure_list_bcc_disp10 = load_descriptor(desc_files=desc_files_disp10_bcc, configs=configs)
y_true = y_true + [229]*len(structure_list_bcc_disp10)

In [6]:
# read structures 
structure_list_disp10 = structure_list_hcp_disp10 + structure_list_sc_disp10 + structure_list_fcc_disp10 + structure_list_diam_disp10 + structure_list_bcc_disp10
target_list_disp10 = target_list_hcp_disp10 + target_list_sc_disp10 + target_list_fcc_disp10 + target_list_diam_disp10 + target_list_bcc_disp10

for idx, item in enumerate(target_list_disp10):
    item['data'][0]['target'] = y_true[idx]
    
for idx, structure in enumerate(structure_list_disp10):
    structure.info['target'] = y_true[idx]
    
#y_pred_disp4 = []
#for structure in structure_list_4:
#    y_pred_disp4.append(ase_get_spacegroup(structure, symprec=1e-1).no)

In [7]:
# make dataset

path_to_x, path_to_y, path_to_summary = prepare_dataset(
    structure_list=structure_list_disp10,
    target_list=target_list_disp10,
    desc_metadata='diffraction_3d_sh_spectrum',
    dataset_name='hcp-sc-fcc-diam-bcc_displacement-10%',
    target_name='target',
    target_categorical=True,
    input_dims=(50, 32),
    configs=configs,
    dataset_folder=dataset_folder,
    main_folder=configs['io']['main_folder'],
    desc_folder=configs['io']['desc_folder'],
    tmp_folder=configs['io']['tmp_folder'],
    notes="Hcp, sc, fcc, diam and sc structures with 10% displacement")

x, y, dataset_info = load_dataset_from_file(path_to_x=path_to_x, path_to_y=path_to_y,
                                                              path_to_summary=path_to_summary)

# write ASE database (wirting a json database does not work
# probably because there is a bug in ASE that prevents to write a json database from Jupyter Notebooks
write_ase_db(structure_list_disp10, main_folder=main_folder, db_name='hcp-sc-fcc-diam-bcc_displacement-10%', db_type='db',
                 overwrite=True, folder_name='db_ase')

Counter(y)

Counter({0: 4500, 1: 900, 2: 900, 3: 900, 4: 900})

### 2.10 Displacements 12.0%

In [10]:
# hcp - spacegroup 194
filenames_disp12_hcp = ['hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid0_disp12.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid1_disp12.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid2_disp12.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid3_disp12.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid4_disp12.tar.gz']
# sc - spacegroup 221
filenames_disp12_sc = ['sc/disp/A_cP1_221_a_target_nb_atoms128_rotid0_disp12.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid1_disp12.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid2_disp12.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid3_disp12.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid4_disp12.tar.gz']

# fcc - spacegroup 225
filenames_disp12_fcc = ['fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid0_disp12.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid1_disp12.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid2_disp12.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid3_disp12.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid4_disp12.tar.gz']

# diam - spacegroup 227
filenames_disp12_diam = ['diam/disp/A_cF8_227_a_target_nb_atoms128_rotid0_disp12.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid1_disp12.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid2_disp12.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid3_disp12.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid4_disp12.tar.gz']
# bcc - spacegroup 229
filenames_disp12_bcc = ['bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid0_disp12.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid1_disp12.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid2_disp12.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid3_disp12.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid4_disp12.tar.gz']
                         
                         
desc_files_disp12_hcp = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp12_hcp]
desc_files_disp12_sc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp12_sc]
desc_files_disp12_fcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp12_fcc]
desc_files_disp12_diam = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp12_diam]
desc_files_disp12_bcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp12_bcc]

y_true = []
target_list_hcp_disp12, structure_list_hcp_disp12 = load_descriptor(desc_files=desc_files_disp12_hcp, configs=configs)
y_true = y_true + [194]*len(structure_list_hcp_disp12)

target_list_sc_disp12, structure_list_sc_disp12 = load_descriptor(desc_files=desc_files_disp12_sc, configs=configs)
y_true = y_true + [221]*len(structure_list_sc_disp12)

target_list_fcc_disp12, structure_list_fcc_disp12 = load_descriptor(desc_files=desc_files_disp12_fcc, configs=configs)
y_true = y_true + [225]*len(structure_list_fcc_disp12)

target_list_diam_disp12, structure_list_diam_disp12 = load_descriptor(desc_files=desc_files_disp12_diam, configs=configs)
y_true = y_true + [227]*len(structure_list_diam_disp12)

target_list_bcc_disp12, structure_list_bcc_disp12 = load_descriptor(desc_files=desc_files_disp12_bcc, configs=configs)
y_true = y_true + [229]*len(structure_list_bcc_disp12)

In [11]:
# read structures 
structure_list_disp12 = structure_list_hcp_disp12 + structure_list_sc_disp12 + structure_list_fcc_disp12 + structure_list_diam_disp12 + structure_list_bcc_disp12
target_list_disp12 = target_list_hcp_disp12 + target_list_sc_disp12 + target_list_fcc_disp12 + target_list_diam_disp12 + target_list_bcc_disp12

for idx, item in enumerate(target_list_disp12):
    item['data'][0]['target'] = y_true[idx]
    
for idx, structure in enumerate(structure_list_disp12):
    structure.info['target'] = y_true[idx]
    
#y_pred_disp4 = []
#for structure in structure_list_4:
#    y_pred_disp4.append(ase_get_spacegroup(structure, symprec=1e-1).no)

In [12]:
# make dataset

path_to_x, path_to_y, path_to_summary = prepare_dataset(
    structure_list=structure_list_disp12,
    target_list=target_list_disp12,
    desc_metadata='diffraction_3d_sh_spectrum',
    dataset_name='hcp-sc-fcc-diam-bcc_displacement-12%',
    target_name='target',
    target_categorical=True,
    input_dims=(50, 32),
    configs=configs,
    dataset_folder=dataset_folder,
    main_folder=configs['io']['main_folder'],
    desc_folder=configs['io']['desc_folder'],
    tmp_folder=configs['io']['tmp_folder'],
    notes="Hcp, sc, fcc, diam and sc structures with 12% displacement")

x, y, dataset_info = load_dataset_from_file(path_to_x=path_to_x, path_to_y=path_to_y,
                                                              path_to_summary=path_to_summary)

# write ASE database (wirting a json database does not work
# probably because there is a bug in ASE that prevents to write a json database from Jupyter Notebooks
write_ase_db(structure_list_disp12, main_folder=main_folder, db_name='hcp-sc-fcc-diam-bcc_displacement-12%', db_type='db',
                 overwrite=True, folder_name='db_ase')

Counter(y)

Counter({0: 4500, 1: 900, 2: 900, 3: 900, 4: 900})

### 2.11 Displacements 20.0%

In [6]:
# hcp - spacegroup 194
filenames_disp20_hcp = ['hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid0_disp20.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid1_disp20.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid2_disp20.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid3_disp20.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid4_disp20.tar.gz']
# sc - spacegroup 221
filenames_disp20_sc = ['sc/disp/A_cP1_221_a_target_nb_atoms128_rotid0_disp20.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid1_disp20.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid2_disp20.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid3_disp20.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid4_disp20.tar.gz']

# fcc - spacegroup 225
filenames_disp20_fcc = ['fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid0_disp20.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid1_disp20.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid2_disp20.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid3_disp20.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid4_disp20.tar.gz']

# diam - spacegroup 227
filenames_disp20_diam = ['diam/disp/A_cF8_227_a_target_nb_atoms128_rotid0_disp20.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid1_disp20.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid2_disp20.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid3_disp20.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid4_disp20.tar.gz']
# bcc - spacegroup 229
filenames_disp20_bcc = ['bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid0_disp20.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid1_disp20.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid2_disp20.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid3_disp20.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid4_disp20.tar.gz']
                         
                         
desc_files_disp20_hcp = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp20_hcp]
desc_files_disp20_sc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp20_sc]
desc_files_disp20_fcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp20_fcc]
desc_files_disp20_diam = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp20_diam]
desc_files_disp20_bcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp20_bcc]

y_true = []
target_list_hcp_disp20, structure_list_hcp_disp20 = load_descriptor(desc_files=desc_files_disp20_hcp, configs=configs)
y_true = y_true + [194]*len(structure_list_hcp_disp20)

target_list_sc_disp20, structure_list_sc_disp20 = load_descriptor(desc_files=desc_files_disp20_sc, configs=configs)
y_true = y_true + [221]*len(structure_list_sc_disp20)

target_list_fcc_disp20, structure_list_fcc_disp20 = load_descriptor(desc_files=desc_files_disp20_fcc, configs=configs)
y_true = y_true + [225]*len(structure_list_fcc_disp20)

target_list_diam_disp20, structure_list_diam_disp20 = load_descriptor(desc_files=desc_files_disp20_diam, configs=configs)
y_true = y_true + [227]*len(structure_list_diam_disp20)

target_list_bcc_disp20, structure_list_bcc_disp20 = load_descriptor(desc_files=desc_files_disp20_bcc, configs=configs)
y_true = y_true + [229]*len(structure_list_bcc_disp20)

In [7]:
# read structures 
structure_list_disp20 = structure_list_hcp_disp20 + structure_list_sc_disp20 + structure_list_fcc_disp20 + structure_list_diam_disp20 + structure_list_bcc_disp20
target_list_disp20 = target_list_hcp_disp20 + target_list_sc_disp20 + target_list_fcc_disp20 + target_list_diam_disp20 + target_list_bcc_disp20

for idx, item in enumerate(target_list_disp20):
    item['data'][0]['target'] = y_true[idx]
    
for idx, structure in enumerate(structure_list_disp20):
    structure.info['target'] = y_true[idx]
    
#y_pred_disp4 = []
#for structure in structure_list_4:
#    y_pred_disp4.append(ase_get_spacegroup(structure, symprec=1e-1).no)

In [8]:
# make dataset

path_to_x, path_to_y, path_to_summary = prepare_dataset(
    structure_list=structure_list_disp20,
    target_list=target_list_disp20,
    desc_metadata='diffraction_3d_sh_spectrum',
    dataset_name='hcp-sc-fcc-diam-bcc_displacement-20%',
    target_name='target',
    target_categorical=True,
    input_dims=(50, 32),
    configs=configs,
    dataset_folder=dataset_folder,
    main_folder=configs['io']['main_folder'],
    desc_folder=configs['io']['desc_folder'],
    tmp_folder=configs['io']['tmp_folder'],
    notes="Hcp, sc, fcc, diam and sc structures with 20% displacement")

x, y, dataset_info = load_dataset_from_file(path_to_x=path_to_x, path_to_y=path_to_y,
                                                              path_to_summary=path_to_summary)

# write ASE database (wirting a json database does not work
# probably because there is a bug in ASE that prevents to write a json database from Jupyter Notebooks
write_ase_db(structure_list_disp20, main_folder=main_folder, db_name='hcp-sc-fcc-diam-bcc_displacement-20%', db_type='db',
                 overwrite=True, folder_name='db_ase')

Counter(y)

Counter({0: 4500, 1: 900, 2: 900, 3: 900, 4: 900})

### 2.12 Displacements 30.0%

In [8]:
# hcp - spacegroup 194
filenames_disp30_hcp = ['hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid0_disp30.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid1_disp30.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid2_disp30.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid3_disp30.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid4_disp30.tar.gz']
# sc - spacegroup 221
filenames_disp30_sc = ['sc/disp/A_cP1_221_a_target_nb_atoms128_rotid0_disp30.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid1_disp30.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid2_disp30.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid3_disp30.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid4_disp30.tar.gz']

# fcc - spacegroup 225
filenames_disp30_fcc = ['fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid0_disp30.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid1_disp30.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid2_disp30.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid3_disp30.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid4_disp30.tar.gz']

# diam - spacegroup 227
filenames_disp30_diam = ['diam/disp/A_cF8_227_a_target_nb_atoms128_rotid0_disp30.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid1_disp30.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid2_disp30.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid3_disp30.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid4_disp30.tar.gz']
# bcc - spacegroup 229
filenames_disp30_bcc = ['bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid0_disp30.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid1_disp30.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid2_disp30.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid3_disp30.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid4_disp30.tar.gz']
                         
                         
desc_files_disp30_hcp = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp30_hcp]
desc_files_disp30_sc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp30_sc]
desc_files_disp30_fcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp30_fcc]
desc_files_disp30_diam = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp30_diam]
desc_files_disp30_bcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp30_bcc]

y_true = []
target_list_hcp_disp30, structure_list_hcp_disp30 = load_descriptor(desc_files=desc_files_disp30_hcp, configs=configs)
y_true = y_true + [194]*len(structure_list_hcp_disp30)

target_list_sc_disp30, structure_list_sc_disp30 = load_descriptor(desc_files=desc_files_disp30_sc, configs=configs)
y_true = y_true + [221]*len(structure_list_sc_disp30)

target_list_fcc_disp30, structure_list_fcc_disp30 = load_descriptor(desc_files=desc_files_disp30_fcc, configs=configs)
y_true = y_true + [225]*len(structure_list_fcc_disp30)

target_list_diam_disp30, structure_list_diam_disp30 = load_descriptor(desc_files=desc_files_disp30_diam, configs=configs)
y_true = y_true + [227]*len(structure_list_diam_disp30)

target_list_bcc_disp30, structure_list_bcc_disp30 = load_descriptor(desc_files=desc_files_disp30_bcc, configs=configs)
y_true = y_true + [229]*len(structure_list_bcc_disp30)

In [9]:
# read structures 
structure_list_disp30 = structure_list_hcp_disp30 + structure_list_sc_disp30 + structure_list_fcc_disp30 + structure_list_diam_disp30 + structure_list_bcc_disp30
target_list_disp30 = target_list_hcp_disp30 + target_list_sc_disp30 + target_list_fcc_disp30 + target_list_diam_disp30 + target_list_bcc_disp30

for idx, item in enumerate(target_list_disp30):
    item['data'][0]['target'] = y_true[idx]
    
for idx, structure in enumerate(structure_list_disp30):
    structure.info['target'] = y_true[idx]
    
#y_pred_disp4 = []
#for structure in structure_list_4:
#    y_pred_disp4.append(ase_get_spacegroup(structure, symprec=1e-1).no)

In [10]:
# make dataset

path_to_x, path_to_y, path_to_summary = prepare_dataset(
    structure_list=structure_list_disp30,
    target_list=target_list_disp30,
    desc_metadata='diffraction_3d_sh_spectrum',
    dataset_name='hcp-sc-fcc-diam-bcc_displacement-30%',
    target_name='target',
    target_categorical=True,
    input_dims=(50, 32),
    configs=configs,
    dataset_folder=dataset_folder,
    main_folder=configs['io']['main_folder'],
    desc_folder=configs['io']['desc_folder'],
    tmp_folder=configs['io']['tmp_folder'],
    notes="Hcp, sc, fcc, diam and sc structures with 30% displacement")

x, y, dataset_info = load_dataset_from_file(path_to_x=path_to_x, path_to_y=path_to_y,
                                                              path_to_summary=path_to_summary)

# write ASE database (wirting a json database does not work
# probably because there is a bug in ASE that prevents to write a json database from Jupyter Notebooks
write_ase_db(structure_list_disp30, main_folder=main_folder, db_name='hcp-sc-fcc-diam-bcc_displacement-30%', db_type='db',
                 overwrite=True, folder_name='db_ase')

Counter(y)

Counter({0: 4500, 1: 900, 2: 900, 3: 900, 4: 900})

### 2.13 Displacements 50.0%

In [3]:
# hcp - spacegroup 194
filenames_disp50_hcp = ['hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid0_disp50.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid1_disp50.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid2_disp50.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid3_disp50.tar.gz',
                        'hcp/disp/A_hP2_194_c_target_nb_atoms128_rotid4_disp50.tar.gz']
# sc - spacegroup 221
filenames_disp50_sc = ['sc/disp/A_cP1_221_a_target_nb_atoms128_rotid0_disp50.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid1_disp50.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid2_disp50.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid3_disp50.tar.gz',
                       'sc/disp/A_cP1_221_a_target_nb_atoms128_rotid4_disp50.tar.gz']

# fcc - spacegroup 225
filenames_disp50_fcc = ['fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid0_disp50.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid1_disp50.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid2_disp50.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid3_disp50.tar.gz',
                         'fcc/disp/A_cF4_225_a_target_nb_atoms128_rotid4_disp50.tar.gz']

# diam - spacegroup 227
filenames_disp50_diam = ['diam/disp/A_cF8_227_a_target_nb_atoms128_rotid0_disp50.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid1_disp50.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid2_disp50.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid3_disp50.tar.gz',
                          'diam/disp/A_cF8_227_a_target_nb_atoms128_rotid4_disp50.tar.gz']
# bcc - spacegroup 229
filenames_disp50_bcc = ['bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid0_disp50.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid1_disp50.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid2_disp50.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid3_disp50.tar.gz',
                         'bcc/disp/A_cI2_229_a_target_nb_atoms128_rotid4_disp50.tar.gz']
                         
                         
desc_files_disp50_hcp = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp50_hcp]
desc_files_disp50_sc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp50_sc]
desc_files_disp50_fcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp50_fcc]
desc_files_disp50_diam = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp50_diam]
desc_files_disp50_bcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_disp50_bcc]

y_true = []
target_list_hcp_disp50, structure_list_hcp_disp50 = load_descriptor(desc_files=desc_files_disp50_hcp, configs=configs)
y_true = y_true + [194]*len(structure_list_hcp_disp50)

target_list_sc_disp50, structure_list_sc_disp50 = load_descriptor(desc_files=desc_files_disp50_sc, configs=configs)
y_true = y_true + [221]*len(structure_list_sc_disp50)

target_list_fcc_disp50, structure_list_fcc_disp50 = load_descriptor(desc_files=desc_files_disp50_fcc, configs=configs)
y_true = y_true + [225]*len(structure_list_fcc_disp50)

target_list_diam_disp50, structure_list_diam_disp50 = load_descriptor(desc_files=desc_files_disp50_diam, configs=configs)
y_true = y_true + [227]*len(structure_list_diam_disp50)

target_list_bcc_disp50, structure_list_bcc_disp50 = load_descriptor(desc_files=desc_files_disp50_bcc, configs=configs)
y_true = y_true + [229]*len(structure_list_bcc_disp50)

In [4]:
# read structures 
structure_list_disp50 = structure_list_hcp_disp50 + structure_list_sc_disp50 + structure_list_fcc_disp50 + structure_list_diam_disp50 + structure_list_bcc_disp50
target_list_disp50 = target_list_hcp_disp50 + target_list_sc_disp50 + target_list_fcc_disp50 + target_list_diam_disp50 + target_list_bcc_disp50

for idx, item in enumerate(target_list_disp50):
    item['data'][0]['target'] = y_true[idx]
    
for idx, structure in enumerate(structure_list_disp50):
    structure.info['target'] = y_true[idx]
    
#y_pred_disp4 = []
#for structure in structure_list_4:
#    y_pred_disp4.append(ase_get_spacegroup(structure, symprec=1e-1).no)

In [9]:
# make dataset

path_to_x, path_to_y, path_to_summary = prepare_dataset(
    structure_list=structure_list_disp50,
    target_list=target_list_disp50,
    desc_metadata='diffraction_3d_sh_spectrum',
    dataset_name='hcp-sc-fcc-diam-bcc_displacement-50%',
    target_name='target',
    target_categorical=True,
    input_dims=(50, 32),
    configs=configs,
    dataset_folder=dataset_folder,
    main_folder=configs['io']['main_folder'],
    desc_folder=configs['io']['desc_folder'],
    tmp_folder=configs['io']['tmp_folder'],
    notes="Hcp, sc, fcc, diam and sc structures with 50% displacement")

x, y, dataset_info = load_dataset_from_file(path_to_x=path_to_x, path_to_y=path_to_y,
                                                              path_to_summary=path_to_summary)

# write ASE database (wirting a json database does not work
# probably because there is a bug in ASE that prevents to write a json database from Jupyter Notebooks
write_ase_db(structure_list_disp50, main_folder=main_folder, db_name='hcp-sc-fcc-diam-bcc_displacement-50%', db_type='db',
                 overwrite=True, folder_name='db_ase')

Counter(y)

Counter({0: 4500, 1: 900, 2: 900, 3: 900, 4: 900})

## 3. Structures with vacancies

### 3.1 Vacancies 1.0%

In [None]:
# hcp - spacegroup 194
filenames_vac1_hcp = ['hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid0_vac01.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid1_vac01.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid2_vac01.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid3_vac01.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid4_vac01.tar.gz']
# sc - spacegroup 221
filenames_vac1_sc = ['sc/vac/A_cP1_221_a_target_nb_atoms128_rotid0_vac01.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid1_vac01.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid2_vac01.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid3_vac01.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid4_vac01.tar.gz']

# fcc - spacegroup 225
filenames_vac1_fcc = ['fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid0_vac01.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid1_vac01.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid2_vac01.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid3_vac01.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid4_vac01.tar.gz']

# diam - spacegroup 227
filenames_vac1_diam = ['diam/vac/A_cF8_227_a_target_nb_atoms128_rotid0_vac01.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid1_vac01.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid2_vac01.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid3_vac01.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid4_vac01.tar.gz']
# bcc - spacegroup 229
filenames_vac1_bcc = ['bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid0_vac01.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid1_vac01.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid2_vac01.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid3_vac01.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid4_vac01.tar.gz']
                         
                         
desc_files_vac1_hcp = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac1_hcp]
desc_files_vac1_sc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac1_sc]
desc_files_vac1_fcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac1_fcc]
desc_files_vac1_diam = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac1_diam]
desc_files_vac1_bcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac1_bcc]

y_true = []
target_list_hcp_vac1, structure_list_hcp_vac1 = load_descriptor(desc_files=desc_files_vac1_hcp, configs=configs)
y_true = y_true + [194]*len(structure_list_hcp_vac1)

target_list_sc_vac1, structure_list_sc_vac1 = load_descriptor(desc_files=desc_files_vac1_sc, configs=configs)
y_true = y_true + [221]*len(structure_list_sc_vac1)

target_list_fcc_vac1, structure_list_fcc_vac1 = load_descriptor(desc_files=desc_files_vac1_fcc, configs=configs)
y_true = y_true + [225]*len(structure_list_fcc_vac1)

target_list_diam_vac1, structure_list_diam_vac1 = load_descriptor(desc_files=desc_files_vac1_diam, configs=configs)
y_true = y_true + [227]*len(structure_list_diam_vac1)

target_list_bcc_vac1, structure_list_bcc_vac1 = load_descriptor(desc_files=desc_files_vac1_bcc, configs=configs)
y_true = y_true + [229]*len(structure_list_bcc_vac1)

In [None]:
# read structure list 
structure_list_vac1 = structure_list_hcp_vac1 + structure_list_sc_vac1 + structure_list_fcc_vac1 + structure_list_diam_vac1 + structure_list_bcc_vac1
target_list_vac1 = target_list_hcp_vac1 + target_list_sc_vac1 + target_list_fcc_vac1 + target_list_diam_vac1 + target_list_bcc_vac1

for idx, item in enumerate(target_list_vac1):
    item['data'][0]['target'] = y_true[idx]
    
for idx, structure in enumerate(structure_list_vac1):
    structure.info['target'] = y_true[idx]
    
#y_pred_vac1 = []
#for structure in structure_list_vac1:
#    y_pred_vac1.append(ase_get_spacegroup(structure, symprec=1e-1).no)

In [None]:
# make dataset

path_to_x, path_to_y, path_to_summary = prepare_dataset(
    structure_list=structure_list_vac1,
    target_list=target_list_vac1,
    desc_metadata='diffraction_3d_sh_spectrum',
    dataset_name='hcp-sc-fcc-diam-bcc_vacancies-1%',
    target_name='target',
    target_categorical=True,
    input_dims=(50, 32),
    configs=configs,
    dataset_folder=dataset_folder,
    main_folder=configs['io']['main_folder'],
    desc_folder=configs['io']['desc_folder'],
    tmp_folder=configs['io']['tmp_folder'],
    notes="Hcp, sc, fcc, diam and sc structures with 1% vacancies")

x, y, dataset_info = load_dataset_from_file(path_to_x=path_to_x, path_to_y=path_to_y,
                                                              path_to_summary=path_to_summary)

# write ASE database (wirting a json database does not work
# probably because there is a bug in ASE that prevents to write a json database from Jupyter Notebooks
write_ase_db(structure_list_vac1, main_folder=main_folder, db_name='hcp-sc-fcc-diam-bcc_vacancies-1%', db_type='db',
                 overwrite=True, folder_name='db_ase')

Counter(y)

### 3.2 Vacancies 2%

In [None]:
#hcp - spacegroup 194
filenames_vac2_hcp = ['hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid0_vac02.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid1_vac02.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid2_vac02.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid3_vac02.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid4_vac02.tar.gz']
# sc - spacegroup 221
filenames_vac2_sc = ['sc/vac/A_cP1_221_a_target_nb_atoms128_rotid0_vac02.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid1_vac02.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid2_vac02.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid3_vac02.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid4_vac02.tar.gz']

# fcc - spacegroup 225
filenames_vac2_fcc = ['fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid0_vac02.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid1_vac02.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid2_vac02.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid3_vac02.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid4_vac02.tar.gz']

# diam - spacegroup 227
filenames_vac2_diam = ['diam/vac/A_cF8_227_a_target_nb_atoms128_rotid0_vac02.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid1_vac02.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid2_vac02.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid3_vac02.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid4_vac02.tar.gz']
# bcc - spacegroup 229
filenames_vac2_bcc = ['bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid0_vac02.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid1_vac02.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid2_vac02.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid3_vac02.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid4_vac02.tar.gz']
                         
                         
desc_files_vac2_hcp = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac2_hcp]
desc_files_vac2_sc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac2_sc]
desc_files_vac2_fcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac2_fcc]
desc_files_vac2_diam = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac2_diam]
desc_files_vac2_bcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac2_bcc]

y_true = []
target_list_hcp_vac2, structure_list_hcp_vac2 = load_descriptor(desc_files=desc_files_vac2_hcp, configs=configs)
y_true = y_true + [194]*len(structure_list_hcp_vac2)

target_list_sc_vac2, structure_list_sc_vac2 = load_descriptor(desc_files=desc_files_vac2_sc, configs=configs)
y_true = y_true + [221]*len(structure_list_sc_vac2)

target_list_fcc_vac2, structure_list_fcc_vac2 = load_descriptor(desc_files=desc_files_vac2_fcc, configs=configs)
y_true = y_true + [225]*len(structure_list_fcc_vac2)

target_list_diam_vac2, structure_list_diam_vac2 = load_descriptor(desc_files=desc_files_vac2_diam, configs=configs)
y_true = y_true + [227]*len(structure_list_diam_vac2)

target_list_bcc_vac2, structure_list_bcc_vac2 = load_descriptor(desc_files=desc_files_vac2_bcc, configs=configs)
y_true = y_true + [229]*len(structure_list_bcc_vac2)

In [None]:
# load structure list

structure_list_vac2 = structure_list_hcp_vac2 + structure_list_sc_vac2 + structure_list_fcc_vac2 + structure_list_diam_vac2 + structure_list_bcc_vac2
target_list_vac2 = target_list_hcp_vac2 + target_list_sc_vac2 + target_list_fcc_vac2 + target_list_diam_vac2 + target_list_bcc_vac2

for idx, item in enumerate(target_list_vac2):
    item['data'][0]['target'] = y_true[idx]
    
for idx, structure in enumerate(structure_list_vac2):
    structure.info['target'] = y_true[idx]
    
#y_pred_vac1 = []
#for structure in structure_list_vac1:
#    y_pred_vac1.append(ase_get_spacegroup(structure, symprec=1e-1).no)

In [None]:
# make dataset

path_to_x, path_to_y, path_to_summary = prepare_dataset(
    structure_list=structure_list_vac2,
    target_list=target_list_vac2,
    desc_metadata='diffraction_3d_sh_spectrum',
    dataset_name='hcp-sc-fcc-diam-bcc_vacancies-2%',
    target_name='target',
    target_categorical=True,
    input_dims=(50, 32),
    configs=configs,
    dataset_folder=dataset_folder,
    main_folder=configs['io']['main_folder'],
    desc_folder=configs['io']['desc_folder'],
    tmp_folder=configs['io']['tmp_folder'],
    notes="Hcp, sc, fcc, diam and sc structures with 2% vacancies")

x, y, dataset_info = load_dataset_from_file(path_to_x=path_to_x, path_to_y=path_to_y,
                                                              path_to_summary=path_to_summary)


# write ASE database (wirting a json database does not work
# probably because there is a bug in ASE that prevents to write a json database from Jupyter Notebooks
write_ase_db(structure_list_vac2, main_folder=main_folder, db_name='hcp-sc-fcc-diam-bcc_vacancies-2%', db_type='db',
                 overwrite=True, folder_name='db_ase')

Counter(y)

### 3.3 Vacancies 5%

In [None]:
#hcp - spacegroup 194
filenames_vac5_hcp = ['hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid0_vac05.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid1_vac05.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid2_vac05.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid3_vac05.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid4_vac05.tar.gz']
# sc - spacegroup 221
filenames_vac5_sc = ['sc/vac/A_cP1_221_a_target_nb_atoms128_rotid0_vac05.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid1_vac05.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid2_vac05.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid3_vac05.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid4_vac05.tar.gz']

# fcc - spacegroup 225
filenames_vac5_fcc = ['fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid0_vac05.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid1_vac05.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid2_vac05.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid3_vac05.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid4_vac05.tar.gz']

# diam - spacegroup 227
filenames_vac5_diam = ['diam/vac/A_cF8_227_a_target_nb_atoms128_rotid0_vac05.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid1_vac05.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid2_vac05.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid3_vac05.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid4_vac05.tar.gz']
# bcc - spacegroup 229
filenames_vac5_bcc = ['bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid0_vac05.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid1_vac05.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid2_vac05.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid3_vac05.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid4_vac05.tar.gz']
                         
                         
desc_files_vac5_hcp = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac5_hcp]
desc_files_vac5_sc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac5_sc]
desc_files_vac5_fcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac5_fcc]
desc_files_vac5_diam = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac5_diam]
desc_files_vac5_bcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac5_bcc]

y_true = []
target_list_hcp_vac5, structure_list_hcp_vac5 = load_descriptor(desc_files=desc_files_vac5_hcp, configs=configs)
y_true = y_true + [194]*len(structure_list_hcp_vac5)

target_list_sc_vac5, structure_list_sc_vac5 = load_descriptor(desc_files=desc_files_vac5_sc, configs=configs)
y_true = y_true + [221]*len(structure_list_sc_vac5)

target_list_fcc_vac5, structure_list_fcc_vac5 = load_descriptor(desc_files=desc_files_vac5_fcc, configs=configs)
y_true = y_true + [225]*len(structure_list_fcc_vac5)

target_list_diam_vac5, structure_list_diam_vac5 = load_descriptor(desc_files=desc_files_vac5_diam, configs=configs)
y_true = y_true + [227]*len(structure_list_diam_vac5)

target_list_bcc_vac5, structure_list_bcc_vac5 = load_descriptor(desc_files=desc_files_vac5_bcc, configs=configs)
y_true = y_true + [229]*len(structure_list_bcc_vac5)

In [None]:
# load structure list

structure_list_vac5 = structure_list_hcp_vac5 + structure_list_sc_vac5 + structure_list_fcc_vac5 + structure_list_diam_vac5 + structure_list_bcc_vac5
target_list_vac5 = target_list_hcp_vac5 + target_list_sc_vac5 + target_list_fcc_vac5 + target_list_diam_vac5 + target_list_bcc_vac5

for idx, item in enumerate(target_list_vac5):
    item['data'][0]['target'] = y_true[idx]
    
for idx, structure in enumerate(structure_list_vac5):
    structure.info['target'] = y_true[idx]
    
#y_pred_vac1 = []
#for structure in structure_list_vac1:
#    y_pred_vac1.append(ase_get_spacegroup(structure, symprec=1e-1).no)

In [None]:
# make dataset

path_to_x, path_to_y, path_to_summary = prepare_dataset(
    structure_list=structure_list_vac5,
    target_list=target_list_vac5,
    desc_metadata='diffraction_3d_sh_spectrum',
    dataset_name='hcp-sc-fcc-diam-bcc_vacancies-5%',
    target_name='target',
    target_categorical=True,
    input_dims=(50, 32),
    configs=configs,
    dataset_folder=dataset_folder,
    main_folder=configs['io']['main_folder'],
    desc_folder=configs['io']['desc_folder'],
    tmp_folder=configs['io']['tmp_folder'],
    notes="Hcp, sc, fcc, diam and sc structures with 5% vacancies")

x, y, dataset_info = load_dataset_from_file(path_to_x=path_to_x, path_to_y=path_to_y,
                                                              path_to_summary=path_to_summary)


# write ASE database (wirting a json database does not work
# probably because there is a bug in ASE that prevents to write a json database from Jupyter Notebooks
write_ase_db(structure_list_vac5, main_folder=main_folder, db_name='hcp-sc-fcc-diam-bcc_vacancies-5%', db_type='db',
                 overwrite=True, folder_name='db_ase')

Counter(y)

### 3.4 Vacancies 10%

In [None]:
# hcp - spacegroup 194
filenames_vac10_hcp = ['hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid0_vac10.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid1_vac10.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid2_vac10.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid3_vac10.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid4_vac10.tar.gz']
# sc - spacegroup 221
filenames_vac10_sc = ['sc/vac/A_cP1_221_a_target_nb_atoms128_rotid0_vac10.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid1_vac10.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid2_vac10.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid3_vac10.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid4_vac10.tar.gz']

# fcc - spacegroup 225
filenames_vac10_fcc = ['fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid0_vac10.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid1_vac10.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid2_vac10.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid3_vac10.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid4_vac10.tar.gz']

# diam - spacegroup 227
filenames_vac10_diam = ['diam/vac/A_cF8_227_a_target_nb_atoms128_rotid0_vac10.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid1_vac10.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid2_vac10.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid3_vac10.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid4_vac10.tar.gz']
# bcc - spacegroup 229
filenames_vac10_bcc = ['bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid0_vac10.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid1_vac10.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid2_vac10.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid3_vac10.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid4_vac10.tar.gz']
                         
                         
desc_files_vac10_hcp = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac10_hcp]
desc_files_vac10_sc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac10_sc]
desc_files_vac10_fcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac10_fcc]
desc_files_vac10_diam = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac10_diam]
desc_files_vac10_bcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac10_bcc]

y_true = []
target_list_hcp_vac10, structure_list_hcp_vac10 = load_descriptor(desc_files=desc_files_vac10_hcp, configs=configs)
y_true = y_true + [194]*len(structure_list_hcp_vac10)

target_list_sc_vac10, structure_list_sc_vac10 = load_descriptor(desc_files=desc_files_vac10_sc, configs=configs)
y_true = y_true + [221]*len(structure_list_sc_vac10)

target_list_fcc_vac10, structure_list_fcc_vac10 = load_descriptor(desc_files=desc_files_vac10_fcc, configs=configs)
y_true = y_true + [225]*len(structure_list_fcc_vac10)

target_list_diam_vac10, structure_list_diam_vac10 = load_descriptor(desc_files=desc_files_vac10_diam, configs=configs)
y_true = y_true + [227]*len(structure_list_diam_vac10)

target_list_bcc_vac10, structure_list_bcc_vac10 = load_descriptor(desc_files=desc_files_vac10_bcc, configs=configs)
y_true = y_true + [229]*len(structure_list_bcc_vac10)

In [None]:
# load structure list

structure_list_vac10 = structure_list_hcp_vac10 + structure_list_sc_vac10 + structure_list_fcc_vac10 + structure_list_diam_vac10 + structure_list_bcc_vac10
target_list_vac10 = target_list_hcp_vac10 + target_list_sc_vac10 + target_list_fcc_vac10 + target_list_diam_vac10 + target_list_bcc_vac10

for idx, item in enumerate(target_list_vac10):
    item['data'][0]['target'] = y_true[idx]
    
for idx, structure in enumerate(structure_list_vac10):
    structure.info['target'] = y_true[idx]
    
#y_pred_vac1 = []
#for structure in structure_list_vac1:
#    y_pred_vac1.append(ase_get_spacegroup(structure, symprec=1e-1).no)

In [None]:
# make dataset

path_to_x, path_to_y, path_to_summary = prepare_dataset(
    structure_list=structure_list_vac10,
    target_list=target_list_vac10,
    desc_metadata='diffraction_3d_sh_spectrum',
    dataset_name='hcp-sc-fcc-diam-bcc_vacancies-10%',
    target_name='target',
    target_categorical=True,
    input_dims=(50, 32),
    configs=configs,
    dataset_folder=dataset_folder,
    main_folder=configs['io']['main_folder'],
    desc_folder=configs['io']['desc_folder'],
    tmp_folder=configs['io']['tmp_folder'],
    notes="Hcp, sc, fcc, diam and sc structures with 10% vacancies")

x, y, dataset_info = load_dataset_from_file(path_to_x=path_to_x, path_to_y=path_to_y,
                                                              path_to_summary=path_to_summary)


# write ASE database (wirting a json database does not work
# probably because there is a bug in ASE that prevents to write a json database from Jupyter Notebooks
write_ase_db(structure_list_vac10, main_folder=main_folder, db_name='hcp-sc-fcc-diam-bcc_vacancies-10%', db_type='db',
                 overwrite=True, folder_name='db_ase')

Counter(y)

### 3.5 Vacancies 20%

In [7]:
# hcp - spacegroup 194
filenames_vac20_hcp = ['hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid0_vac20.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid1_vac20.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid2_vac20.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid3_vac20.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid4_vac20.tar.gz']
# sc - spacegroup 221
filenames_vac20_sc = ['sc/vac/A_cP1_221_a_target_nb_atoms128_rotid0_vac20.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid1_vac20.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid2_vac20.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid3_vac20.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid4_vac20.tar.gz']

# fcc - spacegroup 225
filenames_vac20_fcc = ['fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid0_vac20.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid1_vac20.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid2_vac20.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid3_vac20.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid4_vac20.tar.gz']

# diam - spacegroup 227
filenames_vac20_diam = ['diam/vac/A_cF8_227_a_target_nb_atoms128_rotid0_vac20.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid1_vac20.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid2_vac20.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid3_vac20.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid4_vac20.tar.gz']
# bcc - spacegroup 229
filenames_vac20_bcc = ['bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid0_vac20.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid1_vac20.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid2_vac20.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid3_vac20.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid4_vac20.tar.gz']
                         
                         
desc_files_vac20_hcp = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac20_hcp]
desc_files_vac20_sc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac20_sc]
desc_files_vac20_fcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac20_fcc]
desc_files_vac20_diam = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac20_diam]
desc_files_vac20_bcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac20_bcc]

y_true = []
target_list_hcp_vac20, structure_list_hcp_vac20 = load_descriptor(desc_files=desc_files_vac20_hcp, configs=configs)
y_true = y_true + [194]*len(structure_list_hcp_vac20)

target_list_sc_vac20, structure_list_sc_vac20 = load_descriptor(desc_files=desc_files_vac20_sc, configs=configs)
y_true = y_true + [221]*len(structure_list_sc_vac20)

target_list_fcc_vac20, structure_list_fcc_vac20 = load_descriptor(desc_files=desc_files_vac20_fcc, configs=configs)
y_true = y_true + [225]*len(structure_list_fcc_vac20)

target_list_diam_vac20, structure_list_diam_vac20 = load_descriptor(desc_files=desc_files_vac20_diam, configs=configs)
y_true = y_true + [227]*len(structure_list_diam_vac20)

target_list_bcc_vac20, structure_list_bcc_vac20 = load_descriptor(desc_files=desc_files_vac20_bcc, configs=configs)
y_true = y_true + [229]*len(structure_list_bcc_vac20)

In [8]:
# load structure list

structure_list_vac20 = structure_list_hcp_vac20 + structure_list_sc_vac20 + structure_list_fcc_vac20 + structure_list_diam_vac20 + structure_list_bcc_vac20
target_list_vac20 = target_list_hcp_vac20 + target_list_sc_vac20 + target_list_fcc_vac20 + target_list_diam_vac20 + target_list_bcc_vac20

for idx, item in enumerate(target_list_vac20):
    item['data'][0]['target'] = y_true[idx]
    
for idx, structure in enumerate(structure_list_vac20):
    structure.info['target'] = y_true[idx]
    
#y_pred_vac1 = []
#for structure in structure_list_vac1:
#    y_pred_vac1.append(ase_get_spacegroup(structure, symprec=1e-1).no)

In [9]:
# make dataset

path_to_x, path_to_y, path_to_summary = prepare_dataset(
    structure_list=structure_list_vac20,
    target_list=target_list_vac20,
    desc_metadata='diffraction_3d_sh_spectrum',
    dataset_name='hcp-sc-fcc-diam-bcc_vacancies-20%',
    target_name='target',
    target_categorical=True,
    input_dims=(50, 32),
    configs=configs,
    dataset_folder=dataset_folder,
    main_folder=configs['io']['main_folder'],
    desc_folder=configs['io']['desc_folder'],
    tmp_folder=configs['io']['tmp_folder'],
    notes="Hcp, sc, fcc, diam and sc structures with 20% vacancies")

x, y, dataset_info = load_dataset_from_file(path_to_x=path_to_x, path_to_y=path_to_y,
                                                              path_to_summary=path_to_summary)


# write ASE database (wirting a json database does not work
# probably because there is a bug in ASE that prevents to write a json database from Jupyter Notebooks
write_ase_db(structure_list_vac20, main_folder=main_folder, db_name='hcp-sc-fcc-diam-bcc_vacancies-20%', db_type='db',
                 overwrite=True, folder_name='db_ase')

Counter(y)

Counter({0: 4500, 1: 900, 2: 900, 3: 900, 4: 900})

### 3.6 Vacancies 25%

In [None]:
# hcp - spacegroup 194
filenames_vac25_hcp = ['hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid0_vac25.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid1_vac25.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid2_vac25.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid3_vac25.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid4_vac25.tar.gz']
# sc - spacegroup 221
filenames_vac25_sc = ['sc/vac/A_cP1_221_a_target_nb_atoms128_rotid0_vac25.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid1_vac25.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid2_vac25.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid3_vac25.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid4_vac25.tar.gz']

# fcc - spacegroup 225
filenames_vac25_fcc = ['fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid0_vac25.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid1_vac25.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid2_vac25.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid3_vac25.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid4_vac25.tar.gz']

# diam - spacegroup 227
filenames_vac25_diam = ['diam/vac/A_cF8_227_a_target_nb_atoms128_rotid0_vac25.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid1_vac25.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid2_vac25.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid3_vac25.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid4_vac25.tar.gz']
# bcc - spacegroup 229
filenames_vac25_bcc = ['bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid0_vac25.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid1_vac25.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid2_vac25.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid3_vac25.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid4_vac25.tar.gz']
                         
                         
desc_files_vac25_hcp = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac25_hcp]
desc_files_vac25_sc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac25_sc]
desc_files_vac25_fcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac25_fcc]
desc_files_vac25_diam = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac25_diam]
desc_files_vac25_bcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac25_bcc]

y_true = []
target_list_hcp_vac25, structure_list_hcp_vac25 = load_descriptor(desc_files=desc_files_vac25_hcp, configs=configs)
y_true = y_true + [194]*len(structure_list_hcp_vac25)

target_list_sc_vac25, structure_list_sc_vac25 = load_descriptor(desc_files=desc_files_vac25_sc, configs=configs)
y_true = y_true + [221]*len(structure_list_sc_vac25)

target_list_fcc_vac25, structure_list_fcc_vac25 = load_descriptor(desc_files=desc_files_vac25_fcc, configs=configs)
y_true = y_true + [225]*len(structure_list_fcc_vac25)

target_list_diam_vac25, structure_list_diam_vac25 = load_descriptor(desc_files=desc_files_vac25_diam, configs=configs)
y_true = y_true + [227]*len(structure_list_diam_vac25)

target_list_bcc_vac25, structure_list_bcc_vac25 = load_descriptor(desc_files=desc_files_vac25_bcc, configs=configs)
y_true = y_true + [229]*len(structure_list_bcc_vac25)

In [None]:
# read structure lists

structure_list_vac25 = structure_list_hcp_vac25 + structure_list_sc_vac25 + structure_list_fcc_vac25 + structure_list_diam_vac25 + structure_list_bcc_vac25
target_list_vac25 = target_list_hcp_vac25 + target_list_sc_vac25 + target_list_fcc_vac25 + target_list_diam_vac25 + target_list_bcc_vac25

for idx, item in enumerate(target_list_vac25):
    item['data'][0]['target'] = y_true[idx]
    
for idx, structure in enumerate(structure_list_vac25):
    structure.info['target'] = y_true[idx]
    
#y_pred_vac25 = []
#for structure in structure_list_vac25:
#    y_pred_vac25.append(ase_get_spacegroup(structure, symprec=1e-1).no)

In [None]:
# make dataset

path_to_x, path_to_y, path_to_summary = prepare_dataset(
    structure_list=structure_list_vac25,
    target_list=target_list_vac25,
    desc_metadata='diffraction_3d_sh_spectrum',
    dataset_name='hcp-sc-fcc-diam-bcc_vacancies-25%',
    target_name='target',
    target_categorical=True,
    input_dims=(50, 32),
    configs=configs,
    dataset_folder=dataset_folder,
    main_folder=configs['io']['main_folder'],
    desc_folder=configs['io']['desc_folder'],
    tmp_folder=configs['io']['tmp_folder'],
    notes="Hcp, sc, fcc, diam and sc structures with 25% vacancies")

x, y, dataset_info = load_dataset_from_file(path_to_x=path_to_x, path_to_y=path_to_y,
                                                              path_to_summary=path_to_summary)


# write ASE database (wirting a json database does not work
# probably because there is a bug in ASE that prevents to write a json database from Jupyter Notebooks
write_ase_db(structure_list_vac25, main_folder=main_folder, db_name='hcp-sc-fcc-diam-bcc_vacancies-25%', db_type='db',
                 overwrite=True, folder_name='db_ase')

Counter(y)

### 3.7 Vacancies 50%

In [None]:
# hcp - spacegroup 194
filenames_vac50_hcp = ['hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid0_vac50.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid1_vac50.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid2_vac50.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid3_vac50.tar.gz',
                        'hcp/vac/A_hP2_194_c_target_nb_atoms128_rotid4_vac50.tar.gz']
# sc - spacegroup 221
filenames_vac50_sc = ['sc/vac/A_cP1_221_a_target_nb_atoms128_rotid0_vac50.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid1_vac50.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid2_vac50.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid3_vac50.tar.gz',
                       'sc/vac/A_cP1_221_a_target_nb_atoms128_rotid4_vac50.tar.gz']

# fcc - spacegroup 225
filenames_vac50_fcc = ['fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid0_vac50.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid1_vac50.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid2_vac50.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid3_vac50.tar.gz',
                         'fcc/vac/A_cF4_225_a_target_nb_atoms128_rotid4_vac50.tar.gz']

# diam - spacegroup 227
filenames_vac50_diam = ['diam/vac/A_cF8_227_a_target_nb_atoms128_rotid0_vac50.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid1_vac50.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid2_vac50.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid3_vac50.tar.gz',
                          'diam/vac/A_cF8_227_a_target_nb_atoms128_rotid4_vac50.tar.gz']
# bcc - spacegroup 229
filenames_vac50_bcc = ['bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid0_vac50.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid1_vac50.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid2_vac50.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid3_vac50.tar.gz',
                         'bcc/vac/A_cI2_229_a_target_nb_atoms128_rotid4_vac50.tar.gz']
                         
                         
desc_files_vac50_hcp = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac50_hcp]
desc_files_vac50_sc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac50_sc]
desc_files_vac50_fcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac50_fcc]
desc_files_vac50_diam = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac50_diam]
desc_files_vac50_bcc = [os.path.join(configs['io']['desc_folder'], item) for item in filenames_vac50_bcc]

y_true = []
target_list_hcp_vac50, structure_list_hcp_vac50 = load_descriptor(desc_files=desc_files_vac50_hcp, configs=configs)
y_true = y_true + [194]*len(structure_list_hcp_vac50)

target_list_sc_vac50, structure_list_sc_vac50 = load_descriptor(desc_files=desc_files_vac50_sc, configs=configs)
y_true = y_true + [221]*len(structure_list_sc_vac50)

target_list_fcc_vac50, structure_list_fcc_vac50 = load_descriptor(desc_files=desc_files_vac50_fcc, configs=configs)
y_true = y_true + [225]*len(structure_list_fcc_vac50)

target_list_diam_vac50, structure_list_diam_vac50 = load_descriptor(desc_files=desc_files_vac50_diam, configs=configs)
y_true = y_true + [227]*len(structure_list_diam_vac50)

target_list_bcc_vac50, structure_list_bcc_vac50 = load_descriptor(desc_files=desc_files_vac50_bcc, configs=configs)
y_true = y_true + [229]*len(structure_list_bcc_vac50)

In [None]:
# read structures 
structure_list_vac50 = structure_list_hcp_vac50 + structure_list_sc_vac50 + structure_list_fcc_vac50 + structure_list_diam_vac50 + structure_list_bcc_vac50
target_list_vac50 = target_list_hcp_vac50 + target_list_sc_vac50 + target_list_fcc_vac50 + target_list_diam_vac50 + target_list_bcc_vac50

for idx, item in enumerate(target_list_vac50):
    item['data'][0]['target'] = y_true[idx]
    
for idx, structure in enumerate(structure_list_vac50):
    structure.info['target'] = y_true[idx]
    
#y_pred_vac50 = []
#for structure in structure_list_vac25:
#    y_pred_vac50.append(ase_get_spacegroup(structure, symprec=1e-1).no)

In [None]:
# make dataset

path_to_x, path_to_y, path_to_summary = prepare_dataset(
    structure_list=structure_list_vac50,
    target_list=target_list_vac50,
    desc_metadata='diffraction_3d_sh_spectrum',
    dataset_name='hcp-sc-fcc-diam-bcc_vacancies-50%',
    target_name='target',
    target_categorical=True,
    input_dims=(50, 32),
    configs=configs,
    dataset_folder=dataset_folder,
    main_folder=configs['io']['main_folder'],
    desc_folder=configs['io']['desc_folder'],
    tmp_folder=configs['io']['tmp_folder'],
    notes="Hcp, sc, fcc, diam and sc structures with 50% vacancies")

x, y, dataset_info = load_dataset_from_file(path_to_x=path_to_x, path_to_y=path_to_y,
                                                              path_to_summary=path_to_summary)


# write ASE database (wirting a json database does not work
# probably because there is a bug in ASE that prevents to write a json database from Jupyter Notebooks
write_ase_db(structure_list_vac50, main_folder=main_folder, db_name='hcp-sc-fcc-diam-bcc_vacancies-50%', db_type='db',
                 overwrite=True, folder_name='db_ase')

Counter(y)