## Notebook for data curation

This Notebook is used to generate a dataset (in the extended .xyz format) of the CSD-2K CSD-1K and CSD-S546 data (CSD-3K+S546_shift_tensors.xyz) and the CSD-500 and CSD-S104 data (CSD-500+104-7_shift_tensors.xyz), containing the atom-wise isotropic chemical shifts and shift tensor extracted from the .magres files given in the original CSD-X data 
(archived in https://archive.materialscloud.org/record/2019.0023/v1).

The CSD-500+104-7_shift_tensors.xyz file misses 7 files from the originally reported CSD-500+104.xyz file. I have noticed that there are 7 structures in the CSD-500+104.xyz file (‘JEKPIZ’, ‘NECFIM’, ‘CUMZAM’, ‘ODEJAJ01’, ‘ITOFEE’, ‘WEPBAV’, ‘FOQREK) that have missing .magres files. These structures were excluded from the tensor containing .xyz file. I have done this by returning the intersection of two dicts containing {key(CSD-identifier) : value(e.g. atoms object)} pairs.

A second file helpers.py contains helper functions building CSD-identifier:property dicts, testing functions and a modified .magres parser from the ASE library. This is necessary because either the .magres parser from the ASE (and http://tfgg.me/magres-format/build/html/index.html) are broken, or the .magres files were not written according to specifications. 

This originates from large isotropic shifts and tensor values not being white-space separated. 
'm N-6.0351   57.0173   -6.0146   39.7698   98.8410   49.6738   13.0950   46.7860-115.2668'

The not so nice fix is to replace each "-" with " -" in the file and resubstituting the only other two lines containing "-" in the .magres file (some unit information and calculator version number)

The modified ASE parser takes a modified file object as input instead of a file path.

In [6]:
import numpy as np

In [30]:
def get_macroscopic_tensor(lines):
    """retrieves the macroscopic contribution iso and tensor from a GIPAW FILE as line-list
    """
    for n,line in enumerate(lines):
        if  "Macroscopic shape contribution in ppm:" in line:
            macroscopic_iso = float(line.split()[-1])
            macroscopic_tensor = lines[n+1:n+4]
            macroscopic_tensor = [ column.rstrip("\n").split() for column in macroscopic_tensor]
            macroscopic_tensor = np.array(macroscopic_tensor,dtype=float)
    
    return macroscopic_iso, macroscopic_tensor

def get_contributions(lines,propstring,istens):
    """gets contribution iso and tensorial from GIPAW FILE
       istens option is for "core" contirbution which has no tensor written
       and will be added on diagonal
    """
    
    props_iso = []
    props_tensor = []
    
    for n, line in enumerate(lines):
        
        if propstring in line:
            iso = float(line.rstrip("\n").split()[-1])
            
            if istens is True:
                tensor = [ column.rstrip("\n").split() for column in lines[n+1:n+4]]
                tensor = np.array(tensor,dtype=float)
            
            else:
                tensor = np.eye(3)
                tensor *= iso
        
            props_iso.append(iso)
            props_tensor.append(tensor)
    
    return np.array(props_iso).reshape(-1,1), np.array(props_tensor)

def get_nmr_contributions(lines):
    """gets the individual_contributions of GIPAW shift calculations
    """

    tot_iso = {}
    tot_tens = {}

    iso_makro, tens_makro = get_macroscopic_tensor(lines)
    
    prop_identifiers = {"core sigma:":"core_sigma","para_oo sigma:":"para_oo_sigma",\
                        "para_lq sigma:":"para_lq_sigma","para sigma:":"para_sigma",\
                        "dia sigma:":"dia_sigma","bare sigma:":"bare_sigma"}

    for propstring in ["core sigma:", "para_oo sigma:", "para_lq sigma:", "para sigma:", "dia sigma:", "bare sigma:"]:

        if propstring == "core sigma:":
            iso, tens = get_contributions(lines, propstring, False)

        else:
            iso, tens = get_contributions(lines, propstring, True)
        
        identifier = prop_identifiers[propstring]
        tot_iso[identifier+"_iso"] = iso 
        tot_tens[identifier+"_tensor"] = tens
    
    #tot_iso["shape_contribution_iso"] = iso_makro
    #tot_tens["shape_contribution_iso"] = tens_makro

    return tot_iso, tot_tens

In [8]:
import glob

In [9]:
from ase.io import read,write

In [111]:
#build list of gipaw_file_names of all the datasets
datasets_v1 = ["../ShiftMLv1_datasets/CSD-2k","../ShiftMLv1_datasets/CSD-500"]
datasets_v11 = ["../ShiftMLv1.1_datasets/CSD-1k","../ShiftMLv1.1_datasets/CSD-S104","../ShiftMLv1.1_datasets/CSD-S546"]

paths = []
pathdict = {}


for datadir in datasets_v1 + datasets_v11:
    paths += glob.glob(datadir + "/gipaw/*")

for path in paths:
    tmp = path.rstrip(".nmr.out")
    tmp = tmp.split("/")
    pathdict[tmp[-1]] = path

data_train = read("../make_tensor_data/train_tensor/CSD-3k+S546_shift_tensors.xyz",":")
data_test = read("../make_tensor_data/test_tensor/CSD-500+104-7_shift_tensors.xyz",":")
    
#[ *glob.glob(data_dir + "/gipaw/*") for datadir in [*datasets_v1,*datasets_v11]]

#os.path.join(datasets_v1[0],"gipaw"))

check_consistent_iso = []
check_consistent_tensor = []

for data in [data_train,data_test]:
    for frame in data:
        csd_identifier = frame.info["NAME"]
        frame_gipaw_path = pathdict[csd_identifier]

        with open(frame_gipaw_path,"r") as f:
            gipaw_file_lines = f.readlines()
            tot_iso, tot_tens = get_nmr_contributions(gipaw_file_lines)
            shape_iso, shape_tens = get_macroscopic_tensor(gipaw_file_lines)
            shape_iso_array = np.array([shape_iso for i in range(len(frame))])
            shape_tens_array = np.array([shape_tens for i in range(len(frame))])
        
        for key, value in tot_iso.items():
            frame.arrays.update({key:value})
        
        for key, value in tot_tens.items():    
            frame.arrays.update({key:value.reshape(-1,9)})
        
        frame.arrays.update({"shape_contribution_iso":shape_iso_array})
        frame.arrays.update({"shape_contribution_tensor":shape_tens_array.reshape(-1,9)})
        
        
        
        
        iso_shifts = frame.arrays['cs_iso'].reshape(-1,1)
        tensor_shifts = frame.arrays['cs_tensor']


        int_iso = [frame.arrays[i].reshape(-1,1) for i in ['core_sigma_iso', 'para_oo_sigma_iso', 'para_lq_sigma_iso', 'para_sigma_iso', 'dia_sigma_iso', 'bare_sigma_iso', 'shape_contribution_iso']]
        int_tens = [frame.arrays[i] for i in ['core_sigma_tensor', 'para_oo_sigma_tensor', 'para_lq_sigma_tensor', 'para_sigma_tensor', 'dia_sigma_tensor', 'bare_sigma_tensor', 'shape_contribution_tensor']]
        
        check_consistent_iso.append(np.allclose(iso_shifts,np.sum(int_iso,axis=0),atol=0.035))
        check_consistent_tensor.append(np.allclose(tensor_shifts,np.sum(int_tens,axis=0),atol=0.035))
        
        

#
"""
if (False in check_consistent_tensor) is False:
    if (False in check_consistent_iso) is False:
        write("CSD-3k+S546_shift_tensors_components.xyz",data_train,format="extxyz")        
        write("CSD-500+104-7_shift_tensors_components.xyz",data_test,format="extxyz")
"""
#loop through CSD-3K/CSD-500+100 Dataset and get CSD identifier from .info dict
#

In [112]:
data_train = read("CSD-3k+S546_shift_tensors_components.xyz",":")
data_test = read("CSD-500+104-7_shift_tensors_components.xyz",":")        

check_consistent_iso = []
check_consistent_tensor = []


for data in [data_train,data_test]:
    for frame in data:        
        
        
        iso_shifts = frame.arrays['cs_iso'].reshape(-1,1)
        tensor_shifts = frame.arrays['cs_tensor']


        int_iso = [frame.arrays[i].reshape(-1,1) for i in ['core_sigma_iso', 'para_oo_sigma_iso', 'para_lq_sigma_iso', 'para_sigma_iso', 'dia_sigma_iso', 'bare_sigma_iso', 'shape_contribution_iso']]
        int_tens = [frame.arrays[i] for i in ['core_sigma_tensor', 'para_oo_sigma_tensor', 'para_lq_sigma_tensor', 'para_sigma_tensor', 'dia_sigma_tensor', 'bare_sigma_tensor', 'shape_contribution_tensor']]
        
        check_consistent_iso.append(np.allclose(iso_shifts,np.sum(int_iso,axis=0),atol=0.035))
        check_consistent_tensor.append(np.allclose(tensor_shifts,np.sum(int_tens,axis=0),atol=0.035))

In [114]:
False in check_consistent_iso

False

In [103]:
False in [True,True,True]

False

In [110]:
False in check_consistent_tensor

False

In [80]:
frame = data_train[0]


True

In [60]:
iso_shifts.reshape(-1,1)

array([[35.89],
       [35.89],
       [35.89],
       [35.89],
       [ 2.43],
       [ 2.43],
       [ 2.43],
       [ 2.43],
       [ 3.99],
       [ 3.99],
       [ 3.99],
       [ 3.99],
       [43.06],
       [43.06],
       [43.06],
       [43.06],
       [22.7 ],
       [22.7 ],
       [22.7 ],
       [22.7 ],
       [58.12],
       [58.12],
       [58.12],
       [58.12],
       [27.33],
       [27.33],
       [27.33],
       [27.33],
       [18.39],
       [18.39],
       [18.39],
       [18.39],
       [24.28],
       [24.28],
       [24.28],
       [24.28],
       [24.71],
       [24.71],
       [24.71],
       [24.71],
       [25.05],
       [25.05],
       [25.05],
       [25.05],
       [25.07],
       [25.07],
       [25.07],
       [25.07]])

In [55]:
int_tens[-1].shape

(48, 9)

In [57]:
np.sum(int_tens,axis=0).shape

(48, 9)

In [58]:
np.sum(int_iso,axis=0).shape

(48, 1)

In [44]:
iso_shifts

array([35.89, 35.89, 35.89, 35.89,  2.43,  2.43,  2.43,  2.43,  3.99,
        3.99,  3.99,  3.99, 43.06, 43.06, 43.06, 43.06, 22.7 , 22.7 ,
       22.7 , 22.7 , 58.12, 58.12, 58.12, 58.12, 27.33, 27.33, 27.33,
       27.33, 18.39, 18.39, 18.39, 18.39, 24.28, 24.28, 24.28, 24.28,
       24.71, 24.71, 24.71, 24.71, 25.05, 25.05, 25.05, 25.05, 25.07,
       25.07, 25.07, 25.07])

array([[35.89],
       [35.89],
       [35.89],
       [35.89],
       [ 2.42],
       [ 2.42],
       [ 2.42],
       [ 2.42],
       [ 3.99],
       [ 3.99],
       [ 3.99],
       [ 3.99],
       [43.07],
       [43.07],
       [43.07],
       [43.07],
       [22.69],
       [22.69],
       [22.69],
       [22.69],
       [58.1 ],
       [58.1 ],
       [58.1 ],
       [58.1 ],
       [27.34],
       [27.34],
       [27.34],
       [27.34],
       [18.39],
       [18.39],
       [18.39],
       [18.39],
       [24.28],
       [24.28],
       [24.28],
       [24.28],
       [24.7 ],
       [24.7 ],
       [24.7 ],
       [24.7 ],
       [25.05],
       [25.05],
       [25.05],
       [25.05],
       [25.07],
       [25.07],
       [25.07],
       [25.07]])

In [41]:
np.hstack(int_iso)

(48, 7)

In [33]:
data_train[0].arrays.keys()

dict_keys(['numbers', 'positions', 'cs_tensor', 'cs_iso', 'core_sigma_iso', 'para_oo_sigma_iso', 'para_lq_sigma_iso', 'para_sigma_iso', 'dia_sigma_iso', 'bare_sigma_iso', 'core_sigma_tensor', 'para_oo_sigma_tensor', 'para_lq_sigma_tensor', 'para_sigma_tensor', 'dia_sigma_tensor', 'bare_sigma_tensor', 'shape_contribution_iso', 'shape_contribution_tensor'])

In [18]:
ma = np.eye(3)

In [28]:
this = np.array([ma for i in range(30)])

In [29]:
this.shape

(30, 3, 3)

In [27]:
np.tile(ma,10).shape

(3, 30)

In [11]:
adict = {"a":"c","b":"d"}

In [15]:
for key,value in adict.items():
    print(key)
    print(value)

a
c
b
d


In [53]:
data_train[0].

Atom('S', [4.37754024, 2.95767432, 3.0668442], index=0)

In [48]:
2e03 + 1e03 + 104 + 546 + 500

4150.0

In [49]:
len(paths)

4150

In [3]:
import os
import glob
from ase.io import read, write
import numpy as np

from helpers import *

In [40]:
#Build dictionary with {key: STRUCTURE-CSD-NAME value: atoms object}

extyz_dict = build_extxy_dict("CSD-500+S104.xyz")


#da
datasets = ["CSD-500","CSD-S104"] #test directories
contained_in = "magres"
extension = "*magres"

#build path
#./CSD-500/magres/*magres

extyz_dict_tens = {}

for dataset in datasets:
    DATASETPATH = os.path.join(os.getcwd(),dataset,contained_in,extension)
    files = glob.glob(DATASETPATH)
    print(DATASETPATH)
    for n, file in enumerate(files):
        structname = file.rstrip(".nmr.magres").split("/")[-1]
        extyz_dict_tens.update({structname : None})

#build intersection of two sets (.xyz files) necessary for train set due to the 7 missing files
final_dict = {x:extyz_dict[x] for x in extyz_dict 
                              if x in extyz_dict_tens}  

# else, just do:
# final_dict = {x:extyz_dict[x] for x in extyz_dict}

/ssd/scratch/kellner/COSMO_project/make_tensor_data/CSD-500/magres/*magres
/ssd/scratch/kellner/COSMO_project/make_tensor_data/CSD-S104/magres/*magres


In [41]:
#Build dictionary with {key: STRUCTURE-CSD-NAME value: STATUS}

all_ids = generate_status_dict("CSD-3k+S546.xyz","PASSING")
souspicious = generate_status_dict("./frames_status/frames_suspicious.xyz","SUSPICIOUS")
outliers = generate_status_dict("./frames_status/frames_blatant_outliers.xyz","FAIL")

all_ids.update(souspicious) 
all_ids.update(outliers)

In [6]:
#loop through datasets (magres directories and files that are contained in the latter)
#extract the CSD name from the file name
# read and generate atoms objects from .magres files
# remove atoms.info pairs that are garbage
# if set_status (for training) is True: write status to info dict
# flatten shift tensor and add it with another name
# change coordinates to coordinates from .extyz file (higher precision) 
# 


# Directories, where .magresfiles are located
datasets= ["CSD-500","CSD-S104"] #["CSD-2k","CSD-1k","CSD-S546"]
contained_in = "magres"
extension = "*magres"

structs = []

set_status=False

for dataset in datasets:
    #build combined filepaths from the working directory and 
    DATASETPATH = os.path.join(os.getcwd(),dataset,contained_in,extension)
    files = glob.glob(DATASETPATH)
    print(DATASETPATH)
    
    for n, file in enumerate(files):
        structname = file.rstrip(".nmr.magres").split("/")[-1]
        #print(structname)
        
        #try:
        with open(file) as f:
            fd = f.read()
            fd = fd.replace("-"," -")
            fd = fd.replace("units sus 10^ -6.cm^3.mol^ -1","units sus 10^-6.cm^3.mol^-1")
            fd = fd.replace("#$magres -abinitio -v1.0","#$magres-abinitio-v1.0")
            fd = fd.replace("QE -GIPAW 5.x","QE-GIPAW 5.x")
            atoms = read_magres_modified(fd)
            #print(fd)
       
        
        if set_status is True:
            atoms.info.update({"STATUS":all_ids[structname]})
        
        #-----flatten TENSOR-----
        atoms.arrays.update({"cs_tensor": atoms.arrays["ms"].reshape((-1,9))})
        atoms.info.update({"magres_units": {'cs_tensor': 'ppm', 'cs_iso': 'ppm'}})
            
        
        #----remove labels and incices
        atoms.arrays.pop("ms")
        atoms.arrays.pop("indices")
        atoms.arrays.pop("labels")
        
        #remove garbage from comments
        atoms.info.pop("magresblock_calculation")
        
        #check if structname is in final dict:
        #nescessary for -7 files. probably to complicated
        if structname in final_dict:
            atoms.info.update({"NAME":structname})
            atoms.info.update({"ENERGY": final_dict[structname].info["ENERGY"]})
            atoms.set_positions(final_dict[structname].get_positions())
            atoms.set_cell(final_dict[structname].get_cell())
            atoms.arrays.update({"cs_iso": final_dict[structname].arrays["CS"]})
            structs.append(atoms)

/Users/matthiaskellner/Desktop/EPFL_2021/COSMO_project/make_tensor_data/CSD-500/magres/*magres
/Users/matthiaskellner/Desktop/EPFL_2021/COSMO_project/make_tensor_data/CSD-S104/magres/*magres


In [7]:
#write("CSD-500+104-7_shift_tensors.xyz",structs,format="extxyz")

In [8]:
check_plausibility("./test_tensor/CSD-500+104-7_shift_tensors.xyz","CSD-500+S104.xyz") 
#check if PBC, cell, coordinate and shifts are transferred correctly
# diagonalizes shift tensor and takes average of eigenvalues. compares the average to the iso shift
# to ensure that tensor values are written correctly

In [10]:
#can be used to check if status was written correctly
test_status("./train_tensor/CSD-3k+S546_shift_tensors.xyz",all_ids) 

In [239]:
"""
struct_iso_good = read("teststructs_iso_good.xyz",format="extxyz")
struct_tensor_good = read("teststructs_tens_good.xyz",format="extxyz")

#-----testing the comparison helper functions
bad_structs_iso = [read(this,format="extxyz") for this in ["teststructs_iso_bad_no_PBC.xyz","teststructs_iso_bad_cell.xyz","teststructs_iso_bad_coordinates.xyz","teststructs_iso_bad_shift.xyz"]]
bad_structs_tens = [read(this,format="extxyz") for this in ["teststructs_tens_bad_no_PBC.xyz","teststructs_tens_bad_cell.xyz", "teststructs_tens_bad_shift.xyz","teststructs_tens_bad_coordinates.xyz"]]

for bad_struct in bad_structs_iso:
    print(compaire(bad_struct,struct_tensor_good))
for bad_struct in bad_structs_tens:
    print(compaire(struct_iso_good,bad_struct))
    
compaire(struct_iso_bad,struct_tensor_bad)
"""

[True, False, False, True]
False
[True, False, True, True]
False
[False, True, True, True]
False
[True, True, True, False]
False
[True, False, True, True]
False
[True, False, True, True]
False
[True, True, True, False]
False
[False, True, True, True]
False
