In [3]:
import requests
import json
import numpy as np
import pandas as pd
from matminer.featurizers.conversions import StrToComposition
from matminer.featurizers.composition import ElementProperty, ElementFraction, Meredig, Stoichiometry

## Curating Dataset from NOMAD Database

In [4]:
base_url = 'http://nomad-lab.eu/prod/v1/api/v1'

In [None]:
# this first cell filters the data in the NOMAD database, finding entries containing the element Gallium with band-gap values within the given bound
# the output contains 2 datapoints of interest for each entry, the entry ID and the band gap value
response = requests.post(
    'http://nomad-lab.eu/prod/v1/api/v1/entries/query',
    json={
        'query': {
            'results.material.elements': {
                'all': ['Ga']
                },
            'results.properties.electronic.band_structure_electronic.band_gap.value': {
                'gt': 0,
                'lte': 20e-18,  # in SI units (joule)
            }
        },
        'pagination': {
            'page_size': 3000
        },
        'required': {
            'include': [
                'entry_id',
                'results.properties.electronic.band_structure_electronic.band_gap.value',
                'results.properties.electronic.band_structure_electronic.band_gap.type'
            ]
        }
    })
 
response_json = response.json()
print(json.dumps(response.json(), indent=2))

In [6]:
# we build a list containing the band gap values
band_gap_vals = []

for entry in response_json['data']:
    band_gap_vals.append(entry['results']['properties']['electronic']['band_structure_electronic']['band_gap'][0]['value'])
    
band_gap_vals

[1.4211306743580145e-20,
 2.3189904600516005e-19,
 2.3975131368839396e-19,
 8.725934601754213e-20,
 2.3072945706233993e-19,
 2.6720300813534994e-19,
 1.9487274399342003e-19,
 5.619153890764795e-20,
 2.3689463274997194e-19,
 5.4480414262536e-19,
 9.521735735862006e-20,
 5.439229454766596e-20,
 1.9775666193461993e-19,
 1.9752755067595787e-19,
 2.2339789678515597e-19,
 1.10710405409401e-20,
 1.525272155567996e-20,
 1.12471197530166e-19,
 2.9238121393866006e-19,
 2.4004771636568403e-19,
 2.4039058216536e-19,
 1.1255771506840201e-19,
 2.2339789678515587e-19,
 2.65439011661316e-19,
 2.6737123668192e-19,
 1.6796739177865808e-19,
 2.40544391122224e-19,
 4.9403116509390043e-20,
 2.6594049294775793e-19,
 8.920919498111994e-20,
 2.6572099474889983e-19,
 5.231106710010004e-20,
 2.233882837253519e-19,
 2.418966282013201e-19,
 2.4061488689411997e-19,
 3.627327899375997e-20,
 2.4059886512778004e-19,
 4.9058648533080094e-20,
 1.8232930312583396e-19,
 2.4059245642124397e-19,
 2.233898859019859e-19,
 2.

In [7]:
len(band_gap_vals)

2531

In [None]:
# we build a list containing the entry IDs
entry_IDs = []

for entry in response_json['data']:
    entry_IDs.append(entry['entry_id'])
    
entry_IDs

In [9]:
# this second cell is used to query each entry and obtain useful molecular information
# this can be used to obtain the information for single entry from the entry list

# response = requests.post(
#     f'{base_url}/entries/{entry_IDs[0]}/archive/query',
#     json={
#           "required": {
#         "results": {
#           "material": "*",
#           "method": "*",
#           "properties": {
#             "structures": "*"
#             }
#           }
#         }
#     })

# response_json2 = response.json()
# print(json.dumps(response.json(), indent=2))

In [None]:
# here we set up a loop where each entry is queried and the desired information is captured and recorded in separate lists

compositions = [] # this includes the chemical composition
# space_group_symbols = [] # the space group symbols
# cartesian_cords = [] # the xyz coordinates of each atom in the molecule
# atoms = [] # the atom that each set of xyz coordinates corresponds to in the molecule

for entryid in entry_IDs:
    
    response = requests.post(
        f'{base_url}/entries/{entryid}/archive/query',
        json={
              "required": {
            "results": {
              "material": "*",
              "method": "*",
              "properties": {
                "structures": "*"
                }
              }
            }
        })

    response_json2 = response.json()
    print(json.dumps(response.json(), indent=2))
    compositions.append(response_json2['data']['archive']['results']['material']['chemical_formula_descriptive'])
#     space_group_symbols.append(response_json2['data']['archive']['results']['material']['symmetry']['space_group_symbol'])
#     cartesian_cords.append(response_json2['data']['archive']['results']['properties']['structures']['structure_conventional']['cartesian_site_positions'])
#     atoms.append(response_json2['data']['archive']['results']['properties']['structures']['structure_conventional']['species_at_sites'])

Cells to check each list of data:

In [13]:
compositions

['GaTeV',
 'Ga4P4',
 'Ga4P4',
 'As4Ga4',
 'Ga4P4',
 'Ga4P4',
 'Ga4P4',
 'As4Ga4',
 'Ga4P4',
 'Cl2GaK',
 'As4Ga4',
 'As4Ga4',
 'Ga4P4',
 'Ga4P4',
 'GaP',
 'CrGaPd',
 'ClGaNi',
 'Ga8N8',
 'AgCl2Ga',
 'Ga4P4',
 'Ga4P4',
 'Ga16N16',
 'Ga4P4',
 'Ga4P4',
 'Ga4P4',
 'Ga4P4',
 'Ga4P4',
 'As4Ga4',
 'Ga4P4',
 'As4Ga4',
 'Ga4P4',
 'As4Ga4',
 'Ga16P16',
 'Ga4P4',
 'Ga4P4',
 'As4Ga4',
 'Ga4P4',
 'Ca2ClGa',
 'GaN',
 'Ga4P4',
 'Ga32P32',
 'Ga4P4',
 'Cl6Cs2GaNa',
 'Ga16P16',
 'GaHgSc',
 'GaP',
 'Ga4P4',
 'Ga16Sb16',
 'GaO3Rh',
 'GaO3Sr',
 'GaPdRe',
 'Ga4P4',
 'As4Ga4',
 'As4Ga4',
 'Ga4P4',
 'GaKSi',
 'Ga4P4',
 'Ga4P4',
 'Ga4P4',
 'Ga4P4',
 'As4Ga4',
 'Ga4Sb4',
 'BiGaO3',
 'As4Ga4',
 'As4Ga4',
 'Ga4P4',
 'Br2GaRu',
 'As4Ga4',
 'Ga4P4',
 'GaReZr2',
 'AsGa',
 'Ga4P4',
 'AlBr2Ga',
 'Ga4P4',
 'Ga4P4',
 'Ga4P4',
 'Ga4P4',
 'Ga4P4',
 'Ga16N16',
 'Ga4P4',
 'Ga4P4',
 'BrGaRu',
 'Ga4P4',
 'GaWZr2',
 'Ga4P4',
 'Ga4P4',
 'As4Ga4',
 'Ga4P4',
 'Ga4P4',
 'Ga4P4',
 'BGaLi2',
 'As4Ga4',
 'Ga4P4',
 'Ga32Sb32',
 'Ga4P4'

In [None]:
# space_group_symbols

In [None]:
# cartesian_cords

In [None]:
# atoms

In [15]:
# build a dataframe with all the obtained data
dataframe = pd.DataFrame(compositions, columns = ["Compositions"])
# dataframe["space_group_symbols"] = space_group_symbols
# dataframe["cartesian_cords"] = cartesian_cords
dataframe["band_gap_vals"] = band_gap_vals
# dataframe["atoms"] = atoms_array

In [16]:
dataframe

Unnamed: 0,Compositions,band_gap_vals
0,GaTeV,1.421131e-20
1,Ga4P4,2.318990e-19
2,Ga4P4,2.397513e-19
3,As4Ga4,8.725935e-20
4,Ga4P4,2.307295e-19
...,...,...
2526,Ga4P4,2.649263e-19
2527,BaBrGa,0.000000e+00
2528,Ga4P4,1.974266e-19
2529,As4Ga4,4.839695e-20


## Convert band gap values to eV

In [17]:
def eV(x):
    return x/(1.6022e-19)

In [18]:
dataframe["band_gap_vals"] = dataframe["band_gap_vals"].apply(eV)
dataframe

Unnamed: 0,Compositions,band_gap_vals
0,GaTeV,0.088699
1,Ga4P4,1.447379
2,Ga4P4,1.496388
3,As4Ga4,0.544622
4,Ga4P4,1.440079
...,...,...
2526,Ga4P4,1.653516
2527,BaBrGa,0.000000
2528,Ga4P4,1.232222
2529,As4Ga4,0.302066


In [19]:
# produce csv file
dataframe.to_csv("dataset.csv", sep=';') # store the data in a csv file

## Featurization of the data

In [None]:
semiconductors = StrToComposition().featurize_dataframe(dataframe, "Compositions")

In [None]:
# # Define the featuriser
ep_feat = ElementProperty.from_preset(preset_name="magpie")
X_desc = ep_feat.featurize_dataframe(semiconductors, col_id="composition").iloc[:,6:]
print(X_desc.shape)
X_desc.head(5)

In [None]:
# Drop columns (descriptors) with nan values
X_desc = X_desc.dropna(how='any',axis=1)
# Ensure dtype is float32
columns = X_desc.columns
X_desc = pd.DataFrame(np.array(X_desc,dtype=np.float32),columns=columns)
print(X_desc.shape)
X_desc.head(5)

In [None]:
X_desc.to_csv("descriptors.csv", sep=';')

## Reduced Dataset: removing duplicates

The following code can be used to filter the dataset and remove duplicate entries (i.e., same chemical compositions).

This reduced dataset was then featurised and used to see the effect of including duplicate data entries on the regression analysis. 

In [None]:
dataset_reduced = dataset.drop_duplicates(subset="Compositions", keep='first')
dataset_reduced

In [None]:
dataset_reduced.to_csv("dataset_reduced.csv", sep=';')

In [None]:
# featurisation:

In [None]:
semiconductors = StrToComposition().featurize_dataframe(dataset_reduced, "Compositions")

In [None]:
# # Define the featuriser
ep_feat = ElementProperty.from_preset(preset_name="magpie")
X_desc = ep_feat.featurize_dataframe(semiconductors, col_id="composition").iloc[:,6:]
print(X_desc.shape)
X_desc.head(5)

In [None]:
# Drop columns (descriptors) with nan values
X_desc = X_desc.dropna(how='any',axis=1)
# Ensure dtype is float32
columns = X_desc.columns
X_desc = pd.DataFrame(np.array(X_desc,dtype=np.float32),columns=columns)
print(X_desc.shape)
X_desc.head(5)

In [None]:
X_desc.to_csv("descriptors_reduced.csv", sep=';')

## Notes:

In [44]:
# to obtain space group symbol
response_json2['data']['archive']['results']['material']['symmetry']['space_group_symbol']

'F-43m'

In [48]:
# to obtain the composition
response_json2['data']['archive']['results']['material']['chemical_formula_descriptive']

'Ga4P4'

In [57]:
# to obtain the conventional structure cartesian coordinates
response_json2['data']['archive']['results']['properties']['structures']['structure_conventional']['cartesian_site_positions']

[[0.0, 2.77008663e-10, 2.77008663e-10],
 [4.155129945000001e-10, 4.155129945000001e-10, 4.155129945000001e-10],
 [0.0, 0.0, 0.0],
 [4.155129945000001e-10, 1.385043315e-10, 1.385043315e-10],
 [2.77008663e-10, 2.77008663e-10, 0.0],
 [1.385043315e-10, 4.155129945000001e-10, 1.385043315e-10],
 [2.77008663e-10, 0.0, 2.77008663e-10],
 [1.385043315e-10, 1.385043315e-10, 4.155129945000001e-10]]