In [6]:
from urllib.parse import urlencode
import httplib2
import json
import time
import os

# Output directory for fetched data
data_dir = "data_linear/"

# Search Criteria for Propeties
search = {
    "classes": "oxide",
    "props": "linear thermal expansion coefficient",
    #"lattices": "cubic"
}

if not os.path.isdir(data_dir):
    os.mkdir(data_dir)

# Search for Files with TEC Labels

In [7]:
# API KEYS
api_key = "O2aTxtQ6kG5hCglB32JznFNCPtOJCYt5C0ToTGkvu39ePrMV" # your key
endpoint = "https://api.mpds.io/v0/download/facet"

# API TEC SEARCH CRITERIA
# API HITS

# linear TEC, all: 1597, 791 unique, 97 no structure
# volume TEC, all: 288, 177 unique, 17 no structure
# ~ 860 structures total

# linear TEC, cubic: 356, 250 unique
# volume TEC, cubic: 73, 38 unique


# Remember which files were saved
entry_log = dict()

## DOWNLOAD PROPERTIES ##
req = httplib2.Http()
# Cycle through available pages
num_pages = 1  # Just to start
pg = 0
while pg < num_pages:
    response, content = req.request(
        uri=endpoint + '?' + urlencode({
            'q': json.dumps(search),
            'pagesize': 100,
            'dtype': 1, # PEER REVIEWED  1, MACHINE_LEARNING 2,  MPDSDataTypes.AB_INITIO 4, and ALL 7
            'page' : pg
        }),
        method='GET',
        headers={'Key': api_key}
    )
    pg += 1
    # Check for Errors in the Request
    if response.status != 200:
        # NB 400 means wrong input, 403 means authorization issue etc.
        # see https://en.wikipedia.org/wiki/List_of_HTTP_status_codes
        raise RuntimeError("Error code %s" % response.status)
    content = json.loads(content)
    if content.get('error'): 
        raise RuntimeError(content['error'])
    
    # Sift through Data
    num_pages = content['npages']

    # Save each json
    for i in range(len(content['out'])):
        filename = content['out'][i]['sample']['material']['entry']
        entry_log[filename] = content['out'][i]['sample']['material']['phase_id']  # Log the phase ID for finding structural stuff later
        with open(data_dir+filename+".json", "w") as outfile:
            json.dump(content['out'][i], outfile)
            outfile.close()
    time.sleep(1)
    print(content['count'])

1597
1597
1597
1597
1597
1597
1597
1597
1597
1597
1597
1597
1597
1597
1597
1597


# Determine how many unique material phases in this data

In [8]:
# Make list of unique materials #
unique_mat = dict()
for k in entry_log.keys():
    phase_num = entry_log[k]
    if phase_num not in unique_mat.keys():
        unique_mat[phase_num] = [k]
    else:
        unique_mat[phase_num].append(k)
print("Number of entries: ", len(entry_log))
print("Unique Materials: ", len(unique_mat))

Number of entries:  1597
Unique Materials:  588


In [9]:
# List unique materials phase_ids
unique_mat

{4621: ['P1312068-2'],
 5251: ['P904030-5', 'P1116243-7', 'P1116243-8'],
 5342: ['P1702936-4', 'P1702936-5'],
 5343: ['P601140-2', 'P1800510-1', 'P1800510-2', 'P1800510-3', 'P1800516-3'],
 6123: ['P606820-1', 'P1116975-1', 'P1713142-2', 'P1713142-3'],
 6131: ['P800740-1', 'P800740-2'],
 6133: ['P800730-1',
  'P800730-2',
  'P800730-3',
  'P800730-4',
  'P800739-1',
  'P800739-2',
  'P800739-3'],
 6375: ['P601465-2', 'P601465-3', 'P601465-4'],
 6475: ['P601534-3',
  'P601534-4',
  'P601534-5',
  'P601534-6',
  'P601534-7',
  'P601534-8'],
 6476: ['P601617-2', 'P601617-3', 'P601618-2', 'P601618-3'],
 6479: ['P601323-9',
  'P601323-10',
  'P601323-11',
  'P601323-12',
  'P601323-13',
  'P601323-14',
  'P1318843-4',
  'P1707466-3',
  'P1707466-4'],
 6597: ['P601689-3',
  'P601689-4',
  'P606714-1',
  'P606714-2',
  'P606714-3',
  'P606714-4',
  'P606714-5',
  'P606714-6',
  'P606714-7',
  'P606714-8',
  'P606714-9',
  'P606714-10',
  'P606714-11',
  'P606714-12',
  'P606714-13',
  'P606714

# Now Request Structure File for Each Unique Structure

In [10]:
req = httplib2.Http()
## FETCH STRUCTURAL FILES ##
# API TEC SEARCH CRITERIA
print("Searching for ", len(unique_mat)," structure files in MPDS...")
dead_files = 0
for phase_id in unique_mat.keys(): 
    search = { "props": "crystal structure" }
    print(phase_id)
    response, content = req.request(
        uri=endpoint + '?' + urlencode({
            'q': json.dumps(search),
            'pagesize': 1,
            'phases': phase_id,
            'dtype': 1, # PEER REVIEWED  1, MACHINE_LEARNING 2,  MPDSDataTypes.AB_INITIO 4, and ALL 7
            'fmt' : 'cif'
        }),
        method='GET',
        headers={'Key': api_key}
    )

    # Check for Errors in the Request
    if response.status != 200:
        # NB 400 means wrong input, 403 means authorization issue etc.
        # see https://en.wikipedia.org/wiki/List_of_HTTP_status_codes
        raise RuntimeError("Error code %s" % response.status)

    # Save each cif
    # TODO: in future check for same source as data
    cif_str = content.decode("utf-8").split("end of data item")
    cif_idx = 0
    # Check that the structure files actually have data in them
    abort = False
    while "no atom coordinates" in cif_str[cif_idx] or len(cif_str[cif_idx])<20:
        cif_idx += 1
        if cif_idx >= len(cif_str):
            print("NO CIF FILES ARE GOOD!!")
            abort = True
            dead_files += 1
            break
    if abort == False:
        filename = str(phase_id)+".cif"
        with open(data_dir+filename, "w") as outfile:
            outfile.write(cif_str[cif_idx])
        time.sleep(2)  # Avoid error code 429 TODO: get around this

print("Searched for ", len(unique_mat), " files,  no results found for ", dead_files)

Searching for  588  structure files in MPDS...
4621
5251
5342
5343
NO CIF FILES ARE GOOD!!
6123
6131
6133
6375
6475
6476
6479
6597
7128
7151
7183
7268
7278
NO CIF FILES ARE GOOD!!
7282
7583
NO CIF FILES ARE GOOD!!
8411
8713
9013
9227
9490
9769
9770
9782
NO CIF FILES ARE GOOD!!
9953
9968
10004
10188
10200
10607
10680
10682
10741
10826
10830
10868
10874
10875
11526
11536
11546
11746
NO CIF FILES ARE GOOD!!
12219
NO CIF FILES ARE GOOD!!
12253
12274
12354
NO CIF FILES ARE GOOD!!
12432
12450
NO CIF FILES ARE GOOD!!
12667
12699
12734
12735
NO CIF FILES ARE GOOD!!
12736
12924
12926
12936
12954
12960
13049
13143
13145
13147
13167
13172
13174
13364
13388
13577
13595
13606
13639
NO CIF FILES ARE GOOD!!
13668
NO CIF FILES ARE GOOD!!
13703
13801
13884
13898
13920
13994
13996
NO CIF FILES ARE GOOD!!
14010
14013
14037
NO CIF FILES ARE GOOD!!
14038
14051
14071
14097
14652
14774
14901
14907
15284
15305
NO CIF FILES ARE GOOD!!
15483
15498
NO CIF FILES ARE GOOD!!
15545
15571
15774
NO CIF FILES ARE GOOD!

## Next Block to Retrieve Just 1 structure!! ##

In [6]:
# Phase you want to retrieve
target_phase = 95569

api_key = "O2aTxtQ6kG5hCglB32JznFNCPtOJCYt5C0ToTGkvu39ePrMV" # your key
endpoint = "https://api.mpds.io/v0/download/facet"
search = { "props": "crystal structure" }
req = httplib2.Http()
response, content = req.request(
    uri=endpoint + '?' + urlencode({
        'q': json.dumps(search),
        'pagesize': 1,
        'phases': target_phase,
        'dtype': 1, # PEER REVIEWED  1, MACHINE_LEARNING 2,  MPDSDataTypes.AB_INITIO 4, and ALL 7
        'fmt' : 'cif'
    }),
    method='GET',
    headers={'Key': api_key}
)

# Check for Errors in the Request
if response.status != 200:
    # NB 400 means wrong input, 403 means authorization issue etc.
    # see https://en.wikipedia.org/wiki/List_of_HTTP_status_codes
    raise RuntimeError("Error code %s" % response.status)

# Save each cif
# TODO: in future check for same source as data
cif_str = content.decode("utf-8").split("end of data item")
cif_idx = 0
# Check that the structure files actually have data in them
abort = False
while "no atom coordinates" in cif_str[cif_idx] or len(cif_str[cif_idx])<20:
    cif_idx += 1
    if cif_idx >= len(cif_str):
        print("NO CIF FILES ARE GOOD!!")
        abort = True
        break
if abort == False:
    filename = str(target_phase)+".cif"
    print(cif_str[cif_idx])
    with open(data_dir+filename, "w") as outfile:
        outfile.write(cif_str[cif_idx])

NO CIF FILES ARE GOOD!!
