# Generate lists of compounds in certain categories

This notebook assembles `compound_categories.json`, which contains lists of compounds in different chemical spaces (e.g. Group I halides)

---
## Header

#### Global variables

In [1]:
PROJECT = 'SCAN project'

#### Imports

In [2]:
import os
from pathlib import Path
import re
import numpy as np
import xlrd
from matplotlib import pyplot as plt

In [3]:
from pymatgen.ext.matproj import MPRester, MPRestError
from monty.serialization import loadfn, dumpfn
from pymatgen import Structure, Composition, Element
from pymatgen.analysis.reaction_calculator import ComputedEntry, ComputedReaction
from pymatgen.util.plotting import pretty_plot, periodic_table_heatmap
from pymatgen.core import periodic_table as pt

from scipy.stats import linregress
from adjustText import adjust_text
from sklearn.metrics import max_error, mean_absolute_error, mean_squared_error

#### Settings and utility functions

In [4]:
%load_ext autoreload
%autoreload 2

#### Set Working Directory

In [5]:
workdir = Path(re.sub("(?<={})[\w\W]*".format(PROJECT), "", str(Path.cwd())))
os.chdir(workdir)

data_dir = workdir / '2_raw data'
pipeline_dir = workdir / '3_data analysis' / '2_pipeline'
output_dir = workdir / '3_data analysis' / '3_output'

---
## Main Code

### Build lists of element categories

In [6]:
els_list = pt._pt_data.keys()

alkalis = [e for e in els_list if Element(e).is_alkali]
alkalines = [e for e in els_list if Element(e).is_alkaline]
chalcogens = [e for e in els_list if Element(e).is_chalcogen]
halogens = [e for e in els_list if Element(e).is_halogen]
lanthanoids = [e for e in els_list if Element(e).is_lanthanoid]
actinoids = [e for e in els_list if Element(e).is_actinoid]
metals = [e for e in els_list if Element(e).is_metal]
metalloids = [e for e in els_list if Element(e).is_metalloid]
noble_gases = [e for e in els_list if Element(e).is_noble_gas]
post_transition_metals = [e for e in els_list if Element(e).is_post_transition_metal]
rare_earths = [e for e in els_list if Element(e).is_rare_earth_metal]
transition_metals = [e for e in els_list if Element(e).is_transition_metal]

other_elements = list( set(els_list) - set(alkalis + alkalines + chalcogens + halogens + metals + metalloids + transition_metals + lanthanoids + actinoids +noble_gases))

transition_metals_3d = [e for e in transition_metals if Element(e).is_transition_metal and Element(e).row==4]
transition_metals_4d = [e for e in transition_metals if Element(e).is_transition_metal and Element(e).row==5]
transition_metals_5d = [e for e in transition_metals if Element(e).is_transition_metal and Element(e).row==6]

#### Use MPRester to identify all compounds in specific chemical spaces

In [8]:
compound_categories = {}

def add_compound_category(desc,query):
    '''
    desc (str): plain english description of the category e.g. 'Group I halides'
    query (str): MongoDB query to pass to MPRester
    '''   
    with MPReste as a:
        compound_list = list(set(e.composition.reduced_formula for e in a.get_entries(query)))

    compound_categories.update({desc:compound_list})

In [20]:
query={"elements":{"$in":transition_metals, "$all": ['O']}}
with MPRester(MP_API_KEY) as a:
    compound_list = [e.composition.reduced_formula for e in a.get_entries(query)]
    
len(compound_list)

HBox(children=(IntProgress(value=0, max=44943), HTML(value='')))

44930

In [21]:
len(set(compound_list))

28064

In [15]:
query={"elements":{"$in":transition_metals, "$all": ['F']}}
with MPRester(MP_API_KEY) as a:
    compound_list = [e.composition.reduced_formula for e in a.get_entries(query)]
    
len(compound_list)

HBox(children=(IntProgress(value=0, max=6726), HTML(value='')))

6726

In [19]:
len(set(compound_list))

3726

### Group I and II halides, by cation

In [8]:
add_compound_category('Li halides',{"elements":{"$in":halogens, "$all": ["Li"]}, "nelements":{'$in':[2]}})
add_compound_category('Na halides',{"elements":{"$in":halogens, "$all": ["Na"]}, "nelements":{'$in':[2]}})
add_compound_category('K halides',{"elements":{"$in":halogens, "$all": ["K"]}, "nelements":{'$in':[2]}})
add_compound_category('Rb halides',{"elements":{"$in":halogens, "$all": ["Rb"]}, "nelements":{'$in':[2]}})
add_compound_category('Cs halides',{"elements":{"$in":halogens, "$all": ["Cs"]}, "nelements":{'$in':[2]}})

add_compound_category('Be halides',{"elements":{"$in":halogens, "$all": ["Be"]}, "nelements":{'$in':[2]}})
add_compound_category('Mg halides',{"elements":{"$in":halogens, "$all": ["Mg"]}, "nelements":{'$in':[2]}})
add_compound_category('Ca halides',{"elements":{"$in":halogens, "$all": ["Ca"]}, "nelements":{'$in':[2]}})
add_compound_category('Sr halides',{"elements":{"$in":halogens, "$all": ["Sr"]}, "nelements":{'$in':[2]}})
add_compound_category('Ba halides',{"elements":{"$in":halogens, "$all": ["Ba"]}, "nelements":{'$in':[2]}})

In [9]:
add_compound_category('Li chalcogenides',{"elements":{"$in":chalcogens, "$all": ["Li"]}, "nelements":{'$in':[2]}})
add_compound_category('Na chalcogenides',{"elements":{"$in":chalcogens, "$all": ["Na"]}, "nelements":{'$in':[2]}})
add_compound_category('K chalcogenides',{"elements":{"$in":chalcogens, "$all": ["K"]}, "nelements":{'$in':[2]}})
add_compound_category('Rb chalcogenides',{"elements":{"$in":chalcogens, "$all": ["Rb"]}, "nelements":{'$in':[2]}})
add_compound_category('Cs chalcogenides',{"elements":{"$in":chalcogens, "$all": ["Cs"]}, "nelements":{'$in':[2]}})

add_compound_category('Be chalcogenides',{"elements":{"$in":chalcogens, "$all": ["Be"]}, "nelements":{'$in':[2]}})
add_compound_category('Mg chalcogenides',{"elements":{"$in":chalcogens, "$all": ["Mg"]}, "nelements":{'$in':[2]}})
add_compound_category('Ca chalcogenides',{"elements":{"$in":chalcogens, "$all": ["Ca"]}, "nelements":{'$in':[2]}})
add_compound_category('Sr chalcogenides',{"elements":{"$in":chalcogens, "$all": ["Sr"]}, "nelements":{'$in':[2]}})
add_compound_category('Ba chalcogenides',{"elements":{"$in":chalcogens, "$all": ["Ba"]}, "nelements":{'$in':[2]}})

### Group I and II halides and chalcogens, by anion

In [10]:
add_compound_category('Group I chlorides',{"elements":{"$in":alkalis, "$all": ["Cl"]}, "nelements":{'$in':[2]}})
add_compound_category('Group II chlorides',{"elements":{"$in":alkalines, "$all": ["Cl"]}, "nelements":{'$in':[2]}})

add_compound_category('Group I bromides',{"elements":{"$in":alkalis, "$all": ["Br"]}, "nelements":{'$in':[2]}})
add_compound_category('Group II bromides',{"elements":{"$in":alkalines, "$all": ["Br"]}, "nelements":{'$in':[2]}})

add_compound_category('Group I iodides',{"elements":{"$in":alkalis, "$all": ["I"]}, "nelements":{'$in':[2]}})
add_compound_category('Group II iodides',{"elements":{"$in":alkalines, "$all": ["I"]}, "nelements":{'$in':[2]}})

add_compound_category('Group I fluorides',{"elements":{"$in":alkalis, "$all": ["F"]}, "nelements":{'$in':[2]}})
add_compound_category('Group II fluorides',{"elements":{"$in":alkalines, "$all": ["F"]}, "nelements":{'$in':[2]}})

add_compound_category('Group I oxides',{"elements":{"$in":alkalis, "$all": ["O"]}, "nelements":{'$in':[2]}})
add_compound_category('Group II oxides',{"elements":{"$in":alkalines, "$all": ["O"]}, "nelements":{'$in':[2]}})

add_compound_category('Group I sulfides',{"elements":{"$in":alkalis, "$all": ["S"]}, "nelements":{'$in':[2]}})
add_compound_category('Group II sulfides',{"elements":{"$in":alkalines, "$all": ["S"]}, "nelements":{'$in':[2]}})

add_compound_category('Group I selenides',{"elements":{"$in":alkalis, "$all": ["Se"]}, "nelements":{'$in':[2]}})
add_compound_category('Group II selenides',{"elements":{"$in":alkalines, "$all": ["Se"]}, "nelements":{'$in':[2]}})

add_compound_category('Group I tellurides',{"elements":{"$in":alkalis, "$all": ["Te"]}, "nelements":{'$in':[2]}})
add_compound_category('Group II tellurides',{"elements":{"$in":alkalines, "$all": ["Te"]}, "nelements":{'$in':[2]}})

### Group I and II P, N, and H compounds

In [11]:
add_compound_category('Group I hydrides',{"elements":{"$in":alkalis, "$all": ["H"]}, "nelements":{'$in':[2]}})
add_compound_category('Group II hydrides',{"elements":{"$in":alkalines, "$all": ["H"]}, "nelements":{'$in':[2]}})

add_compound_category('Group I phosphides',{"elements":{"$in":alkalis, "$all": ["P"]}, "nelements":{'$in':[2]}})
add_compound_category('Group II phosphides',{"elements":{"$in":alkalines, "$all": ["P"]}, "nelements":{'$in':[2]}})

add_compound_category('Group I nitrides',{"elements":{"$in":alkalis, "$all": ["N"]}, "nelements":{'$in':[2]}})
add_compound_category('Group II nitrides',{"elements":{"$in":alkalines, "$all": ["N"]}, "nelements":{'$in':[2]}})

### Main group (non-transition metal) compounds

In [12]:
main_group = list(set(alkalis + alkalines + chalcogens + halogens + metals + metalloids + other_elements) - set(transition_metals) - set(lanthanoids) - set(actinoids))

## matches compounds that contain ONLY elements in the list
add_compound_category('main group binaries',{'$nor': [{'elements': {'$elemMatch': {'$nin': main_group}}}],"nelements":{'$in':[2]}})
add_compound_category('main group ternaries',{'$nor': [{'elements': {'$elemMatch': {'$nin': main_group}}}],"nelements":{'$in':[3]}})

HBox(children=(IntProgress(value=0, max=4896), HTML(value='')))

HBox(children=(IntProgress(value=0, max=9106), HTML(value='')))

In [13]:
'Fe' in main_group

False

### Main Group group compounds, by anion

In [14]:
add_compound_category('main group binary & ternary oxides',{'$nor': [{'elements': {'$elemMatch': {'$nin': main_group}}}],"nelements":{'$in':[2,3]},"elements":{"$all":["O"]}})
add_compound_category('main group binary & ternary phosphides & phosphates',{'$nor': [{'elements': {'$elemMatch': {'$nin': main_group}}}],"nelements":{'$in':[2,3]},"elements":{"$all":["P"]}})
add_compound_category('main group binary & ternary sulfides & sulfates',{'$nor': [{'elements': {'$elemMatch': {'$nin': main_group}}}],"nelements":{'$in':[2,3]},"elements":{"$all":["S"]}})
add_compound_category('main group binary & ternary nitrides & nitrates',{'$nor': [{'elements': {'$elemMatch': {'$nin': main_group}}}],"nelements":{'$in':[2,3]},"elements":{"$all":["N"]}})

HBox(children=(IntProgress(value=0, max=5200), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1004), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1297), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1276), HTML(value='')))

### Intermetallics

In [15]:
## matches compounds that contain ONLY elements in the list
add_compound_category('Intermetallics',{'$nor': [{'elements': {'$elemMatch': {'$nin': post_transition_metals + metalloids}}}],"nelements":{'$nin':[1]}})

HBox(children=(IntProgress(value=0, max=507), HTML(value='')))

### Noble Gases

In [16]:
## matches compounds that contain ONLY elements in the list
add_compound_category('Noble gases',{'$nor': [{'elements': {'$elemMatch': {'$nin': noble_gases}}}],"nelements":{'$in':[1]}})

  % self.symbol)
  % self.symbol)
  % self.symbol)


### Polyanions

In [17]:
add_compound_category('Polyanions',{"elements":{"$in":['Cl','N','P','S'], "$all": ["O"]}, "nelements":{'$nin':[1,2]}})
add_compound_category('main group polyanions',{"elements":{"$in":['Cl','N','P','S'], "$all": ["O"],"$nin": transition_metals}, "nelements":{'$nin':[1,2]}})
add_compound_category('Transition metal polyanions',{"elements":{"$in":['Cl','N','P','S'], "$all": ["O"],"$in": transition_metals}, "nelements":{'$nin':[1,2]}})

HBox(children=(IntProgress(value=0, max=17124), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5477), HTML(value='')))

HBox(children=(IntProgress(value=0, max=42755), HTML(value='')))

### Transition metal oxides, by valence structure

In [18]:
add_compound_category('Transition metal binary & ternary oxides',{"elements":{"$in":transition_metals, "$all": ['O']},"nelements":{'$in':[2,3]}})
add_compound_category('3d binary & ternary oxides',{"elements":{"$in":transition_metals_3d, "$all": ['O']},"nelements":{'$in':[2,3]}})
add_compound_category('4d binary & ternary oxides',{"elements":{"$in":transition_metals_4d, "$all": ['O']},"nelements":{'$in':[2,3]}})
add_compound_category('5d binary & ternary oxides',{"elements":{"$in":transition_metals_5d, "$all": ['O']},"nelements":{'$in':[2,3]}})

HBox(children=(IntProgress(value=0, max=15208), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10220), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3269), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2394), HTML(value='')))

### Transition metal compounds, by anion

In [19]:
#add_compound_category('Transition metal oxides',{"elements":{"$in":transition_metals, "$all": ["O"]}})
add_compound_category('Transition metal phosphides & phosphates',{"elements":{"$in":transition_metals, "$all": ["P"]}})
add_compound_category('Transition metal sulfides & sulfates',{"elements":{"$in":transition_metals, "$all": ["S"]}})
add_compound_category('Transition metal nitrides & nitrates',{"elements":{"$in":transition_metals, "$all": ["N"]}})

add_compound_category('Transition metal binary & ternary sulfides',{"elements":{"$in":transition_metals, "$all": ['S']},"nelements":{'$in':[2,3]}})
add_compound_category('Transition metal binary & ternary selenides',{"elements":{"$in":transition_metals, "$all": ['Se']},"nelements":{'$in':[2,3]}})
add_compound_category('Transition metal binary & ternary tellurides',{"elements":{"$in":transition_metals, "$all": ['Te']},"nelements":{'$in':[2,3]}})

HBox(children=(IntProgress(value=0, max=9576), HTML(value='')))

HBox(children=(IntProgress(value=0, max=6298), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4211), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2911), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1627), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1187), HTML(value='')))

In [20]:
add_compound_category('Transition metal binary & ternary chalcogens',{"elements":{"$in":transition_metals, "$in": chalcogens},"nelements":{'$in':[2,3]}})

HBox(children=(IntProgress(value=0, max=31681), HTML(value='')))

In [21]:
dumpfn(compound_categories,Path(pipeline_dir / 'compound_categories.json'))