### 0.1 导入库

In [90]:
# basic tools
import os
import math
import pickle
import numpy as np
import pandas as pd
from os import listdir
from os.path import isfile, join
from tqdm import notebook as tqdm
from tqdm.auto import tqdm as tqdm_pandas

tqdm_pandas.pandas()






# structure modification
from pymatgen.core.periodic_table import Specie
from pymatgen.transformations.standard_transformations import AutoOxiStateDecorationTransformation, OxidationStateDecorationTransformation

from mp_api.client.mprester import MPRester

In [2]:
API_KEY = "fqPPo7Czb5mkbFh8mltlZd0I33csuKv0"
mpr = MPRester(API_KEY)

  pd.set_option('mode.use_inf_as_na', True)


In [11]:
docs = mpr.summary.search(elements=["Li"], all_fields=False, fields=["material_id", "formula_pretty", "structure", "symmetry", "band_gap", "energy_above_hull"])

Retrieving SummaryDoc documents:   0%|          | 0/21686 [00:00<?, ?it/s]

### 0.2 数据处理函数
包括数据处理的所有函数

(1)structure_simplifications()；
将结构简化,用于生成描述符；
structure_input待简化结构；
simplification_dict 填入与下面简化结构对应的字典；
* __structure:__ The original structure. 
* __structure_A:__ Only the anions. All anions are modeled as S atoms. 
* __structure_AM:__ Only the anions and mobile atoms. Anions and mobil atoms are modeled as S and Li, respectively. 
* __structure_CAN:__ Only the cations, anions, and mobile atoms. Cations, anions, and mobile atoms are modeled as Al, S, and Li, respectively. 
* __structure_CAMN:__ All atoms are retained. Cations, anions, mobile atoms, and neutral atoms are modeled as Al, S, Li, and Mg, respectively. 
* __structure_A40:__ As with structure_A but the lattice volume is scaled to 40 cubic angstroms per anion. 
* __structure_AM40:__ As with structure_AM but the lattice volume is scaled to 40 cubic angstroms per anion. 
* __structure_CAN40:__ As with structure_CAN but the lattice volume is scaled to 40 cubic angstroms per anion. 
* __structure_CAMN40:__ As with structure_CAMN but the lattice volume is scaled to 40 cubic angstroms per anion.

(2)apply_charge_decoration()
化合价修饰，输入为pymatgen结构，输出为带化合价修饰的pymatgen结构


In [118]:
simplification_dict_A = {'C':False, 'A':True, 'M':False, 'N':False, '40':False}
simplification_dict_AM = {'C':False, 'A':True, 'M':True, 'N':False, '40':False}
simplification_dict_CAN = {'C':True, 'A':True, 'M':False, 'N':True, '40':False}
simplification_dict_CAMN = {'C':True, 'A':True, 'M':True, 'N':True, '40':False}
simplification_dict_A40 = {'C':False, 'A':True, 'M':False, 'N':False, '40':True}
simplification_dict_AM40 = {'C':False, 'A':True, 'M':True, 'N':False, '40':True}
simplification_dict_CAN40 = {'C':True, 'A':True, 'M':False, 'N':True, '40':True}
simplification_dict_CAMN40 = {'C':True, 'A':True, 'M':True, 'N':True, '40':True}

def structure_simplifications(structure_input, simplification_dict):

    # structure_input = AutoOxiStateDecorationTransformation().apply_transformation(structure_input)
    
    # copy the structure in case modification fails
    structure = structure_input.copy()
    
    # create lists to keep track of the indices for the different atom types: cation, anion, mobile, neutral
    cation_list = []
    anion_list = []
    mobile_list = []
    neutral_list = []
    
    # create list to keep track of which atoms will be removed
    removal_list = []
    
    # integer to keep track of how to scale the lattice (for the representations that end in '40')
    scaling_counter = 0
    
    for idx, site in enumerate(structure):
        # grab the element name at the site
        element = site.species.elements[0].name
        # grab the charge at the site
        charge = site.specie.oxi_state
        
        # if the site is the mobile atom
        if element == 'Li':
            mobile_list.append(idx)
        else:
            # if the site holds a neutral atom
            if charge == 0:
                neutral_list.append(idx)
                scaling_counter+=1
                structure.replace(idx, Specie("Mg", oxidation_state=charge))
            # if the site holds a cation
            elif charge>0:
                cation_list.append(idx)
                structure.replace(idx, Specie("Al", oxidation_state=charge))
            # if the site holds an anion
            else:
                anion_list.append(idx)
                scaling_counter+=1
                structure.replace(idx, Specie("S", oxidation_state=charge))
    
    # comparison to simplification_dict to decide which sites are removed
    if not simplification_dict['C']:
        removal_list += cation_list     
    if not simplification_dict['A']:
        removal_list += anion_list                
    if not simplification_dict['M']:
        removal_list += mobile_list
    if not simplification_dict['N']:
        removal_list += neutral_list
    
    # Special cases for the structures_A and structures_CAN representations
    # Some structures have only Li. For these we are going to handle them as anions (because every representations includes anions)
    if len(structure) == len(mobile_list):
        if not simplification_dict['M']:
            for idx in mobile_list:
                structure.replace(idx, Specie("S", oxidation_state=charge))
    
    # Some structures have only neutrals or cations. For these we are going to handle them as anions (because every representation includes anions)
    elif len(structure) == len(removal_list):
        if len(neutral_list) > 0:
            for idx in neutral_list:
                structure.replace(idx, Specie("S", oxidation_state=charge))
            structure.remove_sites(cation_list+mobile_list)
        elif len(mobile_list) > 0:
            for idx in mobile_list:
                structure.replace(idx, Specie("S", oxidation_state=charge))
            structure.remove_sites(cation_list+neutral_list)
        elif len(cation_list) > 0:
            for idx in cation_list:
                structure.replace(idx, Specie("S", oxidation_state=charge))
            structure.remove_sites(neutral_list+mobile_list)
    
    # otherwise just remove whatever is in the removal list
    else:
        structure.remove_sites(removal_list)
    
    # if simplification_dict indicates that the lattice should be scaled
    if simplification_dict['40']:              
        if scaling_counter > 0:
            structure.scale_lattice(40*scaling_counter)
    
    return structure

def apply_charge_decoration(structure):
    
    # try the manual decoration strategy
    temp_structure = structure.copy()
    try:
        manually_transformed_structure = oxidation_decorator.apply_transformation(temp_structure)
        if abs(manually_transformed_structure.charge) < 0.5:
            return manually_transformed_structure
    except:
        pass
    
    # try Pymatgen's auto decorator
    temp_structure = structure.copy()
    try:
        auto_transformed_structure = oxidation_auto_decorator.apply_transformation(temp_structure)
        if abs(auto_transformed_structure.charge) < 0.5:
            return auto_transformed_structure
    except:
        pass
    
    # allow Pymatgen to guess the oxidation states
    temp_structure = structure.copy()
    try:
        structure.add_oxidation_state_by_guess()
        return structure
    except:
        pass 

    return structure

### 1.1 结构化合价修饰

In [65]:
pymatgen_structures = []

for material in tqdm.tqdm(docs):
    pymatgen_structures.append((material.structure, material.material_id,  material.formula_pretty, material.symmetry.symbol, material.band_gap, material.energy_above_hull))

  0%|          | 0/21686 [00:00<?, ?it/s]

In [107]:
structures_df = pd.DataFrame(pymatgen_structures)
structures_df.columns = ["structure", "material_id", "formula_pretty", "spacegroup", "band_gap", "e_hull"]
structures_df["compsition"] = structures_df.structure.map(lambda x: x.formula)



In [110]:
structures_df

Unnamed: 0,structure,material_id,formula_pretty,spacegroup,band_gap,e_hull,compsition
0,"[[ 1.23176784 4.16350682 -0.05711363] Li, [-1...",mp-763425,LiMn2F7,C2/c,1.0465,0.015323,Li2 Mn4 F14
1,"[[3.69926098 8.41643016 2.75210722] Li, [2.320...",mp-1235216,LiP4(Br3O)4,Pm,0.5519,0.077328,Li1 P4 Br12 O4
2,"[[ 3.98114357 -2.51993556 5.39309484] Li, [ 3...",mp-768922,LiVBO4,R-3,0.0000,0.102854,Li6 V6 B6 O24
3,"[[0. 3.85342 4.59949125] Li, [3.853...",mp-1211135,LiNd(SO4)2,P-4n2,5.3668,0.130943,Li2 Nd2 S4 O16
4,"[[-1.08594717 7.96719939 0.08555979] Li, [-2...",mp-768447,LiMn3(PO4)3,P1,0.0000,0.031609,Li2 Mn6 P6 O24
...,...,...,...,...,...,...,...
21681,"[[1.74146743 2.45301237 8.73754608] Li, [1.711...",mp-1001069,Li48P16S61,Pm,2.2955,0.018961,Li48 P16 S61
21682,"[[ 0. 0. 48.81585546] Li, [ 0...",mp-676829,Li49Cu8N19,P6/mmm,0.0000,0.010936,Li49 Cu8 N19
21683,"[[5.72960091 9.90138534 2.10488774] Li, [-0.07...",mp-722623,Li66Be24H35,P1,0.0000,0.227316,Li66 Be24 H35
21684,"[[12.07035636 11.60824897 7.6898112 ] Ba, [-1...",mp-647275,Ba39Li80N9,I-42m,0.0000,0.060914,Ba39 Li80 N9


In [111]:
# 结构化合价修饰（方便结构简化的实施）
oxidation_dictionary = {"H":1, "Li": 1, "Na":1, "K":1, "Rb": 1, "Cs":1, "Be":2, "Mg":2, "Ca":2, \
                        "Sr":2, "Ba":2, "Ra": 2, "B":3, "Al":3, "Ga":3, "In":3, "Tl":3, \
                        "C":4, "Si": 4, "Ge": 4, "Sn": 4, "Pb": 4, "N":-3, "P":5, "As":5, \
                        "Sb": 5, "Bi":5, "O":-2, "S":-2, "Se":-2, "Te":-2, "Po":-2, "F":-1, \
                       "Cl":-1, "Br":-1, "I":-1, "Sc":3, "Y":3, "Lu":3, "Ti":4, "Zr":4, "Hf":4, \
                       "V":5, "Nb":5, "Ta":5, "Cr":6, "Mo":4, "W":6, "Mn":7, "Tc":7, "Re":7, \
                       "Fe":3, "Ru":3, "Os":3, "Co": 3, "Rh":3, "Ir":3, "Cu":2, "Ag":1, "Au":3, \
                       "Zn":2, "Ni":2, "Cd":2, "Hg":2, "La":3, "Ce":3, "Pd":2, "Pm":3, "Ho":3, \
                        "Eu":3, "Np":3, "Pu":4, "Gd":3, "Sm":2, "Tb":3, "Tm":3, "Yb":3, "Ac":3, \
                       "Dy": 3, "Er":3, "Pr":3, "U":6, "Pt":2, "Nd":3, "Th":4, "Pa":5}

oxidation_decorator = OxidationStateDecorationTransformation(oxidation_dictionary)
oxidation_auto_decorator = AutoOxiStateDecorationTransformation(distance_scale_factor=1)


In [112]:
structures_df.loc[:, "structure"] = structures_df["structure"].progress_apply(apply_charge_decoration)

  0%|          | 0/21686 [00:00<?, ?it/s]

Structure charge (0) is set to be not equal to the sum of oxidation states (16). Use `unset_charge` if this is not desired.
Structure charge (0) is set to be not equal to the sum of oxidation states (1). Use `unset_charge` if this is not desired.
Structure charge (0) is set to be not equal to the sum of oxidation states (6). Use `unset_charge` if this is not desired.
Structure charge (0) is set to be not equal to the sum of oxidation states (-32). Use `unset_charge` if this is not desired.
Structure charge (0) is set to be not equal to the sum of oxidation states (26). Use `unset_charge` if this is not desired.
Structure charge (0) is set to be not equal to the sum of oxidation states (4). Use `unset_charge` if this is not desired.
Structure charge (0) is set to be not equal to the sum of oxidation states (2). Use `unset_charge` if this is not desired.
Structure charge (0) is set to be not equal to the sum of oxidation states (32). Use `unset_charge` if this is not desired.
Structure c

In [113]:
structures_df

Unnamed: 0,structure,material_id,formula_pretty,spacegroup,band_gap,e_hull,compsition
0,"[[ 1.23176784 4.16350682 -0.05711363] Li+, [-...",mp-763425,LiMn2F7,C2/c,1.0465,0.015323,Li2 Mn4 F14
1,"[[3.69926098 8.41643016 2.75210722] Li+, [2.32...",mp-1235216,LiP4(Br3O)4,Pm,0.5519,0.077328,Li1 P4 Br12 O4
2,"[[ 3.98114357 -2.51993556 5.39309484] Li+, [ ...",mp-768922,LiVBO4,R-3,0.0000,0.102854,Li6 V6 B6 O24
3,"[[0. 3.85342 4.59949125] Li+, [3.85...",mp-1211135,LiNd(SO4)2,P-4n2,5.3668,0.130943,Li2 Nd2 S4 O16
4,"[[-1.08594717 7.96719939 0.08555979] Li+, [-...",mp-768447,LiMn3(PO4)3,P1,0.0000,0.031609,Li2 Mn6 P6 O24
...,...,...,...,...,...,...,...
21681,"[[1.74146743 2.45301237 8.73754608] Li+, [1.71...",mp-1001069,Li48P16S61,Pm,2.2955,0.018961,Li48 P16 S61
21682,"[[ 0. 0. 48.81585546] Li+, [ ...",mp-676829,Li49Cu8N19,P6/mmm,0.0000,0.010936,Li49 Cu8 N19
21683,"[[5.72960091 9.90138534 2.10488774] Li+, [-0.0...",mp-722623,Li66Be24H35,P1,0.0000,0.227316,Li66 Be24 H35
21684,"[[12.07035636 11.60824897 7.6898112 ] Ba2+, [...",mp-647275,Ba39Li80N9,I-42m,0.0000,0.060914,Ba39 Li80 N9


In [114]:
unordered_count = 0
unordered = []
no_oxidation_count = 0
no_oxidation_state = []

for idx in tqdm.tqdm(np.arange(0, len(structures_df), 1), desc="Remove structure with problem of charge balance"):
    try:
        for site in structures_df.loc[idx, "structure"]:
            _=site.specie.oxi_state
    except Exception as e:
        no_oxidation_count += 1
        no_oxidation_state.append(idx)

print("{} structures are disordered.".format(unordered_count))
print("{} structures are not charge decorated.".format(no_oxidation_count))

for_removal = np.unique(unordered+no_oxidation_state)
structures_df = structures_df.drop(for_removal)
structures_df = structures_df.reset_index()

Remove structure with problem of charge balance:   0%|          | 0/21686 [00:00<?, ?it/s]

0 structures are disordered.
0 structures are not charge decorated.


### 1.2 结构简化

In [119]:
structures_df['structure_A'] = structures_df['structure'].progress_apply(structure_simplifications, simplification_dict=simplification_dict_A)
structures_df['structure_AM'] = structures_df['structure'].progress_apply(structure_simplifications, simplification_dict=simplification_dict_AM)
structures_df['structure_CAN'] = structures_df['structure'].progress_apply(structure_simplifications, simplification_dict=simplification_dict_CAN)
structures_df['structure_CAMN'] = structures_df['structure'].progress_apply(structure_simplifications, simplification_dict=simplification_dict_CAMN)
structures_df['structure_A40'] = structures_df['structure'].progress_apply(structure_simplifications, simplification_dict=simplification_dict_A40)
structures_df['structure_AM40'] = structures_df['structure'].progress_apply(structure_simplifications, simplification_dict=simplification_dict_AM40)
structures_df['structure_CAN40'] = structures_df['structure'].progress_apply(structure_simplifications, simplification_dict=simplification_dict_CAN40)
structures_df['structure_CAMN40'] = structures_df['structure'].progress_apply(structure_simplifications, simplification_dict=simplification_dict_CAMN40)

  0%|          | 0/21686 [00:00<?, ?it/s]

  0%|          | 0/21686 [00:00<?, ?it/s]

  0%|          | 0/21686 [00:00<?, ?it/s]

  0%|          | 0/21686 [00:00<?, ?it/s]

  0%|          | 0/21686 [00:00<?, ?it/s]

  0%|          | 0/21686 [00:00<?, ?it/s]

  0%|          | 0/21686 [00:00<?, ?it/s]

  0%|          | 0/21686 [00:00<?, ?it/s]

In [123]:
structures_df.iloc[:5]

Unnamed: 0,index,structure,material_id,formula_pretty,spacegroup,band_gap,e_hull,compsition,structure_A,structure_AM,structure_CAN,structure_CAMN,structure_A40,structure_AM40,structure_CAN40,structure_CAMN40
0,0,"[[ 1.23176784 4.16350682 -0.05711363] Li+, [-...",mp-763425,LiMn2F7,C2/c,1.0465,0.015323,Li2 Mn4 F14,"[[-0.4125833 7.11603295 0.35091376] S-, [0....","[[ 1.23176784 4.16350682 -0.05711363] Li+, [-...","[[-1.20302303 6.61623195 4.26318658] Al7+, [...","[[ 1.23176784 4.16350682 -0.05711363] Li+, [-...","[[-0.55956998 9.65118655 0.47593009] S-, [0....","[[ 1.67059671 5.64679525 -0.07746089] Li+, [-...","[[-1.63161129 8.97332675 5.78198685] Al7+, [...","[[ 1.67059671 5.64679525 -0.07746089] Li+, [-..."
1,1,"[[3.69926098 8.41643016 2.75210722] Li+, [2.32...",mp-1235216,LiP4(Br3O)4,Pm,0.5519,0.077328,Li1 P4 Br12 O4,"[[4.9213472 6.90288122 6.62062873] S-, [ 5.55...","[[3.69926098 8.41643016 2.75210722] Li+, [4.92...","[[2.32056132 3.27613999 2.74487635] Al5+, [3.5...","[[3.69926098 8.41643016 2.75210722] Li+, [2.32...","[[4.66525695 6.54367866 6.27611363] S-, [ 5.26...","[[3.50676397 7.97846764 2.60889688] Li+, [4.66...","[[2.19980717 3.10566076 2.60204228] Al5+, [3.4...","[[3.50676397 7.97846764 2.60889688] Li+, [2.19..."
2,2,"[[ 3.98114357 -2.51993556 5.39309484] Li+, [ ...",mp-768922,LiVBO4,R-3,0.0,0.102854,Li6 V6 B6 O24,"[[ 3.44211653 -4.23477597 0.34037411] S2-, [ ...","[[ 3.98114357 -2.51993556 5.39309484] Li+, [ ...","[[ 3.70978256 -4.61007675 2.10069859] Al5+, [...","[[ 3.98114357 -2.51993556 5.39309484] Li+, [ ...","[[ 4.47439084 -5.50476506 0.44245068] S2-, [ ...","[[ 5.17506951 -3.2756522 7.01045822] Li+, [ ...","[[ 4.82232863 -5.99261675 2.73068806] Al5+, [...","[[ 5.17506951 -3.2756522 7.01045822] Li+, [ ..."
3,3,"[[0. 3.85342 4.59949125] Li+, [3.85...",mp-1211135,LiNd(SO4)2,P-4n2,5.3668,0.130943,Li2 Nd2 S4 O16,"[[1.70277235 2.15064765 4.59949125] S2-, [6.00...","[[0. 3.85342 4.59949125] Li+, [3.85...","[[0. 3.85342 1.53316375] Al3+, [3.8...","[[0. 3.85342 4.59949125] Li+, [3.85...","[[2.21336402 2.79553877 5.97869024] S2-, [7.80...","[[0. 5.00890279 5.97869024] Li+, [5.00...","[[0. 5.00890279 1.99289675] Al3+, [5.0...","[[0. 5.00890279 5.97869024] Li+, [5.00..."
4,4,"[[-1.08594717 7.96719939 0.08555979] Li+, [-...",mp-768447,LiMn3(PO4)3,P1,0.0,0.031609,Li2 Mn6 P6 O24,"[[-2.10385611 5.8851766 6.60487769] S2-, [3...","[[-1.08594717 7.96719939 0.08555979] Li+, [-...","[[-1.92169457 6.93965893 4.96614429] Al7+, [...","[[-1.08594717 7.96719939 0.08555979] Li+, [-...","[[-2.73182117 7.64180113 8.57632069] S2-, [4...","[[-1.41008383 10.3452721 0.11109792] Li+, [-...","[[-2.49528752 9.01102839 6.44845339] Al7+, [...","[[-1.41008383 10.3452721 0.11109792] Li+, [-..."


### 数据保存

注：由于保存文件有点大，不要上传到github，源文件位置：家里电脑位置——D:\1-study\work\lesson\ml_for_chemistry

In [None]:
# save_path = os.path.join(os.getcwd(), 'structures_df.pkl')
# save_file = open(save_path, 'wb')
# pickle.dump(structures_df, save_file)
# save_file.close()