In [1]:
import pandas as pd
import numpy as np
from pymatgen.core import Composition

# No warnings about setting value on copy of slice
pd.options.mode.chained_assignment = None

# Display up to 60 columns of a dataframe
pd.set_option('display.max_columns', 60)

# Matplotlib visualization
import matplotlib.pyplot as plt
from matplotlib import rcParams
%matplotlib inline

# Internal ipython tool for setting figure size
from IPython.core.pylabtools import figsize

# Seaborn for visualization
import seaborn as sns

# Splitting data into training and testing
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

config = {
    "mathtext.fontset":'stix',
    "font.family":'serif',
    "font.serif": ['Times New Roman'],
    "font.size": 24,
    'axes.unicode_minus': False
}
rcParams.update(config)
plt.rcParams['axes.unicode_minus'] = False 
large = 22; med = 16; small = 12
params = {'axes.titlesize': large,
          'legend.fontsize': med,
          'figure.figsize': (8, 6),
          'axes.labelsize': med,
          'axes.titlesize': med,
          'xtick.labelsize': med,
          'ytick.labelsize': med,
          'figure.titlesize': large}
plt.rcParams.update(params)
plt.rcParams['figure.dpi'] = 300 

In [2]:
a_data = pd.read_csv('./data/a.csv',index_col=0)
b_data = pd.read_csv('./data/b.csv',index_col=0)
a_data = a_data[a_data['spacegroup_number']==227]
b_data = b_data[b_data['spacegroup_number']==227]

# Get formula

In [3]:
a_data = a_data.reset_index(drop=True)
index_to_drop = []
formula = []
for i,row in a_data.iterrows():
    A1 = row['A1']
    A2 = row['A2']
    a1 = row['a1']
    a2 = row['a2']
    B = row['B']
    if A1==A2 or A1== B or A2==B:
        index_to_drop.append(i)
        continue
    if A1 != '0' and A2 != '0':
        row['formula'] = A1+str(a1)+A2+str(a2)+B+'2'+'O4'
        formula.append(row['formula'])
    elif A1 == '0' and A2 != '0':
        row['formula'] = A2+str(a2)+B+'2'+'O4'
        formula.append(row['formula'])
    elif A2 == '0' and A1 != '0':
        row['formula'] = A1+str(a1)+B+'2'+'O4'
        formula.append(row['formula'])
a_data = a_data.drop(index=index_to_drop)
a_data['formula'] = formula
a_data

Unnamed: 0,A1,A2,a1,a2,B,C,spacegroup_number,formula
0,Na,Ag,0.1,0.9,Mn,O,227,Na0.1Ag0.9Mn2O4
1,Na,Al,0.1,0.9,Mn,O,227,Na0.1Al0.9Mn2O4
2,Na,Ba,0.1,0.9,Mn,O,227,Na0.1Ba0.9Mn2O4
3,Na,Be,0.1,0.9,Mn,O,227,Na0.1Be0.9Mn2O4
4,Na,Ca,0.1,0.9,Mn,O,227,Na0.1Ca0.9Mn2O4
...,...,...,...,...,...,...,...,...
40459,Li,0,1.0,0.0,Ni,O,227,Li1.0Ni2O4
40460,Na,0,1.0,0.0,Sn,O,227,Na1.0Sn2O4
40461,Mn,0,1.0,0.0,Rh,O,227,Mn1.0Rh2O4
40462,Li,0,1.0,0.0,V,O,227,Li1.0V2O4


In [4]:
a_data['formula'] = a_data['formula'].apply(lambda x:Composition(x).reduced_formula)
a_data

Unnamed: 0,A1,A2,a1,a2,B,C,spacegroup_number,formula
0,Na,Ag,0.1,0.9,Mn,O,227,Na0.1Mn2Ag0.9O4
1,Na,Al,0.1,0.9,Mn,O,227,Na0.1Mn2Al0.9O4
2,Na,Ba,0.1,0.9,Mn,O,227,Ba0.9Na0.1Mn2O4
3,Na,Be,0.1,0.9,Mn,O,227,Na0.1Mn2Be0.9O4
4,Na,Ca,0.1,0.9,Mn,O,227,Na0.1Ca0.9Mn2O4
...,...,...,...,...,...,...,...,...
40459,Li,0,1.0,0.0,Ni,O,227,Li(NiO2)2
40460,Na,0,1.0,0.0,Sn,O,227,Na(SnO2)2
40461,Mn,0,1.0,0.0,Rh,O,227,Mn(RhO2)2
40462,Li,0,1.0,0.0,V,O,227,LiV2O4


In [5]:
a_data = a_data.drop_duplicates(subset="formula",keep='first')
a_data

Unnamed: 0,A1,A2,a1,a2,B,C,spacegroup_number,formula
0,Na,Ag,0.1,0.9,Mn,O,227,Na0.1Mn2Ag0.9O4
1,Na,Al,0.1,0.9,Mn,O,227,Na0.1Mn2Al0.9O4
2,Na,Ba,0.1,0.9,Mn,O,227,Ba0.9Na0.1Mn2O4
3,Na,Be,0.1,0.9,Mn,O,227,Na0.1Mn2Be0.9O4
4,Na,Ca,0.1,0.9,Mn,O,227,Na0.1Ca0.9Mn2O4
...,...,...,...,...,...,...,...,...
40457,Li,0,1.0,0.0,Mo,O,227,Li(MoO2)2
40458,Eu,0,1.0,0.0,La,O,227,La2EuO4
40459,Li,0,1.0,0.0,Ni,O,227,Li(NiO2)2
40460,Na,0,1.0,0.0,Sn,O,227,Na(SnO2)2


In [6]:
b_data = b_data.reset_index(drop=True)
index_to_drop = []
formula = []
for i,row in b_data.iterrows():
    B1 = row['B1']
    B2 = row['B2']
    b1 = row['b1']
    b2 = row['b2']
    A = row['A']
    if B1==B2 or B1== A or B2==A:
        index_to_drop.append(i)
        continue
    if str(B1) != '0' and str(B2) != '0':
        formula.append(A+B1+str(b1)+B2+str(b2)+'O4')
    elif str(B1) == '0' and str(B2) != '0':
        formula.append(A+B2+str(b2)+'O4')
    elif str(B2) == '0' and str(B1) != '0':
        formula.append(A+B1+str(b1)+'O4')
b_data = b_data.drop(index=index_to_drop)
b_data['formula'] = formula
b_data

Unnamed: 0,A,B1,B2,b1,b2,C,spacegroup_number,formula
0,Na,Mn,Ag,0.1,1.9,O,227,NaMn0.1Ag1.9O4
1,Na,Mn,Al,0.1,1.9,O,227,NaMn0.1Al1.9O4
2,Na,Mn,Bi,0.1,1.9,O,227,NaMn0.1Bi1.9O4
3,Na,Mn,Ca,0.1,1.9,O,227,NaMn0.1Ca1.9O4
4,Na,Mn,Cd,0.1,1.9,O,227,NaMn0.1Cd1.9O4
...,...,...,...,...,...,...,...,...
129483,V,Tm,0,2.0,0.0,O,227,VTm2.0O4
129485,V,W,0,2.0,0.0,O,227,VW2.0O4
129486,V,Y,0,2.0,0.0,O,227,VY2.0O4
129487,V,Yb,0,2.0,0.0,O,227,VYb2.0O4


In [7]:
b_data['formula'] = b_data['formula'].apply(lambda x:Composition(x).reduced_formula)
b_data

Unnamed: 0,A,B1,B2,b1,b2,C,spacegroup_number,formula
0,Na,Mn,Ag,0.1,1.9,O,227,Na1Mn0.1Ag1.9O4
1,Na,Mn,Al,0.1,1.9,O,227,Na1Mn0.1Al1.9O4
2,Na,Mn,Bi,0.1,1.9,O,227,Na1Mn0.1Bi1.9O4
3,Na,Mn,Ca,0.1,1.9,O,227,Na1Ca1.9Mn0.1O4
4,Na,Mn,Cd,0.1,1.9,O,227,Na1Mn0.1Cd1.9O4
...,...,...,...,...,...,...,...,...
129483,V,Tm,0,2.0,0.0,O,227,Tm2VO4
129485,V,W,0,2.0,0.0,O,227,V(WO2)2
129486,V,Y,0,2.0,0.0,O,227,Y2VO4
129487,V,Yb,0,2.0,0.0,O,227,Yb2VO4


In [8]:
b_data = b_data.drop_duplicates(subset="formula",keep='first')
b_data

Unnamed: 0,A,B1,B2,b1,b2,C,spacegroup_number,formula
0,Na,Mn,Ag,0.1,1.9,O,227,Na1Mn0.1Ag1.9O4
1,Na,Mn,Al,0.1,1.9,O,227,Na1Mn0.1Al1.9O4
2,Na,Mn,Bi,0.1,1.9,O,227,Na1Mn0.1Bi1.9O4
3,Na,Mn,Ca,0.1,1.9,O,227,Na1Ca1.9Mn0.1O4
4,Na,Mn,Cd,0.1,1.9,O,227,Na1Mn0.1Cd1.9O4
...,...,...,...,...,...,...,...,...
129483,V,Tm,0,2.0,0.0,O,227,Tm2VO4
129485,V,W,0,2.0,0.0,O,227,V(WO2)2
129486,V,Y,0,2.0,0.0,O,227,Y2VO4
129487,V,Yb,0,2.0,0.0,O,227,Yb2VO4


# Coarse screening of spinel

In [9]:
spinels = pd.read_csv('./data/spinel_oxides.csv')
spinels = spinels[['formula_pretty','band_gap','spacegroup_number','is_gap_direct']]
spinels = spinels.rename(columns={"formula_pretty":"formula"})
spinels = spinels[['formula']]
mp_data = pd.read_csv('./data/data.csv')
mp_data_non_spinel = mp_data[mp_data['spacegroup']!=227]
mp_data_non_spinel = mp_data_non_spinel[mp_data_non_spinel['formula'].str.contains('O')][['formula']]
mp_data_non_spinel['select'] = mp_data_non_spinel['formula'].map(lambda x: 1 if 2<len(Composition(x))<5 else 0)
mp_data_non_spinel = mp_data_non_spinel[mp_data_non_spinel['select']==1]
mp_data_non_spinel = mp_data_non_spinel.drop(columns='select')
mp_data_non_spinel = mp_data_non_spinel.sample(200,random_state=40).reset_index(drop=True)

In [10]:
spinels['is_spinel'] = 1
mp_data_non_spinel['is_spinel'] = 0
data = pd.concat([spinels,mp_data_non_spinel],axis=0).reset_index(drop=True)
data

Unnamed: 0,formula,is_spinel
0,NaMn2O4,1
1,Cd(RhO2)2,1
2,CaIn2O4,1
3,Si(NiO2)2,1
4,Mg2FeO4,1
...,...,...
365,Ni4P2O9,0
366,Re2NiO6,0
367,MgMn4(Si4O11)2,0
368,KMnO2,0


In [11]:
from matminer.featurizers.base import MultipleFeaturizer
from matminer.featurizers import composition as cf
feature_calculators = MultipleFeaturizer([cf.ElementProperty.from_preset("magpie"),
                                          cf.ValenceOrbital(props=['avg']), cf.IonProperty(fast=True)])
feature_labels = feature_calculators.feature_labels()
data['composition'] = data['formula'].map(Composition)
data_features = feature_calculators.featurize_dataframe(data,col_id='composition')

MultipleFeaturizer:   0%|          | 0/370 [00:00<?, ?it/s]

In [12]:
def remove_collinear_features(x, threshold):
    '''
    Objective:
        Remove collinear features in a dataframe with a correlation coefficient
        greater than the threshold. Removing collinear features can help a model
        to generalize and improves the interpretability of the model.
        
    Inputs: 
        threshold: any features with correlations greater than this value are removed
    
    Output: 
        dataframe that contains only the non-highly-collinear features
    '''  
    # Calculate the correlation matrix
    corr_matrix = x.corr()
    iters = range(len(corr_matrix.columns) - 1)
    drop_cols = []

    # Iterate through the correlation matrix and compare correlations
    for i in iters:
        for j in range(i):
            item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
            col = item.columns
            row = item.index
            val = abs(item.values)
            
            # If correlation exceeds the threshold
            if val >= threshold:
                # Print the correlated features and the correlation value
                # print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
                drop_cols.append(col.values[0])

    # Drop one of each pair of correlated columns
    drops = set(drop_cols)
    x = x.drop(columns = drops)
    return x

In [13]:
X = data_features[feature_labels]
y = data['is_spinel']
X = remove_collinear_features(X, 0.8);
feature_labels = X.columns

In [14]:
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [15]:
from sklearn.model_selection import StratifiedKFold
from tpot import TPOTClassifier
cv = StratifiedKFold(n_splits=5) 

model_clf = TPOTClassifier(generations=10,population_size=50,cv=cv, scoring='roc_auc', verbosity=2, 
                       random_state=42, n_jobs=-1) 
model_clf.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/550 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9880076628352491


Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x2ab7ab2a33a0>
Traceback (most recent call last):
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_modul


Generation 2 - Current best internal CV score: 0.9881000098241477


Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x2ab7ab259a60>
Traceback (most recent call last):
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_modul


Generation 3 - Current best internal CV score: 0.9915404263680125


Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x2ab7ab2f1c10>
Traceback (most recent call last):
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_modul


Generation 4 - Current best internal CV score: 0.991582178995972


Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x2ab7ab259ca0>
Traceback (most recent call last):
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_modul

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x2ab7ab2a3430>
Traceback (most recent call last):
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_modul


Generation 5 - Current best internal CV score: 0.9920620886138127


Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x2ab7ab2a3430>
Traceback (most recent call last):
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_modul

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x2ab7ab2a3430>
Traceback (most recent call last):
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_modul


Generation 6 - Current best internal CV score: 0.9926299243540623


Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x2ab7ab2f1dc0>
Traceback (most recent call last):
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_modul

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x2ab7ab3351f0>
Traceback (most recent call last):
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_modul


Generation 7 - Current best internal CV score: 0.9926299243540623


Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x2ab7ab2f1790>
Traceback (most recent call last):
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_modul


Generation 8 - Current best internal CV score: 0.9926299243540623


Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x2ab7ab335dc0>
Traceback (most recent call last):
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_modul

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x2ab7ab2f1af0>
Traceback (most recent call last):
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_modul


Generation 9 - Current best internal CV score: 0.9945141958935062


Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x2ab7ab335e50>
Traceback (most recent call last):
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/usr/local/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_modul


Generation 10 - Current best internal CV score: 0.9945141958935062

Best pipeline: GradientBoostingClassifier(input_matrix, learning_rate=0.5, max_depth=3, max_features=1.0, min_samples_leaf=17, min_samples_split=4, n_estimators=100, subsample=0.6500000000000001)


In [16]:
from sklearn.metrics import roc_auc_score
y_test_pred = model_clf.predict(X_test)
roc_auc = roc_auc_score(y_test,y_test_pred)
roc_auc

0.9373219373219374

In [17]:
a_data['composition'] = a_data['formula'].map(Composition)
b_data['composition'] = b_data['formula'].map(Composition)

In [18]:
a_data_features = feature_calculators.featurize_dataframe(a_data, col_id='composition',ignore_errors=True)

MultipleFeaturizer:   0%|          | 0/33941 [00:00<?, ?it/s]

In [19]:
a_X = a_data_features[feature_labels]
a_data['is_spinel'] = model_clf.predict(a_X)
a_data['is_spinel'].value_counts()

1    28975
0     4966
Name: is_spinel, dtype: int64

In [20]:
b_data_features = feature_calculators.featurize_dataframe(b_data, col_id='composition',ignore_errors=True)

MultipleFeaturizer:   0%|          | 0/109474 [00:00<?, ?it/s]

In [21]:
b_X = b_data_features[feature_labels]
b_data['is_spinel'] = model_clf.predict(b_X)
b_data['is_spinel'].value_counts()

1    82767
0    26707
Name: is_spinel, dtype: int64

In [22]:
a_data_spinel = a_data[a_data['is_spinel']==1]
b_data_spinel = b_data[b_data['is_spinel']==1]
print(a_data_spinel.shape,b_data_spinel.shape)

(28975, 10) (82767, 10)


# feature project

In [23]:
ele_df1 = pd.read_csv('./data/elements.csv')
ele_df2 = pd.read_csv('./data/elements2.csv')
spinels = pd.read_csv('./data/spinel_oxides.csv')
mp_data = pd.read_csv('./data/data.csv')

In [24]:
spinels = spinels[['formula_pretty','spacegroup_number','band_gap','is_gap_direct']]
spinels.columns = ['formula','spacegroup','GGA','is_gap_direct']
spinels

Unnamed: 0,formula,spacegroup,GGA,is_gap_direct
0,NaMn2O4,227,0.0000,False
1,Cd(RhO2)2,227,0.8367,False
2,CaIn2O4,227,2.0208,True
3,Si(NiO2)2,227,3.5770,True
4,Mg2FeO4,227,0.0000,False
...,...,...,...,...
165,MgCr2O4,227,2.5264,True
166,Mg(RhO2)2,227,1.1521,False
167,LiV2O4,227,0.0000,False
168,MnAl2O4,227,2.6578,True


In [25]:
spinels = pd.merge(spinels,mp_data, on=['formula','spacegroup'], how='inner')
spinels

Unnamed: 0,formula,spacegroup,GGA_x,is_gap_direct,0,GGA_y,target
0,NaMn2O4,227,0.0000,False,131,0.0000,0.000000
1,Cd(RhO2)2,227,0.8367,False,125609,0.8367,2.470591
2,CaIn2O4,227,2.0208,True,81767,2.0208,3.393936
3,Si(NiO2)2,227,3.5770,True,843,3.5770,3.577000
4,Mg2FeO4,227,0.0000,False,363,0.0000,0.000000
...,...,...,...,...,...,...,...
165,MgCr2O4,227,2.5264,True,27914,2.5264,2.526400
166,Mg(RhO2)2,227,1.1521,False,129217,1.1521,3.042557
167,LiV2O4,227,0.0000,False,28026,0.0000,0.000000
168,MnAl2O4,227,2.6578,True,28051,2.6578,2.657800


In [26]:
spinels = spinels[['formula','spacegroup','GGA_x','is_gap_direct','target']]
spinels.columns = ['formula','spacegroup','GGA','is_gap_direct','target']
spinels

Unnamed: 0,formula,spacegroup,GGA,is_gap_direct,target
0,NaMn2O4,227,0.0000,False,0.000000
1,Cd(RhO2)2,227,0.8367,False,2.470591
2,CaIn2O4,227,2.0208,True,3.393936
3,Si(NiO2)2,227,3.5770,True,3.577000
4,Mg2FeO4,227,0.0000,False,0.000000
...,...,...,...,...,...
165,MgCr2O4,227,2.5264,True,2.526400
166,Mg(RhO2)2,227,1.1521,False,3.042557
167,LiV2O4,227,0.0000,False,0.000000
168,MnAl2O4,227,2.6578,True,2.657800


In [27]:
def get_a_symbol(composition: Composition):
    comp = composition.as_dict()
    for k, v in comp.items():
        if v == 1: return k
    return None

def get_b_symbol(composition: Composition):
    comp = composition.as_dict()
    for k, v in comp.items():
        if v == 2: return k
    return None

spinels['composition'] = spinels['formula'].map(Composition)
spinels['A'] = spinels['composition'].map(get_a_symbol)
spinels['B'] = spinels['composition'].map(get_b_symbol)

In [28]:
def get_base_features_AB2O4(data:pd.DataFrame):
    result = []
    for i,row in data.iterrows():
        tmp = {}
        a = row['A']
        b = row['B']
        o = 'O'
        tmp["A_Density"] = ele_df1[ele_df1['symbol']==a]['Density'].values[0]
        tmp["B_Density"] = 2 * ele_df1[ele_df1['symbol']==b]['Density'].values[0]
        tmp["O_Density"] = ele_df1[ele_df1['symbol']==o]['Density'].values[0]
        tmp["A_dipole Polarizability"] = ele_df1[ele_df1['symbol']==a]['dipole_polarizability'].values[0]
        tmp["B_dipole Polarizability"] = 2 * ele_df1[ele_df1['symbol']==b]['dipole_polarizability'].values[0]
        tmp["O_dipole Polarizability"] = ele_df1[ele_df1['symbol']==o]['dipole_polarizability'].values[0]
        tmp["A_covalent Radius"] = ele_df1[ele_df1['symbol']==a]['covalent_radius'].values[0]
        tmp["B_covalent Radius"] = 2 * ele_df1[ele_df1['symbol']==b]['covalent_radius'].values[0]
        tmp["O_covalent Radius"] = ele_df1[ele_df1['symbol']==o]['covalent_radius'].values[0]
        tmp["A_atomic Radius"] = ele_df2[ele_df1['symbol']==a]['Atomic radius (Å)'].values[0]
        tmp["B_atomic Radius"] = 2 * ele_df2[ele_df1['symbol']==b]['Atomic radius (Å)'].values[0]
        tmp["O_atomic Radius"] = ele_df2[ele_df1['symbol']==o]['Atomic radius (Å)'].values[0]
        tmp["A_FirstIonization"] = ele_df1[ele_df1['symbol']==a]['FirstIonization'].values[0]
        tmp["B_FirstIonization"] = 2 * ele_df1[ele_df1['symbol']==b]['FirstIonization'].values[0]
        tmp["O_FirstIonization"] = ele_df1[ele_df1['symbol']==o]['FirstIonization'].values[0]
        tmp["A_number of Valence Electrons"] = ele_df1[ele_df1['symbol']==a]['number_of_valence_electrons'].values[0]
        tmp["B_number of Valence Electrons"] = 2 * ele_df1[ele_df1['symbol']==b]['number_of_valence_electrons'].values[0]
        tmp["O_number of Valence Electrons"] = ele_df1[ele_df1['symbol']==o]['number_of_valence_electrons'].values[0]
        tmp["A_number"] = ele_df1[ele_df1['symbol']==a]['number'].values[0]
        tmp["B_number"] = 2 * ele_df1[ele_df1['symbol']==b]['number'].values[0]
        tmp["O_number"] = ele_df1[ele_df1['symbol']==o]['number'].values[0]
        tmp["A_Period"] = ele_df1[ele_df1['symbol']==a]['Period'].values[0]
        tmp["B_Period"] = 2 * ele_df1[ele_df1['symbol']==b]['Period'].values[0]
        tmp["O_Period"] = ele_df1[ele_df1['symbol']==o]['Period'].values[0]
        tmp["A_Electronegativity"] = ele_df1[ele_df1['symbol']==a]['Electronegativity'].values[0]
        tmp["B_Electronegativity"] = 2 * ele_df1[ele_df1['symbol']==b]['Electronegativity'].values[0]
        tmp["O_Electronegativity"] = ele_df1[ele_df1['symbol']==o]['Electronegativity'].values[0]
        tmp["A_number of s+p Electrons"] = ele_df2[ele_df2['symbol']==a]['Number of s electrons'].values[0] + ele_df2[ele_df2['symbol']==a]['Number of p electrons'].values[0]
        tmp["B_number of s+p Electrons"] = 2 * ele_df2[ele_df2['symbol']==b]['Number of s electrons'].values[0] + 2 * ele_df2[ele_df2['symbol']==b]['Number of p electrons'].values[0]
        tmp["O_number of s+p Electrons"] = ele_df2[ele_df2['symbol']==o]['Number of s electrons'].values[0] + ele_df2[ele_df2['symbol']==o]['Number of p electrons'].values[0]
        tmp["A_number of d Electrons"] = ele_df2[ele_df2['symbol']==a]['Number of d electrons'].values[0]
        tmp["B_number of d Electrons"] = 2 * ele_df2[ele_df2['symbol']==b]['Number of d electrons'].values[0]
        tmp["O_number of d Electrons"] = ele_df2[ele_df2['symbol']==o]['Number of d electrons'].values[0]
        tmp["A_Mulliken EN"] = ele_df2[ele_df2['symbol']==a]['Mulliken EN'].values[0]
        tmp["B_Mulliken EN"] = 2 * ele_df2[ele_df2['symbol']==b]['Mulliken EN'].values[0]
        tmp["O_Mulliken EN"] = ele_df2[ele_df2['symbol']==o]['Mulliken EN'].values[0]
        result.append(tmp)
    return pd.DataFrame(result)
base_features_AB2O4 = get_base_features_AB2O4(spinels)
base_features_AB2O4

Unnamed: 0,A_Density,B_Density,O_Density,A_dipole Polarizability,B_dipole Polarizability,O_dipole Polarizability,A_covalent Radius,B_covalent Radius,O_covalent Radius,A_atomic Radius,B_atomic Radius,O_atomic Radius,A_FirstIonization,B_FirstIonization,O_FirstIonization,A_number of Valence Electrons,B_number of Valence Electrons,O_number of Valence Electrons,A_number,B_number,O_number,A_Period,B_Period,O_Period,A_Electronegativity,B_Electronegativity,O_Electronegativity,A_number of s+p Electrons,B_number of s+p Electrons,O_number of s+p Electrons,A_number of d Electrons,B_number of d Electrons,O_number of d Electrons,A_Mulliken EN,B_Mulliken EN,O_Mulliken EN
0,0.971,14.88,0.00143,162.7000,136.0,5.3,155,238,63,1.80,2.8,0.6,495.8,1434.6,1313.9,1,14,6,11,50,8,3,8,2,0.93,3.10,3.44,1,4,6,0,10,0,2.85,7.44,7.54
1,8.690,24.80,0.00143,46.0000,132.0,5.3,136,250,63,1.55,2.7,0.6,867.8,1439.4,1313.9,12,18,6,48,90,8,5,10,2,1.69,4.56,3.44,2,2,6,10,16,0,4.33,8.60,7.54
2,1.540,14.62,0.00143,160.8000,130.0,5.3,171,284,63,1.80,3.1,0.6,589.8,1116.6,1313.9,2,6,6,20,98,8,4,10,2,1.00,3.56,3.44,2,6,6,0,20,0,2.20,6.20,7.54
3,2.330,17.82,0.00143,37.3000,98.0,5.3,116,220,63,1.10,2.7,0.6,786.5,1474.2,1313.9,4,20,6,14,56,8,3,8,2,1.90,3.82,3.44,4,4,6,0,16,0,4.77,8.80,7.54
4,7.870,3.48,0.00143,62.0000,142.4,5.3,116,278,63,1.40,3.0,0.6,762.5,1475.4,1313.9,8,4,6,26,24,8,4,6,2,1.83,2.62,3.44,2,4,6,6,0,0,4.06,7.50,7.54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,1.740,14.30,0.00143,71.2000,166.0,5.3,139,244,63,1.50,2.8,0.6,737.7,1305.8,1313.9,2,12,6,12,48,8,3,8,2,1.31,3.32,3.44,2,2,6,0,10,0,3.75,7.44,7.54
166,1.740,24.80,0.00143,71.2000,132.0,5.3,139,250,63,1.50,2.7,0.6,737.7,1439.4,1313.9,2,18,6,12,90,8,3,10,2,1.31,4.56,3.44,2,2,6,0,16,0,3.75,8.60,7.54
167,0.534,12.22,0.00143,164.1125,174.0,5.3,133,268,63,1.45,2.7,0.6,520.2,1301.8,1313.9,1,10,6,3,46,8,2,8,2,0.98,3.26,3.44,1,4,6,0,6,0,3.01,7.20,7.54
168,7.440,5.40,0.00143,68.0000,115.6,5.3,119,252,63,1.40,2.5,0.6,717.3,1155.0,1313.9,7,6,6,25,26,8,4,6,2,1.55,3.22,3.44,2,6,6,5,0,0,3.72,6.46,7.54


In [29]:
def get_base_features_AAB2O4(data:pd.DataFrame):
    result = []
    for i,row in data.iterrows():
        if row['formula'] in spinels['formula']:
            continue
        tmp = {}
        o = 'O'
        if row['A1'] != '0' and row['A2'] != '0':
            A1 = row['A1']
            A2 = row['A2']
            a1 = row['a1']
            a2 = row['a2']
            b = row['B']
            tmp["A_Density"] = a1 * ele_df1[ele_df1['symbol']==A1]['Density'].values[0] + \
            a2 * ele_df1[ele_df1['symbol']==A2]['Density'].values[0]
            tmp["B_Density"] = 2 * ele_df1[ele_df1['symbol']==b]['Density'].values[0]
            tmp["O_Density"] = ele_df1[ele_df1['symbol']==o]['Density'].values[0]
            tmp["A_dipole Polarizability"] = a1 * ele_df1[ele_df1['symbol']==A1]['dipole_polarizability'].values[0] + \
            a2 * ele_df1[ele_df1['symbol']==A2]['dipole_polarizability'].values[0]
            tmp["B_dipole Polarizability"] = 2 * ele_df1[ele_df1['symbol']==b]['dipole_polarizability'].values[0]
            tmp["O_dipole Polarizability"] = ele_df1[ele_df1['symbol']==o]['dipole_polarizability'].values[0]
            tmp["A_covalent Radius"] = a1 * ele_df1[ele_df1['symbol']==A1]['covalent_radius'].values[0] + \
            a2 * ele_df1[ele_df1['symbol']==A2]['covalent_radius'].values[0]
            tmp["B_covalent Radius"] = 2 * ele_df1[ele_df1['symbol']==b]['covalent_radius'].values[0]
            tmp["O_covalent Radius"] = ele_df1[ele_df1['symbol']==o]['covalent_radius'].values[0]
            tmp["A_atomic Radius"] = a1 * ele_df2[ele_df1['symbol']==A1]['Atomic radius (Å)'].values[0] + \
            a2 * ele_df2[ele_df1['symbol']==A2]['Atomic radius (Å)'].values[0]
            tmp["B_atomic Radius"] = 2 * ele_df2[ele_df1['symbol']==b]['Atomic radius (Å)'].values[0]
            tmp["O_atomic Radius"] = ele_df2[ele_df1['symbol']==o]['Atomic radius (Å)'].values[0]
            tmp["A_FirstIonization"] = a1 * ele_df1[ele_df1['symbol']==A1]['FirstIonization'].values[0] + \
            a2 * ele_df1[ele_df1['symbol']==A2]['FirstIonization'].values[0]
            tmp["B_FirstIonization"] = 2 * ele_df1[ele_df1['symbol']==b]['FirstIonization'].values[0]
            tmp["O_FirstIonization"] = ele_df1[ele_df1['symbol']==o]['FirstIonization'].values[0]
            tmp["A_number of Valence Electrons"] = a1 * ele_df1[ele_df1['symbol']==A1]['number_of_valence_electrons'].values[0] + \
            a2 * ele_df1[ele_df1['symbol']==A2]['number_of_valence_electrons'].values[0]
            tmp["B_number of Valence Electrons"] = 2 * ele_df1[ele_df1['symbol']==b]['number_of_valence_electrons'].values[0]
            tmp["O_number of Valence Electrons"] = ele_df1[ele_df1['symbol']==o]['number_of_valence_electrons'].values[0]
            tmp["A_number"] = a1 * ele_df1[ele_df1['symbol']==A1]['number'].values[0] + \
            a2 * ele_df1[ele_df1['symbol']==A2]['number'].values[0]
            tmp["B_number"] = 2 * ele_df1[ele_df1['symbol']==b]['number'].values[0]
            tmp["O_number"] = ele_df1[ele_df1['symbol']==o]['number'].values[0]
            tmp["A_Period"] = a1 * ele_df1[ele_df1['symbol']==A1]['Period'].values[0] + \
            a2 * ele_df1[ele_df1['symbol']==A2]['Period'].values[0]
            tmp["B_Period"] = 2 * ele_df1[ele_df1['symbol']==b]['Period'].values[0]
            tmp["O_Period"] = ele_df1[ele_df1['symbol']==o]['Period'].values[0]
            tmp["A_Electronegativity"] = a1 * ele_df1[ele_df1['symbol']==A1]['Electronegativity'].values[0] + \
            a2 * ele_df1[ele_df1['symbol']==A2]['Electronegativity'].values[0]
            tmp["B_Electronegativity"] = 2 * ele_df1[ele_df1['symbol']==b]['Electronegativity'].values[0]
            tmp["O_Electronegativity"] = ele_df1[ele_df1['symbol']==o]['Electronegativity'].values[0]
            tmp["A_number of s+p Electrons"] = a1*(ele_df2[ele_df2['symbol']==A1]['Number of s electrons'].values[0] \
                                                   + ele_df2[ele_df2['symbol']==A1]['Number of p electrons'].values[0])+ \
            a2*(ele_df2[ele_df2['symbol']==A2]['Number of s electrons'].values[0] + \
                ele_df2[ele_df2['symbol']==A2]['Number of p electrons'].values[0])
            tmp["B_number of s+p Electrons"] = 2 * ele_df2[ele_df2['symbol']==b]['Number of s electrons'].values[0] \
            + 2 * ele_df2[ele_df2['symbol']==b]['Number of p electrons'].values[0]
            tmp["O_number of s+p Electrons"] = ele_df2[ele_df2['symbol']==o]['Number of s electrons'].values[0] \
            + ele_df2[ele_df2['symbol']==o]['Number of p electrons'].values[0]
            tmp["A_number of d Electrons"] = a1*ele_df2[ele_df2['symbol']==A1]['Number of d electrons'].values[0] + \
            a2*ele_df2[ele_df2['symbol']==A2]['Number of d electrons'].values[0]
            tmp["B_number of d Electrons"] = 2 * ele_df2[ele_df2['symbol']==b]['Number of d electrons'].values[0]
            tmp["O_number of d Electrons"] = ele_df2[ele_df2['symbol']==o]['Number of d electrons'].values[0]
            tmp["A_Mulliken EN"] = a1*ele_df2[ele_df2['symbol']==A1]['Mulliken EN'].values[0] \
            + a2*ele_df2[ele_df2['symbol']==A2]['Mulliken EN'].values[0]
            tmp["B_Mulliken EN"] = 2 * ele_df2[ele_df2['symbol']==b]['Mulliken EN'].values[0]
            tmp["O_Mulliken EN"] = ele_df2[ele_df2['symbol']==o]['Mulliken EN'].values[0]
            result.append(tmp)
        elif row['A1'] == '0' and row['A2'] != '0':
                a = row['A2']
                b = row['B']
                tmp["A_Density"] = ele_df1[ele_df1['symbol']==a]['Density'].values[0]
                tmp["B_Density"] = 2 * ele_df1[ele_df1['symbol']==b]['Density'].values[0]
                tmp["O_Density"] = ele_df1[ele_df1['symbol']==o]['Density'].values[0]
                tmp["A_dipole Polarizability"] = ele_df1[ele_df1['symbol']==a]['dipole_polarizability'].values[0]
                tmp["B_dipole Polarizability"] = 2 * ele_df1[ele_df1['symbol']==b]['dipole_polarizability'].values[0]
                tmp["O_dipole Polarizability"] = ele_df1[ele_df1['symbol']==o]['dipole_polarizability'].values[0]
                tmp["A_covalent Radius"] = ele_df1[ele_df1['symbol']==a]['covalent_radius'].values[0]
                tmp["B_covalent Radius"] = 2 * ele_df1[ele_df1['symbol']==b]['covalent_radius'].values[0]
                tmp["O_covalent Radius"] = ele_df1[ele_df1['symbol']==o]['covalent_radius'].values[0]
                tmp["A_atomic Radius"] = ele_df2[ele_df1['symbol']==a]['Atomic radius (Å)'].values[0]
                tmp["B_atomic Radius"] = 2 * ele_df2[ele_df1['symbol']==b]['Atomic radius (Å)'].values[0]
                tmp["O_atomic Radius"] = ele_df2[ele_df1['symbol']==o]['Atomic radius (Å)'].values[0]
                tmp["A_FirstIonization"] = ele_df1[ele_df1['symbol']==a]['FirstIonization'].values[0]
                tmp["B_FirstIonization"] = 2 * ele_df1[ele_df1['symbol']==b]['FirstIonization'].values[0]
                tmp["O_FirstIonization"] = ele_df1[ele_df1['symbol']==o]['FirstIonization'].values[0]
                tmp["A_number of Valence Electrons"] = ele_df1[ele_df1['symbol']==a]['number_of_valence_electrons'].values[0]
                tmp["B_number of Valence Electrons"] = 2 * ele_df1[ele_df1['symbol']==b]['number_of_valence_electrons'].values[0]
                tmp["O_number of Valence Electrons"] = ele_df1[ele_df1['symbol']==o]['number_of_valence_electrons'].values[0]
                tmp["A_number"] = ele_df1[ele_df1['symbol']==a]['number'].values[0]
                tmp["B_number"] = 2 * ele_df1[ele_df1['symbol']==b]['number'].values[0]
                tmp["O_number"] = ele_df1[ele_df1['symbol']==o]['number'].values[0]
                tmp["A_Period"] = ele_df1[ele_df1['symbol']==a]['Period'].values[0]
                tmp["B_Period"] = 2 * ele_df1[ele_df1['symbol']==b]['Period'].values[0]
                tmp["O_Period"] = ele_df1[ele_df1['symbol']==o]['Period'].values[0]
                tmp["A_Electronegativity"] = ele_df1[ele_df1['symbol']==a]['Electronegativity'].values[0]
                tmp["B_Electronegativity"] = 2 * ele_df1[ele_df1['symbol']==b]['Electronegativity'].values[0]
                tmp["O_Electronegativity"] = ele_df1[ele_df1['symbol']==o]['Electronegativity'].values[0]
                tmp["A_number of s+p Electrons"] = ele_df2[ele_df2['symbol']==a]['Number of s electrons'].values[0] 
                + ele_df2[ele_df2['symbol']==a]['Number of p electrons'].values[0]
                tmp["B_number of s+p Electrons"] = 2 * ele_df2[ele_df2['symbol']==b]['Number of s electrons'].values[0] 
                + 2 * ele_df2[ele_df2['symbol']==b]['Number of p electrons'].values[0]
                tmp["O_number of s+p Electrons"] = ele_df2[ele_df2['symbol']==o]['Number of s electrons'].values[0] 
                + ele_df2[ele_df2['symbol']==o]['Number of p electrons'].values[0]
                tmp["A_number of d Electrons"] = ele_df2[ele_df2['symbol']==a]['Number of d electrons'].values[0]
                tmp["B_number of d Electrons"] = 2 * ele_df2[ele_df2['symbol']==b]['Number of d electrons'].values[0]
                tmp["O_number of d Electrons"] = ele_df2[ele_df2['symbol']==o]['Number of d electrons'].values[0]
                tmp["A_Mulliken EN"] = ele_df2[ele_df2['symbol']==a]['Mulliken EN'].values[0]
                tmp["B_Mulliken EN"] = 2 * ele_df2[ele_df2['symbol']==b]['Mulliken EN'].values[0]
                tmp["O_Mulliken EN"] = ele_df2[ele_df2['symbol']==o]['Mulliken EN'].values[0]
                result.append(tmp)
        elif row['A2'] == '0' and row['A1'] != '0':
                a = row['A1']
                b = row['B']
                tmp["A_Density"] = ele_df1[ele_df1['symbol']==a]['Density'].values[0]
                tmp["B_Density"] = 2 * ele_df1[ele_df1['symbol']==b]['Density'].values[0]
                tmp["O_Density"] = ele_df1[ele_df1['symbol']==o]['Density'].values[0]
                tmp["A_dipole Polarizability"] = ele_df1[ele_df1['symbol']==a]['dipole_polarizability'].values[0]
                tmp["B_dipole Polarizability"] = 2 * ele_df1[ele_df1['symbol']==b]['dipole_polarizability'].values[0]
                tmp["O_dipole Polarizability"] = ele_df1[ele_df1['symbol']==o]['dipole_polarizability'].values[0]
                tmp["A_covalent Radius"] = ele_df1[ele_df1['symbol']==a]['covalent_radius'].values[0]
                tmp["B_covalent Radius"] = 2 * ele_df1[ele_df1['symbol']==b]['covalent_radius'].values[0]
                tmp["O_covalent Radius"] = ele_df1[ele_df1['symbol']==o]['covalent_radius'].values[0]
                tmp["A_atomic Radius"] = ele_df2[ele_df1['symbol']==a]['Atomic radius (Å)'].values[0]
                tmp["B_atomic Radius"] = 2 * ele_df2[ele_df1['symbol']==b]['Atomic radius (Å)'].values[0]
                tmp["O_atomic Radius"] = ele_df2[ele_df1['symbol']==o]['Atomic radius (Å)'].values[0]
                tmp["A_FirstIonization"] = ele_df1[ele_df1['symbol']==a]['FirstIonization'].values[0]
                tmp["B_FirstIonization"] = 2 * ele_df1[ele_df1['symbol']==b]['FirstIonization'].values[0]
                tmp["O_FirstIonization"] = ele_df1[ele_df1['symbol']==o]['FirstIonization'].values[0]
                tmp["A_number of Valence Electrons"] = ele_df1[ele_df1['symbol']==a]['number_of_valence_electrons'].values[0]
                tmp["B_number of Valence Electrons"] = 2 * ele_df1[ele_df1['symbol']==b]['number_of_valence_electrons'].values[0]
                tmp["O_number of Valence Electrons"] = ele_df1[ele_df1['symbol']==o]['number_of_valence_electrons'].values[0]
                tmp["A_number"] = ele_df1[ele_df1['symbol']==a]['number'].values[0]
                tmp["B_number"] = 2 * ele_df1[ele_df1['symbol']==b]['number'].values[0]
                tmp["O_number"] = ele_df1[ele_df1['symbol']==o]['number'].values[0]
                tmp["A_Period"] = ele_df1[ele_df1['symbol']==a]['Period'].values[0]
                tmp["B_Period"] = 2 * ele_df1[ele_df1['symbol']==b]['Period'].values[0]
                tmp["O_Period"] = ele_df1[ele_df1['symbol']==o]['Period'].values[0]
                tmp["A_Electronegativity"] = ele_df1[ele_df1['symbol']==a]['Electronegativity'].values[0]
                tmp["B_Electronegativity"] = 2 * ele_df1[ele_df1['symbol']==b]['Electronegativity'].values[0]
                tmp["O_Electronegativity"] = ele_df1[ele_df1['symbol']==o]['Electronegativity'].values[0]
                tmp["A_number of s+p Electrons"] = ele_df2[ele_df2['symbol']==a]['Number of s electrons'].values[0] 
                + ele_df2[ele_df2['symbol']==a]['Number of p electrons'].values[0]
                tmp["B_number of s+p Electrons"] = 2 * ele_df2[ele_df2['symbol']==b]['Number of s electrons'].values[0] 
                + 2 * ele_df2[ele_df2['symbol']==b]['Number of p electrons'].values[0]
                tmp["O_number of s+p Electrons"] = ele_df2[ele_df2['symbol']==o]['Number of s electrons'].values[0] 
                + ele_df2[ele_df2['symbol']==o]['Number of p electrons'].values[0]
                tmp["A_number of d Electrons"] = ele_df2[ele_df2['symbol']==a]['Number of d electrons'].values[0]
                tmp["B_number of d Electrons"] = 2 * ele_df2[ele_df2['symbol']==b]['Number of d electrons'].values[0]
                tmp["O_number of d Electrons"] = ele_df2[ele_df2['symbol']==o]['Number of d electrons'].values[0]
                tmp["A_Mulliken EN"] = ele_df2[ele_df2['symbol']==a]['Mulliken EN'].values[0]
                tmp["B_Mulliken EN"] = 2 * ele_df2[ele_df2['symbol']==b]['Mulliken EN'].values[0]
                tmp["O_Mulliken EN"] = ele_df2[ele_df2['symbol']==o]['Mulliken EN'].values[0]
                result.append(tmp)
    return pd.DataFrame(result)
base_features_AAB2O4 = get_base_features_AAB2O4(a_data_spinel)
base_features_AAB2O4

Unnamed: 0,A_Density,B_Density,O_Density,A_dipole Polarizability,B_dipole Polarizability,O_dipole Polarizability,A_covalent Radius,B_covalent Radius,O_covalent Radius,A_atomic Radius,B_atomic Radius,O_atomic Radius,A_FirstIonization,B_FirstIonization,O_FirstIonization,A_number of Valence Electrons,B_number of Valence Electrons,O_number of Valence Electrons,A_number,B_number,O_number,A_Period,B_Period,O_Period,A_Electronegativity,B_Electronegativity,O_Electronegativity,A_number of s+p Electrons,B_number of s+p Electrons,O_number of s+p Electrons,A_number of d Electrons,B_number of d Electrons,O_number of d Electrons,A_Mulliken EN,B_Mulliken EN,O_Mulliken EN
0,9.5471,14.88,0.00143,65.7700,136.0,5.3,130.7,238,63,1.620,2.8,0.6,707.48,1434.6,1313.9,10.0,14,6,43.4,50,8,4.8,8,2,1.830,3.10,3.44,1.0,4,6,9.0,10,0,4.281,7.44,7.54
1,2.5271,14.88,0.00143,68.2900,136.0,5.3,128.9,238,63,1.305,2.8,0.6,569.33,1434.6,1313.9,2.8,14,6,12.8,50,8,3.0,8,2,1.542,3.10,3.44,2.8,4,6,0.0,10,0,3.192,7.44,7.54
2,3.3281,14.88,0.00143,261.0700,136.0,5.3,191.9,238,63,2.115,2.8,0.6,502.19,1434.6,1313.9,1.9,14,6,51.5,50,8,5.7,8,2,0.894,3.10,3.44,1.9,4,6,0.0,10,0,2.445,7.44,7.54
3,1.7621,14.88,0.00143,50.2360,136.0,5.3,107.3,238,63,1.125,2.8,0.6,859.13,1434.6,1313.9,1.9,14,6,4.7,50,8,2.1,8,2,1.506,3.10,3.44,1.9,4,6,0.0,10,0,4.695,7.44,7.54
4,1.4831,14.88,0.00143,160.9900,136.0,5.3,169.4,238,63,1.800,2.8,0.6,580.40,1434.6,1313.9,1.9,14,6,19.1,50,8,3.9,8,2,0.993,3.10,3.44,1.9,4,6,0.0,10,0,2.265,7.44,7.54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28970,2.6400,5.98,0.00143,197.2000,194.0,5.3,185.0,296,63,2.000,3.2,0.6,549.50,1266.2,1313.9,2.0,6,6,38.0,42,8,5.0,8,2,0.950,2.72,3.44,2.0,4,2,0.0,2,0,2.000,6.68,7.54
28971,0.5340,20.40,0.00143,164.1125,174.0,5.3,133.0,276,63,1.450,2.9,0.6,520.20,1368.6,1313.9,1.0,12,6,3.0,84,8,2.0,10,2,0.980,4.32,3.44,1.0,2,2,0.0,10,0,3.010,7.80,7.54
28972,5.2400,12.30,0.00143,184.0000,430.0,5.3,168.0,360,63,1.850,3.9,0.6,547.10,1076.2,1313.9,3.0,6,6,63.0,114,8,6.0,12,2,1.200,2.20,3.44,2.0,4,2,0.0,2,0,3.100,6.20,7.54
28973,0.5340,17.82,0.00143,164.1125,98.0,5.3,133.0,220,63,1.450,2.7,0.6,520.20,1474.2,1313.9,1.0,20,6,3.0,56,8,2.0,8,2,0.980,3.82,3.44,1.0,4,2,0.0,16,0,3.010,8.80,7.54


In [30]:
def get_base_features_ABB2O4(data:pd.DataFrame):
    result = []
    for i,row in data.iterrows():
        if row['formula'] in spinels['formula']:
            continue
        tmp = {}
        o = 'O'
        if str(row['B1']) != '0' and str(row['B2']) != '0':
            a = row['A']
            b1 = row['b1']
            b2 = row['b2']
            B1 = row['B1']
            B2 = row['B2']
            tmp["B_Density"] = b1 * ele_df1[ele_df1['symbol']==B1]['Density'].values[0] + \
            b2 * ele_df1[ele_df1['symbol']==B2]['Density'].values[0]
            tmp["A_Density"] = ele_df1[ele_df1['symbol']==a]['Density'].values[0]
            tmp["O_Density"] = ele_df1[ele_df1['symbol']==o]['Density'].values[0]
            tmp["B_dipole Polarizability"] = b1 * ele_df1[ele_df1['symbol']==B1]['dipole_polarizability'].values[0] + \
            b2 * ele_df1[ele_df1['symbol']==B2]['dipole_polarizability'].values[0]
            tmp["A_dipole Polarizability"] = ele_df1[ele_df1['symbol']==a]['dipole_polarizability'].values[0]
            tmp["O_dipole Polarizability"] = ele_df1[ele_df1['symbol']==o]['dipole_polarizability'].values[0]
            tmp["B_covalent Radius"] = b1 * ele_df1[ele_df1['symbol']==B1]['covalent_radius'].values[0] + \
            b2 * ele_df1[ele_df1['symbol']==B2]['covalent_radius'].values[0]
            tmp["A_covalent Radius"] = ele_df1[ele_df1['symbol']==a]['covalent_radius'].values[0]
            tmp["O_covalent Radius"] = ele_df1[ele_df1['symbol']==o]['covalent_radius'].values[0]
            tmp["B_atomic Radius"] = b1 * ele_df2[ele_df1['symbol']==B1]['Atomic radius (Å)'].values[0] + \
            b2 * ele_df2[ele_df1['symbol']==B2]['Atomic radius (Å)'].values[0]
            tmp["A_atomic Radius"] = ele_df2[ele_df1['symbol']==a]['Atomic radius (Å)'].values[0]
            tmp["O_atomic Radius"] = ele_df2[ele_df1['symbol']==o]['Atomic radius (Å)'].values[0]
            tmp["B_FirstIonization"] = b1 * ele_df1[ele_df1['symbol']==B1]['FirstIonization'].values[0] + \
            b2 * ele_df1[ele_df1['symbol']==B2]['FirstIonization'].values[0]
            tmp["A_FirstIonization"] = ele_df1[ele_df1['symbol']==a]['FirstIonization'].values[0]
            tmp["O_FirstIonization"] = ele_df1[ele_df1['symbol']==o]['FirstIonization'].values[0]
            tmp["B_number of Valence Electrons"] = b1 * ele_df1[ele_df1['symbol']==B1]['number_of_valence_electrons'].values[0] + \
            b2 * ele_df1[ele_df1['symbol']==B2]['number_of_valence_electrons'].values[0]
            tmp["A_number of Valence Electrons"] = ele_df1[ele_df1['symbol']==a]['number_of_valence_electrons'].values[0]
            tmp["O_number of Valence Electrons"] = ele_df1[ele_df1['symbol']==o]['number_of_valence_electrons'].values[0]
            tmp["B_number"] = b1 * ele_df1[ele_df1['symbol']==B1]['number'].values[0] + \
            b2 * ele_df1[ele_df1['symbol']==B2]['number'].values[0]
            tmp["A_number"] = ele_df1[ele_df1['symbol']==a]['number'].values[0]
            tmp["O_number"] = ele_df1[ele_df1['symbol']==o]['number'].values[0]
            tmp["B_Period"] = b1 * ele_df1[ele_df1['symbol']==B1]['Period'].values[0] + \
            b2 * ele_df1[ele_df1['symbol']==B2]['Period'].values[0]
            tmp["A_Period"] = ele_df1[ele_df1['symbol']==a]['Period'].values[0]
            tmp["O_Period"] = ele_df1[ele_df1['symbol']==o]['Period'].values[0]
            tmp["B_Electronegativity"] = b1 * ele_df1[ele_df1['symbol']==B1]['Electronegativity'].values[0] + \
            b2 * ele_df1[ele_df1['symbol']==B2]['Electronegativity'].values[0]
            tmp["A_Electronegativity"] = ele_df1[ele_df1['symbol']==a]['Electronegativity'].values[0]
            tmp["O_Electronegativity"] = ele_df1[ele_df1['symbol']==o]['Electronegativity'].values[0]
            tmp["B_number of s+p Electrons"] = b1*(ele_df2[ele_df2['symbol']==B1]['Number of s electrons'].values[0] \
                                                   + ele_df2[ele_df2['symbol']==B1]['Number of p electrons'].values[0])+ \
            b2*(ele_df2[ele_df2['symbol']==B2]['Number of s electrons'].values[0] + \
                ele_df2[ele_df2['symbol']==B2]['Number of p electrons'].values[0])
            tmp["A_number of s+p Electrons"] = ele_df2[ele_df2['symbol']==a]['Number of s electrons'].values[0] \
            + ele_df2[ele_df2['symbol']==a]['Number of p electrons'].values[0]
            tmp["O_number of s+p Electrons"] = ele_df2[ele_df2['symbol']==o]['Number of s electrons'].values[0] \
            + ele_df2[ele_df2['symbol']==o]['Number of p electrons'].values[0]
            tmp["B_number of d Electrons"] = b1*ele_df2[ele_df2['symbol']==B1]['Number of d electrons'].values[0] + \
            b2*ele_df2[ele_df2['symbol']==B2]['Number of d electrons'].values[0]
            tmp["A_number of d Electrons"] = ele_df2[ele_df2['symbol']==a]['Number of d electrons'].values[0]
            tmp["O_number of d Electrons"] = ele_df2[ele_df2['symbol']==o]['Number of d electrons'].values[0]
            tmp["B_Mulliken EN"] = b1*ele_df2[ele_df2['symbol']==B1]['Mulliken EN'].values[0] \
            + b2*ele_df2[ele_df2['symbol']==B2]['Mulliken EN'].values[0]
            tmp["A_Mulliken EN"] = ele_df2[ele_df2['symbol']==a]['Mulliken EN'].values[0]
            tmp["O_Mulliken EN"] = ele_df2[ele_df2['symbol']==o]['Mulliken EN'].values[0]
            result.append(tmp)
        elif str(row['B1']) == '0' and str(row['B2']) != '0':
                a = row['A']
                b = row['B2']
                tmp["A_Density"] = ele_df1[ele_df1['symbol']==a]['Density'].values[0]
                tmp["B_Density"] = 2 * ele_df1[ele_df1['symbol']==b]['Density'].values[0]
                tmp["O_Density"] = ele_df1[ele_df1['symbol']==o]['Density'].values[0]
                tmp["A_dipole Polarizability"] = ele_df1[ele_df1['symbol']==a]['dipole_polarizability'].values[0]
                tmp["B_dipole Polarizability"] = 2 * ele_df1[ele_df1['symbol']==b]['dipole_polarizability'].values[0]
                tmp["O_dipole Polarizability"] = ele_df1[ele_df1['symbol']==o]['dipole_polarizability'].values[0]
                tmp["A_covalent Radius"] = ele_df1[ele_df1['symbol']==a]['covalent_radius'].values[0]
                tmp["B_covalent Radius"] = 2 * ele_df1[ele_df1['symbol']==b]['covalent_radius'].values[0]
                tmp["O_covalent Radius"] = ele_df1[ele_df1['symbol']==o]['covalent_radius'].values[0]
                tmp["A_atomic Radius"] = ele_df2[ele_df1['symbol']==a]['Atomic radius (Å)'].values[0]
                tmp["B_atomic Radius"] = 2 * ele_df2[ele_df1['symbol']==b]['Atomic radius (Å)'].values[0]
                tmp["O_atomic Radius"] = ele_df2[ele_df1['symbol']==o]['Atomic radius (Å)'].values[0]
                tmp["A_FirstIonization"] = ele_df1[ele_df1['symbol']==a]['FirstIonization'].values[0]
                tmp["B_FirstIonization"] = 2 * ele_df1[ele_df1['symbol']==b]['FirstIonization'].values[0]
                tmp["O_FirstIonization"] = ele_df1[ele_df1['symbol']==o]['FirstIonization'].values[0]
                tmp["A_number of Valence Electrons"] = ele_df1[ele_df1['symbol']==a]['number_of_valence_electrons'].values[0]
                tmp["B_number of Valence Electrons"] = 2 * ele_df1[ele_df1['symbol']==b]['number_of_valence_electrons'].values[0]
                tmp["O_number of Valence Electrons"] = ele_df1[ele_df1['symbol']==o]['number_of_valence_electrons'].values[0]
                tmp["A_number"] = ele_df1[ele_df1['symbol']==a]['number'].values[0]
                tmp["B_number"] = 2 * ele_df1[ele_df1['symbol']==b]['number'].values[0]
                tmp["O_number"] = ele_df1[ele_df1['symbol']==o]['number'].values[0]
                tmp["A_Period"] = ele_df1[ele_df1['symbol']==a]['Period'].values[0]
                tmp["B_Period"] = 2 * ele_df1[ele_df1['symbol']==b]['Period'].values[0]
                tmp["O_Period"] = ele_df1[ele_df1['symbol']==o]['Period'].values[0]
                tmp["A_Electronegativity"] = ele_df1[ele_df1['symbol']==a]['Electronegativity'].values[0]
                tmp["B_Electronegativity"] = 2 * ele_df1[ele_df1['symbol']==b]['Electronegativity'].values[0]
                tmp["O_Electronegativity"] = ele_df1[ele_df1['symbol']==o]['Electronegativity'].values[0]
                tmp["A_number of s+p Electrons"] = ele_df2[ele_df2['symbol']==a]['Number of s electrons'].values[0] 
                + ele_df2[ele_df2['symbol']==a]['Number of p electrons'].values[0]
                tmp["B_number of s+p Electrons"] = 2 * ele_df2[ele_df2['symbol']==b]['Number of s electrons'].values[0] 
                + 2 * ele_df2[ele_df2['symbol']==b]['Number of p electrons'].values[0]
                tmp["O_number of s+p Electrons"] = ele_df2[ele_df2['symbol']==o]['Number of s electrons'].values[0] 
                + ele_df2[ele_df2['symbol']==o]['Number of p electrons'].values[0]
                tmp["A_number of d Electrons"] = ele_df2[ele_df2['symbol']==a]['Number of d electrons'].values[0]
                tmp["B_number of d Electrons"] = 2 * ele_df2[ele_df2['symbol']==b]['Number of d electrons'].values[0]
                tmp["O_number of d Electrons"] = ele_df2[ele_df2['symbol']==o]['Number of d electrons'].values[0]
                tmp["A_Mulliken EN"] = ele_df2[ele_df2['symbol']==a]['Mulliken EN'].values[0]
                tmp["B_Mulliken EN"] = 2 * ele_df2[ele_df2['symbol']==b]['Mulliken EN'].values[0]
                tmp["O_Mulliken EN"] = ele_df2[ele_df2['symbol']==o]['Mulliken EN'].values[0]
                result.append(tmp)
        elif str(row['B2']) == '0' and str(row['B1']) != '0':
                a = row['A']
                b = row['B1']
                tmp["A_Density"] = ele_df1[ele_df1['symbol']==a]['Density'].values[0]
                tmp["B_Density"] = 2 * ele_df1[ele_df1['symbol']==b]['Density'].values[0]
                tmp["O_Density"] = ele_df1[ele_df1['symbol']==o]['Density'].values[0]
                tmp["A_dipole Polarizability"] = ele_df1[ele_df1['symbol']==a]['dipole_polarizability'].values[0]
                tmp["B_dipole Polarizability"] = 2 * ele_df1[ele_df1['symbol']==b]['dipole_polarizability'].values[0]
                tmp["O_dipole Polarizability"] = ele_df1[ele_df1['symbol']==o]['dipole_polarizability'].values[0]
                tmp["A_covalent Radius"] = ele_df1[ele_df1['symbol']==a]['covalent_radius'].values[0]
                tmp["B_covalent Radius"] = 2 * ele_df1[ele_df1['symbol']==b]['covalent_radius'].values[0]
                tmp["O_covalent Radius"] = ele_df1[ele_df1['symbol']==o]['covalent_radius'].values[0]
                tmp["A_atomic Radius"] = ele_df2[ele_df1['symbol']==a]['Atomic radius (Å)'].values[0]
                tmp["B_atomic Radius"] = 2 * ele_df2[ele_df1['symbol']==b]['Atomic radius (Å)'].values[0]
                tmp["O_atomic Radius"] = ele_df2[ele_df1['symbol']==o]['Atomic radius (Å)'].values[0]
                tmp["A_FirstIonization"] = ele_df1[ele_df1['symbol']==a]['FirstIonization'].values[0]
                tmp["B_FirstIonization"] = 2 * ele_df1[ele_df1['symbol']==b]['FirstIonization'].values[0]
                tmp["O_FirstIonization"] = ele_df1[ele_df1['symbol']==o]['FirstIonization'].values[0]
                tmp["A_number of Valence Electrons"] = ele_df1[ele_df1['symbol']==a]['number_of_valence_electrons'].values[0]
                tmp["B_number of Valence Electrons"] = 2 * ele_df1[ele_df1['symbol']==b]['number_of_valence_electrons'].values[0]
                tmp["O_number of Valence Electrons"] = ele_df1[ele_df1['symbol']==o]['number_of_valence_electrons'].values[0]
                tmp["A_number"] = ele_df1[ele_df1['symbol']==a]['number'].values[0]
                tmp["B_number"] = 2 * ele_df1[ele_df1['symbol']==b]['number'].values[0]
                tmp["O_number"] = ele_df1[ele_df1['symbol']==o]['number'].values[0]
                tmp["A_Period"] = ele_df1[ele_df1['symbol']==a]['Period'].values[0]
                tmp["B_Period"] = 2 * ele_df1[ele_df1['symbol']==b]['Period'].values[0]
                tmp["O_Period"] = ele_df1[ele_df1['symbol']==o]['Period'].values[0]
                tmp["A_Electronegativity"] = ele_df1[ele_df1['symbol']==a]['Electronegativity'].values[0]
                tmp["B_Electronegativity"] = 2 * ele_df1[ele_df1['symbol']==b]['Electronegativity'].values[0]
                tmp["O_Electronegativity"] = ele_df1[ele_df1['symbol']==o]['Electronegativity'].values[0]
                tmp["A_number of s+p Electrons"] = ele_df2[ele_df2['symbol']==a]['Number of s electrons'].values[0] 
                + ele_df2[ele_df2['symbol']==a]['Number of p electrons'].values[0]
                tmp["B_number of s+p Electrons"] = 2 * ele_df2[ele_df2['symbol']==b]['Number of s electrons'].values[0] 
                + 2 * ele_df2[ele_df2['symbol']==b]['Number of p electrons'].values[0]
                tmp["O_number of s+p Electrons"] = ele_df2[ele_df2['symbol']==o]['Number of s electrons'].values[0] 
                + ele_df2[ele_df2['symbol']==o]['Number of p electrons'].values[0]
                tmp["A_number of d Electrons"] = ele_df2[ele_df2['symbol']==a]['Number of d electrons'].values[0]
                tmp["B_number of d Electrons"] = 2 * ele_df2[ele_df2['symbol']==b]['Number of d electrons'].values[0]
                tmp["O_number of d Electrons"] = ele_df2[ele_df2['symbol']==o]['Number of d electrons'].values[0]
                tmp["A_Mulliken EN"] = ele_df2[ele_df2['symbol']==a]['Mulliken EN'].values[0]
                tmp["B_Mulliken EN"] = 2 * ele_df2[ele_df2['symbol']==b]['Mulliken EN'].values[0]
                tmp["O_Mulliken EN"] = ele_df2[ele_df2['symbol']==o]['Mulliken EN'].values[0]
                result.append(tmp)
    return pd.DataFrame(result)
base_features_ABB2O4 = get_base_features_ABB2O4(b_data_spinel)
base_features_ABB2O4

Unnamed: 0,B_Density,A_Density,O_Density,B_dipole Polarizability,A_dipole Polarizability,O_dipole Polarizability,B_covalent Radius,A_covalent Radius,O_covalent Radius,B_atomic Radius,A_atomic Radius,O_atomic Radius,B_FirstIonization,A_FirstIonization,O_FirstIonization,B_number of Valence Electrons,A_number of Valence Electrons,O_number of Valence Electrons,B_number,A_number,O_number,B_Period,A_Period,O_Period,B_Electronegativity,A_Electronegativity,O_Electronegativity,B_number of s+p Electrons,A_number of s+p Electrons,O_number of s+p Electrons,B_number of d Electrons,A_number of d Electrons,O_number of d Electrons,B_Mulliken EN,A_Mulliken EN,O_Mulliken EN
0,20.694,0.971,0.00143,111.30,162.7,5.3,255.1,155,63,3.180,1.80,0.6,1460.63,495.8,1313.9,21.6,1,6,91.8,11,8,9.9,3,2,3.822,0.93,3.44,2.1,1,6,19.5,0,0,8.808,2.85,7.54
1,3.670,0.971,0.00143,312.32,162.7,5.3,336.8,155,63,3.560,1.80,0.6,1192.35,495.8,1313.9,4.5,1,6,40.5,11,8,8.0,3,2,2.055,0.93,3.44,4.0,1,6,0.5,0,0,4.552,2.85,7.54
2,17.255,0.971,0.00143,94.20,162.7,5.3,270.3,155,63,3.085,1.80,0.6,1720.55,495.8,1313.9,23.5,1,6,93.7,11,8,9.9,3,2,3.366,0.93,3.44,4.0,1,6,19.5,0,0,8.599,2.85,7.54
3,17.578,0.971,0.00143,111.30,162.7,5.3,222.8,155,63,2.705,1.80,0.6,1516.49,495.8,1313.9,17.8,1,6,53.8,11,8,8.0,3,2,3.727,0.93,3.44,4.0,1,6,13.8,0,0,8.542,2.85,7.54
4,14.329,0.971,0.00143,164.50,162.7,5.3,243.7,155,63,2.800,1.80,0.6,1312.24,495.8,1313.9,12.1,1,6,48.1,11,8,8.0,3,2,3.309,0.93,3.44,2.1,1,6,10.0,0,0,7.440,2.85,7.54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82762,18.640,6.110,0.00143,288.00,87.0,5.3,328.0,134,63,3.500,1.35,0.6,1193.40,650.9,1313.9,6.0,5,6,138.0,23,8,12.0,4,2,2.500,1.63,3.44,4.0,2,2,0.0,3,0,6.200,3.60,7.54
82763,38.600,6.110,0.00143,136.00,87.0,5.3,274.0,134,63,2.700,1.35,0.6,1540.00,650.9,1313.9,12.0,5,6,148.0,23,8,12.0,4,2,4.720,1.63,3.44,4.0,2,2,8.0,3,0,8.800,3.60,7.54
82764,8.940,6.110,0.00143,324.00,87.0,5.3,326.0,134,63,3.600,1.35,0.6,1200.00,650.9,1313.9,6.0,5,6,78.0,23,8,10.0,4,2,2.440,1.63,3.44,4.0,2,2,2.0,3,0,6.380,3.60,7.54
82765,13.940,6.110,0.00143,278.00,87.0,5.3,340.0,134,63,3.500,1.35,0.6,1206.80,650.9,1313.9,6.0,5,6,140.0,23,8,12.0,4,2,2.200,1.63,3.44,4.0,2,2,0.0,3,0,6.200,3.60,7.54


In [31]:
base_features_AB2O4['formula'] = spinels['formula']
base_features_AAB2O4['formula'] = a_data_spinel['formula'].values
base_features_ABB2O4['formula'] = b_data_spinel['formula'].values
base_features_AAB2O4 = base_features_AAB2O4[base_features_AB2O4.columns]
base_features_ABB2O4 = base_features_ABB2O4[base_features_AB2O4.columns]
result_features = pd.concat([base_features_AB2O4,base_features_AAB2O4,base_features_ABB2O4],axis=0).drop_duplicates(keep='first').reset_index(drop=True)
formula = result_features['formula'].values
result_features = result_features.drop(columns='formula')
result_features

Unnamed: 0,A_Density,B_Density,O_Density,A_dipole Polarizability,B_dipole Polarizability,O_dipole Polarizability,A_covalent Radius,B_covalent Radius,O_covalent Radius,A_atomic Radius,B_atomic Radius,O_atomic Radius,A_FirstIonization,B_FirstIonization,O_FirstIonization,A_number of Valence Electrons,B_number of Valence Electrons,O_number of Valence Electrons,A_number,B_number,O_number,A_Period,B_Period,O_Period,A_Electronegativity,B_Electronegativity,O_Electronegativity,A_number of s+p Electrons,B_number of s+p Electrons,O_number of s+p Electrons,A_number of d Electrons,B_number of d Electrons,O_number of d Electrons,A_Mulliken EN,B_Mulliken EN,O_Mulliken EN
0,0.971,14.88,0.00143,162.7,136.00,5.3,155.0,238.0,63,1.80,2.8,0.6,495.8,1434.6,1313.9,1.0,14.0,6,11.0,50.0,8,3.0,8.0,2,0.93,3.10,3.44,1.0,4.0,6,0.0,10.0,0,2.85,7.44,7.54
1,8.690,24.80,0.00143,46.0,132.00,5.3,136.0,250.0,63,1.55,2.7,0.6,867.8,1439.4,1313.9,12.0,18.0,6,48.0,90.0,8,5.0,10.0,2,1.69,4.56,3.44,2.0,2.0,6,10.0,16.0,0,4.33,8.60,7.54
2,1.540,14.62,0.00143,160.8,130.00,5.3,171.0,284.0,63,1.80,3.1,0.6,589.8,1116.6,1313.9,2.0,6.0,6,20.0,98.0,8,4.0,10.0,2,1.00,3.56,3.44,2.0,6.0,6,0.0,20.0,0,2.20,6.20,7.54
3,2.330,17.82,0.00143,37.3,98.00,5.3,116.0,220.0,63,1.10,2.7,0.6,786.5,1474.2,1313.9,4.0,20.0,6,14.0,56.0,8,3.0,8.0,2,1.90,3.82,3.44,4.0,4.0,6,0.0,16.0,0,4.77,8.80,7.54
4,7.870,3.48,0.00143,62.0,142.40,5.3,116.0,278.0,63,1.40,3.0,0.6,762.5,1475.4,1313.9,8.0,4.0,6,26.0,24.0,8,4.0,6.0,2,1.83,2.62,3.44,2.0,4.0,6,6.0,0.0,0,4.06,7.50,7.54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111301,6.110,17.60,0.00143,87.0,312.00,5.3,134.0,332.0,63,1.35,3.5,0.6,650.9,1162.0,1313.9,5.0,6.0,6,23.0,134.0,8,4.0,12.0,2,1.63,2.46,3.44,2.0,4.0,2,3.0,0.0,0,3.60,6.20,7.54
111302,6.110,24.00,0.00143,87.0,52.28,5.3,134.0,240.0,63,1.35,2.8,0.6,650.9,1608.8,1313.9,5.0,20.0,6,23.0,92.0,8,4.0,10.0,2,1.63,4.40,3.44,2.0,0.0,2,3.0,20.0,0,3.60,8.90,7.54
111303,6.110,14.52,0.00143,87.0,400.00,5.3,134.0,346.0,63,1.35,3.7,0.6,650.9,1080.0,1313.9,5.0,6.0,6,23.0,122.0,8,4.0,12.0,2,1.63,2.26,3.44,2.0,4.0,2,3.0,0.0,0,3.60,6.20,7.54
111304,6.110,3.06,0.00143,87.0,639.60,5.3,134.0,420.0,63,1.35,4.7,0.6,650.9,806.0,1313.9,5.0,2.0,6,23.0,74.0,8,4.0,10.0,2,1.63,1.64,3.44,2.0,2.0,2,3.0,0.0,0,3.60,4.68,7.54


In [32]:
def get_combined_features(data: pd.DataFrame):
    columns = data.columns
    A_features = []
    B_features = []
    O_features = []
    for column in columns:
        if str(column).__contains__('A'):
            A_features.append(column)
        elif str(column).__contains__('B'):
            B_features.append(column)
        elif str(column).__contains__('O'):
            O_features.append(column)

    for a_feature, b_feature, o_feature in zip(A_features, B_features, O_features):
        feature_name = str(a_feature).split('_')[1]
        feature_name = feature_name
        data['A-B_'+feature_name] = np.abs(data[a_feature] - data[b_feature])
        data['(A+B)-O_'+feature_name] = np.abs((data[a_feature] + data[b_feature])- data[o_feature])


    data = data.drop(columns=O_features)
    return data

features = get_combined_features(result_features)
features

Unnamed: 0,A_Density,B_Density,A_dipole Polarizability,B_dipole Polarizability,A_covalent Radius,B_covalent Radius,A_atomic Radius,B_atomic Radius,A_FirstIonization,B_FirstIonization,A_number of Valence Electrons,B_number of Valence Electrons,A_number,B_number,A_Period,B_Period,A_Electronegativity,B_Electronegativity,A_number of s+p Electrons,B_number of s+p Electrons,A_number of d Electrons,B_number of d Electrons,A_Mulliken EN,B_Mulliken EN,A-B_Density,(A+B)-O_Density,A-B_dipole Polarizability,(A+B)-O_dipole Polarizability,A-B_covalent Radius,(A+B)-O_covalent Radius,A-B_atomic Radius,(A+B)-O_atomic Radius,A-B_FirstIonization,(A+B)-O_FirstIonization,A-B_number of Valence Electrons,(A+B)-O_number of Valence Electrons,A-B_number,(A+B)-O_number,A-B_Period,(A+B)-O_Period,A-B_Electronegativity,(A+B)-O_Electronegativity,A-B_number of s+p Electrons,(A+B)-O_number of s+p Electrons,A-B_number of d Electrons,(A+B)-O_number of d Electrons,A-B_Mulliken EN,(A+B)-O_Mulliken EN
0,0.971,14.88,162.7,136.00,155.0,238.0,1.80,2.8,495.8,1434.6,1.0,14.0,11.0,50.0,3.0,8.0,0.93,3.10,1.0,4.0,0.0,10.0,2.85,7.44,13.909,15.84957,26.70,293.40,83.0,330.0,1.00,4.00,938.8,616.5,13.0,9.0,39.0,53.0,5.0,9.0,2.17,0.59,3.0,1.0,10.0,10.0,4.59,2.75
1,8.690,24.80,46.0,132.00,136.0,250.0,1.55,2.7,867.8,1439.4,12.0,18.0,48.0,90.0,5.0,10.0,1.69,4.56,2.0,2.0,10.0,16.0,4.33,8.60,16.110,33.48857,86.00,172.70,114.0,323.0,1.15,3.65,571.6,993.3,6.0,24.0,42.0,130.0,5.0,13.0,2.87,2.81,0.0,2.0,6.0,26.0,4.27,5.39
2,1.540,14.62,160.8,130.00,171.0,284.0,1.80,3.1,589.8,1116.6,2.0,6.0,20.0,98.0,4.0,10.0,1.00,3.56,2.0,6.0,0.0,20.0,2.20,6.20,13.080,16.15857,30.80,285.50,113.0,392.0,1.30,4.30,526.8,392.5,4.0,2.0,78.0,110.0,6.0,12.0,2.56,1.12,4.0,2.0,20.0,20.0,4.00,0.86
3,2.330,17.82,37.3,98.00,116.0,220.0,1.10,2.7,786.5,1474.2,4.0,20.0,14.0,56.0,3.0,8.0,1.90,3.82,4.0,4.0,0.0,16.0,4.77,8.80,15.490,20.14857,60.70,130.00,104.0,273.0,1.60,3.20,687.7,946.8,16.0,18.0,42.0,62.0,5.0,9.0,1.92,2.28,0.0,2.0,16.0,16.0,4.03,6.03
4,7.870,3.48,62.0,142.40,116.0,278.0,1.40,3.0,762.5,1475.4,8.0,4.0,26.0,24.0,4.0,6.0,1.83,2.62,2.0,4.0,6.0,0.0,4.06,7.50,4.390,11.34857,80.40,199.10,162.0,331.0,1.60,3.80,712.9,924.0,4.0,6.0,2.0,42.0,2.0,8.0,0.79,1.01,2.0,0.0,6.0,6.0,3.44,4.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111301,6.110,17.60,87.0,312.00,134.0,332.0,1.35,3.5,650.9,1162.0,5.0,6.0,23.0,134.0,4.0,12.0,1.63,2.46,2.0,4.0,3.0,0.0,3.60,6.20,11.490,23.70857,225.00,393.70,198.0,403.0,2.15,4.25,511.1,499.0,1.0,5.0,111.0,149.0,8.0,14.0,0.83,0.65,2.0,4.0,3.0,3.0,2.60,2.26
111302,6.110,24.00,87.0,52.28,134.0,240.0,1.35,2.8,650.9,1608.8,5.0,20.0,23.0,92.0,4.0,10.0,1.63,4.40,2.0,0.0,3.0,20.0,3.60,8.90,17.890,30.10857,34.72,133.98,106.0,311.0,1.45,3.55,957.9,945.8,15.0,19.0,69.0,107.0,6.0,12.0,2.77,2.59,2.0,0.0,17.0,23.0,5.30,4.96
111303,6.110,14.52,87.0,400.00,134.0,346.0,1.35,3.7,650.9,1080.0,5.0,6.0,23.0,122.0,4.0,12.0,1.63,2.26,2.0,4.0,3.0,0.0,3.60,6.20,8.410,20.62857,313.00,481.70,212.0,417.0,2.35,4.45,429.1,417.0,1.0,5.0,99.0,137.0,8.0,14.0,0.63,0.45,2.0,4.0,3.0,3.0,2.60,2.26
111304,6.110,3.06,87.0,639.60,134.0,420.0,1.35,4.7,650.9,806.0,5.0,2.0,23.0,74.0,4.0,10.0,1.63,1.64,2.0,2.0,3.0,0.0,3.60,4.68,3.050,9.16857,552.60,721.30,286.0,491.0,3.35,5.45,155.1,143.0,3.0,1.0,51.0,89.0,6.0,12.0,0.01,0.17,0.0,2.0,3.0,3.0,1.08,0.74


In [33]:
features['formula'] = formula
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111306 entries, 0 to 111305
Data columns (total 49 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   A_Density                            111306 non-null  float64
 1   B_Density                            111306 non-null  float64
 2   A_dipole Polarizability              111306 non-null  float64
 3   B_dipole Polarizability              111306 non-null  float64
 4   A_covalent Radius                    111306 non-null  float64
 5   B_covalent Radius                    111306 non-null  float64
 6   A_atomic Radius                      111306 non-null  float64
 7   B_atomic Radius                      111306 non-null  float64
 8   A_FirstIonization                    111306 non-null  float64
 9   B_FirstIonization                    111306 non-null  float64
 10  A_number of Valence Electrons        111306 non-null  float64
 11  B_number of V

In [34]:
features = features.drop_duplicates(subset='formula',keep='first')
features

Unnamed: 0,A_Density,B_Density,A_dipole Polarizability,B_dipole Polarizability,A_covalent Radius,B_covalent Radius,A_atomic Radius,B_atomic Radius,A_FirstIonization,B_FirstIonization,A_number of Valence Electrons,B_number of Valence Electrons,A_number,B_number,A_Period,B_Period,A_Electronegativity,B_Electronegativity,A_number of s+p Electrons,B_number of s+p Electrons,A_number of d Electrons,B_number of d Electrons,A_Mulliken EN,B_Mulliken EN,A-B_Density,(A+B)-O_Density,A-B_dipole Polarizability,(A+B)-O_dipole Polarizability,A-B_covalent Radius,(A+B)-O_covalent Radius,A-B_atomic Radius,(A+B)-O_atomic Radius,A-B_FirstIonization,(A+B)-O_FirstIonization,A-B_number of Valence Electrons,(A+B)-O_number of Valence Electrons,A-B_number,(A+B)-O_number,A-B_Period,(A+B)-O_Period,A-B_Electronegativity,(A+B)-O_Electronegativity,A-B_number of s+p Electrons,(A+B)-O_number of s+p Electrons,A-B_number of d Electrons,(A+B)-O_number of d Electrons,A-B_Mulliken EN,(A+B)-O_Mulliken EN,formula
0,0.971,14.88,162.7,136.00,155.0,238.0,1.80,2.8,495.8,1434.6,1.0,14.0,11.0,50.0,3.0,8.0,0.93,3.10,1.0,4.0,0.0,10.0,2.85,7.44,13.909,15.84957,26.70,293.40,83.0,330.0,1.00,4.00,938.8,616.5,13.0,9.0,39.0,53.0,5.0,9.0,2.17,0.59,3.0,1.0,10.0,10.0,4.59,2.75,NaMn2O4
1,8.690,24.80,46.0,132.00,136.0,250.0,1.55,2.7,867.8,1439.4,12.0,18.0,48.0,90.0,5.0,10.0,1.69,4.56,2.0,2.0,10.0,16.0,4.33,8.60,16.110,33.48857,86.00,172.70,114.0,323.0,1.15,3.65,571.6,993.3,6.0,24.0,42.0,130.0,5.0,13.0,2.87,2.81,0.0,2.0,6.0,26.0,4.27,5.39,Cd(RhO2)2
2,1.540,14.62,160.8,130.00,171.0,284.0,1.80,3.1,589.8,1116.6,2.0,6.0,20.0,98.0,4.0,10.0,1.00,3.56,2.0,6.0,0.0,20.0,2.20,6.20,13.080,16.15857,30.80,285.50,113.0,392.0,1.30,4.30,526.8,392.5,4.0,2.0,78.0,110.0,6.0,12.0,2.56,1.12,4.0,2.0,20.0,20.0,4.00,0.86,CaIn2O4
3,2.330,17.82,37.3,98.00,116.0,220.0,1.10,2.7,786.5,1474.2,4.0,20.0,14.0,56.0,3.0,8.0,1.90,3.82,4.0,4.0,0.0,16.0,4.77,8.80,15.490,20.14857,60.70,130.00,104.0,273.0,1.60,3.20,687.7,946.8,16.0,18.0,42.0,62.0,5.0,9.0,1.92,2.28,0.0,2.0,16.0,16.0,4.03,6.03,Si(NiO2)2
4,7.870,3.48,62.0,142.40,116.0,278.0,1.40,3.0,762.5,1475.4,8.0,4.0,26.0,24.0,4.0,6.0,1.83,2.62,2.0,4.0,6.0,0.0,4.06,7.50,4.390,11.34857,80.40,199.10,162.0,331.0,1.60,3.80,712.9,924.0,4.0,6.0,2.0,42.0,2.0,8.0,0.79,1.01,2.0,0.0,6.0,6.0,3.44,4.02,Mg2FeO4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111301,6.110,17.60,87.0,312.00,134.0,332.0,1.35,3.5,650.9,1162.0,5.0,6.0,23.0,134.0,4.0,12.0,1.63,2.46,2.0,4.0,3.0,0.0,3.60,6.20,11.490,23.70857,225.00,393.70,198.0,403.0,2.15,4.25,511.1,499.0,1.0,5.0,111.0,149.0,8.0,14.0,0.83,0.65,2.0,4.0,3.0,3.0,2.60,2.26,Ho2VO4
111302,6.110,24.00,87.0,52.28,134.0,240.0,1.35,2.8,650.9,1608.8,5.0,20.0,23.0,92.0,4.0,10.0,1.63,4.40,2.0,0.0,3.0,20.0,3.60,8.90,17.890,30.10857,34.72,133.98,106.0,311.0,1.45,3.55,957.9,945.8,15.0,19.0,69.0,107.0,6.0,12.0,2.77,2.59,2.0,0.0,17.0,23.0,5.30,4.96,V(PdO2)2
111303,6.110,14.52,87.0,400.00,134.0,346.0,1.35,3.7,650.9,1080.0,5.0,6.0,23.0,122.0,4.0,12.0,1.63,2.26,2.0,4.0,3.0,0.0,3.60,6.20,8.410,20.62857,313.00,481.70,212.0,417.0,2.35,4.45,429.1,417.0,1.0,5.0,99.0,137.0,8.0,14.0,0.63,0.45,2.0,4.0,3.0,3.0,2.60,2.26,Pm2VO4
111304,6.110,3.06,87.0,639.60,134.0,420.0,1.35,4.7,650.9,806.0,5.0,2.0,23.0,74.0,4.0,10.0,1.63,1.64,2.0,2.0,3.0,0.0,3.60,4.68,3.050,9.16857,552.60,721.30,286.0,491.0,3.35,5.45,155.1,143.0,3.0,1.0,51.0,89.0,6.0,12.0,0.01,0.17,0.0,2.0,3.0,3.0,1.08,0.74,Rb2VO4


In [35]:
features.to_csv('./data/cation_replacement_features.csv')