In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./MPEA_dataset.csv')
df = df[pd.notna(df['PROPERTY: Calculated Young modulus (GPa)'])]   # 729 Rows

In [3]:
# Irrelevant
df.drop('REFERENCE: doi', axis=1, inplace=True)
df.drop('REFERENCE: year', axis=1, inplace=True)
df.drop('REFERENCE: title', axis=1, inplace=True)
df.drop('IDENTIFIER: Reference ID', axis=1, inplace=True)

In [4]:
df.to_csv('./data/MPEA.csv')

In [4]:
cols = []
for col in df:
    nonNaNcount = len(df[col]) - df[col].isna().sum()
    print(col, '->', nonNaNcount)
    cols.append(nonNaNcount)

FORMULA -> 729
PROPERTY: Microstructure -> 729
PROPERTY: Processing method -> 711
PROPERTY: BCC/FCC/other -> 729
PROPERTY: grain size ($\mu$m) -> 138
PROPERTY: Exp. Density (g/cm$^3$) -> 51
PROPERTY: Calculated Density (g/cm$^3$) -> 729
PROPERTY: HV -> 179
PROPERTY: Type of test -> 589
PROPERTY: Test temperature ($^\circ$C) -> 687
PROPERTY: YS (MPa) -> 555
PROPERTY: UTS (MPa) -> 257
PROPERTY: Elongation (%) -> 298
PROPERTY: Elongation plastic (%) -> 62
PROPERTY: Exp. Young modulus (GPa) -> 107
PROPERTY: Calculated Young modulus (GPa) -> 729
PROPERTY: O content (wppm) -> 45
PROPERTY: N content (wppm) -> 37
PROPERTY: C content (wppm) -> 1


In [5]:
print(sorted(cols))

[1, 37, 45, 51, 62, 107, 138, 179, 257, 298, 555, 589, 687, 711, 729, 729, 729, 729, 729]


In [6]:
# Not enough values
df.drop('PROPERTY: C content (wppm)', axis=1, inplace=True) # 1
df.drop('PROPERTY: N content (wppm)', axis=1, inplace=True) # 37
df.drop('PROPERTY: O content (wppm)', axis=1, inplace=True) # 45
df.drop('PROPERTY: Exp. Density (g/cm$^3$)', axis=1, inplace=True) # 51
df.drop('PROPERTY: Elongation plastic (%)', axis=1, inplace=True) # 62
df.drop('PROPERTY: Exp. Young modulus (GPa)', axis=1, inplace=True) # 107
df.drop('PROPERTY: grain size ($\mu$m)', axis=1, inplace=True) # 138
df.drop('PROPERTY: HV', axis=1, inplace=True) # 179
df.drop('PROPERTY: UTS (MPa)', axis=1, inplace=True) # 257
df.drop('PROPERTY: Elongation (%)', axis=1, inplace=True) # 298
# df.drop('YS (MPa)', axis=1, inplace=True) # 555
# df.drop('Type of test', axis=1, inplace=True) # 589

In [7]:
print(all(df['PROPERTY: BCC/FCC/other'] == df['PROPERTY: Microstructure']))
df.drop('PROPERTY: BCC/FCC/other', axis=1, inplace=True)

True


In [8]:
df.dropna(inplace=True) # 553 Rows

In [9]:
print(df.keys()); print(len(df.keys()))

Index(['FORMULA', 'PROPERTY: Microstructure', 'PROPERTY: Processing method',
       'PROPERTY: Calculated Density (g/cm$^3$)', 'PROPERTY: Type of test',
       'PROPERTY: Test temperature ($^\circ$C)', 'PROPERTY: YS (MPa)',
       'PROPERTY: Calculated Young modulus (GPa)'],
      dtype='object')
8


In [10]:
df.to_csv('./data/MPEA_clean.csv')

In [12]:
df.dtypes

FORMULA                                      object
PROPERTY: Microstructure                     object
PROPERTY: Processing method                  object
PROPERTY: Calculated Density (g/cm$^3$)     float64
PROPERTY: Type of test                       object
PROPERTY: Test temperature ($^\circ$C)      float64
PROPERTY: YS (MPa)                          float64
PROPERTY: Calculated Young modulus (GPa)    float64
dtype: object

In [12]:
print(pd.value_counts(df['PROPERTY: Type of test']))

df['PROPERTY: Type of test::C'] = pd.Series(df['PROPERTY: Type of test'] == 'C', dtype=int)
df['PROPERTY: Type of test::T'] = pd.Series(df['PROPERTY: Type of test'] == 'T', dtype=int)

df.drop('PROPERTY: Type of test', axis=1, inplace=True)

PROPERTY: Type of test
C    378
T    175
Name: count, dtype: int64


In [14]:
print(pd.value_counts(df['PROPERTY: Processing method']))

df['PROPERTY: Processing method::CAST'] = pd.Series(df['PROPERTY: Processing method'] == 'CAST', dtype=int)
df['PROPERTY: Processing method::WROUGHT'] = pd.Series(df['PROPERTY: Processing method'] == 'WROUGHT', dtype=int)
df['PROPERTY: Processing method::ANNEAL'] = pd.Series(df['PROPERTY: Processing method'] == 'ANNEAL', dtype=int)
df['PROPERTY: Processing method::OTHER'] = pd.Series(df['PROPERTY: Processing method'] == 'OTHER', dtype=int)
df['PROPERTY: Processing method::POWDER'] = pd.Series(df['PROPERTY: Processing method'] == 'POWDER', dtype=int)

df.drop('PROPERTY: Processing method', axis=1, inplace=True)

PROPERTY: Processing method
CAST       239
WROUGHT    153
ANNEAL      97
OTHER       59
POWDER       5
Name: count, dtype: int64


In [17]:
print(pd.value_counts(df['PROPERTY: Microstructure']))

df['PROPERTY: Microstructure::BCC'] = pd.Series(df['PROPERTY: Microstructure'] == 'BCC', dtype=int)
df['PROPERTY: Microstructure::FCC'] = pd.Series(df['PROPERTY: Microstructure'] == 'FCC', dtype=int)

df.drop('PROPERTY: Microstructure', axis=1, inplace=True)

PROPERTY: Microstructure
BCC    372
FCC    181
Name: count, dtype: int64


In [55]:
elementsToIdx = {}
idx = 0

for formula in df['FORMULA'].values:
    for f in formula.split(' '):
        i = 2 if str.isalpha(f[1]) else 1
        elem = f[:i]
        if elem not in elementsToIdx:
            elementsToIdx[elem] = idx
            idx += 1

print(elementsToIdx)

{'Al': 0, 'Co': 1, 'Fe': 2, 'Ni': 3, 'Si': 4, 'Cr': 5, 'Mn': 6, 'Mo': 7, 'Nb': 8, 'Cu': 9, 'Ti': 10, 'V': 11, 'Ta': 12, 'Zr': 13, 'Hf': 14, 'W': 15, 'Zn': 16, 'Re': 17, 'Mg': 18}


In [70]:
elemsArrays = []
totaElems = len(elementsToIdx.keys())

for formula in df['FORMULA'].values:
    elems = np.zeros(totaElems, dtype=np.float16)
    for f in formula.split(' '):
        i = 2 if str.isalpha(f[1]) else 1
        elem, comp = f[:i], float(f[i:])
        elems[elementsToIdx[elem]] = comp
    elemsArrays.append(elems)

elemsArrays = np.vstack(elemsArrays)

In [71]:
for elem, idx in elementsToIdx.items():
    df[f'FORMULA::{elem}'] = elemsArrays[:, idx]

df.drop('FORMULA', axis=1, inplace=True)

In [72]:
df.to_csv('./data/MPEA_numeric.csv')