# Supercon v2: Incorporating Additional Data

This notebook uses the cleaned supercon v2 dataset as a template imports additional data sources to use as a template

In [4]:
from sctk.materials import Material, Superconductor, PeriodicTable
from sctk.notebook import init_notebook

init_notebook()

## Loading SuperCon v2 data:

In [5]:
import pandas as pd
from tqdm import tqdm
import pickle as pkl
import os
import numpy as np
import sklearn

In [6]:
CLEANED_SUPERCON_V2_CSV = './data/Supercon-master/supercon2_cleaned.csv'

SUPERCON_V1_DOI = '10.1038/s41524-018-0085-8'
SUPERCON_V1_CSV = './data/Supercon-master/Supercon_data.csv'

CLEANED_SUPERCON_V1_CSV = './data/Supercon-master/supercon1_cleaned.csv'

In [7]:
supercon2_df = pd.read_csv(CLEANED_SUPERCON_V2_CSV, index_col=False)

## Convert to dataset:

In [8]:
classes = set(sc_class for s in supercon2_df['Classes'] for sc_class in eval(s))
class_idxs = { c : i for i,c in enumerate(classes) }
element_idxs = { elem : i for i, elem in enumerate(PeriodicTable) }


    
def parse_xy_data(row, logscale=True):
    x = np.zeros(len(element_idxs))
    y = np.zeros(len(classes))
    variables = eval(row['Substitutions'])
    subs = { k : v[0] if len(v) > 0 else 0.0 for k,v in variables.items() }
    comp = Material(row['Material']).substitute(subs).get_composition()
    row_classes = eval(row['Classes'])
    if None in comp.values():
        return None, None
    
    for k, v in comp.items():
        x[element_idxs[k]] = v
    for c in row_classes:
        y[class_idxs[c]] = 1.0
        
    return x, y

In [9]:
from sklearn.preprocessing import StandardScaler

X, Y = [], []
for _, row in supercon2_df.iterrows():
    x, y = parse_xy_data(row)
    if x is not None and y is not None:
        X.append(x)
        Y.append(y)

X = np.array(X)
Y = np.array(Y)

X_scaler = StandardScaler()
X_norm = X_scaler.fit_transform(X)

## Train catagorical classifier model based on composition:

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

class_models = {
    c : KNeighborsClassifier(n_neighbors=3)
    for c in classes
}

confusion_matrices = {
    c : np.zeros((2,2))
    for c in classes
}

for c, model in tqdm(list(class_models.items())):
    class_idx = class_idxs[c]
    X_train, X_test, Y_train, Y_test = train_test_split(X_norm, Y, test_size=0.2)
    model.fit(X_train, Y_train[:,class_idx])
    Y_test_pred = model.predict(X_test)
    
    for y_pred, y_true in zip(Y_test_pred, Y_test[:,class_idx]):
        confusion_matrices[c][int(1-y_true),int(1-y_pred)] += 1.0
        
    confusion_matrices[c] /= len(Y_test)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:08<00:00,  1.39it/s]


In [12]:
import pickle as pkl

with open('./models/class_models.pkl', 'wb') as f:
    pkl.dump(class_models, f)

In [14]:
# for c, m in confusion_matrices.items():
#     print(c)
#     print(m)
#     print(f'Error probability: {(m[0,1] + m[1,0]):.2f}\n')

In [15]:
def predict_classes(materials):
    
    if not materials:
        return []
    
    mats_X = np.zeros((len(materials),len(element_idxs)))
    
    for i, m in enumerate(materials):
        comp = m.get_composition()
        for k, v in comp.items():
            mats_X[i, element_idxs[k]] = v
    
    mats_norm_X = X_scaler.transform(mats_X)
    mats_Y_pred = np.array([
        model.predict(mats_norm_X)
        for model in class_models.values()
    ]).T
    
    classes = [
        [ c for i, c in enumerate(class_models.keys()) 
          if y_pred[i] > 0.5]
        for y_pred in mats_Y_pred
    ]
        
    return classes

example_superconductor = 'YBCO'
classes = predict_classes([Superconductor(example_superconductor)])[0]
print(f'Predicted classes of {example_superconductor}:', classes)

Predicted classes of YBCO: ['Cuprates', 'Oxides']


## Load Supercon v1 Dataset:

In [16]:
supercon1_df = pd.read_csv(SUPERCON_V1_CSV, index_col=False)
supercon1_df = supercon1_df[supercon1_df['Tc'] > 0.0]

supercon1_df

Unnamed: 0,name,Tc
0,Ba0.4K0.6Fe2As2,31.20
1,Ca0.4Ba1.25La1.25Cu3O6.98,40.10
2,Mo0.39Ru0.61,6.90
3,Tm4Os6Sn19,1.10
4,Nd1Bi0.99Pb0.01S2F0.3O0.7,4.85
...,...,...
16408,La1.78Sr0.22Cu0.9975Zn0.0025O4,19.25
16410,Nb0.96Ta0.04,8.87
16411,Pb2Sr2Ho0.5Ca0.5Cu2.982Al0.018O8,63.60
16412,Yb0.5Pr0.5Ba2Cu3O6.9,34.80


In [17]:
sc_materials = []
sc_rows = []
for i, row in tqdm(list(supercon1_df.iterrows())):
    try:
        sc = Superconductor(row['name'])
        sc.squeeze()
        if None in sc.get_composition().values():
            raise Exception('None value in parsed composition.')
        
        sc_materials.append(sc)
        sc_rows.append(row)
    except:
        continue
    
sc_classes = predict_classes(sc_materials)

cleaned_supercon1_data = []
for sc, row, pred_classes in zip(sc_materials,sc_rows,sc_classes):
    material_str = sc.get_formula_string(fmt='cod')
    substitutions = {}
    Tc = row['Tc']
    pressure = 0.0
    shape = None
    substrate = None
    doi = SUPERCON_V1_DOI
    
    cleaned_supercon1_data.append(
        (material_str,substitutions,Tc,pressure,classes,shape,substrate,doi))


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12448/12448 [00:00<00:00, 12477.79it/s]


In [18]:
cleaned_supercon1_df = pd.DataFrame(cleaned_supercon1_data,
                                    columns=supercon2_df.columns)
cleaned_supercon1_df

Unnamed: 0,Material,Substitutions,Tc (K),Pressure (GPa),Classes,Shape,Substrate,DOI
0,Ba 0.4 K 0.6 Fe 2 As 2,{},31.20,0.0,"[Cuprates, Oxides]",,,10.1038/s41524-018-0085-8
1,Ca 0.4 Ba 1.25 La 1.25 Cu 3 O 6.98,{},40.10,0.0,"[Cuprates, Oxides]",,,10.1038/s41524-018-0085-8
2,Mo 0.39 Ru 0.61,{},6.90,0.0,"[Cuprates, Oxides]",,,10.1038/s41524-018-0085-8
3,Tm 4 Os 6 Sn 19,{},1.10,0.0,"[Cuprates, Oxides]",,,10.1038/s41524-018-0085-8
4,Nd Bi 0.99 Pb 0.01 S 2 F 0.3 O 0.7,{},4.85,0.0,"[Cuprates, Oxides]",,,10.1038/s41524-018-0085-8
...,...,...,...,...,...,...,...,...
12413,La 1.78 Sr 0.22 Cu 0.998 Zn 0.003 O 4,{},19.25,0.0,"[Cuprates, Oxides]",,,10.1038/s41524-018-0085-8
12414,Nb 0.96 Ta 0.04,{},8.87,0.0,"[Cuprates, Oxides]",,,10.1038/s41524-018-0085-8
12415,Pb 2 Sr 2 Ho 0.5 Ca 0.5 Cu 2.982 Al 0.018 O 8,{},63.60,0.0,"[Cuprates, Oxides]",,,10.1038/s41524-018-0085-8
12416,Yb 0.5 Pr 0.5 Ba 2 Cu 3 O 6.9,{},34.80,0.0,"[Cuprates, Oxides]",,,10.1038/s41524-018-0085-8


In [19]:
#cleaned_supercon1_df.to_csv(CLEANED_SUPERCON_V1_CSV, index=False)

In [20]:
read_supercon1_df = pd.read_csv(CLEANED_SUPERCON_V1_CSV)
display(read_supercon1_df)

Unnamed: 0,Material,Substitutions,Tc (K),Pressure (GPa),Classes,Shape,Substrate,DOI
0,Ba 0.4 K 0.6 Fe 2 As 2,{},31.20,0.0,"['Oxides', 'Cuprates']",,,10.1038/s41524-018-0085-8
1,Ca 0.4 Ba 1.25 La 1.25 Cu 3 O 6.98,{},40.10,0.0,"['Oxides', 'Cuprates']",,,10.1038/s41524-018-0085-8
2,Mo 0.39 Ru 0.61,{},6.90,0.0,"['Oxides', 'Cuprates']",,,10.1038/s41524-018-0085-8
3,Tm 4 Os 6 Sn 19,{},1.10,0.0,"['Oxides', 'Cuprates']",,,10.1038/s41524-018-0085-8
4,Nd Bi 0.99 Pb 0.01 S 2 F 0.3 O 0.7,{},4.85,0.0,"['Oxides', 'Cuprates']",,,10.1038/s41524-018-0085-8
...,...,...,...,...,...,...,...,...
12413,La 1.78 Sr 0.22 Cu 0.998 Zn 0.003 O 4,{},19.25,0.0,"['Oxides', 'Cuprates']",,,10.1038/s41524-018-0085-8
12414,Nb 0.96 Ta 0.04,{},8.87,0.0,"['Oxides', 'Cuprates']",,,10.1038/s41524-018-0085-8
12415,Pb 2 Sr 2 Ho 0.5 Ca 0.5 Cu 2.982 Al 0.018 O 8,{},63.60,0.0,"['Oxides', 'Cuprates']",,,10.1038/s41524-018-0085-8
12416,Yb 0.5 Pr 0.5 Ba 2 Cu 3 O 6.9,{},34.80,0.0,"['Oxides', 'Cuprates']",,,10.1038/s41524-018-0085-8


# Analysis of composition support of superconducting systems:

In [21]:
from sklearn.svm import OneClassSVM

# reuse old supercon v2 dataset:
X_v2 = X

# parse supercon v1 dataset:
X_v1 = []

classes = set(sc_class for s in supercon2_df['Classes'] for sc_class in eval(s))
class_idxs = { c : i for i,c in enumerate(classes) }
element_idxs = { elem : i for i, elem in enumerate(PeriodicTable) }
for _, row in read_supercon1_df.iterrows():
    x, y = parse_xy_data(row)
    if x is not None and y is not None:
        X_v1.append(x)
X_v1 = np.array(X_v1)
        
# combine v1 and v2 supercon datasets:
X_all = np.concatenate((X_v1,X_v2))
X_all_scaler = StandardScaler()
X_all_norm = X_all_scaler.fit_transform(X_all)

# train a 1-class classifier to recognize the support of known data:
support_svm = OneClassSVM()
support_svm.fit(X_all_norm)



In [23]:
with open('./models/support_svm_model.pkl', 'wb') as f:
    pkl.dump(support_svm, f)

# Generate Non-Superconducting Data

In [None]:
def predict_support(materials):
    
    if not materials:
        return []
    
    mats_X = np.zeros((len(materials),len(element_idxs)))
    
    for i, m in enumerate(materials):
        comp = m.get_composition()
        for k, v in comp.items():
            mats_X[i, element_idxs[k]] = v
    
    mats_norm_X = X_all_scaler.transform(mats_X)
        
    return support_svm.predict(mats_norm_X)

In [2]:
from random import sample, choice
from sctk.databases import MaterialsProject
from itertools import product

N_system_samples = 10000
X_all_maxes = np.max(X_all, axis=0)
elem_list = [ v for v in PeriodicTable.values()
              if v.atomic_number < 100 and 
              X_all_maxes[v.atomic_number-1] > 0 ]

elem_systems = \
    list(product(elem_list, elem_list))

print('# systems: ', len(elem_systems))

elem_system_ids = set()
with MaterialsProject() as mp:
    for sys in tqdm(elem_systems):
        if ids := mp.get_structure_ids_with_elements(sys):
            elem_system_ids |= set(ids)

NameError: name 'np' is not defined

In [None]:
OUTPUT_ID_FILE = './data/sctk_structures/screening/screen_ids4.txt'
elem_system_ids = list(set(elem_system_ids))
with open(OUTPUT_ID_FILE, 'w') as f:
    for _id in elem_system_ids:
        f.write(_id.strip() + '\n')

In [None]:
MP_RANDOM_ID_FILE = './data/sctk_structures/random/random_ids.txt'
MP_RANDOM_ID_FILE2 = './data/sctk_structures/random/random_ids2.txt'
MP_RANDOM_ID_FILE3 = './data/sctk_structures/random/random_ids3.txt'

MP_RANDOM_SCREEN_FILE = './data/sctk_structures/screening/screen_ids.txt'
MP_RANDOM_SCREEN_FILE2 = './data/sctk_structures/screening/screen_ids2.txt'
MP_RANDOM_SCREEN_FILE3 = './data/sctk_structures/screening/screen_ids3.txt'
MP_RANDOM_SCREEN_FILE4 = './data/sctk_structures/screening/screen_ids4.txt'

MP_RANDOM_MATERIAL_FILE = './data/sctk_structures/random/random_materials.csv'
MP_RANDOM_DATASET_FILE = './data/sctk_structures/random/random_materials_dataset.csv'
MP_RANDOM_MATERIAL_DIR = './data/sctk_structures/random/'

MP_SCREENING_MATERIAL_DIR = './data/sctk_structures/screening'

In [3]:
# read list of random material ids:
lines = []
with open(MP_RANDOM_SCREEN_FILE, 'r') as f:
    lines.extend(f.readlines())

with open(MP_RANDOM_SCREEN_FILE2, 'r') as f:
    lines.extend(f.readlines())

with open(MP_RANDOM_SCREEN_FILE3, 'r') as f:
    lines.extend(f.readlines())
    
with open(MP_RANDOM_SCREEN_FILE4, 'r') as f:
    lines.extend(f.readlines())

lines = list(set(lines))

#write materials to VASP format:
with MaterialsProject() as mp:
    for line in tqdm(lines):
        _id = line.strip()
        filepath = os.path.join(MP_SCREENING_MATERIAL_DIR, f'{_id}.poscar')
        if not os.path.exists(filepath):
            structure = mp.get_structure_by_id(_id)
            structure.write(filepath)

NameError: name 'MP_RANDOM_SCREEN_FILE' is not defined

In [None]:
from ase.io import read
from glob import glob

# material_ids = []
# for file in glob(os.path.join(MP_RANDOM_MATERIAL_DIR,'*.poscar')):
#     mp_id = os.path.splitext(os.path.split(file)[-1])[0]
#     atoms = read(file)
#     material_ids.append(mp_id)
    
# with MaterialsProject() as mp:
#     results = mp.query({'material_ids' : material_ids, 'fields' : ['formula_pretty', 'material_id']})

In [None]:
# with open(MP_RANDOM_MATERIAL_FILE, 'w') as f:
#     f.write('mp_id,formula\n')
#     for _id, doc in results.items():
#         f.write(_id + ', ' + doc['formula_pretty'] + '\n')

In [None]:
random_df = pd.read_csv(MP_RANDOM_MATERIAL_FILE)

random_materials = [ Material(f) for f in random_df['formula'] ]
svc_supports = predict_support(random_materials)
random_df['support'] = svc_supports

In [None]:
random_df['similar classes'] = predict_classes(random_materials)

In [None]:
random_df[random_df['support'] < 0.0]

In [None]:
# Note: these are the similar materials that 
random_df[random_df['support'] > 0.0]