# SuperCon Dataset: Initial Exploration
---

This is a notebook contining some of my initial explorations of the SuperCon dataset.

### Dependencies:

To run this notebook you will need to run `pip3 install <dependency>` for all of the packages listed below. These dependencies are preinstalled in this project's associated Docker container:

In [3]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import tqdm
import pymatgen

### Datasets:
Download each of the following and configure the path variables below:

* [Tc SuperCon Dataset (CSV)](https://github.com/vstanev1/Supercon).
* [Periodic Table of Elements (CSV)](https://gist.github.com/GoodmanSciences/c2dd862cd38f21b0ad36b8f96b4bf1ee)

In [4]:
SUPERCON_CSV = '/media/colin/Shared/colin/git/materials-ml/\
jupyter/supercond-exploration/data/Supercon-master/Supercon_data.csv'

SUPERCON_V2_CSV = '/media/colin/Shared/colin/git/materials-ml/\
jupyter/supercond-exploration/data/Supercon-master/supercon2_v22.12.03.csv'

PTABLE_CSV = '/media/colin/Shared/colin/git/materials-ml/\
jupyter/supercond-exploration/data/PTable/PeriodicTable.csv'

This is the path that the cleaned data will be exported to:

In [5]:
CLEANED_SUPERCON_V2_CSV = '/media/colin/Shared/colin/git/materials-ml/\
jupyter/supercond-exploration/data/Supercon-master/supercon2_cleaned.csv'

### Configure Matplotlib:

In [6]:
from matplotlib import cm
from matplotlib.colors import LinearSegmentedColormap

# if set to None, figures will not be saved:
FIGURE_OUTPUT_DIR = './figures' 

# This sets the pdf rendering to be pdflatex-compatible:
plt.rcParams.update({
    'pgf.texsystem' : 'pdflatex'
})

# use seaborn plot style:
plt.style.use('seaborn')

## Load Periodic Table Data:

In [7]:
ptable_df = pd.read_csv(PTABLE_CSV)
ptable_df # render in notebook

Unnamed: 0,AtomicNumber,Element,Symbol,AtomicMass,NumberofNeutrons,NumberofProtons,NumberofElectrons,Period,Group,Phase,...,FirstIonization,Density,MeltingPoint,BoilingPoint,NumberOfIsotopes,Discoverer,Year,SpecificHeat,NumberofShells,NumberofValence
0,1,Hydrogen,H,1.007,0,1,1,1,1.0,gas,...,13.5984,0.000090,14.175,20.28,3.0,Cavendish,1766.0,14.304,1,1.0
1,2,Helium,He,4.002,2,2,2,1,18.0,gas,...,24.5874,0.000179,,4.22,5.0,Janssen,1868.0,5.193,1,
2,3,Lithium,Li,6.941,4,3,3,2,1.0,solid,...,5.3917,0.534000,453.850,1615.00,5.0,Arfvedson,1817.0,3.582,2,1.0
3,4,Beryllium,Be,9.012,5,4,4,2,2.0,solid,...,9.3227,1.850000,1560.150,2742.00,6.0,Vaulquelin,1798.0,1.825,2,2.0
4,5,Boron,B,10.811,6,5,5,2,13.0,solid,...,8.2980,2.340000,2573.150,4200.00,6.0,Gay-Lussac,1808.0,1.026,2,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,114,Flerovium,Fl,289.000,175,114,114,7,14.0,artificial,...,,,,,,,1999.0,,7,4.0
114,115,Moscovium,Mc,288.000,173,115,115,7,15.0,artificial,...,,,,,,,2010.0,,7,5.0
115,116,Livermorium,Lv,292.000,176,116,116,7,16.0,artificial,...,,,,,,,2000.0,,7,6.0
116,117,Tennessine,Ts,295.000,178,117,117,7,17.0,artificial,...,,,,,,,2010.0,,7,7.0


In [8]:
supercon_df = pd.read_csv(SUPERCON_CSV)
supercon2_df = pd.read_csv(SUPERCON_V2_CSV)

supercon2_df # view in notebook

Unnamed: 0,id,rawMaterial,materialId,name,formula,doping,shape,materialClass,fabrication,substrate,...,appliedPressure,section,subsection,hash,title,doi,authors,publisher,journal,year
0,61ba959a7c13e4f711c50c3e,InO x films,1.942078e+09,,InO x,,films,Oxides,,,...,,body,figure,b7d13f231a,Infinite-randomness fixed point of the quantum...,10.1103/physrevb.99.054515,"Nicholas A Lewellyn, Ilana M Percher, J J Nels...",American Physical Society (APS),Physical Review B,2019.0
1,61ba959a0cb3d3a5e6c50c3e,"x = 0.04, 0.05, and 0.06)",1.064204e+09,,,,,,,,...,,body,paragraph,67f5567e75,Tuning the interplay between nematicity and sp...,10.1038/s41467-018-04471-7,"S-H Baek, Dilip Bhoi, Woohyun Nam, Bumsung Lee...",Springer Science and Business Media LLC,Nature Communications,2018.0
2,61ba959a720b8b3d36c50c3e,amorphous In-O films,-1.411819e+09,,In-O,,films,Alloys,,,...,,body,,e74ca47303,Scaling analysis of the magnetic field-tuned q...,10.1134/1.568304,"V F Gantmakher, M V Golubkov, V T Dolgopolov, ...",Pleiades Publishing Ltd,Journal of Experimental and Theoretical Physic...,1998.0
3,61ba959a6a35f302ddc50c3e,x= 0.108,4.969883e+08,,,,,,,,...,,body,figure,c8fb7503bc,Doping evolution of antiferromagnetism and tra...,,"Rui Zhang, Dongliang Gong, Xingye Lu, Shiliang...",,,2015.0
4,61ba959a85b12ae79ac50c3e,PrRu 2 Si 2,-5.030707e+08,,PrRu 2 Si 2,,,Alloys,,,...,,header,abstract,e8285dd0b7,Crystal-field interactions in PrRu2Si2,10.1088/0953-8984/12/34/307,"R Michalski, Z Ropka, R J Radwanski",IOP Publishing,Journal of Physics: Condensed Matter,2014.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40319,61ba9fbb3136e52869c51390,NbSe 2 nanobelts,1.325648e+09,,NbSe 2,,nanobelts,Chalcogenides,,,...,,body,paragraph,7767ae827b,Magnetic switching of phase-slip dissipation i...,10.1103/physrevb.75.020501,"Abram Falk, Mandar M Deshmukh, Amy L Prieto, J...",American Physical Society (APS),Physical Review B,2007.0
40320,61ba9fbb0a416aa3d9c51367,MgB 2,1.438579e+09,,MgB 2,,,"Alloys, Borides",,,...,,body,paragraph,eb762175eb,Effects of Al doping on the structural and ele...,10.1103/physrevb.66.012511,"O De La Peña, A Aguayo, R De Coss",American Physical Society (APS),Physical Review B,2014.0
40321,61ba9fbcab9e3e9324c5138b,CaBi2,-1.219075e+09,,CaBi2,,,Alloys,,,...,,header,abstract,04bc481d97,Observation of topological surface state in a ...,,"Gyanendra Dhakal, M Mofazzel Hosen, Ayana Ghos...",,,2019.0
40322,61ba9fbcab9e3e9324c5138c,single-crystalline CaBi 2,-2.079131e+09,,CaBi 2,,single-crystalline,Alloys,,,...,,body,paragraph,04bc481d97,Observation of topological surface state in a ...,,"Gyanendra Dhakal, M Mofazzel Hosen, Ayana Ghos...",,,2019.0


In [9]:
from pprint import pprint
pprint(list(supercon2_df.columns))

['id',
 'rawMaterial',
 'materialId',
 'name',
 'formula',
 'doping',
 'shape',
 'materialClass',
 'fabrication',
 'substrate',
 'variables',
 'criticalTemperature',
 'criticalTemperatureMeasurementMethod',
 'appliedPressure',
 'section',
 'subsection',
 'hash',
 'title',
 'doi',
 'authors',
 'publisher',
 'journal',
 'year']


In [10]:
pd.set_option("display.max_rows", 2000)
parse_test_df = supercon2_df[['rawMaterial', 'formula', 'variables', 'criticalTemperature', 'appliedPressure']]

parse_test_df_samples = parse_test_df[:50]
display(parse_test_df_samples)

Unnamed: 0,rawMaterial,formula,variables,criticalTemperature,appliedPressure
0,InO x films,InO x,,2.8 K,
1,"x = 0.04, 0.05, and 0.06)",,"x=0.04, 0.05, 0.06",2 K,
2,amorphous In-O films,In-O,,2 K,
3,x= 0.108,,x=0.108,20 K,
4,PrRu 2 Si 2,PrRu 2 Si 2,,14K,
5,Lu3Os4Ge13,Lu3Os4Ge13,,2.85 K,
6,LaMnO 3.04,LaMnO 3.04,,125 K,
7,doping con- centrations 0.06 < x ≤ 0.1,,"x=≤ 0.1, 0.06 <",140 K,
8,La 0.8 Ca 0.2 MnO 3.015,La 0.8 Ca 0.2 MnO 3.015,,190 K,
9,LaMnO 3.06,LaMnO 3.06,,125 and 150 K,


In [11]:
from sctk.materials import Superconductor, PeriodicTable
import re

FORMULA_XLAT_TABLE = str.maketrans(
    "−", 
    "-")

VARIABLE_ELEMENT_RE = r'([A-Z][A-Z]?)([~ ]*=[~ ]*([A-Z][a-z]?[,]?[ ]*)+)'
VARIABLE_ASSIGNMENT_RE = r'([a-zαβγδ])[ ]*[≤<>~]?=[≤<>~]?[ ]*(\d*[\./]\d+)'
TC_RE = r'[~ ]*((\d*[\.])?\d+)([~ ]*±[~ ]*((\d*[\.])?\d+))?'
TC_RANGE_RE = TC_RE + TC_RE

ELEMENT_RE = '('+'|'.join(sorted(PeriodicTable.keys(), key=lambda x: -len(x)))+')'

def parse_Tc(s):
    if not isinstance(s,str):
        return None
    
    values = []
    for match in re.finditer(TC_RE, s):
        values.append(float(match.group(1)))
    
    return values[0] if len(values) == 1 else None

def parse_pressure(s):
    if not isinstance(s,str):
        return 0.0
    
    values = []
    for match in re.finditer(TC_RE, s):
        values.append(float(match.group(1)))
    
    return max(values) if values else 0.0

def parse_classes(s):
    if not isinstance(s, str):
        return []
    return [ item.strip() for item in s.split(',') ]

def find_max_count(formula):
    m = 0
    for (elem, count) in formula:
        if isinstance(count, list):
            m = find_max_count(elem)
        elif isinstance(count,int) or isinstance(count,float):
            m = max(m,count)
            
    return m

def parse_row_data(row, supercon_df):
    
    # parse row entries:
    formula = row['formula']
    raw_material = row['rawMaterial']
    variables = row['variables']
    row_Tc = parse_Tc(row['criticalTemperature'])
    pressure = parse_pressure(row['appliedPressure'])
    shape = row['shape'] if isinstance(row['shape'],str) else ''
    classes = parse_classes(row['materialClass'])
    substrate = row['substrate'] if isinstance(row['substrate'],str) else ''
    doi = row['doi']
    
    # ignore materials with no Tc listed:
    if not row_Tc:
        return []
    
    # compile metadata:
    metadata = {
        'Tc' : row_Tc,
        'pressure': pressure,
        'shape' : shape,
        'classes' : classes,
        'substrate' : substrate,
        'doi' : doi
    }
    
    # find database fields that came from the same paper:
    samepaper_rows = supercon_df[supercon_df['doi'] == row['doi']] if row['doi'] else []
    samepaper_fields = [ r['variables']   for _, r in samepaper_rows.iterrows() ] + \
                       [ r['rawMaterial'] for _, r in samepaper_rows.iterrows() ]
    samepaper_Tc = [ r['criticalTemperature'] for _, r in samepaper_rows.iterrows() ]*2
    
    # try to parse formula directly:
    parsed_materials = []
    if isinstance(formula, str):
        formula = formula.translate(FORMULA_XLAT_TABLE)
        try:
            material = Superconductor(formula)
            parsed_materials.append(material)
        except:
            pass
        
        # if parsing failed, try searching for variable elements elsewhere in the same paper:
        if not parsed_materials:
            for field in samepaper_fields:
                if isinstance(field, str) and (match := re.match(VARIABLE_ELEMENT_RE, field)):
                    variable = match.group(1)
                    sub_list = match.group(2)
                    substituted_elements = [
                        m.group(0) for m in re.finditer(ELEMENT_RE, sub_list)
                    ]

                    # try applying each substitution:
                    for sub in substituted_elements:
                        try:
                            material = Superconductor(formula, variable_elements=[variable])
                        except:
                            continue
                        
                        material.substitute_element(variable,sub)
                        if not material.variable_elements:
                            parsed_materials.append(material)
                
    # apply substitutions to variables in the formula:
    substituted_materials = []
    for material in parsed_materials:
        
        # ignore if material has large max count or too many variables:
        if find_max_count(material.formula) >= 100 or len(material.variables) > 6:
            continue
        
        possible_substitutions = {}
        substitutions = {}
        if material.variables:
            if isinstance(variables, str) and (match := re.match(VARIABLE_ASSIGNMENT_RE, variables)):
                v, val = match.group(1), float(eval(match.group(2)))
                possible_substitutions[v] = set([val])
            else:
                # look for variables to substitute elsewhere in paper (with same reported Tc):
                for field, Tc_str in zip(samepaper_fields, samepaper_Tc):
                    if parse_Tc(Tc_str) == row_Tc and \
                       isinstance(field,str) and (match := re.match(VARIABLE_ASSIGNMENT_RE, field)):
                        v, val = match.group(1), float(eval(match.group(2)))
                        
                        # add parsed value (from elsewhere in the paper) to set of possible substitutions:
                        if v not in possible_substitutions:
                            possible_substitutions[v] = set()
                        possible_substitutions[v].add(val)
            
            
            for v in material.variables:
                substitutions[v] = list(possible_substitutions[v]) \
                                    if v in possible_substitutions else []
        
        #--------------------------------------------------------------------------------
        # apply manual filters based on composition:
        material_comp = material.get_composition()
        
        # 1. filter out corrupted Y entries (metallic Y has max Tc of 25K)
        if len(material_comp) == 1 and 'Y' in material_comp and row_Tc > 25.0:
            continue
            
        # 2. remove materials with undefined or ambiguous pristine structures (e.g. of the form "Al 0.01")
        if not material.get_pristine_material() or \
            None in material.get_pristine_material().get_composition().values():
            continue
            
        # 3. fiter out any materials with Tc > 280 K without applied pressure or indicated features
        #    (this should screen out most of the Curie temperatures accidentally parsed):
        if row_Tc > 280 and pressure <= 1e-5:
            continue
            
        # account for unlisted pressure in high-Tc Hydrides:
        if row_Tc > 140 and pressure <= 1e-5:
            if 'Hydrides' in classes:
                metadata['pressure'] = 170.0
            else:
                continue
            
        substituted_materials.append((material, substitutions, metadata))
        
        
    return substituted_materials
        
                
            
        

In [12]:
from tqdm import tqdm

supercon_list = []
for i, row in tqdm(supercon2_df.iterrows()):
    supercon_list.extend(parse_row_data(row, supercon2_df))
    
known_materials = []
for mat, subs, md in supercon_list:
    empty_sublist = False
    for v in subs.values():
        if len(v) == 0:
            empty_sublist = True
                
    if not empty_sublist:
        known_materials.append((mat,subs, md))

40324it [03:13, 208.46it/s]


In [13]:
print('Parsed Materials: ', len(supercon_list), 'of', len(supercon2_df))
print('Materials with specified doping: ', len(known_materials), 'of', len(supercon_list))

Parsed Materials:  23881 of 40324
Materials with specified doping:  19466 of 23881


# Export Parsed Dataset to CSV:

In [14]:
cleaned_supercon2_df = pd.DataFrame(index=np.arange(0, len(supercon_list)),columns=(
    'Material', 'Substitutions', 'Tc (K)', 'Pressure (GPa)', 'Classes', 'Shape', 'Substrate', 'DOI'))

for i, (material, subs, metadata) in enumerate(tqdm(supercon_list)):
    cleaned_supercon2_df.loc[i] = [
        material.get_formula_string(fmt='cod'),
        str(subs),
        metadata['Tc'],
        str(metadata['pressure']),
        str(metadata['classes']),
        str(metadata['shape']),
        str(metadata['substrate']),
        str(metadata['doi']),
    ]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23881/23881 [00:02<00:00, 8969.15it/s]


In [15]:
cleaned_supercon2_df[cleaned_supercon2_df['Tc (K)'] > 150]

Unnamed: 0,Material,Substitutions,Tc (K),Pressure (GPa),Classes,Shape,Substrate,DOI
14,La H 10,{},260.0,170.0,"['Hydrides', 'Alloys']",,,10.1021/acs.inorgchem.9b02709
16,H 3 S,{},203.0,170.0,"['Hydrides', 'Chalcogenides']",,,10.1021/acs.inorgchem.9b02709
218,H 3 S,{},203.0,150.0,[],,,
392,La H 10,{},260.0,170.0,"['Hydrides', 'Alloys']",,,10.1103/physrevb.46.12196
410,Hg Ba 2 Ca 2 Cu 3 O 8+δ,{'δ': []},160.0,30.0,['Alloys'],,,10.7567/1882-0786/ab0521
518,La H 10,{},265.0,190.0,"['Hydrides', 'Alloys']",,,10.1103/physrevb.99.220502
519,Y H 6,{},264.0,120.0,"['Hydrides', 'Alloys']",,,10.1103/physrevb.99.220502
520,Y H 6,{},290.0,300.0,"['Hydrides', 'Alloys']",,,10.1103/physrevb.99.220502
522,Y H 6,{},260.0,120.0,"['Hydrides', 'Alloys']",,,10.1103/physrevb.99.220502
540,H 2 S,{},203.0,170.0,"['Hydrides', 'Chalcogenides']",,,10.1002/andp.201700197


In [16]:
cleaned_supercon2_df.to_csv(CLEANED_SUPERCON_V2_CSV, index=False)

In [17]:
display(pd.read_csv(CLEANED_SUPERCON_V2_CSV))

Unnamed: 0,Material,Substitutions,Tc (K),Pressure (GPa),Classes,Shape,Substrate,DOI
0,In O x,{'x': []},2.80,0.0,['Oxides'],films,,10.1103/physrevb.99.054515
1,In O,{},2.00,0.0,['Alloys'],films,,10.1134/1.568304
2,Pr Ru 2 Si 2,{},14.00,0.0,['Alloys'],,,10.1088/0953-8984/12/34/307
3,Lu 3 Os 4 Ge 13,{},2.85,0.0,['Alloys'],,,
4,La Mn O 3.04,{},125.00,0.0,['Oxides'],,,10.1103/physrevlett.87.127206
...,...,...,...,...,...,...,...,...
23876,Nb Se 2,{},7.00,0.0,['Chalcogenides'],nanobelts,,10.1103/physrevb.75.020501
23877,Mg B 2,{},40.00,0.0,"['Alloys', 'Borides']",,,10.1103/physrevb.66.012511
23878,Ca Bi 2,{},2.00,0.0,['Alloys'],,,
23879,Ca Bi 2,{},2.00,0.0,['Alloys'],single-crystalline,,


In [64]:
from pprint import pprint
formulas = set(cleaned_supercon2_df['Material'])
for f in formulas:
    if f:
        if pm := Superconductor(f).get_pristine_material(with_variables=True):
            print(Superconductor(f).formula,'\n  ', pm.formula)
            

[(Cu, 3), (O, 6.67)] 
   [(Cu, 3), (O, '7-a')]
[(Bi, 2), (Pd, 1)] 
   [(Bi, 2), (Pd, 1)]
[([(In, 1), (Mn, 1)], 1), (As, 1)] 
   [(In, 1), (Mn, 1), (As, 1)]
[(Fe, '1-x'), (Cu, 'x'), (Se, 1)] 
   [(Fe, '1-x'), (Cu, 'x'), (Se, 1)]
[(Ca, 1), (Pd, 2), (As, 2)] 
   [(Ca, 1), (Pd, 2), (As, 2)]
[(Th, 1.8), (Sc, 0.2), (Ni, 1), (C, 2)] 
   [(Th, '2-a'), (Sc, 'a'), (Ni, 1), (C, 2)]
[(Ca, 1.88), (Na, 0.12), (Cu, 1), (O, 2), (Cl, 2)] 
   [(Ca, '2-a'), (Na, 'a'), (Cu, 1), (O, 2), (Cl, 2)]
[(Sn, 0.4), (Sb, 0.6)] 
   [(Sn, 'a'), (Sb, '1-a')]
[(La, 1), (H, 'x')] 
   [(La, 1), (H, 'x')]
[(Li, 0.75), (B, 1), (C, 1)] 
   [(Li, '1-a'), (B, 1), (C, 1)]
[(Ba, '1-x'), (Rb, 'x'), (Fe, 2), (As, 2)] 
   [(Ba, '1-x'), (Rb, 'x'), (Fe, 2), (As, 2)]
[(Ba, 1), (Fe, 1.8), (Co, 0.2), (Fe, 2), (As, 2)] 
   [(Ba, 1), (Fe, '2-a'), (Co, 'a'), (Fe, 2), (As, 2)]
[(La, 1), (O, 0.8), (F, 0.2), (Fe, 1), (As, 0.95), (Sb, 0.05)] 
   [(La, 1), (O, '1-a'), (F, 'a'), (Fe, 1), (As, '1-b'), (Sb, 'b')]
[(Ce, 1), (Pt, 2), (In, 7)] 
   [

[(Rb, 0.74), (Fe, 1.6), (Se, 2)] 
   [(Rb, '1-a'), (Fe, '2-b'), (Se, 2)]
[(Gd, 1), (O, 1), (Ni, 1), (Bi, 1)] 
   [(Gd, 1), (O, 1), (Ni, 1), (Bi, 1)]
[(Yb, 1), (In, 1), (Cu, 4)] 
   [(Yb, 1), (In, 1), (Cu, 4)]
[(Nb, 3), (Ge, 2)] 
   [(Nb, 3), (Ge, 2)]
[(La, 1), (O, 1), (Fe, 1), (As, '1-x'), (F, 1)] 
   [(La, 1), (O, 1), (Fe, 1), (As, '1-x'), (F, 1)]
[(Y, 1), (In, 3)] 
   [(Y, 1), (In, 3)]
[(Cu, 1), (N, 1), (C, 1), (S, 1)] 
   [(Cu, 1), (N, 1), (C, 1), (S, 1)]
[(Ba, 1), ([(Fe, 0.95), (Co, 0.05)], 2), (As, 2)] 
   [(Ba, 1), ([(Fe, '1-a'), (Co, 'a')], 2), (As, 2)]
[(Y, 1), (Ba, 2), (Cu, 4), (O, 8)] 
   [(Y, 1), (Ba, 2), (Cu, 4), (O, 8)]
[(Li, 1), (Ho, 0.44), (Y, 0.56), (F, 4)] 
   [(Li, 1), (Ho, 'a'), (Y, '1-a'), (F, 4)]
[(Na, 1), (Fe, 0.98), (Co, 0.02), (As, 1)] 
   [(Na, 1), (Fe, '1-a'), (Co, 'a'), (As, 1)]
[(La, 1.985), (Sr, 0.015), (Cu, 1), (O, '4+δ')] 
   [(La, '2-a'), (Sr, 'a'), (Cu, 1), (O, '4+δ')]
[(Fe, 1), (Te, 0.92)] 
   [(Fe, 1), (Te, '1-a')]
[(Sm, 1), (Fe, 1), (As, 1), (O, 0.9)

[(Ce, 1), (Cu, 2), ([(Si, 0.98), (Ge, 0.02)], 2)] 
   [(Ce, 1), (Cu, 2), ([(Si, '1-a'), (Ge, 'a')], 2)]
[(Ba, 1), (Ni, 2), (P, 2)] 
   [(Ba, 1), (Ni, 2), (P, 2)]
[([(La, 0.25), (Pr, 0.75)], 0.7), (Ca, 0.3), (Mn, 1), (O, 3)] 
   [([(La, 'a'), (Pr, '1-a')], '1-a'), (Ca, 'a'), (Mn, 1), (O, 3)]
[(Nb, 1), (Cu, 1), (Nb, 1)] 
   [(Nb, 1), (Cu, 1), (Nb, 1)]
[(W, 1), (O, 3)] 
   [(W, 1), (O, 3)]
[(Cu, 'c'), (Nb, 2), (S, 2), (C, 1)] 
   [(Cu, 'c'), (Nb, 2), (S, 2), (C, 1)]
[(Y, 1), (Ba, 2), (Cu, 3), (O, 6.7)] 
   [(Y, 1), (Ba, 2), (Cu, 3), (O, '7-a')]
[(Eu, 1), (Fe, '2-x'), (Co, 'x'), (As, 2)] 
   [(Eu, 1), (Fe, '2-x'), (Co, 'x'), (As, 2)]
[(Y, 1), (Ba, 2), (Cu, 3), (O, 6.69)] 
   [(Y, 1), (Ba, 2), (Cu, 3), (O, '7-a')]
[(Ba, 1), ([(Fe, 0.953), (Co, 0.047)], 2), (As, 2)] 
   [(Ba, 1), ([(Fe, '1-a'), (Co, 'a')], 2), (As, 2)]
[(Ba, 1), (Zn, 2), (As, 2)] 
   [(Ba, 1), (Zn, 2), (As, 2)]
[(W, 3), (N, 4)] 
   [(W, 3), (N, 4)]
[(Yb, 3), (Ta, 1), (O, 7)] 
   [(Yb, 3), (Ta, 1), (O, 7)]
[(Ca, 1), (Ru, '1-x

[([(Bi, 1), (Pb, 1)], 2), ([(Sr, 1), (La, 1)], 2), (Cu, 1), (O, '6+δ')] 
   [([(Bi, 1), (Pb, 1)], 2), ([(Sr, 1), (La, 1)], 2), (Cu, 1), (O, '6+δ')]
[(Li, 'x'), (Sn, 1), (Se, 2)] 
   [(Li, 'x'), (Sn, 1), (Se, 2)]
[(F, 1)] 
   [(F, 1)]
[(Re, 1), (O, '1-x'), (F, 'x'), (Fe, 1), (As, 1)] 
   [(Re, 1), (O, '1-x'), (F, 'x'), (Fe, 1), (As, 1)]
[(K, 'x'), (Fe, '2+δ'), (Se, 2)] 
   [(K, 'x'), (Fe, '2+δ'), (Se, 2)]
[(Th, 1), (N, 1), (O, 1), (Fe, 1), (As, 1)] 
   [(Th, 1), (N, 1), (O, 1), (Fe, 1), (As, 1)]
[(Cu, 0.05), (Zr, 1), (Te, 3)] 
   [(Cu, 'a'), (Zr, 1), (Te, 3)]
[(La, 1), (O, 0.89), (F, 0.11), (Fe, 1), (As, 1)] 
   [(La, 1), (O, '1-a'), (F, 'a'), (Fe, 1), (As, 1)]
[([(Cu, 1), (C, 1)], 1), (Ba, 2), (Ca, 4), (Cu, 5), (O, 'y')] 
   [(Cu, 1), (C, 1), (Ba, 2), (Ca, 4), (Cu, 5), (O, 'y')]
[([(Ca, 1), (Na, 1)], 1), (Fe, 2), (As, 2)] 
   [(Ca, 1), (Na, 1), (Fe, 2), (As, 2)]
[(Be, 1), ([(B, 1)], 2.75)] 
   [(Be, 1), ([(B, 1)], '3-a')]
[(Sr, 1), (Pd, 2), (Sb, 2)] 
   [(Sr, 1), (Pd, 2), (Sb, 2)]
[(Si

[(Cu, 1), (Ir, 2), (Te, 1.9), (I, 0.1)] 
   [(Cu, 1), (Ir, 2), (Te, '2-a'), (I, 'a')]
[(La, 1), (Ru, 2), (As, 2)] 
   [(La, 1), (Ru, 2), (As, 2)]
[(K, 72)] 
   [(K, 72)]
[([(Nd, 1), (Ce, 1)], 2), (Cu, 1), (O, 4)] 
   [([(Nd, 1), (Ce, 1)], 2), (Cu, 1), (O, 4)]
[(In, 2), (H, 1), (Nb, 1), (S, 2)] 
   [(In, 2), (H, 1), (Nb, 1), (S, 2)]
[(Bi, 2), (Sr, '2-x'), (La, 'x'), (Cu, 1), (O, '6-δ')] 
   [(Bi, 2), (Sr, '2-x'), (La, 'x'), (Cu, 1), (O, '6-δ')]
[(Li, 2), (Al, 1), (B, 4)] 
   [(Li, 2), (Al, 1), (B, 4)]
[(Pr, 1), (Fe, 1), (As, 1), (O, 0.7), (F, 0.3)] 
   [(Pr, 1), (Fe, 1), (As, 1), (O, '1-a'), (F, 'a')]
[(K, 0.62), (Fe, 1.7), (Se, 2)] 
   [(K, '1-a'), (Fe, '2-b'), (Se, 2)]
[([(Tl, 1), (Rb, 1)], 'y'), (Fe, '2-x'), (Se, 2)] 
   [([(Tl, 1), (Rb, 1)], 'y'), (Fe, '2-x'), (Se, 2)]
[(Li, 1), (Fe, 0.97), (Co, 0.03), (As, 1)] 
   [(Li, 1), (Fe, '1-a'), (Co, 'a'), (As, 1)]
[(Pb, 0.38), (Bi, 1.74), (Sr, 1.88), (Cu, 1), (O, '6+δ')] 
   [(Pb, 'a'), (Bi, '2-b'), (Sr, '2-c'), (Cu, 1), (O, '6+δ')]
[(C, 1

### Compare with Supercon v1 dataset:

In [18]:
# separate out known T_c from unknown T_c data:
known_tc = (supercon_df.Tc != 0)
supercon_df['KnownTc'] = known_tc

print('Supercon v1 materials: ', len(supercon_df))
print('Supercon v2 materials: ', len(cleaned_supercon2_df))

Supercon v1 materials:  16414
Supercon v2 materials:  23881
