In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
# read data into DataFrame

def make_df(fin):
    """
    Args:
        fin (str) - file name with training or test data
    Returns:
        DataFrame with renamed columns (personal preference)
    """
    df = pd.read_csv(fin)
    df = df.rename(columns={'spacegroup' : 'spacegroup',
                            'number_of_total_atoms' : 'Natoms',
                            'percent_atom_al' : 'x_Al',
                            'percent_atom_ga' : 'x_Ga',
                            'percent_atom_in' : 'x_In',
                            'lattice_vector_1_ang' : 'a',
                            'lattice_vector_2_ang' : 'b',
                            'lattice_vector_3_ang' : 'c',
                            'lattice_angle_alpha_degree' : 'alpha',
                            'lattice_angle_beta_degree' : 'beta',
                            'lattice_angle_gamma_degree' : 'gamma',
                            'formation_energy_ev_natom' : 'formation_energy_ev_natom',
                            'bandgap_energy_ev' : 'bandgap_energy_ev'})
    return df

In [3]:
# retrieve list of elemental properties

def get_prop_list(path_to_element_data):
    """
    Args:
        path_to_element_data (str) - path to folder of elemental property files
    Returns:
        list of elemental properties (str) which have corresponding .csv files
    """
    return [f[:-4] for f in os.listdir(path_to_element_data)]

In [4]:
def get_prop(prop, path_to_element_data):
    """
    Args:
        prop (str) - name of elemental property
        path_to_element_data (str) - path to folder of elemental property files
    Returns:
        dictionary of {element (str) : property value (float)}
    """
    fin = os.path.join(path_to_element_data, prop+'.csv')
    with open(fin) as f:
        all_els = {line.split(',')[0] : float(line.split(',')[1][:-1]) for line in f}
        my_els = ['Al', 'Ga', 'In']
        return {el : all_els[el] for el in all_els if el in my_els}    

In [5]:
# average each property using the composition

def avg_prop(x_Al, x_Ga, x_In, prop):
    """
    Args:
        x_Al (float or DataFrame series) - concentration of Al
        x_Ga (float or DataFrame series) - concentration of Ga
        x_In (float or DataFrame series) - concentration of In
        prop (str) - name of elemental property
    Returns:
        average property for the compound (float or DataFrame series), 
        weighted by the elemental concentrations
    """
    els = ['Al', 'Ga', 'In']
    concentration_dict = dict(zip(els, [x_Al, x_Ga, x_In]))
    return np.sum(prop_dict[prop][el] * concentration_dict[el] for el in els)

In [6]:
# calculate the volume of the structure

def get_vol(a, b, c, alpha, beta, gamma):
    """
    Args:
        a (float) - lattice vector 1
        b (float) - lattice vector 2
        c (float) - lattice vector 3
        alpha (float) - lattice angle 1 [radians]
        beta (float) - lattice angle 2 [radians]
        gamma (float) - lattice angle 3 [radians]
    Returns:
        volume (float) of the parallelepiped unit cell
    """
    return a*b*c*np.sqrt(1 + 2*np.cos(alpha)*np.cos(beta)*np.cos(gamma)
                           - np.cos(alpha)**2
                           - np.cos(beta)**2
                           - np.cos(gamma)**2)

In [7]:
def get_atomic_density(df):
    return df['Natoms'] / df['vol']

In [8]:
def execute(df_train):    
    # folder which contains data folders
    input_dir = os.path.join('/home/agi/Desktop/NOMAD/', 'data')
    # folder which contains element data
    path_to_element_data = os.path.join(input_dir, 'elemental-properties')
    # get list of properties which have data files
    properties = get_prop_list(path_to_element_data)
    print(sorted(properties))

    # make nested dictionary which maps {property (str) : {element (str) : property value (float)}}
    prop_dict = {prop : get_prop(prop, path_to_element_data) for prop in properties}
    print('The mass of aluminum is %.2f amu' % prop_dict['mass']['Al'])

    # add averaged properties to DataFrame
    for prop in properties:
        df_train['_'.join(['avg', prop])] = avg_prop(df_train['x_Al'], 
                                                     df_train['x_Ga'],
                                                     df_train['x_In'],
                                                     prop)
    list(df_train)

    # convert lattice angles from degrees to radians for volume calculation
    lattice_angles = ['alpha', 'beta', 'gamma']
    for lang in lattice_angles:
        df_train['_'.join([lang, 'r'])] = np.pi * df_train[lang] / 180

    # compute the cell volumes 
    df_train['vol'] = get_vol(df_train['a'], df_train['b'], df_train['c'],
                              df_train['alpha_r'], df_train['beta_r'], df_train['gamma_r'])
    df_train[['a','b','c','alpha_r','beta_r','gamma_r','vol']].head()

    # calculate the atomic density

    # this is known to correlate with stability or bonding strength
    df_train['atomic_density'] = get_atomic_density(df_train)
    
    train_cols = ['id',
     'spacegroup',
     'Natoms',
     'x_Al',
     'x_Ga',
     'x_In',
     'a',
     'b',
     'c',
     'alpha',
     'beta',
     'gamma',
     'avg_HOMO',
     'avg_rs_max',
     'avg_rp_max',
     'avg_mass',
     'avg_rd_max',
     'avg_LUMO',
     'avg_EA',
     'avg_electronegativity',
     'avg_IP',
     'alpha_r',
     'beta_r',
     'gamma_r',
     'vol',
     'atomic_density',
     'formation_energy_ev_natom',
     'bandgap_energy_ev',]

    df_train=df_train.reindex(columns=train_cols)

    return df_train

In [9]:
# folder which contains competition data
path_to_train_data = os.path.join(input_dir, '')

# training data
f_train = os.path.join(path_to_train_data, 'train.csv')
f_test = os.path.join(path_to_train_data, 'test.csv')
# make DataFrame of training data
df_train = make_df(f_train)

train_prep = execute(df_train)
train_prep.to_csv("/home/agi/Desktop/NOMAD/train_prepared2.csv", index=False)

NameError: name 'input_dir' is not defined

In [10]:
test_prep = execute(make_df(f_test))
test_prep.to_csv("/home/agi/Desktop/NOMAD/test_prepared2.csv", index=False)

NameError: name 'f_test' is not defined