In [2]:
import os, sys
print (sys.path)
from mtds.reader import *
from mtds.helper import *
from mtds.plots import *

['', 'c:\\python35\\python35.zip', 'c:\\python35\\DLLs', 'c:\\python35\\lib', 'c:\\python35', 'c:\\python35\\lib\\site-packages', 'c:\\python35\\lib\\site-packages\\IPython\\extensions', 'C:\\Users\\1203087\\.ipython']


In [3]:
# Read local data of raw (downloaded) VASP format.

PATH_BASE = os.path.join(os.getcwd(), '../database')

path_bin = os.path.join(PATH_BASE, 'vasp/binary_part')
path_bin_all = os.path.join(PATH_BASE, 'vasp/binary')

bin_mts = read_vasp_from_dir(path_bin)
bin_mts_all = read_vasp_from_dir(path_bin_all)

all_materials = dict()
all_materials['Binary'] = bin_mts
all_materials['Binary_All'] = bin_mts_all

In [6]:
# Prepare training and testing sets.

from sklearn.model_selection import train_test_split

# Filter Materials.
def mt_filter(mt):
    """Use this to filter data you don't need.
    """
    return mt['band_gap'] > 0.0
#     return mt # use all available.

mts_all = bin_mts_all
print('Number of All Materials:{}'.format(len(mts_all)))

# Filter materials for some condition.
mts_used = [mt for mt in mts_all if mt_filter(mt)]
print('Number of USED Materials:{}'.format(len(mts_used)))

from sklearn.model_selection import train_test_split

mts_train, mts_test = train_test_split(mts_used, test_size=0.33, random_state=0)

print('# of Training Materials: {} [Filtered]'.format(len(mts_train)))
print('# of Testing  Materials: {} [Filtered]'.format(len(mts_test)))


Number of All Materials:10717
Number of USED Materials:3823
# of Training Materials: 2561 [Filtered]
# of Testing  Materials: 1262 [Filtered]


In [1]:
# Define functions for fetching features and targets.

def get_targets_mine(mt):
    """Return user-defined target.
    """
    return mt['band_gap']
#     return mt['elasticity']['elastic_tensor'][0][0]

def get_specific_features_mine(mt):
    """Return specific features.
    """

    target_features = ['density', 'energy_per_atom', 'formation_energy_per_atom']
#     target_features = ['density', 'energy_per_atom', 'formation_energy_per_atom', 'volume', 'energy', 'nsites', 'e_above_hull']
    
    features = zeros(len(target_features)) 
    
    for i in range(len(target_features)):
        features[i] = mt[target_features[i]]
        
    return features
    
def get_features_mine(mt):
    
    # Get the boundary of rows and groups of periodic table.
    rg_limits = row_group_limits()
    
    rg_features = get_row_group_density_vec(mt, rg_limits)
#     rg_features = getElementDensity(mt)
    
    spec_features = get_specific_features_mine(mt)
    
    return np.concatenate((rg_features, spec_features))
#     return rg_features


# A shortcut for feature target pairing.
collecting = lambda x, y, z: [[y(mt) for mt in x], [z(mt) for mt in x]]

features_train, targets_train = collecting(mts_train, get_features_mine, get_targets_mine)
features_test, targets_test = collecting(mts_test, get_features_mine, get_targets_mine)

print('Feature Vector Length: {}'.format(len(features_train[0])))
print('Train - # of features: {}, targets: {}'.format(len(features_train), len(targets_train)))
print('Test  - # of features: {}, targets: {}'.format(len(features_test), len(targets_test)))

table = []
for mt in mts_test:
    table.append([mt['pretty_formula'], mt['full_formula'], get_targets_mine(mt), mt['material_id']])

import tabulate
print(tabulate.tabulate(table, headers=['Pretty Formula', 'Full Formula', 'Target', 'Material-ID']))

NameError: name 'mts_train' is not defined

In [None]:
## Feature Scaling/Preprocessing.

from sklearn.preprocessing import StandardScaler, RobustScaler
# http://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing

scaler = StandardScaler() 
# scaler = RobustScaler()

# Do remember learn from training set ONLY.
scaler.fit(features_train)

# Scale all the features.
features_train = scaler.transform(features_train)
features_test  = scaler.transform(features_test)

In [None]:
# Build ML model(s) with scikit-learn.

from sklearn import linear_model, kernel_ridge
from sklearn.model_selection import cross_val_score

models = []
params = [i for i in range(1, 5)]

# Ridge regression model, alpha is the regularization weight.
for p in params:
#     model = linear_model.Ridge(alpha = p, fit_intercept=True))

    # scikit-learn build in kernels: 'poly', 'rbf', 'laplacian', 'sigmoid'.
    # refer to: http://scikit-learn.org/stable/modules/svm.html#svm-kernels
    model = kernel_ridge.KernelRidge(kernel='poly', degree = p, alpha = 1.0, gamma = p)
    models.append(model)

# More about prediction models.
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge
# http://scikit-learn.org/stable/modules/generated/sklearn.kernel_ridge.KernelRidge.html#sklearn.kernel_ridge.KernelRidge

In [None]:
%%time
# fitting (training) model(s).
for model in models:
    model.fit(features_train, targets_train)

In [None]:
# Prepare training and testing set for later usage.

# Collecting data for better arrangement.
training_set = aggregate_data(mts_train, features_train, targets_train)
testing_set  = aggregate_data(mts_test, features_test, targets_test)

In [None]:
%%time
# Evaluation the results.
estimators = []
for i in range(len(params)):
    estimators.append({'name':'Parameter set {}'.format(params[i]), 'regressor':models[i]})

print('\nResults of Training')    
show_correlations(estimators, features_train, targets_train)

# Cross validation.
k_fold = 10
scores = []
print('\n============= {}-Fold Cross Validation on Training Set =========='.format(k_fold))
for estimator in estimators:
    score = cross_val_score(estimator['regressor'], features_train, targets_train, cv=k_fold)
    scores.append(score)
    print("{}: score = {} +- {}".format(estimator['name'], score.mean(), score.std()*2))

display_cv(scores)

print('\nResults of Testing')  

show_correlations(estimators, features_test, targets_test)

# show_elements_on_ptable([mts_test, mts_train])

In [None]:
# Visualize the results.

for estimator in estimators:
    plot_regression_results(training_set, testing_set, estimator, print_details=True)

In [None]:
# Visualize the error.

errors_train = [targets_train - estimator['regressor'].predict(features_train) for estimator in estimators]
errors_test  = [targets_test  - estimator['regressor'].predict(features_test) for estimator in estimators]

# print(len(errors_train))
plot_hist(errors_train, 'Training Error Distribution', 0.1)
plot_hist(errors_test, 'Testing Error Distribution', 0.1)

In [None]:
# Visualize the errors.

abs_error_train = [abs(targets_train - estimator['regressor'].predict(features_train)) for estimator in estimators]
abs_error_test  = [abs(targets_test  - estimator['regressor'].predict(features_test)) for estimator in estimators]

# print(len(errors_train))
plot_hist(abs_error_train, 'Training Error Distribution', 0.1)
plot_hist(abs_error_test, 'Testing Error Distribution', 0.1)