In [4]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.datasets import load_iris
from matplotlib import pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
import numpy as np
from sklearn import linear_model
import scipy.io as spio
from sklearn.multioutput import MultiOutputRegressor
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process import GaussianProcessRegressor, kernels
import shelve

In [5]:
lattice = 'cubic'
coeff=0
data = spio.loadmat(lattice+'-data-posd-with-den.mat')
X = data['xdata']
y = data['ydata'][:,coeff]

In [3]:
print(X[2,:])
data['mps'][2]

[ 1.4804800e+02  1.4727705e+01  0.0000000e+00  6.3059300e-01
  1.0400000e+00  1.8000000e+01  6.0000000e+00  3.1312620e+00
  2.8888900e-01  1.8000000e+01  8.0000000e+00  2.7513770e+00
  1.9636940e+01  2.7513770e+00  1.6975000e-02  1.6691922e+01
  3.6000000e+00  1.0000000e+00 -4.1742500e-01  1.6692338e+01]


'mp-861931'

In [4]:
X1=np.delete(X,18,axis=1)
X1.shape
X1[2,:]
X=X1

In [5]:
models_and_parameters = {
  'lasso': (linear_model.Lasso(),
              {'reg__alpha': [0.01, 0.1, 0.5, 1.,5.,10.]}),
    'elnet': (linear_model.ElasticNet(),
              {'reg__alpha':[0.01, 0.1, 0.5, 1, 5., 10.], 'reg__l1_ratio':[0.,0.1,0.5,1.,2.1]}),
    'krg': (KernelRidge(),
            {'reg__kernel':['rbf','linear'], 'reg__alpha': [1e0, 0.1, 1e-2, 1e-3], 'reg__gamma': np.logspace(-2, 2, 5)}),
    'gpr': (GaussianProcessRegressor(kernel = kernels.RBF()),
            {'reg__kernel__length_scale':[0.01, 0.1, 1., 2., 10., 100.], 'reg__kernel__length_scale_bounds':[(1e-2,1.),(1e-1,1.),(1e-1,10.),(1.,10.),(1.,100.)\
,(1e-2,1e2)]}),
    'gbr': (GradientBoostingRegressor(learning_rate=0.01, min_samples_split=2, max_features='sqrt', loss='ls', subsample=0.4),
            {'reg__max_depth': [2,3,4,10,20,50],'reg__min_samples_leaf': [2,3,4,10], 'reg__learning_rate':[0.01, 0.1], 'reg__max_features':['auto', 'sqrt', 'l\
og2']}),
    'ada': (AdaBoostRegressor(base_estimator=DecisionTreeRegressor(),n_estimators=500,learning_rate=0.01),#max_depth alone doesn't work probably               
            {'reg__base_estimator__max_depth': [2,3,4,10], 'reg__base_estimator':[DecisionTreeRegressor(max_depth = 4, max_features='auto'),
                                                                                    DecisionTreeRegressor(max_depth = None, max_features='auto'),
                                                                                    DecisionTreeRegressor(max_depth = 4, max_features='sqrt'),
                                                                                     DecisionTreeRegressor(max_depth = None, max_features='sqrt')]}),
    'svr': (SVR(),
            {'reg__C': [0.01, 0.05, 0.1, 1], 'reg__kernel': ['linear', 'rbf']}),
    'rf': (RandomForestRegressor(),
           {'reg__max_depth': [None, 5, 10, 50]}),
    'brg': (linear_model.BayesianRidge(fit_intercept=True),
            {'reg__alpha_1': [1.e-6, 1.e-5]}),
    'lars': (linear_model.Lars(fit_intercept = True, normalize=False),
             {'reg__n_nonzero_coefs': [5, 10, 50, 500, np.inf]}),
    'ard': (linear_model.ARDRegression(),
            {'reg__alpha_1':[1.e-6, 1.e-5]})}

In [6]:
scaler = preprocessing.StandardScaler()
inner_cv = KFold(n_splits=3, shuffle=True)
outer_cv = KFold(n_splits=3, shuffle=True)

In [32]:
model=AdaBoostRegressor(base_estimator=DecisionTreeRegressor(),n_estimators=500,learning_rate=0.01)
params = {'reg__max_depth': [None,2,5,100],'reg__max_features':['auto','sqrt','log2'],'reg__min_samples_split':[2,3,4],'reg__min_samples_leaf':[2,3,4]}
params = {'reg__base_estimator__max_depth': [None,2,4,10,100],
            'reg__base_estimator__max_features':['auto','sqrt','log2',None],
            'reg__base_estimator__min_samples_split':[2,3,4],
            'reg__base_estimator__min_samples_leaf': [1,2,3,4]}
#params = {'reg__base_estimator':[DecisionTreeRegressor(max_depth = 4, max_features='auto'),
#                                DecisionTreeRegressor(max_depth = None, max_features='auto'),
#                                DecisionTreeRegressor(max_depth = 4, max_features='sqrt'),
#                                DecisionTreeRegressor(max_depth = None, max_features='sqrt')]}
pipeline = Pipeline([('transformer', scaler), ('reg', model)])

In [33]:
reg1 = GridSearchCV(estimator=pipeline, param_grid=params, cv=inner_cv, n_jobs=-1)

In [46]:
#not to run
model=AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=100,
                                                            max_features='log2',
                                                            min_samples_leaf= 3,
                                                            min_samples_split= 3))
pipeline = Pipeline([('transformer', scaler), ('reg', model)])
reg1 = Pipeline([('transformer', scaler), ('reg', model)])
#clf.fit(X,y)
#clf.best_params_

In [41]:
#One with GridSearch
reg1.fit(X,y)
print(reg1.best_params_)
#scores=cross_val_score(reg1, X=X, y=y, scoring='neg_mean_squared_error',cv=outer_cv)
r2scores=cross_val_score(reg1, X=X, y=y, scoring='r2',cv=outer_cv)
#rmse_scores = [np.sqrt(abs(s)) for s in scores]
#print(rmse_scores)
print(r2scores)
print('Cross-validation results:')
#print('Folds: %i, mean RMSE: %.3f' % (len(scores), np.mean(np.abs(rmse_scores))))
print('Folds: %i, mean r2: %.3f' % (len(r2scores), np.mean(r2scores)))
print(sum(y-reg1.predict(X)))
#print(y-reg1.predict(X))

{'reg__base_estimator__max_depth': 100, 'reg__base_estimator__max_features': 'log2', 'reg__base_estimator__min_samples_leaf': 3, 'reg__base_estimator__min_samples_split': 3}
[0.77268395 0.39346309 0.54305701]
Cross-validation results:
Folds: 3, mean r2: 0.570
101.58333333333334


In [83]:
reg1.fit(X,y)
#print(reg1.best_params_)
#scores=cross_val_score(reg1, X=X, y=y, scoring='neg_mean_squared_error',cv=outer_cv)
r2scores=cross_val_score(reg1, X=X, y=y, scoring='r2',cv=outer_cv)
#rmse_scores = [np.sqrt(abs(s)) for s in scores]
#print(rmse_scores)
print(r2scores)
print('Cross-validation results:')
#print('Folds: %i, mean RMSE: %.3f' % (len(scores), np.mean(np.abs(rmse_scores))))
print('Folds: %i, mean r2: %.3f' % (len(r2scores), np.mean(r2scores)))

[0.79404276 0.55647594 0.7017167 ]
Cross-validation results:
Folds: 3, mean r2: 0.684


In [45]:
print(reg1.best_params_)

{'reg__base_estimator__max_depth': 100, 'reg__base_estimator__max_features': 'log2', 'reg__base_estimator__min_samples_leaf': 3, 'reg__base_estimator__min_samples_split': 3}


In [86]:
ntdata = spio.loadmat(lattice+'-non-training-data-with-den.mat')
Xnt = ntdata['xntdata']
Xnt=np.delete(Xnt,18,axis=1)
ynt1=reg1.predict(Xnt)

In [84]:
from matminer.figrecipes.plotly.make_plots import PlotlyFig

pf_rf = PlotlyFig(x_title='DFT (MP) C11 (GPa)',
                  y_title='Random Forest C11 (GPa)',
                  plot_title='Random forest regression',
                  plot_mode='offline',
                  margin_left=150,
                  textsize=35,
                  ticksize=30,
                  filename="rf_regression1.html")

# a line to represent a perfect model with 1:1 prediction
xy_line = {'x_col': [0, max(y)],
           'y_col': [0, max(y)],
           'color': 'black',
           'mode': 'lines',
           'legend': None,
           'text': None,
           'size': None}

pf_rf.xy_plot(x_col=y,
              y_col=reg1.predict(X),
              size=3,
              marker_outline_width=0.5,
              #text=df_mp['pretty_formula'],
              add_xy_plot=[xy_line])

In [85]:
import pickle
s = pickle.dump(reg1,open( "reg1_noform.p", "wb" ))

In [199]:
coeff=1
y = data['ydata'][:,coeff]
params=models_and_parameters['lasso'][1]
scaler = preprocessing.StandardScaler()
inner_cv = KFold(n_splits=3, shuffle=True)
outer_cv = KFold(n_splits=3, shuffle=True)

In [200]:
model=linear_model.Lasso()
scaler = preprocessing.StandardScaler()
pipeline = Pipeline([('transformer', scaler), ('reg', model)])
reg2 = GridSearchCV(estimator=pipeline, param_grid=params, cv=inner_cv,n_jobs=-1)
reg2

GridSearchCV(cv=KFold(n_splits=3, random_state=None, shuffle=True),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('transformer', StandardScaler(copy=True, with_mean=True, with_std=True)), ('reg', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'reg__alpha': [0.01, 0.1, 0.5, 1.0, 5.0, 10.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [352]:
reg2.fit(X,y)
reg2.best_params_
scores=cross_val_score(reg2, X=X, y=y, scoring='neg_mean_squared_error',cv=outer_cv)
r2scores=cross_val_score(reg2, X=X, y=y, scoring='r2',cv=outer_cv)
rmse_scores = [np.sqrt(abs(s)) for s in scores]
print('Cross-validation results:')
print('Folds: %i, mean RMSE: %.3f' % (len(scores), np.mean(np.abs(rmse_scores))))
print('Folds: %i, mean r2: %.3f' % (len(r2scores), np.mean(r2scores)))

Cross-validation results:
Folds: 3, mean RMSE: 16.915
Folds: 3, mean r2: 0.846


In [353]:
from matminer.figrecipes.plotly.make_plots import PlotlyFig
pf_rf = PlotlyFig(x_title='DFT (MP) C12 (GPa)',
                  y_title='Bayesian Ridge C12 (GPa)',
                  plot_title='Random forest regression',
                  plot_mode='offline',
                  margin_left=150,
                  textsize=35,
                  ticksize=30,
                  filename="br2_regression1.html")

# a line to represent a perfect model with 1:1 prediction
xy_line = {'x_col': [min(y), max(y)],
           'y_col': [min(y), max(y)],
           'color': 'black',
           'mode': 'lines',
           'legend': None,
           'text': None,
           'size': None}


pf_rf.xy_plot(x_col=y,
              y_col=reg2.predict(X),
              size=3,
              marker_outline_width=0.5,
              #text=df_mp['pretty_formula'],
              add_xy_plot=[xy_line])

In [354]:
import pickle
s = pickle.dump(reg2,open( "reg2_noform.p", "wb" ))

In [355]:
coeff=2
y = data['ydata'][:,coeff]
params=models_and_parameters['brg'][1]
scaler = preprocessing.StandardScaler()
inner_cv = KFold(n_splits=3, shuffle=True)
outer_cv = KFold(n_splits=3, shuffle=True)
model=linear_model.BayesianRidge(normalize=False)
scaler = preprocessing.StandardScaler()
pipeline = Pipeline([('transformer', scaler), ('reg', model)])

In [817]:
reg3 = GridSearchCV(estimator=pipeline, param_grid=params, cv=inner_cv,n_jobs=-1)
reg3
reg3.fit(X,y)
reg3.best_params_
scores=cross_val_score(reg3, X=X, y=y, scoring='neg_mean_squared_error',cv=outer_cv)
r2scores=cross_val_score(reg3, X=X, y=y, scoring='r2',cv=outer_cv)
print(r2scores)
rmse_scores = [np.sqrt(abs(s)) for s in scores]
print('Cross-validation results:')
#print('Folds: %i, mean RMSE: %.3f' % (len(scores), np.mean(np.abs(rmse_scores))))
print('Folds: %i, mean r2: %.3f' % (len(r2scores), np.mean(r2scores)))

[0.72590485 0.78405826 0.6265607 ]
Cross-validation results:
Folds: 3, mean r2: 0.712


In [818]:
from matminer.figrecipes.plotly.make_plots import PlotlyFig
pf_rf = PlotlyFig(x_title='DFT (MP) C44 (GPa)',
                  y_title='Bayesian Ridge C44 (GPa)',
                  plot_title='Random forest regression',
                  plot_mode='offline',
                  margin_left=150,
                  textsize=35,
                  ticksize=30,
                  filename="br3_regression1.html")

# a line to represent a perfect model with 1:1 prediction
xy_line = {'x_col': [min(y), max(y)],
           'y_col': [min(y), max(y)],
           'color': 'black',
           'mode': 'lines',
           'legend': None,
           'text': None,
           'size': None}

pf_rf.xy_plot(x_col=y,
              y_col=reg3.predict(X),
              size=3,
              marker_outline_width=0.5,
              #text=df_mp['pretty_formula'],
              add_xy_plot=[xy_line])

In [819]:
import pickle
s = pickle.dump(reg3,open( "reg3_noform.p", "wb" ))

In [820]:
y1t=reg1.predict(X)
y2t=reg2.predict(X)
y3t=reg3.predict(X)
e1t=(y1t+2*y2t)/3
e2t=(y1t-y2t)
e3t=y3t
from sklearn.metrics import *
yall = data['ydata']
print(mean_squared_error(y1t,yall[:,0]))
print(mean_squared_error(y2t,yall[:,1]))
print(mean_squared_error(y3t,yall[:,2]))
print(mean_squared_error(e1t,(yall[:,0]+2.*yall[:,1])/3.))
print(mean_squared_error(e2t,yall[:,0]-yall[:,1]))
print(mean_squared_error(e3t,yall[:,2]))
print()
print(r2_score(y1t,yall[:,0]))
print(r2_score(y2t,yall[:,1]))
print(r2_score(y3t,yall[:,2]))
print(r2_score(e1t,(yall[:,0]+2.*yall[:,1])/3.))
print(r2_score(e2t,yall[:,0]-yall[:,1]))
print(r2_score(e3t,yall[:,2]))

97.41702297585702
197.05481972699144
158.26740359133254
112.14214679707378
232.65033305792758
158.26740359133254

0.98006455634168
0.854755349177047
0.6882464206315353
0.9405688510174358
0.9200212242959691
0.6882464206315353


In [821]:
y1nt=reg1.predict(Xnt)
y2nt=reg2.predict(Xnt)
y3nt=reg3.predict(Xnt)

In [822]:
ynt=np.column_stack((y1nt,y2nt,y3nt))
ynt.shape
spio.savemat(lattice+'-nt-result',mdict={'coeffsnt':ynt,'coord':ntdata['coord'],'mps':ntdata['mps'],'volrat':ntdata['volrat']})

In [823]:
print(data['volratt'][0])

[0.2581988  0.38035522 0.38049773 0.25820269 0.38049772 0.08406202
 0.38049773 0.25820563 0.25820563 0.2581988  0.2581988  0.08406202
 0.25820563 0.08406202 0.25820563 0.25820563 0.25820563 0.25820563
 0.73524312 0.23805073 0.23356835 0.23244811 0.38049773 0.08406202
 0.25820563 0.08406202 0.15191745 0.15191745 0.05525574 0.25819966
 0.25820364 0.25820563 0.25819916 0.25819966 0.25820563 0.25819966
 0.25820121 0.25820563 0.25820392 0.25819651 0.25820563 0.14101352
 0.25820563 0.25820478 0.25820563 0.25820563 0.15191745 0.25820029
 0.25820336 0.2581988  0.25820563 0.25820563 0.15191745 0.2581988
 0.15191744 0.1579381  0.16420279 0.25820563 0.25820563 0.25820563
 0.2581988  0.25820364 0.25820051 0.25820563 0.25820563 0.25819966
 0.25820336 0.25819368 0.25820165 0.25820563 0.25820321 0.25820392
 0.25820165 0.0538958  0.25819872 0.25820563 0.25820563 0.25820563
 0.25820279 0.25819966 0.15191745 0.2581988  0.25820563 0.25819966
 0.25820336 0.25819966 0.25820078 0.14623936 0.15191735 0.25820

In [824]:
ytr=np.column_stack((reg1.predict(X),reg2.predict(X),reg3.predict(X)))
ytr.shape
spio.savemat(lattice+'-tr-result',mdict={'coeffstr':data['ydata'],'coeffspred':ytr,'coord':data['coordt'],'mps':data['mps'],'volrat':data['volratt']})

In [119]:
import pickle
with open('reg1.p','rb') as f:
    reg1=pickle.load(f)

In [126]:
import pickle
with open('reg3.p','rb') as f:
    r3=pickle.load(f)
r3

GridSearchCV(cv=KFold(n_splits=3, random_state=None, shuffle=True),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('transformer', StandardScaler(copy=True, with_mean=True, with_std=True)), ('reg', BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
       fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
       normalize=False, tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'reg__alpha_1': [1e-06, 1e-05]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [11]:
dd = spio.loadmat(lattice+'-nt-result')
dd['coeffsnt']=dd['coeffsnt'].tolist()
dd['coord']=dd['coord'].tolist()
dd['volrat']=dd['volrat'].tolist()
dd['mps']=dd['mps'].tolist()

In [12]:
import json
with open(lattice+'-nt-result_noform.json','w') as f:
    json.dump(dd,f)

TypeError: Object of type 'bytes' is not JSON serializable

In [11]:
model=linear_model.Lasso()
params=models_and_parameters['lasso'][1]
scaler = preprocessing.StandardScaler()
pipeline = Pipeline([('transformer', scaler), ('reg', model)])
reg1lasso = GridSearchCV(estimator=pipeline, param_grid=params, cv=inner_cv,n_jobs=-1)
reg1lasso

GridSearchCV(cv=KFold(n_splits=3, random_state=None, shuffle=True),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('transformer', StandardScaler(copy=True, with_mean=True, with_std=True)), ('reg', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'reg__alpha': [0.01, 0.1, 0.5, 1.0, 5.0, 10.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [294]:
reg1lasso.fit(X,y)
reg1lasso.best_params_
#scores=cross_val_score(reg2, X=X, y=y, scoring='neg_mean_squared_error',cv=outer_cv)
r2scores=cross_val_score(reg1lasso, X=X, y=y, scoring='r2',cv=outer_cv)
#rmse_scores = [np.sqrt(abs(s)) for s in scores]
print('Cross-validation results:')
#print('Folds: %i, mean RMSE: %.3f' % (len(scores), np.mean(np.abs(rmse_scores))))
print('Folds: %i, mean r2: %.3f' % (len(r2scores), np.mean(r2scores)))

Cross-validation results:
Folds: 3, mean r2: 0.616


reg1lasso

In [298]:
print(reg1lasso.best_estimator_.named_steps['reg'].coef_)
print(reg2.best_estimator_.named_steps['reg'].coef_)
print(reg3.best_estimator_.named_steps['reg'].coef_)

[-73.99316211  46.3843869    4.75194328   3.0581043  -29.7807194
 -83.48320337  -2.59027984 -35.69553963   7.59154831 142.25994489
  27.55151659 -15.72826648 -18.54535543  -5.91218868 -35.05286745
 107.27319597  -6.11099286  50.10285911   0.        ]
[-33.13148875  -5.31709399  -1.58192648  -0.96671281  -1.30384867
   4.46508118  -1.00773901  -9.52158754   6.30148955  10.10122899
  -3.9855769   -0.           0.25734102  10.1495421  -16.2522837
  50.2334811    2.96694845   5.64221635   0.10404665]
[-20.17605836   3.20680575  -1.27026152  -3.1777611   -7.89473839
  -2.74887685   3.96638945 -16.79590484   4.25475666  23.67939975
   7.57692893  -8.84518697   2.59918826   5.0206955  -13.94565021
  11.83438023   3.21882829  10.74947255  11.82631575]


In [296]:
import pickle
with open('reg2_noform.p','rb') as f:
    reg2=pickle.load(f)
with open('reg3_noform.p','rb') as f:
    reg3=pickle.load(f)

In [299]:
coef1=reg1lasso.best_estimator_.named_steps['reg'].coef_
coef2=reg2.best_estimator_.named_steps['reg'].coef_
coef3=reg3.best_estimator_.named_steps['reg'].coef_
print(np.abs(coef1).argsort()[-5:][::-1]+1)
print(np.abs(coef2).argsort()[-5:][::-1]+1)
print(np.abs(coef3).argsort()[-5:][::-1]+1)

[10 16  6  1 18]
[16  1 15 14 10]
[10  1  8 15 16]
