In [12]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.datasets import load_iris
from matplotlib import pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
import numpy as np
from sklearn import linear_model
import scipy.io as spio
from sklearn.multioutput import MultiOutputRegressor
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process import GaussianProcessRegressor, kernels
import shelve

In [13]:
lattice = 'cubic'
coeff=0
data = spio.loadmat(lattice+'-data-posd-with-den.mat')
X = data['xdata']
y = data['ydata'][:,coeff]

In [14]:
models_and_parameters = {
  'lasso': (linear_model.Lasso(),
              {'reg__alpha': [0.01, 0.1, 0.5, 1.,5.,10.]}),
    'elnet': (linear_model.ElasticNet(),
              {'reg__alpha':[0.01, 0.1, 0.5, 1, 5., 10.], 'reg__l1_ratio':[0.,0.1,0.5,1.,2.1]}),
    'krg': (KernelRidge(),
            {'reg__kernel':['rbf','linear'], 'reg__alpha': [1e0, 0.1, 1e-2, 1e-3], 'reg__gamma': np.logspace(-2, 2, 5)}),
    'gpr': (GaussianProcessRegressor(kernel = kernels.RBF()),
            {'reg__kernel__length_scale':[0.01, 0.1, 1., 2., 10., 100.], 'reg__kernel__length_scale_bounds':[(1e-2,1.),(1e-1,1.),(1e-1,10.),(1.,10.),(1.,100.)\
,(1e-2,1e2)]}),
    'gbr': (GradientBoostingRegressor(learning_rate=0.01, min_samples_split=2, max_features='sqrt', loss='ls', subsample=0.4),
            {'reg__max_depth': [2,3,4,10,20,50],'reg__min_samples_leaf': [2,3,4,10], 'reg__learning_rate':[0.01, 0.1], 'reg__max_features':['auto', 'sqrt', 'l\
og2']}),
    'ada': (AdaBoostRegressor(base_estimator=DecisionTreeRegressor(),n_estimators=500,learning_rate=0.01),#max_depth alone doesn't work probably               
            {'reg__base_estimator__max_depth': [2,3,4,10], 'reg__base_estimator':[DecisionTreeRegressor(max_depth = 4, max_features='auto'),
                                                                                     DecisionTreeRegressor(max_depth = None, max_features='auto'),
                                                                                     DecisionTreeRegressor(max_depth = 4, max_features='sqrt'),
                                                                                     DecisionTreeRegressor(max_depth = None, max_features='sqrt')]}),
    'svr': (SVR(),
            {'reg__C': [0.01, 0.05, 0.1, 1], 'reg__kernel': ['linear', 'rbf']}),
    'rf': (RandomForestRegressor(),
           {'reg__max_depth': [None, 5, 10, 50]}),
    'brg': (linear_model.BayesianRidge(fit_intercept=True),
            {'reg__alpha_1': [1.e-6, 1.e-5]}),
    'lars': (linear_model.Lars(fit_intercept = True, normalize=False),
             {'reg__n_nonzero_coefs': [5, 10, 50, 500, np.inf]}),
    'ard': (linear_model.ARDRegression(),
            {'reg__alpha_1':[1.e-6, 1.e-5]})}

In [15]:
scaler = preprocessing.StandardScaler()

In [16]:
inner_cv = KFold(n_splits=3, shuffle=True)
outer_cv = KFold(n_splits=3, shuffle=True)

In [17]:
model=RandomForestRegressor(n_estimators=1000)
params={'reg__max_depth': [None, 5, 10, 50,100],'reg__max_features':['auto','sqrt','log2'],'reg__min_samples_split':[2,3,4],'reg__min_samples_leaf':[2,3,4]}
pipeline = Pipeline([('transformer', scaler), ('reg', model)])

In [18]:
clf = GridSearchCV(estimator=pipeline, param_grid=params, cv=inner_cv)
clf

GridSearchCV(cv=KFold(n_splits=3, random_state=None, shuffle=True),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('transformer', StandardScaler(copy=True, with_mean=True, with_std=True)), ('reg', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurit...ators=1000, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'reg__max_depth': [None, 5, 10, 50, 100], 'reg__max_features': ['auto', 'sqrt', 'log2'], 'reg__min_samples_split': [2, 3, 4], 'reg__min_samples_leaf': [2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [None]:
clf.fit(X,y)
clf.best_params_

In [None]:
clf.fit(X,y)
scores=cross_val_score(clf, X=X, y=y, scoring='neg_mean_squared_error',cv=outer_cv)
r2scores=cross_val_score(clf, X=X, y=y, scoring='r2',cv=outer_cv)
rmse_scores = [np.sqrt(abs(s)) for s in scores]
print('Cross-validation results:')
print('Folds: %i, mean RMSE: %.3f' % (len(scores), np.mean(np.abs(rmse_scores))))
print('Folds: %i, mean RMSE: %.3f' % (len(r2scores), np.mean(r2scores)))

In [None]:
from matminer.figrecipes.plotly.make_plots import PlotlyFig

pf_rf = PlotlyFig(x_title='DFT (MP) bulk modulus (GPa)',
                  y_title='Random forest bulk modulus (GPa)',
                  plot_title='Random forest regression',
                  plot_mode='offline',
                  margin_left=150,
                  textsize=35,
                  ticksize=30,
                  filename="rf_regression.html")

# a line to represent a perfect model with 1:1 prediction
xy_line = {'x_col': [min(y), max(y)],
           'y_col': [min(y), max(y)],
           'color': 'black',
           'mode': 'lines',
           'legend': None,
           'text': None,
           'size': None}


pf_rf.xy_plot(x_col=y,
              y_col=clf.predict(X),
              size=3,
              marker_outline_width=0.5,
              #text=df_mp['pretty_formula'],
              add_xy_plot=[xy_line])