In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
columns={'spacegroup' : 'sg',
                            'number_of_total_atoms' : 'Natoms',
                            'percent_atom_al' : 'x_Al',
                            'percent_atom_ga' : 'x_Ga',
                            'percent_atom_in' : 'x_In',
                            'lattice_vector_1_ang' : 'a',
                            'lattice_vector_2_ang' : 'b',
                            'lattice_vector_3_ang' : 'c',
                            'lattice_angle_alpha_degree' : 'alpha',
                            'lattice_angle_beta_degree' : 'beta',
                            'lattice_angle_gamma_degree' : 'gamma',
                            'formation_energy_ev_natom' : 'E',
                            'bandgap_energy_ev' : 'Eg'}
    
    
df_train = pd.read_csv("./input/train.csv").rename(columns=columns)
df_train["dataset"] = "train"
df_train["E"]=np.log1p(df_train["E"])
df_train["Eg"]=np.log1p(df_train["Eg"])
df_test = pd.read_csv("./input/test.csv").rename(columns=columns)
df_test["dataset"] = "test"
df_total = pd.concat([df_train, df_test], ignore_index=True)

len(df_train),len(df_test),len(df_total)

(2400, 600, 3000)

In [3]:
df_total.head()

Unnamed: 0,E,Eg,Natoms,a,alpha,b,beta,c,dataset,gamma,id,sg,x_Al,x_Ga,x_In
0,0.065788,1.490362,80.0,9.9523,90.0026,8.5513,90.0023,9.1775,train,90.0017,1,33,0.625,0.375,0.0
1,0.222343,1.366347,80.0,6.184,90.0186,6.1838,89.998,23.6287,train,120.0025,2,194,0.625,0.375,0.0
2,0.167293,1.320101,40.0,9.751,90.9688,5.6595,91.1228,13.963,train,30.5185,3,227,0.8125,0.1875,0.0
3,0.196553,1.469992,30.0,5.0036,89.9888,5.0034,90.0119,13.5318,train,120.0017,4,167,0.75,0.0,0.25
4,0.049266,0.866806,80.0,6.6614,89.996,6.6612,90.0006,24.5813,train,119.9893,5,194,0.0,0.625,0.375


In [4]:
df_total.tail()

Unnamed: 0,E,Eg,Natoms,a,alpha,b,beta,c,dataset,gamma,id,sg,x_Al,x_Ga,x_In
2995,,,80.0,24.8145,90.0002,6.3964,104.7733,6.2933,test,90.0001,596,12,0.0,0.5938,0.4062
2996,,,40.0,5.5783,90.0008,9.4849,89.9967,10.1107,test,90.0004,597,33,0.125,0.0,0.875
2997,,,80.0,6.9377,90.0072,6.9372,89.988,25.0641,test,119.9857,598,194,0.0,0.25,0.75
2998,,,40.0,5.1841,90.0041,8.8659,90.0009,9.4956,test,90.0007,599,33,0.625,0.0,0.375
2999,,,80.0,9.4959,90.0029,9.4956,90.0031,9.4956,test,89.9969,600,206,0.375,0.3438,0.2812


In [5]:
#from https://www.kaggle.com/cbartel/random-forest-using-elemental-properties
def get_vol(a, b, c, alpha, beta, gamma):
    """
    Args:
        a (float) - lattice vector 1
        b (float) - lattice vector 2
        c (float) - lattice vector 3
        alpha (float) - lattice angle 1 [radians]
        beta (float) - lattice angle 2 [radians]
        gamma (float) - lattice angle 3 [radians]
    Returns:
        volume (float) of the parallelepiped unit cell
    """
    alpha=alpha*np.pi/180
    beta=beta*np.pi/180
    gamma=gamma*np.pi/180
    return a*b*c*np.sqrt(1 + 2*np.cos(alpha)*np.cos(beta)*np.cos(gamma)
                           - np.cos(alpha)**2
                           - np.cos(beta)**2
                           - np.cos(gamma)**2)


    
# compute the cell volumes 
df_total['vol'] = get_vol(df_total['a'], df_total['b'], df_total['c'],
                          df_total['alpha'], df_total['beta'], df_total['gamma'])
#df_total[['a','b','c','alpha','beta','gamma','vol']].head()
df_total['density']=df_total['Natoms']/df_total["vol"]
df_total['density_Al']=df_total['density']*df_total['x_Al']
df_total['density_Ga']=df_total['density']*df_total['x_Ga']
df_total['density_In']=df_total['density']*df_total['x_In']
df_total['sg']=df_total['sg'].astype('category')

In [6]:
df_total.head()

Unnamed: 0,E,Eg,Natoms,a,alpha,b,beta,c,dataset,gamma,id,sg,x_Al,x_Ga,x_In,vol,density,density_Al,density_Ga,density_In
0,0.065788,1.490362,80.0,9.9523,90.0026,8.5513,90.0023,9.1775,train,90.0017,1,33,0.625,0.375,0.0,781.052081,0.102426,0.064016,0.03841,0.0
1,0.222343,1.366347,80.0,6.184,90.0186,6.1838,89.998,23.6287,train,120.0025,2,194,0.625,0.375,0.0,782.50011,0.102236,0.063898,0.038339,0.0
2,0.167293,1.320101,40.0,9.751,90.9688,5.6595,91.1228,13.963,train,30.5185,3,227,0.8125,0.1875,0.0,391.227531,0.102242,0.083072,0.01917,0.0
3,0.196553,1.469992,30.0,5.0036,89.9888,5.0034,90.0119,13.5318,train,120.0017,4,167,0.75,0.0,0.25,293.377334,0.102257,0.076693,0.0,0.025564
4,0.049266,0.866806,80.0,6.6614,89.996,6.6612,90.0006,24.5813,train,119.9893,5,194,0.0,0.625,0.375,944.713843,0.084682,0.0,0.052926,0.031756


In [7]:
#Encoding of cat features
import sys 
sys.path.append("../kaggle_varie")
from  varie import *
cols_to_enc=["sg"]

#binary encoder
#enc=bin_enc(df_total,cols_to_enc,verbose=2,copy=True,drop_original=True,ordinal_only=False)
#one-hot encoder
enc=pd.get_dummies(df_total,columns=cols_to_enc)



In [8]:
def grid_search_fct(model,params,df,y_col,n_iter=20,cv=4,drop_col=[],verbose=2):
    
    X_train=df.drop(y_col+drop_col,axis=1).values
    grids=[]
    for y in y_col:
        print(y)
        y_train=df[y].values
        print(X_train.shape,y_train.shape)

        grid=RandomizedSearchCV(model,param_distributions=params, n_iter=n_iter,cv=cv,verbose=verbose,scoring="neg_mean_squared_error" )

        grid.fit(X_train,y_train)
        grids.append(grid)
    return grids

In [25]:
#grid search for random forest
import scipy
from  sklearn.model_selection import RandomizedSearchCV
from sklearn import *
from catboost import CatBoostRegressor,CatBoostClassifier
from sklearn.svm import SVR
from sklearn.linear_model import  ElasticNet
from sklearn.ensemble import  GradientBoostingRegressor, RandomForestRegressor,AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
#from sklearn.kernel_approximation import Nystroem
#from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import Lasso,Ridge,LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from varie import lognuniform
#from varie import loguniform2
%aimport varie
import varie
from scipy.stats import uniform, randint

y_col=["E","Eg"]
drop_col=["id","dataset"]
df_total_train_eval=enc[df_total.dataset=='train']
df_total_test=enc[df_total.dataset=='test']

X_train=df_total_train_eval.drop(y_col+drop_col,axis=1).values
X_test=df_total_test.drop(y_col+drop_col,axis=1).values

models={
    
    'knn':
           (KNeighborsRegressor(),
            {'n_neighbors':scipy.stats.randint(1,100)}),
    
    'svr':
           (SVR(verbose=False,kernel='linear'),
            {'C':lognuniform(low=-4,high=4,base=10,size=100),
             'epsilon':lognuniform(low=-2,high=0,base=10,size=100)}),
    
    'svr_rbf':
           (SVR(verbose=False,kernel='rbf'),
            {'C': lognuniform(low=-2,high=2,base=10,size=100),
             'gamma':lognuniform(low=-2,high=2,base=10,size=100)}),

    'rf':
           (ensemble.RandomForestRegressor(verbose=False),
            {"max_depth": scipy.stats.randint(1,100), 
             'n_estimators': scipy.stats.randint(1,400),
             'max_features':('log2','sqrt','auto'),
             'min_samples_split':scipy.stats.randint(2,5),
             'min_samples_leaf':scipy.stats.randint(1,5)}),
    
    'cb':
           (CatBoostRegressor(loss_function='RMSE', eval_metric='RMSE',logging_level='Silent'),
            {"depth": scipy.stats.randint(1,6), 
             'iterations': scipy.stats.randint(100,2000),
             'learning_rate':lognuniform(low=-2,high=-1,base=10,size=100),
             'l2_leaf_reg': scipy.stats.randint(2,4)}),
    
    'mlp': 
           (MLPRegressor((80, 10), early_stopping=False),
             {'hidden_layer_sizes':scipy.stats.randint(1,100),
              'alpha':lognuniform(low=-5,high=-1,base=10,size=100)}),
             
     'gb':
           (GradientBoostingRegressor(n_estimators=100),
            {'learning_rate':lognuniform(low=-3,high=-1,base=10,size=100), 
             'n_estimators': scipy.stats.randint(1,300),
             'max_depth':scipy.stats.randint(1,5),
             'max_features':('sqrt','log2','auto')}),
    
    'lasso':
            (Lasso(),
            {'alpha':lognuniform(low=-6,high=2,base=10,size=100)}),  

    'ridge':
            (Ridge(),
            {'alpha':varie.lognuniform(low=-6,high=2,base=10,size=100)}),
    
    'eln':
            (ElasticNet(),
            {'alpha':lognuniform(low=-6,high=4,base=10,size=100), 
             'l1_ratio':lognuniform(low=-6,high=4,base=10,size=100)}),
    
    'xgb':
        (XGBRegressor(),
         {'max_depth':scipy.stats.randint(1,100), 
          'learning_rate':lognuniform(low=-4,high=-0.5,base=10,size=100), 
          'n_estimators':scipy.stats.randint(1,400),
          'colsample_bytree': uniform(0.55, 0.66),
          'min_child_weight': randint(30, 60),
          'colsample_bytree': uniform(0.6, 0.4),
          'reg_lambda': uniform(1, 2),
          'reg_alpha': uniform(1, 2),
}),
    
 #does not install    
    'gbm' :
        (LGBMRegressor(objective='regression'),
            {'num_leaves':scipy.stats.randint(1,200), 
          'learning_rate':lognuniform(low=-4,high=-0.5,base=10,size=100), 
          'n_estimators':scipy.stats.randint(1,400)}),
    'adb' :
        (AdaBoostRegressor(loss="square"),
            {'learning_rate':lognuniform(low=-4,high=-0.1,base=10,size=10), 
             'n_estimators':scipy.stats.randint(1,400)}),         

    
       }
 
    

try:
    results
except:
    results={}
    
for (tag,model) in  models.items():
    if (tag not in results):
        print(tag)
        results[tag]=grid_search_fct(model[0],model[1],df_total_train_eval,y_col,n_iter=10,cv=4,drop_col=drop_col,verbose=1)


    
    #grid=RandomizedSearchCV(model[0],param_distributions=params, n_iter=20,cv=4,verbose=2,scoring="neg_mean_squared_error" )

                        
    #grid.fit(X_train,y_train)
    #grids.append(grid)

ERROR:root:Line magic function `%aimport` not found.


In [10]:
import pickle
#pickle.dump(results, open( "results_100iter.pickle", "wb" ))
#results.pop('cb')

In [41]:
#best models and their performance

for tag,grids in results.items():
    print(tag)
    for grid in grids:
        print(grid.best_params_)
    print((np.sqrt(-grids[0].best_score_)+np.sqrt(-grids[1].best_score_))/2,
          np.sqrt(-grids[0].best_score_),np.sqrt(-grids[1].best_score_))

knn
{'n_neighbors': 10}
{'n_neighbors': 8}
0.0844665177528 0.0583265454341 0.110606490071
svr
{'epsilon': 0.019614171120463306, 'C': 0.35193348838838973}
{'epsilon': 0.019614171120463306, 'C': 3.5803832611521806}
0.0967069025043 0.0556463496366 0.137767455372
rf
{'max_depth': 44, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 378}
{'max_depth': 92, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 397}
0.062052700501 0.0330543963934 0.0910510046087
cb
{'depth': 5, 'iterations': 1266, 'l2_leaf_reg': 2, 'learning_rate': 0.017966319176400233}
{'depth': 5, 'iterations': 1283, 'l2_leaf_reg': 3, 'learning_rate': 0.022438886398284608}
0.0593270482992 0.0318770563238 0.0867770402745
mlp
{'alpha': 7.4480758285059003e-05, 'hidden_layer_sizes': 74}
{'alpha': 0.00019138373871388064, 'hidden_layer_sizes': 36}
0.181769505703 0.141626186076 0.221912825331
gb
{'learning_rate': 0.083379899825578627, 'max_depth': 3, 'max_featur

In [None]:
#Stacking via mlxtend
from mlxtend import StackingRegressor
#cbr=CatBoostRegressor(loss_function='RMSE', eval_metric='RMSE',logging_level='Silent')
#    'rf':
rf1=ensemble.RandomForestRegressor(verbose=False)
rf2=ensemble.RandomForestRegressor(verbose=False)
#abc = SVR(kernel='rbf')

params_meta={"meta-randomforestregressor__max_depth": scipy.stats.randint(1,100), 
             'meta-randomforestregressor__n_estimators': scipy.stats.randint(1,400),
             'meta-randomforestregressor__max_features':('log2','sqrt','auto'),
             'meta-randomforestregressor__min_samples_split':scipy.stats.randint(2,5),
             'meta-randomforestregressor__min_samples_leaf':scipy.stats.randint(1,5)}


#grid search
#params_meta = {'meta-svr__C': [0.1, 1.0, 10.0, 100.0],
#          'meta-svr__gamma': [0.1, 1.0, 10.0]}

#randomized search
#params_meta = {'meta-svr__C': lognuniform(low=-2,high=2,base=10,size=100),
#               'meta-svr__gamma': lognuniform(low=-2,high=2,base=10,size=100)}

#{'meta-cbr__depth': scipy.stats.randint(1,6)}
             
             
             #'meta-meta_learner__iterations': scipy.stats.randint(100,2000),
             #'meta-meta_learner__learning_rate':lognuniform(low=-2,high=-1,base=10,size=100),
             #'meta-meta_learner__l2_leaf_reg': scipy.stats.randint(2,4)}
            
from mlxtend.regressor import StackingRegressor

learners1=[g[0].best_estimator_ for g in results.values()]
learners2=[g[1].best_estimator_ for g in results.values()]
learners=[learners1,learners2]

stregr = [StackingRegressor(regressors=learners1,meta_regressor=rf1),
          StackingRegressor(regressors=learners2,meta_regressor=rf2)]




results2=[]
for s,y in zip(stregr,y_col):
    print(y)
    y_train=df_total_train_eval[y]
    grid=RandomizedSearchCV(s,param_distributions=params_meta, n_iter=10,cv=5,verbose=10,scoring="neg_mean_squared_error" )
    #grid=GridSearchCV(s,param_grid=params_meta, cv=5,verbose=10,scoring="neg_mean_squared_error" )
    grid.fit(X_train, y_train)
    results2.append(grid)



Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] meta-randomforestregressor__max_depth=67, meta-randomforestregressor__max_features=log2, meta-randomforestregressor__min_samples_leaf=2, meta-randomforestregressor__min_samples_split=3, meta-randomforestregressor__n_estimators=15 




[CV]  meta-randomforestregressor__max_depth=67, meta-randomforestregressor__max_features=log2, meta-randomforestregressor__min_samples_leaf=2, meta-randomforestregressor__min_samples_split=3, meta-randomforestregressor__n_estimators=15, score=-0.001332, total= 1.7min
[CV] meta-randomforestregressor__max_depth=67, meta-randomforestregressor__max_features=log2, meta-randomforestregressor__min_samples_leaf=2, meta-randomforestregressor__min_samples_split=3, meta-randomforestregressor__n_estimators=15 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.7min remaining:    0.0s


[CV]  meta-randomforestregressor__max_depth=67, meta-randomforestregressor__max_features=log2, meta-randomforestregressor__min_samples_leaf=2, meta-randomforestregressor__min_samples_split=3, meta-randomforestregressor__n_estimators=15, score=-0.001181, total= 1.7min
[CV] meta-randomforestregressor__max_depth=67, meta-randomforestregressor__max_features=log2, meta-randomforestregressor__min_samples_leaf=2, meta-randomforestregressor__min_samples_split=3, meta-randomforestregressor__n_estimators=15 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  3.5min remaining:    0.0s


[CV]  meta-randomforestregressor__max_depth=67, meta-randomforestregressor__max_features=log2, meta-randomforestregressor__min_samples_leaf=2, meta-randomforestregressor__min_samples_split=3, meta-randomforestregressor__n_estimators=15, score=-0.001291, total= 1.7min
[CV] meta-randomforestregressor__max_depth=67, meta-randomforestregressor__max_features=log2, meta-randomforestregressor__min_samples_leaf=2, meta-randomforestregressor__min_samples_split=3, meta-randomforestregressor__n_estimators=15 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  5.2min remaining:    0.0s


[CV]  meta-randomforestregressor__max_depth=67, meta-randomforestregressor__max_features=log2, meta-randomforestregressor__min_samples_leaf=2, meta-randomforestregressor__min_samples_split=3, meta-randomforestregressor__n_estimators=15, score=-0.001137, total= 1.7min
[CV] meta-randomforestregressor__max_depth=67, meta-randomforestregressor__max_features=log2, meta-randomforestregressor__min_samples_leaf=2, meta-randomforestregressor__min_samples_split=3, meta-randomforestregressor__n_estimators=15 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  6.9min remaining:    0.0s


[CV]  meta-randomforestregressor__max_depth=67, meta-randomforestregressor__max_features=log2, meta-randomforestregressor__min_samples_leaf=2, meta-randomforestregressor__min_samples_split=3, meta-randomforestregressor__n_estimators=15, score=-0.001343, total= 1.7min
[CV] meta-randomforestregressor__max_depth=80, meta-randomforestregressor__max_features=auto, meta-randomforestregressor__min_samples_leaf=3, meta-randomforestregressor__min_samples_split=4, meta-randomforestregressor__n_estimators=233 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  8.6min remaining:    0.0s


[CV]  meta-randomforestregressor__max_depth=80, meta-randomforestregressor__max_features=auto, meta-randomforestregressor__min_samples_leaf=3, meta-randomforestregressor__min_samples_split=4, meta-randomforestregressor__n_estimators=233, score=-0.001341, total= 1.9min
[CV] meta-randomforestregressor__max_depth=80, meta-randomforestregressor__max_features=auto, meta-randomforestregressor__min_samples_leaf=3, meta-randomforestregressor__min_samples_split=4, meta-randomforestregressor__n_estimators=233 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 10.5min remaining:    0.0s


[CV]  meta-randomforestregressor__max_depth=80, meta-randomforestregressor__max_features=auto, meta-randomforestregressor__min_samples_leaf=3, meta-randomforestregressor__min_samples_split=4, meta-randomforestregressor__n_estimators=233, score=-0.001168, total= 1.8min
[CV] meta-randomforestregressor__max_depth=80, meta-randomforestregressor__max_features=auto, meta-randomforestregressor__min_samples_leaf=3, meta-randomforestregressor__min_samples_split=4, meta-randomforestregressor__n_estimators=233 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 12.4min remaining:    0.0s


[CV]  meta-randomforestregressor__max_depth=80, meta-randomforestregressor__max_features=auto, meta-randomforestregressor__min_samples_leaf=3, meta-randomforestregressor__min_samples_split=4, meta-randomforestregressor__n_estimators=233, score=-0.001325, total= 1.8min
[CV] meta-randomforestregressor__max_depth=80, meta-randomforestregressor__max_features=auto, meta-randomforestregressor__min_samples_leaf=3, meta-randomforestregressor__min_samples_split=4, meta-randomforestregressor__n_estimators=233 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 14.2min remaining:    0.0s


[CV]  meta-randomforestregressor__max_depth=80, meta-randomforestregressor__max_features=auto, meta-randomforestregressor__min_samples_leaf=3, meta-randomforestregressor__min_samples_split=4, meta-randomforestregressor__n_estimators=233, score=-0.001165, total= 1.9min
[CV] meta-randomforestregressor__max_depth=80, meta-randomforestregressor__max_features=auto, meta-randomforestregressor__min_samples_leaf=3, meta-randomforestregressor__min_samples_split=4, meta-randomforestregressor__n_estimators=233 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 16.1min remaining:    0.0s


[CV]  meta-randomforestregressor__max_depth=80, meta-randomforestregressor__max_features=auto, meta-randomforestregressor__min_samples_leaf=3, meta-randomforestregressor__min_samples_split=4, meta-randomforestregressor__n_estimators=233, score=-0.001355, total= 2.2min
[CV] meta-randomforestregressor__max_depth=80, meta-randomforestregressor__max_features=sqrt, meta-randomforestregressor__min_samples_leaf=1, meta-randomforestregressor__min_samples_split=4, meta-randomforestregressor__n_estimators=225 
[CV]  meta-randomforestregressor__max_depth=80, meta-randomforestregressor__max_features=sqrt, meta-randomforestregressor__min_samples_leaf=1, meta-randomforestregressor__min_samples_split=4, meta-randomforestregressor__n_estimators=225, score=-0.001330, total= 1.8min
[CV] meta-randomforestregressor__max_depth=80, meta-randomforestregressor__max_features=sqrt, meta-randomforestregressor__min_samples_leaf=1, meta-randomforestregressor__min_samples_split=4, meta-randomforestregressor__n_esti

[CV]  meta-randomforestregressor__max_depth=73, meta-randomforestregressor__max_features=log2, meta-randomforestregressor__min_samples_leaf=4, meta-randomforestregressor__min_samples_split=3, meta-randomforestregressor__n_estimators=45, score=-0.001069, total= 2.6min
[CV] meta-randomforestregressor__max_depth=73, meta-randomforestregressor__max_features=log2, meta-randomforestregressor__min_samples_leaf=4, meta-randomforestregressor__min_samples_split=3, meta-randomforestregressor__n_estimators=45 
[CV]  meta-randomforestregressor__max_depth=73, meta-randomforestregressor__max_features=log2, meta-randomforestregressor__min_samples_leaf=4, meta-randomforestregressor__min_samples_split=3, meta-randomforestregressor__n_estimators=45, score=-0.001268, total= 2.4min
[CV] meta-randomforestregressor__max_depth=73, meta-randomforestregressor__max_features=log2, meta-randomforestregressor__min_samples_leaf=4, meta-randomforestregressor__min_samples_split=3, meta-randomforestregressor__n_estimat

In [None]:
StackingRegressor.get_params(StackingRegressor)

In [None]:
from mlens.ensemble import SuperLearner
import mlens
from mlens.model_selection import Evaluator
from mlens.metrics import make_scorer
from mlens.metrics import rmse

from mlens.metrics import make_scorer
rmse_scorer = make_scorer(rmse, greater_is_better=False)



learners1=[g[0].best_estimator_ for g in results.values()]
learners2=[g[1].best_estimator_ for g in results.values()]
learners=[learners1,learners2]

# Instantiate the ensemble with 10 folds
#meta_learner1=CatBoostRegressor(iterations=1200,
#                            learning_rate=0.03,
#                            depth=4,
#                            loss_function='RMSE',
#                            eval_metric='RMSE',
##                            random_seed=SEED,
#                            od_type='Iter',
#                            od_wait=50,verbose=False)

#import copy
#meta_learner2=copy.deepcopy(meta_learner1)

#sl1 = SuperLearner(
#    folds=5,
#    verbose=True,
##    scorer=mlens.metrics.rmse
#)
#sl2 = SuperLearner(
#    folds=5,
#    verbose=True,
#    scorer=mlens.metrics.rmse
#)

# Add the base learners and the meta learner
#sl1.add(learners1) 
#sl1.add_meta(meta_learner1)
#sl2.add(learners2) 
#sl2.add_meta(meta_learner2)

#sls=[sl1,sl2]
#evaluator
#evl = Evaluator(make_scorer(mlens.metrics.rmse), cv=5, shuffle=False)
sls=[]
for learner,y in zip(learners,y_col):
    print(y)
    y_train=df_total_train_eval[y].values
    print(X_train.shape,y_train.shape)
    
    #evl.fit(X_train, y_train, sl, {}, n_iter=1)
    from mlens.ensemble import SuperLearner
    
    sl = SuperLearner(
    folds=5,
    verbose=True,
    #    scorer=mlens.metrics.rmse
    )
    
    meta_learner=CatBoostRegressor(iterations=1200,
                            learning_rate=0.03,
                            depth=4,
                            loss_function='RMSE',
                            eval_metric='RMSE',
#                            random_seed=SEED,
                            od_type='Iter',
                            od_wait=50,verbose=False)
    
    sl.add(learner) 
    sl.add_meta(meta_learner)
    # Train the ensemble
    sl.fit(X_train, y_train)
    preds = sl.predict(X_train)
    print(rmse(y_train, preds))
    sls.append(sl)
#    results.append(mlens.metrics.rmse(y_train, ensemble.predict(X_train)),
#                          evl.summary['test_score_mean']['superlearner'],
#                          evl.summary['test_score_std']['superlearner'],
#                          mlens.metrics.rmse(y_test, ensemble.predict(X_test)))

#    print_scores(scores_df, 'mlens')

In [None]:
## from mlens.ensemble import SuperLearner
import mlens
from mlens.model_selection import Evaluator
from mlens.metrics import make_scorer
from mlens.metrics import rmse



learners1=[grid[0].best_estimator_ for grid in results.values()]
learners2=[grid[1].best_estimator_ for grid in results.values()]

grid_sl=[]

tries=3
#grid search for the meta learner         
for depth, iterations, learning_rate in zip (scipy.stats.randint(1,5).rvs(tries),
                                              scipy.stats.randint(1000,2000).rvs(tries),    
                                              lognuniform(low=-2,high=-1,base=10,size=tries)):
    print(depth, iterations, learning_rate)
    

    # Instantiate the ensemble with 10 folds
    meta_learner1=CatBoostRegressor(iterations=iterations,
                                learning_rate=learning_rate,
                                depth=depth,
                                loss_function='RMSE',
                                eval_metric='RMSE',
    #                            random_seed=SEED,
                                od_type='Iter',
                                od_wait=50,verbose=False)

    import copy
    meta_learner2=copy.deepcopy(meta_learner1)

    sl1 = SuperLearner(
        folds=5,
        verbose=True,
        scorer=mlens.metrics.rmse
    )
    sl2 = SuperLearner(
        folds=5,
        verbose=True,
        scorer=mlens.metrics.rmse
    )

    # Add the base learners and the meta learner
    sl1.add(learners1) 
    sl1.add_meta(meta_learner1)
    sl2.add(learners2) 
    sl2.add_meta(meta_learner2)



    sls=[sl1,sl2]
    #evaluator
    #evl = Evaluator(make_scorer(mlens.metrics.rmse), cv=5, shuffle=False)

    for i,y in enumerate(y_col):
        print(y)
        y_train=df_total_train_eval[y].values
        #print(X_train.shape,y_train.shape)

        #evl.fit(X_train, y_train, sl, {}, n_iter=1)

        # Train the ensemble
        sls[i].fit(X_train, y_train)
        preds = sls[i].predict(X_train)
        print(rmse(y_train, preds))
        
        grid_sl.append(depth, iterations, learning_rate,sls)
        
        
    #    results.append(mlens.metrics.rmse(y_train, ensemble.predict(X_train)),
    #                          evl.summary['test_score_mean']['superlearner'],
    #                          evl.summary['test_score_std']['superlearner'],
    #                          mlens.metrics.rmse(y_test, ensemble.predict(X_test)))

    #    print_scores(scores_df, 'mlens')

In [None]:
in_layer = SuperLearner(model_selection=True)
in_layer.add(base_learners)

preprocess = [in_layer]

evl = Evaluator(
    scorer,
    cv=2,
    verbose=5,
)

evl.fit(
    X_train, y_train,
    meta_learners,
    param_dicts,
    preprocessing={'meta': preprocess},
    n_iter=5                           # bump this up to do a larger grid search
)


In [None]:
#write to csv
%load_ext autoreload
%aimport varie
%autoreload 2
#I use a different model for E and Eg
varie.make_csv2(df_total_train_eval,pd.DataFrame(),df_total_test,
#         (ensemble.RandomForestRegressor(max_depth= 11, max_features='log2', n_estimators= 55),
#          ensemble.RandomForestRegressor(max_depth= 9, max_features='sqrt', n_estimators= 220)),
            (sl1,sl2),
         y_col,'sl3.csv',drop=drop_col,columns=['id','E','Eg'],
         new_column_names=['id','formation_energy_ev_natom' ,'bandgap_energy_ev'],change_col_names=True,fit=False)

In [None]:
grids_sl=[]
for i,y in enumerate(y_col):
    print(y)
    y_train=df_total_train_eval[y].values
    print(X_train.shape,y_train.shape)

    grid=RandomizedSearchCV(model,param_distributions=params, n_iter=n_iter,cv=cv,verbose=verbose,scoring="neg_mean_squared_error" )

    grid.fit(X_train,y_train)
    grids_sl.append(grid)

    sls[i].fit(X_train, y_train)
    preds = sls[i].predict(X_train)
    print(rmse(y_train, preds))
#    results.append(mlens.metrics.rmse(y_train, ensemble.predict(X_train)),
#                          evl.summary['test_score_mean']['superlearner'],
#                          evl.summary['test_score_std']['superlearner'],
#                          mlens.metrics.rmse(y_test, ensemble.predict(X_test)))

#    print_scores(scores_df, 'mlens')   X_train=df.drop(y_col+drop_col,axis=1).values
    for y in y_col:

    return grids

In [None]:
evaluator = Evaluator()
evaluator.fit(X_train, y_train,sl)

In [None]:
models

In [None]:
ests=[(tag, model[0]) for tag,model in models.items()]
params={tag:model[1] for tag,model in models.items() }

In [None]:
from mlens.model_selection import Evaluator

from scipy.stats import randint

# Here we name the estimators ourselves
#ests = [('gnb', GaussianNB()), ('knn', KNeighborsClassifier())]

# Now we map parameters to these
# The gnb doesn't have any parameters so we can skip it
#pars = {'n_neighbors': randint(2, 20)}
#params = {'knn': pars}

evaluators=[]
for i,y in enumerate(y_col):
    print(y)
    y_train=df_total_train_eval[y].values
    print(X_train.shape,y_train.shape)
    evaluator = Evaluator(rmse_scorer, cv=10,  verbose=1)

    
    evaluator.fit(X_train,y_train, ests, params, n_iter=5)
    evaluators.append(evaluator)

In [None]:
from mlens.metrics import make_scorer
rmse_scorer = make_scorer(rmse, average='micro', greater_is_better=False)

In [None]:
base_learners=[(tag,model[0]) for tag,model in models.items()]
param_dicts_base={tag:model[1] for tag,model in models.items()}
len(param_dicts_base),len(base_learners)

In [None]:
from scipy.stats import uniform, randint
SEED=1
# We consider the following models (or base learners)
gb = XGBRegressor()
ls = Lasso(alpha=1e-6, normalize=True)
el = ElasticNet(alpha=1e-6, normalize=True)
rf = RandomForestRegressor(random_state=SEED)

base_learners = [
    ('ls', ls), ('el', el), ('rf', rf), ('gb', gb)
]

# Put their parameter dictionaries in a dictionary with the
# estimator names as keys
param_dicts_base = {
    'ls':
    {'alpha': uniform(1e-6, 1e-5)},
    'el':
    {'alpha': uniform(1e-6, 1e-5),
     'l1_ratio': uniform(0, 1)
    },
    'gb':
    {'learning_rate': uniform(0.02, 0.04),
     'colsample_bytree': uniform(0.55, 0.66),
     'min_child_weight': randint(30, 60),
     'max_depth': randint(3, 7),
     'subsample': uniform(0.4, 0.2),
     'n_estimators': randint(150, 200),
     'colsample_bytree': uniform(0.6, 0.4),
     'reg_lambda': uniform(1, 2),
     'reg_alpha': uniform(1, 2),
    },
    'rf':
    {'max_depth': randint(2, 5),
     'min_samples_split': randint(5, 20),
     'min_samples_leaf': randint(10, 20),
     'n_estimators': randint(50, 100),
     'max_features': uniform(0.6, 0.3)
    }
}


In [None]:
from sklearn.preprocessing import StandardScaler
from mlens.model_selection import Evaluator
assert(len(base_learners)==len(param_dicts_base))
from sklearn.metrics import mean_absolute_error
scorer = make_scorer(mean_absolute_error, greater_is_better=False)

evl = Evaluator(
    scorer,
    cv=2,
    random_state=SEED,
    verbose=5,
)



evl.fit(
    X_train, y_train,
    estimators=base_learners,
    param_dicts=param_dicts_base,
    preprocessing={'sc': [StandardScaler()], 'none': []},
    n_iter=2  # bump this up to do a larger grid search
)




In [None]:
pd.DataFrame(evl.results)

In [None]:
meta_learners = [
    ('gb', gb), ('el', el)
]

param_dicts = {
    'el':
    {'alpha': uniform(1e-5, 1),
     'l1_ratio': uniform(0, 1)
    },
    'gb':
    {'learning_rate': uniform(0.01, 0.2),
     'subsample': uniform(0.5, 0.5),
     'reg_lambda': uniform(0.1, 1),
     'n_estimators': randint(10, 100)
    },
}


# Put the layers you don't want to tune into an ensemble with model selection turned on
# Just remember to turn it off when you're done!





In [None]:
len(meta_learners), len(param_dicts)

In [None]:
in_layer = SuperLearner(model_selection=True)
in_layer.add(base_learners)

preprocess = [in_layer]

evl.fit(
    X_train, y_train,
    meta_learners,
    param_dicts,
#    preprocessing={'meta': preprocess},
    n_iter=5                           # bump this up to do a larger grid search
)



In [None]:
pd.DataFrame(evl.results)

In [None]:
def lognuniform(low=0, high=1, size=None, base=np.exp(1)):
    return np.power(base, np.random.uniform(low, high, size))

In [None]:
uniform.rvs?

In [None]:
np.random.uniform?

In [None]:
scipy.stats.uniform?

In [None]:
from sympy.stats import *
x = Symbol('x')
X = ContinuousRV(x, 2*x, Interval(0, 1))

P(X>.5) 

Var(X) # variance

E(2*cos(X)+X**2) # complex expressions are ok too
