In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
columns={'spacegroup' : 'sg',
                            'number_of_total_atoms' : 'Natoms',
                            'percent_atom_al' : 'x_Al',
                            'percent_atom_ga' : 'x_Ga',
                            'percent_atom_in' : 'x_In',
                            'lattice_vector_1_ang' : 'a',
                            'lattice_vector_2_ang' : 'b',
                            'lattice_vector_3_ang' : 'c',
                            'lattice_angle_alpha_degree' : 'alpha',
                            'lattice_angle_beta_degree' : 'beta',
                            'lattice_angle_gamma_degree' : 'gamma',
                            'formation_energy_ev_natom' : 'E',
                            'bandgap_energy_ev' : 'Eg'}
    
    
df_train = pd.read_csv("./input/train.csv").rename(columns=columns)
df_train["dataset"] = "train"
df_train["E"]=np.log1p(df_train["E"])
df_train["Eg"]=np.log1p(df_train["Eg"])
df_test = pd.read_csv("./input/test.csv").rename(columns=columns)
df_test["dataset"] = "test"
df_total = pd.concat([df_train, df_test], ignore_index=True)

len(df_train),len(df_test),len(df_total)

(2400, 600, 3000)

In [3]:
df_total.head()

Unnamed: 0,E,Eg,Natoms,a,alpha,b,beta,c,dataset,gamma,id,sg,x_Al,x_Ga,x_In
0,0.065788,1.490362,80.0,9.9523,90.0026,8.5513,90.0023,9.1775,train,90.0017,1,33,0.625,0.375,0.0
1,0.222343,1.366347,80.0,6.184,90.0186,6.1838,89.998,23.6287,train,120.0025,2,194,0.625,0.375,0.0
2,0.167293,1.320101,40.0,9.751,90.9688,5.6595,91.1228,13.963,train,30.5185,3,227,0.8125,0.1875,0.0
3,0.196553,1.469992,30.0,5.0036,89.9888,5.0034,90.0119,13.5318,train,120.0017,4,167,0.75,0.0,0.25
4,0.049266,0.866806,80.0,6.6614,89.996,6.6612,90.0006,24.5813,train,119.9893,5,194,0.0,0.625,0.375


In [4]:
df_total.tail()

Unnamed: 0,E,Eg,Natoms,a,alpha,b,beta,c,dataset,gamma,id,sg,x_Al,x_Ga,x_In
2995,,,80.0,24.8145,90.0002,6.3964,104.7733,6.2933,test,90.0001,596,12,0.0,0.5938,0.4062
2996,,,40.0,5.5783,90.0008,9.4849,89.9967,10.1107,test,90.0004,597,33,0.125,0.0,0.875
2997,,,80.0,6.9377,90.0072,6.9372,89.988,25.0641,test,119.9857,598,194,0.0,0.25,0.75
2998,,,40.0,5.1841,90.0041,8.8659,90.0009,9.4956,test,90.0007,599,33,0.625,0.0,0.375
2999,,,80.0,9.4959,90.0029,9.4956,90.0031,9.4956,test,89.9969,600,206,0.375,0.3438,0.2812


In [5]:
def get_vol(a, b, c, alpha, beta, gamma):
    """
    Args:
        a (float) - lattice vector 1
        b (float) - lattice vector 2
        c (float) - lattice vector 3
        alpha (float) - lattice angle 1 [radians]
        beta (float) - lattice angle 2 [radians]
        gamma (float) - lattice angle 3 [radians]
    Returns:
        volume (float) of the parallelepiped unit cell
    """
    alpha=alpha*np.pi/180
    beta=beta*np.pi/180
    gamma=gamma*np.pi/180
    return a*b*c*np.sqrt(1 + 2*np.cos(alpha)*np.cos(beta)*np.cos(gamma)
                           - np.cos(alpha)**2
                           - np.cos(beta)**2
                           - np.cos(gamma)**2)


    
# compute the cell volumes 
df_total['vol'] = get_vol(df_total['a'], df_total['b'], df_total['c'],
                          df_total['alpha'], df_total['beta'], df_total['gamma'])
#df_total[['a','b','c','alpha','beta','gamma','vol']].head()
df_total['density']=df_total['Natoms']/df_total["vol"]
df_total['sg']=df_total['sg'].astype('category')

In [6]:
df_total.head()

Unnamed: 0,E,Eg,Natoms,a,alpha,b,beta,c,dataset,gamma,id,sg,x_Al,x_Ga,x_In,vol,density
0,0.065788,1.490362,80.0,9.9523,90.0026,8.5513,90.0023,9.1775,train,90.0017,1,33,0.625,0.375,0.0,781.052081,0.102426
1,0.222343,1.366347,80.0,6.184,90.0186,6.1838,89.998,23.6287,train,120.0025,2,194,0.625,0.375,0.0,782.50011,0.102236
2,0.167293,1.320101,40.0,9.751,90.9688,5.6595,91.1228,13.963,train,30.5185,3,227,0.8125,0.1875,0.0,391.227531,0.102242
3,0.196553,1.469992,30.0,5.0036,89.9888,5.0034,90.0119,13.5318,train,120.0017,4,167,0.75,0.0,0.25,293.377334,0.102257
4,0.049266,0.866806,80.0,6.6614,89.996,6.6612,90.0006,24.5813,train,119.9893,5,194,0.0,0.625,0.375,944.713843,0.084682


In [27]:
import sys 
sys.path.append("../kaggle_varie")
from  varie import *
cols_to_enc=["sg"]

#binary encoder
#enc=bin_enc(df_total,cols_to_enc,verbose=2,copy=True,drop_original=True,ordinal_only=False)
#one-hot encoder
enc=pd.get_dummies(df_total,columns=cols_to_enc)

In [67]:
import scipy
from  sklearn.model_selection import RandomizedSearchCV
from sklearn import *

y_col=["E","Eg"]
drop_col=["id","dataset"]
df_total_train_eval=enc[df_total.dataset=='train']
df_total_test=enc[df_total.dataset=='test']

X_train=df_total_train_eval.drop(y_col+drop_col,axis=1).values
X_test=df_total_test.drop(y_col+drop_col,axis=1).values
grids=[]
for y in y_col:
    print(y)
    y_train=df_total_train_eval[y].values
    y_test=df_total_test[y].values
    print(X_train.shape,y_train.shape)
    
    grid=RandomizedSearchCV(ensemble.RandomForestRegressor(), 
                                        param_distributions=
                        {"max_depth": scipy.stats.randint(1,100), 
                         'n_estimators': scipy.stats.randint(1,300),
                        'max_features':('log2','sqrt','auto')},
                                        n_iter=50,cv=4,verbose=2,scoring="neg_mean_squared_error" )
    grid.fit(X_train,y_train)
    grids.append(grid)

E
(2400, 18) (2400,)
Fitting 4 folds for each of 50 candidates, totalling 200 fits
[CV] n_estimators=223, max_features=log2, max_depth=2 ................
[CV] . n_estimators=223, max_features=log2, max_depth=2, total=   0.3s
[CV] n_estimators=223, max_features=log2, max_depth=2 ................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] . n_estimators=223, max_features=log2, max_depth=2, total=   0.3s
[CV] n_estimators=223, max_features=log2, max_depth=2 ................
[CV] . n_estimators=223, max_features=log2, max_depth=2, total=   0.2s
[CV] n_estimators=223, max_features=log2, max_depth=2 ................
[CV] . n_estimators=223, max_features=log2, max_depth=2, total=   0.3s
[CV] n_estimators=257, max_features=log2, max_depth=68 ...............
[CV]  n_estimators=257, max_features=log2, max_depth=68, total=   0.8s
[CV] n_estimators=257, max_features=log2, max_depth=68 ...............
[CV]  n_estimators=257, max_features=log2, max_depth=68, total=   0.7s
[CV] n_estimators=257, max_features=log2, max_depth=68 ...............
[CV]  n_estimators=257, max_features=log2, max_depth=68, total=   0.7s
[CV] n_estimators=257, max_features=log2, max_depth=68 ...............
[CV]  n_estimators=257, max_features=log2, max_depth=68, total=   0.7s
[CV] n_estimators=263, max_features=auto, max_depth=80 ...............
[CV]  

[CV]  n_estimators=172, max_features=log2, max_depth=57, total=   0.5s
[CV] n_estimators=185, max_features=log2, max_depth=44 ...............
[CV]  n_estimators=185, max_features=log2, max_depth=44, total=   0.5s
[CV] n_estimators=185, max_features=log2, max_depth=44 ...............
[CV]  n_estimators=185, max_features=log2, max_depth=44, total=   0.5s
[CV] n_estimators=185, max_features=log2, max_depth=44 ...............
[CV]  n_estimators=185, max_features=log2, max_depth=44, total=   0.5s
[CV] n_estimators=185, max_features=log2, max_depth=44 ...............
[CV]  n_estimators=185, max_features=log2, max_depth=44, total=   0.5s
[CV] n_estimators=228, max_features=sqrt, max_depth=53 ...............
[CV]  n_estimators=228, max_features=sqrt, max_depth=53, total=   0.7s
[CV] n_estimators=228, max_features=sqrt, max_depth=53 ...............
[CV]  n_estimators=228, max_features=sqrt, max_depth=53, total=   0.7s
[CV] n_estimators=228, max_features=sqrt, max_depth=53 ...............
[CV]  

[CV]  n_estimators=190, max_features=sqrt, max_depth=73, total=   0.5s
[CV] n_estimators=190, max_features=sqrt, max_depth=73 ...............
[CV]  n_estimators=190, max_features=sqrt, max_depth=73, total=   0.5s
[CV] n_estimators=190, max_features=sqrt, max_depth=73 ...............
[CV]  n_estimators=190, max_features=sqrt, max_depth=73, total=   0.5s
[CV] n_estimators=116, max_features=sqrt, max_depth=23 ...............
[CV]  n_estimators=116, max_features=sqrt, max_depth=23, total=   0.3s
[CV] n_estimators=116, max_features=sqrt, max_depth=23 ...............
[CV]  n_estimators=116, max_features=sqrt, max_depth=23, total=   0.3s
[CV] n_estimators=116, max_features=sqrt, max_depth=23 ...............
[CV]  n_estimators=116, max_features=sqrt, max_depth=23, total=   0.3s
[CV] n_estimators=116, max_features=sqrt, max_depth=23 ...............
[CV]  n_estimators=116, max_features=sqrt, max_depth=23, total=   0.3s
[CV] n_estimators=293, max_features=auto, max_depth=57 ...............
[CV]  

[CV]  n_estimators=281, max_features=log2, max_depth=32, total=   0.8s
[CV] n_estimators=32, max_features=log2, max_depth=46 ................
[CV] . n_estimators=32, max_features=log2, max_depth=46, total=   0.1s
[CV] n_estimators=32, max_features=log2, max_depth=46 ................
[CV] . n_estimators=32, max_features=log2, max_depth=46, total=   0.1s
[CV] n_estimators=32, max_features=log2, max_depth=46 ................
[CV] . n_estimators=32, max_features=log2, max_depth=46, total=   0.1s
[CV] n_estimators=32, max_features=log2, max_depth=46 ................
[CV] . n_estimators=32, max_features=log2, max_depth=46, total=   0.1s
[CV] n_estimators=31, max_features=sqrt, max_depth=94 ................
[CV] . n_estimators=31, max_features=sqrt, max_depth=94, total=   0.1s
[CV] n_estimators=31, max_features=sqrt, max_depth=94 ................
[CV] . n_estimators=31, max_features=sqrt, max_depth=94, total=   0.1s
[CV] n_estimators=31, max_features=sqrt, max_depth=94 ................
[CV] .

[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:  1.9min finished


[CV] . n_estimators=222, max_features=sqrt, max_depth=9, total=   0.5s
[CV] n_estimators=222, max_features=sqrt, max_depth=9 ................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV] . n_estimators=222, max_features=sqrt, max_depth=9, total=   0.5s
[CV] n_estimators=222, max_features=sqrt, max_depth=9 ................
[CV] . n_estimators=222, max_features=sqrt, max_depth=9, total=   0.5s
[CV] n_estimators=222, max_features=sqrt, max_depth=9 ................
[CV] . n_estimators=222, max_features=sqrt, max_depth=9, total=   0.5s
[CV] n_estimators=218, max_features=sqrt, max_depth=62 ...............
[CV]  n_estimators=218, max_features=sqrt, max_depth=62, total=   0.6s
[CV] n_estimators=218, max_features=sqrt, max_depth=62 ...............
[CV]  n_estimators=218, max_features=sqrt, max_depth=62, total=   0.6s
[CV] n_estimators=218, max_features=sqrt, max_depth=62 ...............
[CV]  n_estimators=218, max_features=sqrt, max_depth=62, total=   0.6s
[CV] n_estimators=218, max_features=sqrt, max_depth=62 ...............
[CV]  n_estimators=218, max_features=sqrt, max_depth=62, total=   0.6s
[CV] n_estimators=261, max_features=log2, max_depth=38 ...............
[CV]  

[CV]  n_estimators=265, max_features=auto, max_depth=72, total=   1.9s
[CV] n_estimators=170, max_features=sqrt, max_depth=12 ...............
[CV]  n_estimators=170, max_features=sqrt, max_depth=12, total=   0.4s
[CV] n_estimators=170, max_features=sqrt, max_depth=12 ...............
[CV]  n_estimators=170, max_features=sqrt, max_depth=12, total=   0.4s
[CV] n_estimators=170, max_features=sqrt, max_depth=12 ...............
[CV]  n_estimators=170, max_features=sqrt, max_depth=12, total=   0.4s
[CV] n_estimators=170, max_features=sqrt, max_depth=12 ...............
[CV]  n_estimators=170, max_features=sqrt, max_depth=12, total=   0.4s
[CV] n_estimators=98, max_features=auto, max_depth=75 ................
[CV] . n_estimators=98, max_features=auto, max_depth=75, total=   0.7s
[CV] n_estimators=98, max_features=auto, max_depth=75 ................
[CV] . n_estimators=98, max_features=auto, max_depth=75, total=   0.7s
[CV] n_estimators=98, max_features=auto, max_depth=75 ................
[CV] .

[CV] . n_estimators=82, max_features=log2, max_depth=62, total=   0.2s
[CV] n_estimators=82, max_features=log2, max_depth=62 ................
[CV] . n_estimators=82, max_features=log2, max_depth=62, total=   0.2s
[CV] n_estimators=82, max_features=log2, max_depth=62 ................
[CV] . n_estimators=82, max_features=log2, max_depth=62, total=   0.2s
[CV] n_estimators=180, max_features=log2, max_depth=36 ...............
[CV]  n_estimators=180, max_features=log2, max_depth=36, total=   0.5s
[CV] n_estimators=180, max_features=log2, max_depth=36 ...............
[CV]  n_estimators=180, max_features=log2, max_depth=36, total=   0.5s
[CV] n_estimators=180, max_features=log2, max_depth=36 ...............
[CV]  n_estimators=180, max_features=log2, max_depth=36, total=   0.5s
[CV] n_estimators=180, max_features=log2, max_depth=36 ...............
[CV]  n_estimators=180, max_features=log2, max_depth=36, total=   0.5s
[CV] n_estimators=197, max_features=log2, max_depth=70 ...............
[CV]  

[CV]  n_estimators=211, max_features=sqrt, max_depth=44, total=   0.6s
[CV] n_estimators=270, max_features=sqrt, max_depth=56 ...............
[CV]  n_estimators=270, max_features=sqrt, max_depth=56, total=   0.8s
[CV] n_estimators=270, max_features=sqrt, max_depth=56 ...............
[CV]  n_estimators=270, max_features=sqrt, max_depth=56, total=   0.8s
[CV] n_estimators=270, max_features=sqrt, max_depth=56 ...............
[CV]  n_estimators=270, max_features=sqrt, max_depth=56, total=   0.7s
[CV] n_estimators=270, max_features=sqrt, max_depth=56 ...............
[CV]  n_estimators=270, max_features=sqrt, max_depth=56, total=   0.8s
[CV] n_estimators=58, max_features=sqrt, max_depth=72 ................
[CV] . n_estimators=58, max_features=sqrt, max_depth=72, total=   0.2s
[CV] n_estimators=58, max_features=sqrt, max_depth=72 ................
[CV] . n_estimators=58, max_features=sqrt, max_depth=72, total=   0.2s
[CV] n_estimators=58, max_features=sqrt, max_depth=72 ................
[CV] .

[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:  1.8min finished


In [68]:
for grid in grids:
    print(grid.best_params_)

{'n_estimators': 55, 'max_features': 'log2', 'max_depth': 11}
{'n_estimators': 222, 'max_features': 'sqrt', 'max_depth': 9}


In [70]:
(np.sqrt(-grids[0].best_score_)+np.sqrt(-grids[1].best_score_))/2

0.06281204914084831

In [75]:
%load_ext autoreload
%aimport varie
%autoreload 2
#I use a different model for E and Eg
varie.make_csv2(df_total_train_eval,pd.DataFrame(),df_total_test,
         (ensemble.RandomForestRegressor(max_depth= 11, max_features='log2', n_estimators= 55),
          ensemble.RandomForestRegressor(max_depth= 9, max_features='sqrt', n_estimators= 220)),
         y_col,'rf2.csv',drop=drop_col,columns=['id','E','Eg'],
         new_column_names=['id','formation_energy_ev_natom' ,'bandgap_energy_ev'],change_col_names=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
E RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=11,
           max_features='log2', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=55, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
shapes: (2400, 18) (2400,)
Eg RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=9,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=220, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
shapes: (2400, 18) (2400,)
