In [None]:
import pandas as pd
import numpy as np
import NUTILS as nutils
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from mlxtend.regressor import StackingRegressor
from sklearn.ensemble import BaggingRegressor

In [None]:
df_t = pd.read_csv('/home/agi/Desktop/NOMAD/ensembling/train_full.csv')
df_s = pd.read_csv('/home/agi/Desktop/NOMAD/ensembling/test_full.csv')

X_train = nutils.drop_features(df_t)
X_submit = nutils.drop_features_s(df_s)
y_form = df_t["formation_energy_ev_natom"]
y_band = df_t["bandgap_energy_ev"]

In [None]:
base_models = []

In [None]:
# LightGBM models
params1 = {
    'num_leaves': 7,
    'objective': 'regression',
    'min_data_in_leaf': 18,
    'learning_rate': 0.04,
    'feature_fraction': 0.93,
    'bagging_fraction': 0.93,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 1,
    'verbose': 0
}

params2 = {
    'num_leaves': 8,
    'objective': 'regression',
    'min_data_in_leaf': 18,
    'learning_rate': 0.04,
    'feature_fraction': 0.93,
    'bagging_fraction': 0.93,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 1,
    'verbose': 0
}

boost_rounds_form = 230
boost_rounds_band = 721

base_models.append(LGBMRegressor(**params1, num_boost_round=boost_rounds_form))
base_models.append(LGBMRegressor(**params2, num_boost_round=boost_rounds_band))

In [None]:
# XGBoost models
params1={
    'max_depth':6,
    'learning_rate':0.05,
    'n_estimators':125,
    'min_child_weight':20,
    'colsample_bytree':0.7,
    'colsample_bylevel':0.8,
    'reg_lambda':5,
    'subsample':0.8
}

params2={
    'max_depth':4,
    'learning_rate':0.07,
    'n_estimators':725,
    'min_child_weight':20,
    'colsample_bytree':0.7,
    'colsample_bylevel':0.9,
    'reg_lambda':5,
    'subsample':0.8
}

base_models.append(XGBRegressor(**params1))
base_models.append(XGBRegressor(**params2))

In [None]:
params = {
    'logging_level': 'Silent'
}

base_models.append(CatBoostRegressor(**params))
#base_models.append(CatBoostRegressor(**params))

In [None]:
base_models.append(KNeighborsRegressor(n_neighbors=4))
base_models.append(KNeighborsRegressor(n_neighbors=20))
#base_models.append(KNeighborsRegressor(n_neighbors=100))

In [None]:
base_models.append(GradientBoostingRegressor(n_estimators=20))
base_models.append(GradientBoostingRegressor(n_estimators=200))
#base_models.append(GradientBoostingRegressor(n_estimators=1000))

In [None]:
base_models.append(RandomForestRegressor(n_estimators=20))
base_models.append(RandomForestRegressor(n_estimators=200))
#base_models.append(RandomForestRegressor(n_estimators=1000))

In [None]:
base_models.append(SVR(kernel='rbf'))
#base_models.append(SVR(kernel='poly'))
#base_models.append(SVR(kernel='sigmoid'))

In [None]:
base_models.append(ExtraTreesRegressor(n_estimators=200))
#base_models.append(ExtraTreesRegressor(n_estimators=500))
#base_models.append(ExtraTreesRegressor(n_estimators=1000))

In [None]:
base_models.append(Ridge())

In [None]:
model_pre1 = StackingRegressor(regressors=base_models, 
                         meta_regressor=Ridge())

model_pre2 = StackingRegressor(regressors=base_models, 
                          meta_regressor=KNeighborsRegressor())

model_pre3 = StackingRegressor(regressors=base_models, 
                          meta_regressor=CatBoostRegressor())

model_form = StackingRegressor(regressors=[model_pre1, model_pre2, model_pre3], 
                         meta_regressor=BaggingRegressor(n_estimators=100))

model_band = StackingRegressor(regressors=[model_pre1, model_pre2, model_pre3], 
                         meta_regressor=BaggingRegressor(n_estimators=100))

In [None]:
k_fold = KFold(n_splits=5, random_state=7)
objective  = make_scorer(nutils.rmsle, greater_is_better=False)


scores_form = cross_val_score(model_form, nutils.encode(X_train), y_form, scoring=objective, cv=k_fold)
scores_band = cross_val_score(model_band, nutils.encode(X_train), y_band, scoring=objective, cv=k_fold)

rmse_scores = -scores_form - scores_band
nutils.display_scores(rmse_scores / 2)

In [None]:
model_form.fit(X_train, y_form)
print('fitted formation')
model_band.fit(X_train, y_band)
print('fitted band')

submit_pred_form = model_form.predict(X_submit)
print('submi form')
submit_pred_band = model_band.predict(X_submit)
print('submit band')

# Build submission .csv
submission = np.concatenate((submit_pred_form.reshape(600,1), submit_pred_band.reshape(600,1)), axis=1)
submit_df = pd.DataFrame(submission, columns=['formation_energy_ev_natom', "bandgap_energy_ev"])
submit_df[submit_df < 0] = 0
submit_df.insert(0, 'id', range(1, 601))

# Save to file
submit_df.to_csv("/home/agi/Desktop/NOMAD/ensembling/subs/bag_1.csv", index=False)