In [None]:
import pandas as pd
import numpy as np
import NUTILS as nutils
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.ensemble import BaggingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [None]:
df_t = pd.read_csv('/home/agi/Desktop/NOMAD/ensembling/train_full.csv')
df_s = pd.read_csv('/home/agi/Desktop/NOMAD/ensembling/test_full.csv')

X_train = nutils.drop_features(df_t)
X_submit = nutils.drop_features_s(df_s)
y_form = df_t["formation_energy_ev_natom"]
y_band = df_t["bandgap_energy_ev"]

In [None]:
# LightGBM models
params1 = {
    'num_leaves': 7,
    'objective': 'regression',
    'min_data_in_leaf': 18,
    'learning_rate': 0.04,
    'feature_fraction': 0.93,
    'bagging_fraction': 0.93,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 1,
    'verbose': 0
}

params2 = {
    'num_leaves': 8,
    'objective': 'regression',
    'min_data_in_leaf': 18,
    'learning_rate': 0.04,
    'feature_fraction': 0.93,
    'bagging_fraction': 0.93,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 1,
    'verbose': 0
}

boost_rounds_form = 230
boost_rounds_band = 721

lgbm_f = LGBMRegressor(**params1, num_boost_round=boost_rounds_form)
lgbm_b = LGBMRegressor(**params2, num_boost_round=boost_rounds_band)

lgbm_bag1 = BaggingRegressor(base_estimator=lgbm_f, n_estimators=30)
lgbm_bag2 = BaggingRegressor(base_estimator=lgbm_b, n_estimators=30)

k_fold = KFold(n_splits=10, random_state=7)
objective  = make_scorer(nutils.rmsle, greater_is_better=False)


scores_form = cross_val_score(lgbm_bag1, X_train, y_form, scoring=objective, cv=k_fold)
scores_band = cross_val_score(lgbm_bag2, X_train, y_band, scoring=objective, cv=k_fold)

rmse_scores = -scores_form - scores_band
nutils.display_scores(rmse_scores / 2)

In [None]:
lgbm_bag1.fit(X_train, y_form)
print('fitted formation')
lgbm_bag2.fit(X_train, y_band)
print('fitted band')

submit_pred_form = lgbm_bag1.predict(X_submit)
print('submi form')
submit_pred_band = lgbm_bag2.predict(X_submit)
print('submit band')

# Build submission .csv
submission = np.concatenate((submit_pred_form.reshape(600,1), submit_pred_band.reshape(600,1)), axis=1)
submit_df = pd.DataFrame(submission, columns=['formation_energy_ev_natom', "bandgap_energy_ev"])
submit_df[submit_df < 0] = 0
submit_df.insert(0, 'id', range(1, 601))

# Save to file
submit_df.to_csv("/home/agi/Desktop/NOMAD/ensembling/subs/bag_1.csv", index=False)

In [None]:
params_f = {
    'max_depth':6,
    'learning_rate':0.05,
    'n_estimators':125,
    'min_child_weight':20,
    'colsample_bytree':0.7,
    'colsample_bylevel':0.8,
    'reg_lambda':5,
    'subsample':0.8
}

params_b = {
    'max_depth':4,
    'learning_rate':0.07,
    'n_estimators':725,
    'min_child_weight':20,
    'colsample_bytree':0.7,
    'colsample_bylevel':0.9,
    'reg_lambda':5,
    'subsample':0.8
}

xgb_f = XGBRegressor(**params_f)
xgb_b = XGBRegressor(**params_b)

xgb_bag1 = BaggingRegressor(base_estimator=xgb_f, n_estimators=30)
xgb_bag2 = BaggingRegressor(base_estimator=xgb_b, n_estimators=30)

k_fold = KFold(n_splits=10, random_state=7)
objective  = make_scorer(nutils.rmsle, greater_is_better=False)


scores_form = cross_val_score(xgb_bag1, X_train, y_form, scoring=objective, cv=k_fold)
scores_band = cross_val_score(xgb_bag2, X_train, y_band, scoring=objective, cv=k_fold)

rmse_scores = -scores_form - scores_band
nutils.display_scores(rmse_scores / 2)

In [None]:
xgb_bag1.fit(X_train, y_form)
print('fitted formation')
xgb_bag2.fit(X_train, y_band)
print('fitted band')

submit_pred_form = xgb_bag1.predict(X_submit)
print('submi form')
submit_pred_band = xgb_bag2.predict(X_submit)
print('submit band')

# Build submission .csv
submission = np.concatenate((submit_pred_form.reshape(600,1), submit_pred_band.reshape(600,1)), axis=1)
submit_df = pd.DataFrame(submission, columns=['formation_energy_ev_natom', "bandgap_energy_ev"])
submit_df[submit_df < 0] = 0
submit_df.insert(0, 'id', range(1, 601))

# Save to file
submit_df.to_csv("/home/agi/Desktop/NOMAD/ensembling/subs/bag_2.csv", index=False)

In [None]:
params = {
    'logging_level': 'Silent'
}

cb_f = CatBoostRegressor(**params)
cb_b = CatBoostRegressor(**params)