In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from datetime import datetime 
from skopt import BayesSearchCV
from sklearn.model_selection import KFold

ModuleNotFoundError: No module named 'skopt'

In [None]:
# Utility functions
def load_data(csv_path):
    return pd.read_csv(csv_path)

def rmsle(actual, predicted):
    """
    Args:
        actual (1d-array [nx1]) - array of actual values (float)
        predicted (1d-array [nx1]) - array of predicted values (float)
    Returns:
        root mean square log error (float)
    """
    return np.sqrt(np.mean(np.power(np.log1p(actual)-np.log1p(predicted), 2)))

objective  = make_scorer(rmsle, greater_is_better=False)

def drop_features(df_t):
    df = df_t.copy()
    df = df.drop('id', 1)
    df = df.drop("formation_energy_ev_natom", 1)
    df = df.drop("bandgap_energy_ev", 1)
    df = df.drop("spacegroup", 1)
    return df

def display_scores(scores):
    print("CV LB: ")
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())

In [None]:
# Create train/test datasets
df = pd.read_csv('/home/agi/Desktop/NOMAD/data/train_prepared.csv')
train_df, test_df = train_test_split(df, test_size=0.2, random_state=400)

X_full = drop_features(df)
X_train = drop_features(train_df)
X_test  = drop_features(test_df)

y_form_full = df["formation_energy_ev_natom"]
y_band_full = df["bandgap_energy_ev"]

y_form_train = train_df["formation_energy_ev_natom"]
y_band_train = train_df["bandgap_energy_ev"]

y_form_test  = test_df["formation_energy_ev_natom"]
y_band_test  = test_df["bandgap_energy_ev"]

params = {
    'max_depth': (3, 10),
    'min_child_weight': (1e-3, 1e+3),
    'n_estimators': (1, 300),
    'colsample_bytree': (1e-1, 1e+0),
    'subsample': (0.4, 1),
    'bagging_fraction': (0.5, 1),
    'feature_fraction': (0.5, 1),
    'min_split_gain': (0.1, 10),
}

# log-uniform: understand as search over p = exp(x) by varying x
opt_form = BayesSearchCV(
    lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', metric='rmse', categorical_feature=0),
    params,
    n_iter=10,
    n_jobs=4
)
opt_form.fit(X_full, y_form_full)

print("val. score: %s" % opt_form.best_score_)
print("test score: %s" % opt_form.score(X_test, y_form_test))
print(opt_form.best_params_)

In [None]:
# log-uniform: understand as search over p = exp(x) by varying x
opt_band = BayesSearchCV(
    lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', metric='rmse'),
    params,
    n_iter=10,
    n_jobs=4
)

opt_band.fit(X_full, y_band_full)

print("val. score: %s" % opt_band.best_score_)
print("test score: %s" % opt_band.score(X_test, y_band_test))
print(opt_band.best_params_)

In [None]:
opt_band.best_params_

In [None]:
opt_form.best_params_