In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from datetime import datetime 

In [2]:
# Utility functions
def load_data(csv_path):
    return pd.read_csv(csv_path)

def rmsle(actual, predicted):
    """
    Args:
        actual (1d-array [nx1]) - array of actual values (float)
        predicted (1d-array [nx1]) - array of predicted values (float)
    Returns:
        root mean square log error (float)
    """
    return np.sqrt(np.mean(np.power(np.log1p(actual)-np.log1p(predicted), 2)))

objective  = make_scorer(rmsle, greater_is_better=False)

def drop_features(df_t):
    df = df_t.copy()
    df = df.drop('id', 1)
    df = df.drop("formation_energy_ev_natom", 1)
    df = df.drop("bandgap_energy_ev", 1)
    return df

def display_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())

In [3]:
# Create train/test datasets
df = pd.read_csv('/home/agi/Desktop/NOMAD/data/train_prepared.csv')
train_df, test_df = train_test_split(df, test_size=0.2, random_state=400)

X_full = drop_features(df)
X_train = drop_features(train_df)
X_test  = drop_features(test_df)

y_form_full = df["formation_energy_ev_natom"]
y_band_full = df["bandgap_energy_ev"]

y_form_train = train_df["formation_energy_ev_natom"]
y_band_train = train_df["bandgap_energy_ev"]

y_form_test  = test_df["formation_energy_ev_natom"]
y_band_test  = test_df["bandgap_energy_ev"]

In [4]:
# Configuration
params_f = {
    'max_depth':6,
    'learning_rate':0.05,
    'n_estimators':125,
    'min_child_weight':20,
    'colsample_bytree':0.7,
    'colsample_bylevel':0.8,
    'reg_lambda':5,
    'subsample':0.8
}

params_b = {
    'max_depth':4,
    'learning_rate':0.07,
    'n_estimators':725,
    'min_child_weight':20,
    'colsample_bytree':0.7,
    'colsample_bylevel':0.9,
    'reg_lambda':5,
    'subsample':0.8
}

num_boost_round = 40
esr = 40
ev_dict_f = {}
ev_dict_b = {}

xgb_train_form = xgb.DMatrix(X_train,y_form_train)
xgb_train_band = xgb.DMatrix(X_train,y_band_train)

xgb_test_form = xgb.DMatrix(X_test,y_form_test)
xgb_test_band = xgb.DMatrix(X_test,y_band_test)

# Fit to test set
start = datetime.now()
xgb_model_form = xgb.train(params_f, xgb_train_form, num_boost_round, evals=[(xgb_train_form, "form")], 
                           evals_result=ev_dict_f, verbose_eval=False) 
xgb_model_band = xgb.train(params_b, xgb_train_band, num_boost_round, evals=[(xgb_train_band, "band")], 
                           evals_result=ev_dict_b, verbose_eval=False)
stop = datetime.now()

# Predict on test set
xgb_pred_form = xgb_model_form.predict(xgb_test_form)
xgb_pred_band = xgb_model_band.predict(xgb_test_band)

print('Execution time: ', stop - start)

Execution time:  0:00:00.621468


In [5]:
from sklearn.metrics import mean_squared_error

# Evaluate models 
print(ev_dict_f['form']['rmse'][-1])
print(ev_dict_b['band']['rmse'][-1])

print('form-rmse: ', mean_squared_error(y_form_test, xgb_pred_form) ** 0.5)
print('band-rmse: ', mean_squared_error(y_band_test, xgb_pred_band) ** 0.5)


f_rmsle = rmsle(y_form_test, xgb_pred_form)
b_rmsle = rmsle(y_band_test, xgb_pred_band)
print('Expected LB: ', (f_rmsle + b_rmsle) / 2)

0.03521
0.185843
form-rmse:  0.0401460657829
band-rmse:  0.218975411284
Expected LB:  0.0593643049893


In [6]:
# KFold cross validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from xgboost import XGBRegressor

k_fold = KFold(n_splits=10, random_state=7)
model = XGBRegressor(max_depth=6, learning_rate=0.2, n_estimators=40, 
                     min_child_weight=70, colsample_bytree=0.9)

scores_form = cross_val_score(model, X_full, y_form_full, scoring=objective, cv=k_fold)
scores_band = cross_val_score(model, X_full, y_band_full, scoring=objective, cv=k_fold)

rmse_scores = -scores_form - scores_band
display_scores(rmse_scores / 2)

Scores:  [ 0.05707815  0.06795943  0.06714063  0.05219012  0.05114615  0.0699484
  0.06082621  0.05035566  0.0649717   0.05844032]
Mean:  0.0600056782392
Standard deviation:  0.00694319672234


In [7]:
# Prepare submission data set
s_df = pd.read_csv('/home/agi/Desktop/NOMAD/data/test_prepared.csv')
X_submit  = drop_features(s_df)
xgb_submit = xgb.DMatrix(X_submit)
# Predict 
submit_pred_form = xgb_model_form.predict(xgb_submit)
submit_pred_band = xgb_model_band.predict(xgb_submit)

# Build submission .csv
submission = np.concatenate((submit_pred_form.reshape(600,1), submit_pred_band.reshape(600,1)), axis=1)
submit_df = pd.DataFrame(submission, columns=['formation_energy_ev_natom', "bandgap_energy_ev"])
submit_df[df < 0] = 0
print(submit_df.shape)
submit_df.insert(0, 'id', range(1, 601))
submit_df.to_csv("/home/agi/Desktop/NOMAD/submissions/xgb_8.csv", index=False)

(600, 2)
