In [None]:
import json
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [None]:
def rmsle(actual, predicted):
    """
    Args:
        actual (1d-array) - array of actual values (float)
        predicted (1d-array) - array of predicted values (float)
    Returns:
        root mean square log error (float)
    """
    return np.sqrt(np.mean(np.power(np.log1p(actual)-np.log1p(predicted), 2)))

def display_scores(scores):
    #print("Scores: ", scores)
    print("Mean: ", scores.mean())
    #print("Standard deviation: ", scores.std())

In [None]:
def drop_features(df_t):
    df = df_t.copy()
    df = df.drop('id', 1)
    df = df.drop("formation_energy_ev_natom", 1)
    df = df.drop("bandgap_energy_ev", 1)
    return df

In [None]:
# Create dataset
df = pd.read_csv('/home/agi/Desktop/NOMAD/data/train_prepared.csv')
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

X_train = drop_features(train_df)
X_test  = drop_features(test_df)

y_form_train = train_df["formation_energy_ev_natom"]
y_band_train = train_df["bandgap_energy_ev"]

y_form_test  = test_df["formation_energy_ev_natom"]
y_band_test  = test_df["bandgap_energy_ev"]

num_train, num_feature = X_train.shape
X_train.head()

In [None]:
lgb_train_form = lgb.Dataset(X_train, y_form_train, free_raw_data=False)
lgb_eval_form = lgb.Dataset(X_train, y_form_train, reference=lgb_train_form, free_raw_data=False)

In [None]:
import warnings
warnings.filterwarnings('ignore')
lgb_estimator = lgb.LGBMRegressor(boosting_type='gbdt',
                                  objective='regression',
                                  num_boost_round=5000,
                                  learning_rate=0.1,
                                  max_depth=-1,
                                  num_leaf=5,
                                  subsample=1,
                                  min_split_gain=0.0,
                                  min_data_in_leaf = 0,
                                  categorical_feature=0)
                                  #early_stopping_rounds=5) 

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lgb_estimator, X_train, y_form_train, scoring="neg_mean_squared_error", cv=8)
rmse_scores = np.sqrt(-scores)
display_scores(rmse_scores)
reg = lgb_estimator.fit(X_train, y_form_train)
y_pred_form = reg.predict(X_test)
print('formation model\'s RMSE:', mean_squared_error(y_form_test, y_pred_form) ** 0.5)
print('formation model\'s RMLSE:', rmsle(y_form_test, y_pred_form))