In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from datetime import datetime 

In [2]:
# Utility functions
def load_data(csv_path):
    return pd.read_csv(csv_path)

def rmsle(actual, predicted):
    """
    Args:
        actual (1d-array [nx1]) - array of actual values (float)
        predicted (1d-array [nx1]) - array of predicted values (float)
    Returns:
        root mean square log error (float)
    """
    return np.sqrt(np.mean(np.power(np.log1p(actual)-np.log1p(predicted), 2)))

objective  = make_scorer(rmsle, greater_is_better=False)

def drop_features(df_t):
    df = df_t.copy()
    df = df.drop('id', 1)
    df = df.drop("formation_energy_ev_natom", 1)
    df = df.drop("bandgap_energy_ev", 1)
    return df

def display_scores(scores):
    print("CV LB: ")
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())

In [3]:
# Create train/test datasets
df = pd.read_csv('/home/agi/Desktop/NOMAD/data/train_prepared.csv')
train_df, test_df = train_test_split(df, test_size=0.2, random_state=400)

X_full = drop_features(df)
X_train = drop_features(train_df)
X_test  = drop_features(test_df)

y_form_full = df["formation_energy_ev_natom"]
y_band_full = df["bandgap_energy_ev"]

y_form_train = train_df["formation_energy_ev_natom"]
y_band_train = train_df["bandgap_energy_ev"]

y_form_test  = test_df["formation_energy_ev_natom"]
y_band_test  = test_df["bandgap_energy_ev"]

FileNotFoundError: File b'/home/agi/Desktop/NOMAD/data/train_prepared.csv' does not exist

In [None]:
# Configuration
params = {
    'boosting_type': 'gbdt',    
     'objective': 'regression',    
          'metric': 'rmse',    
          'max_depth': 3,    
          'min_child_weight': 70,    
          'learning_rate': 0.2,   
          'subsample': 1,
          'n_estimators': 120
}

lgb_model_form = lgb.LGBMRegressor(**params)
lgb_model_band = lgb.LGBMRegressor(**params)

# Fit to training set
start = datetime.now()

lgb_model_form = lgb_model_form.fit(X_train, y_form_train)
lgb_model_band = lgb_model_band.fit(X_train, y_band_train)

# check feature name
stop = datetime.now()

print('Execution time: ', stop - start)

# Predict on test set
lgb_pred_form = lgb_model_form.predict(X_test)
lgb_pred_band = lgb_model_band.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error
print('form-rmse: ', mean_squared_error(y_form_test, lgb_pred_form) ** 0.5)
print('band-rmse: ', mean_squared_error(y_band_test, lgb_pred_band) ** 0.5)


f_rmsle = rmsle(y_form_test, lgb_pred_form)
b_rmsle = rmsle(y_band_test, lgb_pred_band)
print('Expected LB: ', (f_rmsle + b_rmsle) / 2)

In [None]:
# KFold cross validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from xgboost import XGBRegressor

k_fold = KFold(n_splits=10, random_state=7)
model_f = lgb.LGBMRegressor(**params)
model_b = lgb.LGBMRegressor(**params)

scores_form = cross_val_score(model_f, X_full, y_form_full, scoring=objective, cv=k_fold)
scores_band = cross_val_score(model_b, X_full, y_band_full, scoring=objective, cv=k_fold)

rmse_scores = -scores_form - scores_band
display_scores(rmse_scores / 2)

In [None]:
# Prepare submission data set
s_df = pd.read_csv('/home/agi/Desktop/NOMAD/data/test_prepared.csv')
X_submit  = drop_features(s_df)

# Predict 
submit_pred_form = lgb_model_form.predict(X_submit)
submit_pred_band = lgb_model_band.predict(X_submit)

# Build submission .csv
submission = np.concatenate((submit_pred_form.reshape(600,1), submit_pred_band.reshape(600,1)), axis=1)
submit_df = pd.DataFrame(submission, columns=['formation_energy_ev_natom', "bandgap_energy_ev"])
submit_df[df < 0] = 0
print(submit_df.shape)
submit_df.insert(0, 'id', range(1, 601))
submit_df.to_csv("/home/agi/Desktop/NOMAD/submissions/lgb_3.csv", index=False)