In [1]:
import json
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin

# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [3]:
def rmsle(actual, predicted):
    """
    Args:
        actual (1d-array) - array of actual values (float)
        predicted (1d-array) - array of predicted values (float)
    Returns:
        root mean square log error (float)
    """
    return np.sqrt(np.mean(np.power(np.log1p(actual)-np.log1p(predicted), 2)))

In [4]:
def drop_features(df_t):
    df = df_t.copy()
    df = df.drop('id', 1)
    df = df.drop("formation_energy_ev_natom", 1)
    df = df.drop("bandgap_energy_ev", 1)
    return df

In [5]:
# Create dataset
df = pd.read_csv('/home/agiDesktop/NOMAD/data/train_prepared.csv')
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

X_train = drop_features(train_df)
X_test  = drop_features(test_df)

y_form_train = train_df["formation_energy_ev_natom"]
y_band_train = train_df["bandgap_energy_ev"]

y_form_test  = test_df["formation_energy_ev_natom"]
y_band_test  = test_df["bandgap_energy_ev"]

num_train, num_feature = X_train.shape

FileNotFoundError: File b'/home/agiDesktop/NOMAD/data/train_prepared.csv' does not exist

In [None]:
from sklearn.preprocessing import OneHotEncoder
numerical_attrbs = list(X_train)
label_attrbs = ['spacegroup']
spacegroup = X_train["spacegroup"]
encoder = OneHotEncoder()
spacegroup_1hot = encoder.fit_transform(spacegroup.values.reshape(-1,1))

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion

attrs = list(X_train)
hot_attrs = ["spacegroup"]

pl1 = Pipeline([
    ('selector', DataFrameSelector(numerical_attrbs)),
    ('scaler', StandardScaler()),
])

pl2 = Pipeline([
    ('selector', DataFrameSelector(label_attrbs)),
    ('one_hot', OneHotEncoder())
])

full_pl = FeatureUnion(transformer_list=[
    ('numerical', pl1),
    ('label',     pl2),
])

X_train = full_pl.fit_transform(X_train)
X_test = full_pl.fit_transform(X_test)
print(X_train[0])

In [None]:
# Create LightGBM dataset
lgb_train_form = lgb.Dataset(X_train, y_form_train, free_raw_data=False)
lgb_train_band = lgb.Dataset(X_train, y_band_train, free_raw_data=False)

lgb_eval_form = lgb.Dataset(X_test, y_form_test, reference=lgb_train_form, free_raw_data=False)
lgb_eval_band = lgb.Dataset(X_test, y_band_test, reference=lgb_train_band, free_raw_data=False)

In [None]:
# Configurations dictionary
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
}

# generate a feature name
feature_name = ['feature_' + str(col) for col in range(31)]

In [None]:
print('Start training...')

# feature_name and categorical_feature
gbm_form = lgb.train(params,
                lgb_train_form,
                valid_sets=lgb_train_form,  # eval training data
                feature_name=feature_name)
                #categorical_feature=[0])

# feature_name and categorical_feature
gbm_band = lgb.train(params,
                lgb_train_band,
                valid_sets=lgb_train_band,  # eval training data
                feature_name=feature_name)
                #categorical_feature=[0])

# check feature name
print('Training Complete')

In [None]:
form_model_loc = '/home/agi/Desktop/NOMAD/models/lgbm_tune101.txt'
band_model_loc = '/home/agi/Desktop/NOMAD/models/lgbm_tune201.txt'

gbm_form.save_model(form_model_loc)
gbm_band.save_model(band_model_loc)

bst_form = lgb.Booster(model_file=form_model_loc)
bst_band = lgb.Booster(model_file=band_model_loc)

In [None]:
y_pred_form = bst_form.predict(X_test)
y_pred_band = bst_band.predict(X_test)

print('formation model\'s RMSE:', mean_squared_error(y_form_test, y_pred_form) ** 0.5)
print('bandgap model\'s RMSE:',   mean_squared_error(y_band_test, y_pred_band) ** 0.5)

In [6]:
y_pred_form = bst_form.predict(X_test)
y_pred_band = bst_band.predict(X_test)

print('Expected LB:', (rmsle(y_form_test, y_pred_form) + rmsle(y_band_test, y_pred_band)) * 0.5)

NameError: name 'bst_form' is not defined

In [7]:
# Create submission
s_df = pd.read_csv('/home/agi/Desktop/NOMAD/data/test_prepared.csv')
X_submit  = drop_features(s_df)
submit_pred_form = bst_form.predict(X_submit)
submit_pred_band = bst_band.predict(X_submit)
submission = np.concatenate((submit_pred_form.reshape(600,1), submit_pred_band.reshape(600,1)), axis=1)
submit_df = pd.DataFrame(submission, columns=['formation_energy_ev_natom', "bandgap_energy_ev"])
submit_df[df < 0] = 0
print(submit_df.shape)
submit_df.insert(0, 'id', range(1, 601))
submit_df.to_csv("/home/agi/Desktop/NOMAD/submissions/lightgbm_best5.csv", index=False)

FileNotFoundError: File b'/home/agi/Desktop/NOMAD/data/test_prepared.csv' does not exist