In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt
from mlxtend.regressor import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.ensemble import BaggingRegressor
from catboost import Pool, CatBoostRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import GradientBoostingRegressor
from tqdm import tqdm
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.dummy import DummyRegressor

In [None]:
# Utility functions
def load_data(csv_path):
    return pd.read_csv(csv_path)

def rmsle(actual, predicted):
    """
    Args:
        actual (1d-array [nx1]) - array of actual values (float)
        predicted (1d-array [nx1]) - array of predicted values (float)
    Returns:
        root mean square log error (float)
    """
    return np.sqrt(np.mean(np.power(np.log1p(actual)-np.log1p(predicted), 2)))

objective  = make_scorer(rmsle, greater_is_better=False)

def drop_features(df_t):
    df = df_t.copy()
    df = df.drop('id', 1)
    df = df.drop("formation_energy_ev_natom", 1)
    df = df.drop("bandgap_energy_ev", 1)
    return df

def display_scores(scores):
    print("Expected LB")
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())

In [None]:
# Prepare data sets
df_t = pd.read_csv('/home/agi/Desktop/NOMAD/data/train_prepared.csv')
df_s = pd.read_csv('/home/agi/Desktop/NOMAD/data/test_prepared.csv')

X_train = drop_features(df_t)
X_submt = drop_features(df_s)
y_form = df_t["formation_energy_ev_natom"]
y_band = df_t["bandgap_energy_ev"]

In [None]:
k_fold = KFold(n_splits=10, random_state=7)

# Initializing models
params = {    
    'boosting_type': 'gbdt',    
     'objective': 'regression',    
          'metric': 'rmse',    
          'max_depth': 3,    
          'min_child_weight': 70,    
          'learning_rate': 0.2,   
          'subsample': 1,
          'n_estimators': 120}

m1 = XGBRegressor(max_depth=6, learning_rate=0.2, n_estimators=40, 
                   min_child_weight=70, colsample_bytree=1, subsample=1)
m8 = lgb_model_form = lgb.LGBMRegressor(**params)
m16 = CatBoostRegressor()

model_form = StackingRegressor(regressors=[m16,m16, m16], 
                           meta_regressor=LinearRegression())

model_band = StackingRegressor(regressors=[m16,m16, m16], 
                           meta_regressor=LinearRegression())

In [None]:
# Training the stacking classifier
scores_form = cross_val_score(model_form, X_train, y_form, scoring=objective, cv=k_fold)
scores_band = cross_val_score(model_band, X_train, y_band, scoring=objective, cv=k_fold)

rmse_scores = -scores_form - scores_band
display_scores(rmse_scores / 2)

# Fit models
model_form.fit(X_train, y_form)
model_band.fit(X_train, y_band)

In [None]:
rmse_scores = -scores_form - scores_band
display_scores(rmse_scores / 2)

In [None]:
submit_pred_form = model_form.predict(X_submt)
submit_pred_band = model_band.predict(X_submt)

# Build submission .csv
submission = np.concatenate((submit_pred_form.reshape(600,1), submit_pred_band.reshape(600,1)), axis=1)
submit_df = pd.DataFrame(submission, columns=['formation_energy_ev_natom', "bandgap_energy_ev"])
submit_df[submit_df < 0] = 0
submit_df.insert(0, 'id', range(1, 601))

# Save to file
submit_df.to_csv("/home/agi/Desktop/NOMAD/submissions/stack_5.csv", index=False)