In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from catboost import Pool, CatBoostRegressor
from datetime import datetime 

In [5]:
# Utility functions
def load_data(csv_path):
    return pd.read_csv(csv_path)

def rmsle(actual, predicted):
    """
    Args:
        actual (1d-array [nx1]) - array of actual values (float)
        predicted (1d-array [nx1]) - array of predicted values (float)
    Returns:
        root mean square log error (float)
    """
    return np.sqrt(np.mean(np.power(np.log1p(actual)-np.log1p(predicted), 2)))

objective  = make_scorer(rmsle, greater_is_better=False)

def drop_features(df_t):
    df = df_t.copy()
    df = df.drop('id', 1)
    df = df.drop("formation_energy_ev_natom", 1)
    df = df.drop("bandgap_energy_ev", 1)
    return df

def drop_features_s(df_t):
    df = df_t.copy()
    df = df.drop('id', 1)
    return df

def display_scores(scores):
    print("Expected LB")
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())

In [6]:
# Prepare data sets
df_t = pd.read_csv('/home/agi/Desktop/NOMAD/data/train_full.csv')
df_s = pd.read_csv('/home/agi/Desktop/NOMAD/data/test_full.csv')

X_train = drop_features(df_t)
X_submt = drop_features_s(df_s)
y_form = df_t["formation_energy_ev_natom"]
y_band = df_t["bandgap_energy_ev"]

In [7]:
# Train and evaluate with K-Fold Cross-validation
k_fold = KFold(n_splits=10, random_state=7)
model_form = CatBoostRegressor(logging_level='Silent')
model_band = CatBoostRegressor(logging_level='Silent')

scores_form = cross_val_score(model_form, X_train, y_form, scoring=objective, cv=k_fold)
scores_band = cross_val_score(model_band, X_train, y_band, scoring=objective, cv=k_fold)

rmse_scores = -scores_form - scores_band
display_scores(rmse_scores / 2)

# Fit models
model_form.fit(X_train, y_form, cat_features=[0])
model_band.fit(X_train, y_band, cat_features=[0])

Expected LB
Scores:  [ 0.05412756  0.0640158   0.06537968  0.05092926  0.05197046  0.06719524
  0.05751173  0.05274459  0.06387804  0.05868859]
Mean:  0.0586440950169
Standard deviation:  0.00579036298443


<catboost.core.CatBoostRegressor at 0x7f4ac9ffdb00>

In [8]:
# Predict on the submission set
cat_submit = Pool(X_submt, cat_features=[0])
submit_pred_form = model_form.predict(cat_submit)
submit_pred_band = model_band.predict(cat_submit)

# Build submission .csv
submission = np.concatenate((submit_pred_form.reshape(600,1), submit_pred_band.reshape(600,1)), axis=1)
submit_df = pd.DataFrame(submission, columns=['formation_energy_ev_natom', "bandgap_energy_ev"])
submit_df[submit_df < 0] = 0
submit_df.insert(0, 'id', range(1, 601))

# Save to file
submit_df.to_csv("/home/agi/Desktop/NOMAD/submissions/cat_2.csv", index=False)

In [1]:
# Save model to disk
cat_form_model.save_model('/home/agi/Desktop/NOMAD/models/cat_form_model', format="cbm", export_parameters=None)
cat_band_model.save_model('/home/agi/Desktop/NOMAD/models/cat_band_model', format="cbm", export_parameters=None)

NameError: name 'cat_form_model' is not defined