In [468]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.contrib.layers import batch_norm
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [469]:
from sklearn.base import BaseEstimator, TransformerMixin

# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [470]:
# Utility functions
def load_data(csv_path):
    return pd.read_csv(csv_path)

#def rmsle(y, y0):
#    return tf.sqrt(tf.reduce_mean(tf.pow(tf.log1p(y)-tf.log1p(y0), 2)))
def rmsle(actual, predicted):
    """
    Args:
        actual (1d-array [nx1]) - array of actual values (float)
        predicted (1d-array [nx1]) - array of predicted values (float)
    Returns:
        root mean square log error (float)
    """
    return np.sqrt(np.mean(np.power(np.log1p(actual)-np.log1p(predicted), 2)))

def display_rmsle(y_pred_f, y_pred_b, y_f, y_b):
    rmsle_f = rmsle(y_f, y_pred_f)
    rmsle_b = rmsle(y_b, y_pred_b)
    print ("RMSLE: ", (rmsle_f + rmsle_b) / 2)

objective  = make_scorer(rmsle, greater_is_better=False)

def drop_features(df_t):
    df = df_t.copy()
    df = df.drop('id', 1)
    df = df.drop("formation_energy_ev_natom", 1)
    df = df.drop("bandgap_energy_ev", 1)
    return df

def display_scores(scores):
    print("Expected LB")
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())
    
def encode_scale(df):
    X = df
    encoded_matrix = OneHotEncoder(sparse=False).fit_transform(X['spacegroup'].values.reshape(-1,1))
    X = X.drop("spacegroup", 1)
    encoded_df = pd.DataFrame(data=encoded_matrix, dtype=np.float64)
    X_encoded = pd.concat([X, encoded_df], axis=1).reindex()
    print(X_encoded)
    X_scaled = StandardScaler().fit_transform(X_encoded)
    
    myEncoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
    myEncoder.fit(df[columnsToEncode])
    pd.concat([X, pd.DataFrame(myEncoder.transform(df[columnsToEncode]))], axis=1).reindex()
    return X_scaled

def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)
    
def encode(df):
    numerical_attrbs = list(df)
    del numerical_attrbs[0]

    label_attrbs = ['spacegroup']
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import FeatureUnion
    pl1 = Pipeline([
        ('selector', DataFrameSelector(numerical_attrbs)),
        ('scaler', StandardScaler()),
    ])

    pl2 = Pipeline([
        ('selector', DataFrameSelector(label_attrbs)),
        ('one_hot', OneHotEncoder())
    ])

    full_pl = FeatureUnion(transformer_list=[
        ('numerical', pl1),
        ('label',     pl2),
    ])
    data_prepared = full_pl.fit_transform(df)
    return pd.DataFrame(data_prepared.toarray())

In [471]:
# Prepare data sets
df_t = load_data('/home/agi/Desktop/NOMAD/data/train_prepared.csv')
df_s = load_data('/home/agi/Desktop/NOMAD/data/test_prepared.csv')

train_set, test_set = train_test_split(df_t, test_size=0.2, random_state=42)

X_train = encode(drop_features(train_set))
X_test  = encode(drop_features(test_set))
X_submt = encode(drop_features(df_s))
                       
y_form = train_set["formation_energy_ev_natom"]
y_band = train_set["bandgap_energy_ev"]
                       
y_form_test = test_set["formation_energy_ev_natom"]
y_band_test = test_set["bandgap_energy_ev"]

In [472]:
from functools import partial

n_inputs = 30
n_hidden1 = 300
n_hidden2 = 200
n_hidden3 = 100
n_hidden4 = 100
n_hidden5 = 100
n_hidden6 = 50
n_hidden7 = 50
n_outputs = 1
learning_rate = 0.01

reset_graph()

batch_norm_momentum = 0.9

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.float32, shape=(None), name="y")
training = tf.placeholder_with_default(False, shape=(), name='training')

with tf.name_scope("dnn"):
    he_init = tf.contrib.layers.variance_scaling_initializer()

    my_batch_norm_layer = partial(
            tf.layers.batch_normalization,
            training=training,
            momentum=batch_norm_momentum)

    my_dense_layer = partial(
            tf.layers.dense,
            kernel_initializer=he_init)

    hidden1 = my_dense_layer(X, n_hidden1, name="hidden1")
    bn1 = tf.nn.elu(my_batch_norm_layer(hidden1))
    hidden2 = my_dense_layer(bn1, n_hidden2, name="hidden2")
    bn2 = tf.nn.elu(my_batch_norm_layer(hidden2))
    hidden3 = my_dense_layer(bn2, n_hidden3, name="hidden3")
    bn3 = tf.nn.elu(my_batch_norm_layer(hidden3))
    logits_before_bn = my_dense_layer(bn3, n_outputs, name="outputs")
    logits = my_batch_norm_layer(logits_before_bn)
    
with tf.name_scope("loss"):
    mse = tf.losses.mean_squared_error(labels=y, predictions=logits)
    loss = tf.sqrt(mse, name="loss")

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)
    
with tf.name_scope("eval"):
    mse_eval = tf.losses.mean_squared_error(y, logits)
    
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [473]:
n_epochs = 100
batch_size = 1
preds_form = None
preds_band = None
with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(X_train.shape[0] // batch_size):
            X_batch, y_batch = (X_train[iteration*batch_size:(iteration+1)*batch_size], 
                y_form[iteration*batch_size:(iteration+1)*batch_size])
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        acc_train = mse_eval.eval(feed_dict={X: X_batch, y: y_batch})
        acc_test = mse_eval.eval(feed_dict={X: X_test, y: y_form})
        print(epoch, "Train MSE:", acc_train, "Test MSE:", acc_test)
    feed_dict_f = {X: X_test}
    preds_form = sess.run(logits, feed_dict_f)
    submit_pred_form = sess.run(logits, {X: X_submt})

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(X_train.shape[0] // batch_size):
            X_batch, y_batch = (X_train[iteration*batch_size:(iteration+1)*batch_size], 
                y_band[iteration*batch_size:(iteration+1)*batch_size])
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        acc_train = mse_eval.eval(feed_dict={X: X_batch, y: y_batch})
        acc_test = mse_eval.eval(feed_dict={X: X_test, y: y_band})
        print(epoch, "Train MSE:", acc_train, "Test MSE:", acc_test)
    feed_dict_b = {X: X_test}
    preds_band = sess.run(logits, feed_dict_b)
    submit_pred_band = sess.run(logits, {X: X_submt})
    #save_path = saver.save(sess, "./my_model_final.ckpt")

0 Train MSE: 0.000108592 Test MSE: 0.0211013
1 Train MSE: 0.000221666 Test MSE: 0.0231636
2 Train MSE: 0.000344633 Test MSE: 0.0180247
3 Train MSE: 3.3325e-05 Test MSE: 0.0199028
4 Train MSE: 6.75643e-05 Test MSE: 0.0237235
5 Train MSE: 3.34294e-05 Test MSE: 0.0261064
6 Train MSE: 0.000496724 Test MSE: 0.0236852
7 Train MSE: 0.000323696 Test MSE: 0.021026
8 Train MSE: 0.00303616 Test MSE: 0.0234125
9 Train MSE: 4.0864e-05 Test MSE: 0.0265094
10 Train MSE: 4.26366e-05 Test MSE: 0.0188494
11 Train MSE: 3.71971e-05 Test MSE: 0.0217199
12 Train MSE: 1.66138e-05 Test MSE: 0.0222932
13 Train MSE: 7.05345e-08 Test MSE: 0.0234318
14 Train MSE: 9.47027e-05 Test MSE: 0.0257284
15 Train MSE: 0.000275343 Test MSE: 0.0227877
16 Train MSE: 5.97976e-06 Test MSE: 0.0206599
17 Train MSE: 4.91789e-05 Test MSE: 0.0204063
18 Train MSE: 1.7469e-06 Test MSE: 0.0198834
19 Train MSE: 9.40403e-05 Test MSE: 0.0214879
20 Train MSE: 3.93691e-05 Test MSE: 0.0205828
21 Train MSE: 8.42047e-05 Test MSE: 0.0243414
22 

85 Train MSE: 0.000335538 Test MSE: 2.02594
86 Train MSE: 0.0151014 Test MSE: 2.0882
87 Train MSE: 0.00786817 Test MSE: 2.06606
88 Train MSE: 0.000564802 Test MSE: 1.82002
89 Train MSE: 0.00933171 Test MSE: 1.9973
90 Train MSE: 0.00059053 Test MSE: 2.04094
91 Train MSE: 0.00134946 Test MSE: 1.85654
92 Train MSE: 0.00160989 Test MSE: 2.03373
93 Train MSE: 0.00575626 Test MSE: 1.99238
94 Train MSE: 0.00794661 Test MSE: 2.00614
95 Train MSE: 0.000109225 Test MSE: 2.06673
96 Train MSE: 0.00401424 Test MSE: 1.86212
97 Train MSE: 0.0107306 Test MSE: 2.04763
98 Train MSE: 0.00348855 Test MSE: 2.01398
99 Train MSE: 0.000555312 Test MSE: 1.91973


In [474]:
display_rmsle(preds_form, preds_band, y_form_test.as_matrix(), y_band_test.as_matrix())

RMSLE:  0.288322903515


In [475]:
# Build submission .csv
submission = np.concatenate((submit_pred_form.reshape(600,1), submit_pred_band.reshape(600,1)), axis=1)
submit_df = pd.DataFrame(submission, columns=['formation_energy_ev_natom', "bandgap_energy_ev"])
submit_df[submit_df < 0] = 0
submit_df.insert(0, 'id', range(1, 601))

# Save to file
submit_df.to_csv("/home/agi/Desktop/NOMAD/submissions/dnn_1.csv", index=False)