In [4]:
import pandas as pd
import numpy as np
import random

from sklearn.svm import SVR 

from modules.data_processing import make_Xy, post_process, RMSLE
from modules.keras_processing import make_model

In [5]:
# get data from csv
X, y, Xsub, id_sub = make_Xy()

# add cube_PCA to X and Xsub
cube_PCA = np.load('data/cube_PCA.npy').transpose()
cube_PCA = cube_PCA / np.max(cube_PCA)

X = np.hstack((X, cube_PCA[:2400, ]))
Xsub = np.hstack((Xsub, cube_PCA[-600:, ]))

In [6]:
dimX = X.shape[1]
N = X.shape[0]

C_list = [1.0 * 1.5**p for p in range(8,11)]
epsilon_list = [0.1 * 1.5**p for p in range(-8,-3)]
gamma_list = [1.0 * 1.5**p for p in range(0,5)]

error0 = {}
error1 = {}
current_min  = np.inf

while len(error0) < 100:
    C = np.random.choice(C_list)
    epsilon = np.random.choice(epsilon_list)
    gamma = np.random.choice(gamma_list)
    
    t = (C, epsilon, gamma)
    if t in error0:
        continue
 
    pred = np.tile(np.nan, (N,2))
    for iCV in range(5):
        ind_test = np.tile(False, N)
        ind_test[iCV::5] = True

        X_train, y_train = X[~ind_test, :], y[~ind_test, :]
        X_test, y_test = X[ind_test, :], y[ind_test, :]

        y_train = np.log(1+y_train)

        clf0 = SVR(C=C, epsilon=epsilon, gamma=gamma/float(dimX), kernel='rbf')
        clf0.fit(X_train, y_train[:, 0])

        clf1 = SVR(C=C, epsilon=epsilon, gamma=gamma/float(dimX), kernel='rbf')
        clf1.fit(X_train, y_train[:, 1])

        pred[ind_test, 0] = clf0.predict(X_test)
        pred[ind_test, 1] = clf1.predict(X_test)
        
    # post process
    pred = post_process(pred)
    
    error0[t], error1[t] = RMSLE(y, pred)
    current_min = 0.5*(min(error0.values()) + min(error1.values()))

    print("trial", len(error0), " - ", "current min", '{0:.4f}'.format(current_min))

trial 1  -  current min 0.0531
trial 2  -  current min 0.0530
trial 3  -  current min 0.0530
trial 4  -  current min 0.0530
trial 5  -  current min 0.0530
trial 6  -  current min 0.0530
trial 7  -  current min 0.0530
trial 8  -  current min 0.0528
trial 9  -  current min 0.0528
trial 10  -  current min 0.0528
trial 11  -  current min 0.0528
trial 12  -  current min 0.0528
trial 13  -  current min 0.0528
trial 14  -  current min 0.0527
trial 15  -  current min 0.0527
trial 16  -  current min 0.0527
trial 17  -  current min 0.0527
trial 18  -  current min 0.0527
trial 19  -  current min 0.0527
trial 20  -  current min 0.0527
trial 21  -  current min 0.0527
trial 22  -  current min 0.0527
trial 23  -  current min 0.0527
trial 24  -  current min 0.0527
trial 25  -  current min 0.0526
trial 26  -  current min 0.0526


KeyboardInterrupt: 

In [7]:
# get best hyperparameters for each output
(C0, epsilon0, gamma0) = min(error0, key=error0.get) 
(C1, epsilon1, gamma1) = min(error1, key=error1.get) 

# train SVR
clf0 = SVR(C=C0, epsilon=epsilon0, gamma=gamma0/float(dimX), kernel='rbf')
clf0.fit(X, np.log(1+y[:, 0]) )

clf1 = SVR(C=C1, epsilon=epsilon1, gamma=gamma1/float(dimX), kernel='rbf')
clf1.fit(X, np.log(1+y[:, 1]) )

# predict submission samples
ysub0 = clf0.predict(Xsub)
ysub1 = clf1.predict(Xsub)

# post process
ysub0 = post_process(ysub0)
ysub1 = post_process(ysub1)    

# write submission
submission = pd.concat([pd.DataFrame(id_sub), pd.DataFrame(ysub0), pd.DataFrame(ysub1)], axis=1)
submission.columns = ['id','formation_energy_ev_natom', 'bandgap_energy_ev']
submission.to_csv('output/submission.csv', index = False)