In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def load_train_data(filename):
    train = pd.read_csv(filename,encoding='big5')
    
    ### seperate data 
    data = [[] for i in range(18)]
    for r,d in enumerate(train.iloc[:,3:].values):
        data[r%18].append(d)
    data = np.array(data).reshape((18,-1))
    
    return data

def load_test_data(filename):
    test = pd.read_csv(filename,encoding='big5',header = None)
    test_data = [[] for i in range(18)]
    for r,d in enumerate(test.iloc[:,2:].values):
        test_data[r%18].append(d)
    test_data = np.array(test_data).reshape((18,-1))
    
    return test_data

In [2]:
train_data = load_train_data('./data/train.csv')
test_data = load_test_data('./data/test.csv')

print(train_data.shape)
print(test_data.shape)

(18, 5760)
(18, 2160)


In [3]:
def get_train_data(data,hours):
    train_x_18 = []
    train_x_1 = []
    train_y = []

    for i in range(0,data.shape[1]-hours):
        train_x_18.append(data[:,i:i+hours].reshape((-1)))
        train_x_1.append(data[9,i:i+hours].reshape((-1)))
        train_y.append(data[9][i+hours])
        
    train_x_18 = np.array(train_x_18)
    train_x_1 = np.array(train_x_1)
    train_y = np.array(train_y).astype(np.float64)

    # replace NR to 0
    train_x_18[train_x_18 == 'NR'] = 0
    train_x_18 = train_x_18.astype(np.float64)
    train_x_1 = train_x_1.astype(np.float64)
    
    return train_x_18, train_x_1, train_y

def get_test_data(data,hours):
    test_x_18 = []
    test_x_1 = []
    
    for i in range(0,data.shape[1],hours):
        test_x_18.append(data[:,i:i+hours].reshape(-1))
        test_x_1.append(data[9,i:i+hours].reshape(-1))
        
    test_x_18 = np.array(test_x_18)
    test_x_1 = np.array(test_x_1)
    
    test_x_18[test_x_18 == 'NR'] = 0
    test_x_18 = test_x_18.astype(np.float64)
    test_x_1 = test_x_1.astype(np.float64)
    return test_x_18, test_x_1

In [5]:
train_x_18_9, train_x_1_9, train_y_9 = get_train_data(train_data,9)
print(train_x_18_9.shape)
print(train_x_1_9.shape)
print(train_y_9.shape)

train_x_18_5, train_x_1_5, train_y_5 = get_train_data(train_data,5)
print(train_x_18_5.shape)
print(train_x_1_5.shape)
print(train_y_5.shape)

test_x_18_9, test_x_1_9 = get_test_data(test_data,9)
print(test_x_18_9.shape)
print(test_x_1_9.shape)

test_x_18_5, test_x_1_5 = get_test_data(test_data,5)
print(test_x_18_5.shape)
print(test_x_1_5.shape)

(5751, 162)
(5751, 9)
(5751,)
(5755, 90)
(5755, 5)
(5755,)
(240, 162)
(240, 9)
(432, 90)
(432, 5)


In [6]:
# add bias term
train_x_18_9 = np.hstack([train_x_18_9,np.ones((train_x_18_9.shape[0],1))])
train_x_1_9 = np.hstack([train_x_1_9,np.ones((train_x_1_9.shape[0],1))])
test_x_18_9 = np.hstack([test_x_18_9,np.ones((test_x_18_9.shape[0],1))])
test_x_1_9 = np.hstack([test_x_1_9,np.ones((test_x_1_9.shape[0],1))])

train_x_18_5 = np.hstack([train_x_18_5,np.ones((train_x_18_5.shape[0],1))])
train_x_1_5 = np.hstack([train_x_1_5,np.ones((train_x_1_5.shape[0],1))])
test_x_18_5 = np.hstack([test_x_18_5,np.ones((test_x_18_5.shape[0],1))])
test_x_1_5 = np.hstack([test_x_1_5,np.ones((test_x_1_5.shape[0],1))])

In [15]:
def output_result(filename,predict_value):
    id_ = []
    for i in range(predict_value.shape[0]):
        temp = 'id_'+str(i)
        id_.append(temp)
    output = pd.DataFrame(columns=['id','value'])
    output['id'] = id_
    output['value'] = predict_value
    output.to_csv(filename,index = False)
    
    print(output.head())

In [16]:
output_result('output_rf.csv',prediction_test)

     id   value
0  id_0   6.300
1  id_1  17.895
2  id_2  27.135
3  id_3   6.730
4  id_4  28.670


In [53]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.grid_search import GridSearchCV

In [54]:
forest = RandomForestRegressor(random_state=10)
forest.fit(train_x_18_9,train_y_9)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=10, verbose=0, warm_start=False)

In [55]:
pre = forest.predict(train_x_18_9)
print(metrics.mean_squared_error(train_y_9,pre)**0.5)

2.849348093723737


In [56]:
param_test1= {'n_estimators':[i for i in range(10,71,10)]}

gsearch1= GridSearchCV(estimator = RandomForestRegressor(),
#                                  min_samples_leaf=20,max_depth=8,max_features='sqrt' ,random_state=10),
                       param_grid =param_test1, scoring='neg_mean_squared_error',cv=5)
gsearch1.fit(train_x_18_9,train_y_9)
gsearch1.grid_scores_,gsearch1.best_params_, gsearch1.best_score_


([mean: -49.34916, std: 8.18040, params: {'n_estimators': 10},
  mean: -47.51384, std: 8.72862, params: {'n_estimators': 20},
  mean: -45.63497, std: 8.51197, params: {'n_estimators': 30},
  mean: -46.29801, std: 8.38311, params: {'n_estimators': 40},
  mean: -45.91122, std: 8.01583, params: {'n_estimators': 50},
  mean: -45.08428, std: 9.02259, params: {'n_estimators': 60},
  mean: -45.21809, std: 8.76195, params: {'n_estimators': 70}],
 {'n_estimators': 60},
 -45.08427906257848)

In [57]:
(gsearch1.best_score_*-1)**0.5

6.714482784442781

In [63]:
param_test2= {'max_depth':[i for i in range(10,20,2)], 'min_samples_split':[i for i in range(2,52,10)]}
gsearch2= GridSearchCV(estimator = RandomForestRegressor(n_estimators= 60),
#                                  min_samples_leaf=20,max_depth=8,max_features='sqrt' ,random_state=10),
                       param_grid =param_test2, scoring='neg_mean_squared_error',cv=5)
gsearch2.fit(train_x_18_9,train_y_9)
gsearch2.grid_scores_,gsearch2.best_params_, (gsearch2.best_score_*-1)**0.5

([mean: -50.73787, std: 11.08709, params: {'max_depth': 3, 'min_samples_split': 50},
  mean: -50.51117, std: 10.70753, params: {'max_depth': 3, 'min_samples_split': 70},
  mean: -50.89457, std: 10.94607, params: {'max_depth': 3, 'min_samples_split': 90},
  mean: -50.77795, std: 11.05020, params: {'max_depth': 3, 'min_samples_split': 110},
  mean: -50.49816, std: 10.68308, params: {'max_depth': 3, 'min_samples_split': 130},
  mean: -50.59236, std: 10.91041, params: {'max_depth': 3, 'min_samples_split': 150},
  mean: -50.65559, std: 10.85900, params: {'max_depth': 3, 'min_samples_split': 170},
  mean: -51.60338, std: 11.84658, params: {'max_depth': 3, 'min_samples_split': 190},
  mean: -47.30114, std: 9.61762, params: {'max_depth': 5, 'min_samples_split': 50},
  mean: -47.96165, std: 10.14189, params: {'max_depth': 5, 'min_samples_split': 70},
  mean: -47.54596, std: 9.47000, params: {'max_depth': 5, 'min_samples_split': 90},
  mean: -47.70303, std: 9.97500, params: {'max_depth': 5, 'min

In [None]:
'sha1:9e7a50cd0206:6d63824225cf72432869e379bb68181d9cd7add3'