In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def load_train_data(filename):
    train = pd.read_csv(filename,encoding='big5')
    
    ### seperate data 
    data = [[] for i in range(18)]
    for r,d in enumerate(train.iloc[:,3:].values):
        data[r%18].append(d)
    data = np.array(data).reshape((18,-1))
    
    return data

def load_test_data(filename):
    test = pd.read_csv(filename,encoding='big5',header = None)
    test_data = [[] for i in range(18)]
    for r,d in enumerate(test.iloc[:,2:].values):
        test_data[r%18].append(d)
    test_data = np.array(test_data).reshape((18,-1))
    
    return test_data

In [2]:
train_data = load_train_data('./data/train.csv')
test_data = load_test_data('./data/test.csv')

print(train_data.shape)
print(test_data.shape)

(18, 5760)
(18, 2160)


In [3]:
def get_train_data(data,hours):
    train_x_18 = []
    train_x_1 = []
    train_y = []

    for i in range(0,data.shape[1]-hours):
        train_x_18.append(data[:,i:i+hours].reshape((-1)))
        train_x_1.append(data[9,i:i+hours].reshape((-1)))
        train_y.append(data[9][i+hours])
        
    train_x_18 = np.array(train_x_18)
    train_x_1 = np.array(train_x_1)
    train_y = np.array(train_y).astype(np.float64)

    # replace NR to 0
    train_x_18[train_x_18 == 'NR'] = 0
    train_x_18 = train_x_18.astype(np.float64)
    train_x_1 = train_x_1.astype(np.float64)
    
    return train_x_18, train_x_1, train_y

def get_test_data(data,hours):
    test_x_18 = []
    test_x_1 = []
    
    for i in range(0,data.shape[1],hours):
        test_x_18.append(data[:,i:i+hours].reshape(-1))
        test_x_1.append(data[9,i:i+hours].reshape(-1))
        
    test_x_18 = np.array(test_x_18)
    test_x_1 = np.array(test_x_1)
    
    test_x_18[test_x_18 == 'NR'] = 0
    test_x_18 = test_x_18.astype(np.float64)
    test_x_1 = test_x_1.astype(np.float64)
    return test_x_18, test_x_1

In [4]:
train_x_18_9, train_x_1_9, train_y_9 = get_train_data(train_data,9)
print(train_x_18_9.shape)
print(train_x_1_9.shape)
print(train_y_9.shape)

train_x_18_5, train_x_1_5, train_y_5 = get_train_data(train_data,5)
print(train_x_18_5.shape)
print(train_x_1_5.shape)
print(train_y_5.shape)

test_x_18_9, test_x_1_9 = get_test_data(test_data,9)
print(test_x_18_9.shape)
print(test_x_1_9.shape)

test_x_18_5, test_x_1_5 = get_test_data(test_data,5)
print(test_x_18_5.shape)
print(test_x_1_5.shape)

(5751, 162)
(5751, 9)
(5751,)
(5755, 90)
(5755, 5)
(5755,)
(240, 162)
(240, 9)
(432, 90)
(432, 5)


In [5]:
# add bias term
train_x_18_9 = np.hstack([train_x_18_9,np.ones((train_x_18_9.shape[0],1))])
train_x_1_9 = np.hstack([train_x_1_9,np.ones((train_x_1_9.shape[0],1))])
test_x_18_9 = np.hstack([test_x_18_9,np.ones((test_x_18_9.shape[0],1))])
test_x_1_9 = np.hstack([test_x_1_9,np.ones((test_x_1_9.shape[0],1))])

train_x_18_5 = np.hstack([train_x_18_5,np.ones((train_x_18_5.shape[0],1))])
train_x_1_5 = np.hstack([train_x_1_5,np.ones((train_x_1_5.shape[0],1))])
test_x_18_5 = np.hstack([test_x_18_5,np.ones((test_x_18_5.shape[0],1))])
test_x_1_5 = np.hstack([test_x_1_5,np.ones((test_x_1_5.shape[0],1))])

In [19]:
def output_result(filename,predict_value):
    id_ = []
    for i in range(predict_value.shape[0]):
        temp = 'id_'+str(i)
        id_.append(temp)
    output = pd.DataFrame(columns=['id','value'])
    output['id'] = id_
    output['value'] = predict_value
    output.to_csv(filename,index = False)
    
    print(output.head())

In [39]:
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn import metrics
from sklearn.grid_search import GridSearchCV

In [9]:
from sklearn.linear_model import LinearRegression

In [34]:
lm = LinearRegression(normalize=True)
lm.fit(train_x_18_9,train_y_9)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [35]:
pre = lm.predict(train_x_18_9)

In [36]:
def rmse(pre,test):
    tmp = 0
    for i,p in enumerate(pre):
        tmp += (p-test[i])**2
    return (tmp/pre.shape[0])**0.5

In [37]:
rmse(pre,train_y_9)

5.7829041840324136

In [38]:
prediction_test = lm.predict(test_x_18_9)
output_result('output_lm.csv',prediction_test)

     id      value
0  id_0   6.779339
1  id_1  18.248901
2  id_2  23.944874
3  id_3   7.731480
4  id_4  27.142087


In [14]:
forest = RandomForestRegressor(random_state=10)
forest.fit(train_x_18_9,train_y_9)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=10, verbose=0, warm_start=False)

In [15]:
pre = forest.predict(train_x_18_9)
print(rmse(pre,train_y_9))

2.84934809372


In [22]:
param_test1= {'n_estimators':[i for i in range(10,71,10)]}

gsearch1= GridSearchCV(estimator = RandomForestRegressor(),
#                                  min_samples_leaf=20,max_depth=8,max_features='sqrt' ,random_state=10),
                       param_grid =param_test1, scoring='neg_mean_squared_error',cv=3,n_jobs = 12)
gsearch1.fit(train_x_18_9,train_y_9)
gsearch1.grid_scores_,gsearch1.best_params_, gsearch1.best_score_


([mean: -53.62212, std: 10.37863, params: {'n_estimators': 10},
  mean: -49.76097, std: 9.08392, params: {'n_estimators': 20},
  mean: -49.04939, std: 9.25554, params: {'n_estimators': 30},
  mean: -48.20502, std: 8.40299, params: {'n_estimators': 40},
  mean: -47.89552, std: 8.37526, params: {'n_estimators': 50},
  mean: -47.89158, std: 8.75664, params: {'n_estimators': 60},
  mean: -47.48670, std: 8.36389, params: {'n_estimators': 70}],
 {'n_estimators': 70},
 -47.486699243077524)

In [23]:
(gsearch1.best_score_*-1)**0.5

6.8910593701605505

In [11]:
param_test2= {'max_depth':[i for i in range(10,20,2)], 'min_samples_split':[i for i in range(2,52,10)]}
gsearch2= GridSearchCV(estimator = RandomForestRegressor(n_estimators= 60),
#                                  min_samples_leaf=20,max_depth=8,max_features='sqrt' ,random_state=10),
                       param_grid =param_test2, scoring='neg_mean_squared_error',cv=5,n_jobs=3)
gsearch2.fit(train_x_18_9,train_y_9)
gsearch2.grid_scores_,gsearch2.best_params_, (gsearch2.best_score_*-1)**0.5

([mean: -45.49097, std: 8.63872, params: {'max_depth': 10, 'min_samples_split': 2},
  mean: -45.20031, std: 8.53628, params: {'max_depth': 10, 'min_samples_split': 12},
  mean: -45.98404, std: 9.01138, params: {'max_depth': 10, 'min_samples_split': 22},
  mean: -46.28174, std: 9.07085, params: {'max_depth': 10, 'min_samples_split': 32},
  mean: -46.22732, std: 9.11412, params: {'max_depth': 10, 'min_samples_split': 42},
  mean: -45.53640, std: 8.76131, params: {'max_depth': 12, 'min_samples_split': 2},
  mean: -45.53601, std: 9.43556, params: {'max_depth': 12, 'min_samples_split': 12},
  mean: -46.14703, std: 9.52828, params: {'max_depth': 12, 'min_samples_split': 22},
  mean: -45.06799, std: 9.37471, params: {'max_depth': 12, 'min_samples_split': 32},
  mean: -46.54924, std: 8.91659, params: {'max_depth': 12, 'min_samples_split': 42},
  mean: -45.48071, std: 8.59541, params: {'max_depth': 14, 'min_samples_split': 2},
  mean: -45.20763, std: 8.61868, params: {'max_depth': 14, 'min_samp

In [12]:
param_test2= {'max_depth':[i for i in range(20,40,2)], 'min_samples_split':[i for i in range(2,72,15)]}
gsearch2= GridSearchCV(estimator = RandomForestRegressor(n_estimators= 60),
#                                  min_samples_leaf=20,max_depth=8,max_features='sqrt' ,random_state=10),
                       param_grid =param_test2, scoring='neg_mean_squared_error',cv=5,n_jobs=12)
gsearch2.fit(train_x_18_9,train_y_9)
gsearch2.grid_scores_,gsearch2.best_params_, (gsearch2.best_score_*-1)**0.5

([mean: -45.19147, std: 8.48883, params: {'max_depth': 20, 'min_samples_split': 2},
  mean: -45.11397, std: 8.72600, params: {'max_depth': 20, 'min_samples_split': 17},
  mean: -45.29740, std: 8.88584, params: {'max_depth': 20, 'min_samples_split': 32},
  mean: -45.73170, std: 9.26199, params: {'max_depth': 20, 'min_samples_split': 47},
  mean: -46.00905, std: 9.10949, params: {'max_depth': 20, 'min_samples_split': 62},
  mean: -45.46666, std: 8.85808, params: {'max_depth': 22, 'min_samples_split': 2},
  mean: -45.60173, std: 8.95280, params: {'max_depth': 22, 'min_samples_split': 17},
  mean: -46.21174, std: 9.42386, params: {'max_depth': 22, 'min_samples_split': 32},
  mean: -46.03069, std: 9.02647, params: {'max_depth': 22, 'min_samples_split': 47},
  mean: -45.98206, std: 9.16324, params: {'max_depth': 22, 'min_samples_split': 62},
  mean: -46.39856, std: 9.41831, params: {'max_depth': 24, 'min_samples_split': 2},
  mean: -46.07278, std: 9.36477, params: {'max_depth': 24, 'min_samp

In [15]:
param_test3= {'min_samples_leaf':[i for i in range(2,30,5)], 'min_samples_split':[i for i in range(2,50,5)]}
gsearch3= GridSearchCV(estimator = RandomForestRegressor(n_estimators= 60,max_depth=32),
#                                  min_samples_leaf=20,max_depth=8,max_features='sqrt' ,random_state=10),
                       param_grid =param_test3, scoring='neg_mean_squared_error',cv=5,n_jobs=12)
gsearch3.fit(train_x_18_9,train_y_9)
gsearch3.grid_scores_,gsearch3.best_params_, (gsearch3.best_score_*-1)**0.5

([mean: -45.09276, std: 9.70218, params: {'min_samples_leaf': 2, 'min_samples_split': 2},
  mean: -45.02056, std: 9.81790, params: {'min_samples_leaf': 2, 'min_samples_split': 7},
  mean: -45.33229, std: 9.80756, params: {'min_samples_leaf': 2, 'min_samples_split': 12},
  mean: -45.38893, std: 9.82930, params: {'min_samples_leaf': 2, 'min_samples_split': 17},
  mean: -45.59281, std: 9.98997, params: {'min_samples_leaf': 2, 'min_samples_split': 22},
  mean: -45.76089, std: 10.27894, params: {'min_samples_leaf': 2, 'min_samples_split': 27},
  mean: -45.83291, std: 10.37861, params: {'min_samples_leaf': 2, 'min_samples_split': 32},
  mean: -45.53646, std: 10.02583, params: {'min_samples_leaf': 2, 'min_samples_split': 37},
  mean: -45.50658, std: 9.95866, params: {'min_samples_leaf': 2, 'min_samples_split': 42},
  mean: -45.42628, std: 10.01900, params: {'min_samples_leaf': 2, 'min_samples_split': 47},
  mean: -44.57749, std: 9.90823, params: {'min_samples_leaf': 7, 'min_samples_split': 2}

In [18]:
param_test1= {'n_estimators':[i for i in range(10,200,10)]}

gsearch1= GridSearchCV(estimator = RandomForestRegressor(max_depth=32,min_samples_leaf=12,min_samples_split=22),
#                                  min_samples_leaf=20,max_depth=8,max_features='sqrt' ,random_state=10),
                       param_grid =param_test1, scoring='neg_mean_squared_error',cv=5,n_jobs=12)
gsearch1.fit(train_x_18_9,train_y_9)
gsearch1.grid_scores_,gsearch1.best_params_, (gsearch1.best_score_*-1)**0.5

([mean: -46.09416, std: 10.07758, params: {'n_estimators': 10},
  mean: -45.03027, std: 10.20747, params: {'n_estimators': 20},
  mean: -44.77676, std: 10.16715, params: {'n_estimators': 30},
  mean: -44.67868, std: 10.26459, params: {'n_estimators': 40},
  mean: -44.52704, std: 10.27127, params: {'n_estimators': 50},
  mean: -44.91253, std: 10.54580, params: {'n_estimators': 60},
  mean: -44.73737, std: 10.26844, params: {'n_estimators': 70},
  mean: -44.55666, std: 10.08427, params: {'n_estimators': 80},
  mean: -44.68972, std: 10.18069, params: {'n_estimators': 90},
  mean: -44.71550, std: 10.21134, params: {'n_estimators': 100},
  mean: -44.65201, std: 10.30226, params: {'n_estimators': 110},
  mean: -44.59289, std: 10.30685, params: {'n_estimators': 120},
  mean: -44.70786, std: 10.34852, params: {'n_estimators': 130},
  mean: -44.64227, std: 10.29483, params: {'n_estimators': 140},
  mean: -44.70801, std: 10.31477, params: {'n_estimators': 150},
  mean: -44.62370, std: 10.22136, 

In [16]:
forest = RandomForestRegressor(random_state=10,max_depth=32,min_samples_leaf=12,min_samples_split=22,n_estimators=170)
forest.fit(train_x_18_9,train_y_9)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=32,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=12, min_samples_split=22,
           min_weight_fraction_leaf=0.0, n_estimators=170, n_jobs=1,
           oob_score=False, random_state=10, verbose=0, warm_start=False)

In [17]:
pre = forest.predict(train_x_18_9)
print(rmse(pre,train_y_9))

4.82391790408


In [20]:
prediction_test = forest.predict(test_x_18_9)

output_result('output_rf.csv',prediction_test)

In [48]:
gb = GradientBoostingRegressor(random_state=10)
gb.fit(train_x_18_9,train_y_9)
pre = gb.predict(train_x_18_9)
print(rmse(pre,train_y_9))

5.30016467069


In [49]:
prediction_test = gb.predict(test_x_18_9)

output_result('output_gb.csv',prediction_test)

     id      value
0  id_0   7.227736
1  id_1  17.916588
2  id_2  26.600544
3  id_3   7.040215
4  id_4  26.761588


In [44]:
gb.get_params

<bound method BaseEstimator.get_params of GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=10,
             subsample=1.0, verbose=0, warm_start=False)>

In [46]:
param_test1= {'n_estimators':[i for i in range(50,200,10)]}

gsearch1= GridSearchCV(estimator = GradientBoostingRegressor(random_state=10,learning_rate=0.1,max_depth=8,max_features='sqrt'),
                       param_grid =param_test1, scoring='neg_mean_squared_error',cv=5,n_jobs = 12)
gsearch1.fit(train_x_18_9,train_y_9)
gsearch1.grid_scores_,gsearch1.best_params_, (gsearch1.best_score_*-1)**0.5

([mean: -65.96844, std: 16.52025, params: {'n_estimators': 50},
  mean: -65.45826, std: 16.46920, params: {'n_estimators': 60},
  mean: -64.96189, std: 16.25923, params: {'n_estimators': 70},
  mean: -64.54772, std: 16.15981, params: {'n_estimators': 80},
  mean: -64.23294, std: 16.09882, params: {'n_estimators': 90},
  mean: -64.17024, std: 16.14635, params: {'n_estimators': 100},
  mean: -64.01372, std: 16.15635, params: {'n_estimators': 110},
  mean: -63.88243, std: 16.11680, params: {'n_estimators': 120},
  mean: -63.81257, std: 16.08031, params: {'n_estimators': 130},
  mean: -63.78912, std: 16.10091, params: {'n_estimators': 140},
  mean: -63.71603, std: 16.06029, params: {'n_estimators': 150},
  mean: -63.68459, std: 16.07278, params: {'n_estimators': 160},
  mean: -63.70034, std: 16.08594, params: {'n_estimators': 170},
  mean: -63.64051, std: 16.07425, params: {'n_estimators': 180},
  mean: -63.62173, std: 16.07138, params: {'n_estimators': 190}],
 {'n_estimators': 190},
 7.97