In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def load_train_data(filename):
    train = pd.read_csv(filename,encoding='big5')
    
    ### seperate data 
    data = [[] for i in range(18)]
    for r,d in enumerate(train.iloc[:,3:].values):
        data[r%18].append(d)
    data = np.array(data).reshape((18,-1))
    
    return data

def load_test_data(filename):
    test = pd.read_csv(filename,encoding='big5',header = None)
    test_data = [[] for i in range(18)]
    for r,d in enumerate(test.iloc[:,2:].values):
        test_data[r%18].append(d)
    test_data = np.array(test_data).reshape((18,-1))
    
    return test_data

In [2]:
train_data = load_train_data('./data/train.csv')
test_data = load_test_data('./data/test.csv')

print(train_data.shape)
print(test_data.shape)

(18, 5760)
(18, 2160)


In [3]:
def get_train_data(data,hours):
    train_x_18 = []
    train_x_1 = []
    train_y = []

    for i in range(0,data.shape[1]-hours):
        train_x_18.append(data[:,i:i+hours].reshape((-1)))
        train_x_1.append(data[9,i:i+hours].reshape((-1)))
        train_y.append(data[9][i+hours])
        
    train_x_18 = np.array(train_x_18)
    train_x_1 = np.array(train_x_1)
    train_y = np.array(train_y).astype(np.float64)

    # replace NR to 0
    train_x_18[train_x_18 == 'NR'] = 0
    train_x_18 = train_x_18.astype(np.float64)
    train_x_1 = train_x_1.astype(np.float64)
    
    return train_x_18, train_x_1, train_y

def get_test_data(data,hours):
    test_x_18 = []
    test_x_1 = []
    
    for i in range(0,data.shape[1],hours):
        test_x_18.append(data[:,i:i+hours].reshape(-1))
        test_x_1.append(data[9,i:i+hours].reshape(-1))
        
    test_x_18 = np.array(test_x_18)
    test_x_1 = np.array(test_x_1)
    
    test_x_18[test_x_18 == 'NR'] = 0
    test_x_18 = test_x_18.astype(np.float64)
    test_x_1 = test_x_1.astype(np.float64)
    return test_x_18, test_x_1

In [6]:
train_x_18_9, train_x_1_9, train_y_9 = get_train_data(train_data,9)
print(train_x_18_9.shape)
print(train_x_1_9.shape)
print(train_y_9.shape)

train_x_18_5, train_x_1_5, train_y_5 = get_train_data(train_data,5)
print(train_x_18_5.shape)
print(train_x_1_5.shape)
print(train_y_5.shape)

test_x_18_9, test_x_1_9 = get_test_data(test_data,9)
print(test_x_18_9.shape)
print(test_x_1_9.shape)

test_x_18_5, test_x_1_5 = get_test_data(test_data,5)
print(test_x_18_5.shape)
print(test_x_1_5.shape)

(5751, 162)
(5751, 9)
(5751,)
(5755, 90)
(5755, 5)
(5755,)
(240, 162)
(240, 9)
(432, 90)
(432, 5)


In [7]:
def feature_scaling(train_x,test_x):
    data = np.vstack([train_x,test_x])
    feature_mean = np.mean(data,axis = 0) 
    feature_var = np.var(data,axis = 0)
    data = (data - feature_mean)/feature_var 
    
    return data[:-240,:],data[-240:,:]

In [8]:
train_x_18_9,test_x_18_9 = feature_scaling(train_x_18_9,test_x_18_9)
train_x_18_5,test_x_18_5 = feature_scaling(train_x_18_5,test_x_18_5)
train_x_1_5,test_x_1_5 = feature_scaling(train_x_1_5,test_x_1_5)
train_x_1_5,test_x_1_5 = feature_scaling(train_x_1_5,test_x_1_5)


In [9]:
# add bias term
train_x_18_9 = np.hstack([train_x_18_9,np.ones((train_x_18_9.shape[0],1))])
train_x_1_9 = np.hstack([train_x_1_9,np.ones((train_x_1_9.shape[0],1))])
test_x_18_9 = np.hstack([test_x_18_9,np.ones((test_x_18_9.shape[0],1))])
test_x_1_9 = np.hstack([test_x_1_9,np.ones((test_x_1_9.shape[0],1))])

train_x_18_5 = np.hstack([train_x_18_5,np.ones((train_x_18_5.shape[0],1))])
train_x_1_5 = np.hstack([train_x_1_5,np.ones((train_x_1_5.shape[0],1))])
test_x_18_5 = np.hstack([test_x_18_5,np.ones((test_x_18_5.shape[0],1))])
test_x_1_5 = np.hstack([test_x_1_5,np.ones((test_x_1_5.shape[0],1))])

In [10]:
def output_result(filename,predict_value):
    id_ = []
    for i in range(predict_value.shape[0]):
        temp = 'id_'+str(i)
        id_.append(temp)
    output = pd.DataFrame(columns=['id','value'])
    output['id'] = id_
    output['value'] = predict_value
    output.to_csv(filename,index = False)
    
    print(output.head())

In [11]:
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn import metrics
from sklearn.grid_search import GridSearchCV



In [12]:
from sklearn.linear_model import LinearRegression

In [13]:
lm = LinearRegression(normalize=True)
lm.fit(train_x_18_9,train_y_9)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [14]:
pre = lm.predict(train_x_18_9)

In [15]:
def rmse(pre,test):
    tmp = 0
    for i,p in enumerate(pre):
        tmp += (p-test[i])**2
    return (tmp/pre.shape[0])**0.5

In [16]:
rmse(pre,train_y_9)

5.782904184032418

In [38]:
prediction_test = lm.predict(test_x_18_9)
output_result('output_lm.csv',prediction_test)

     id      value
0  id_0   6.779339
1  id_1  18.248901
2  id_2  23.944874
3  id_3   7.731480
4  id_4  27.142087


In [17]:
forest = RandomForestRegressor(random_state=10)
forest.fit(train_x_18_9,train_y_9)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=10, verbose=0, warm_start=False)

In [18]:
pre = forest.predict(train_x_18_9)
print(rmse(pre,train_y_9))

2.84767886334


In [46]:
param_test1= {'n_estimators':[i for i in range(5,95,10)]}

gsearch1= GridSearchCV(estimator = RandomForestRegressor(
                                 min_samples_leaf=1,max_depth=5 ,random_state=10),
                       param_grid =param_test1, scoring='neg_mean_squared_error',cv=5,n_jobs = 12)
gsearch1.fit(train_x_18_9,train_y_9)
gsearch1.grid_scores_,gsearch1.best_params_, (gsearch1.best_score_*-1)**0.5


([mean: -47.84665, std: 8.90858, params: {'n_estimators': 5},
  mean: -47.68945, std: 8.89384, params: {'n_estimators': 15},
  mean: -47.40528, std: 9.28233, params: {'n_estimators': 25},
  mean: -47.72785, std: 9.23768, params: {'n_estimators': 35},
  mean: -47.57056, std: 9.59389, params: {'n_estimators': 45},
  mean: -47.70442, std: 9.69273, params: {'n_estimators': 55},
  mean: -47.68381, std: 9.85721, params: {'n_estimators': 65},
  mean: -47.62470, std: 9.65050, params: {'n_estimators': 75},
  mean: -47.53019, std: 9.65905, params: {'n_estimators': 85}],
 {'n_estimators': 25},
 6.885149226902899)

In [47]:
forest = RandomForestRegressor( min_samples_leaf=1,max_depth=5 ,random_state=10,n_estimators=25)
forest.fit(train_x_18_9,train_y_9)
pre = forest.predict(train_x_18_9)
print(rmse(pre,train_y_9))

6.10640988278


In [48]:
param_test2= {'max_depth':[i for i in range(3,15,2)], 'min_samples_split':[i for i in range(2,30,5)]}
gsearch2= GridSearchCV(estimator = RandomForestRegressor(n_estimators= 25),
#                                  min_samples_leaf=20,max_depth=8,max_features='sqrt' ,random_state=10),
                       param_grid =param_test2, scoring='neg_mean_squared_error',cv=5,n_jobs=12)
gsearch2.fit(train_x_18_9,train_y_9)
gsearch2.grid_scores_,gsearch2.best_params_, (gsearch2.best_score_*-1)**0.5

([mean: -50.92352, std: 10.92926, params: {'max_depth': 3, 'min_samples_split': 2},
  mean: -50.92352, std: 10.92926, params: {'max_depth': 3, 'min_samples_split': 7},
  mean: -50.72098, std: 10.94123, params: {'max_depth': 3, 'min_samples_split': 12},
  mean: -50.88207, std: 11.07967, params: {'max_depth': 3, 'min_samples_split': 17},
  mean: -50.94356, std: 11.07464, params: {'max_depth': 3, 'min_samples_split': 22},
  mean: -50.96903, std: 10.96290, params: {'max_depth': 3, 'min_samples_split': 27},
  mean: -47.51064, std: 9.54831, params: {'max_depth': 5, 'min_samples_split': 2},
  mean: -47.02096, std: 9.80606, params: {'max_depth': 5, 'min_samples_split': 7},
  mean: -47.19664, std: 10.07551, params: {'max_depth': 5, 'min_samples_split': 12},
  mean: -47.42307, std: 9.52071, params: {'max_depth': 5, 'min_samples_split': 17},
  mean: -48.14559, std: 9.99506, params: {'max_depth': 5, 'min_samples_split': 22},
  mean: -48.30933, std: 10.03986, params: {'max_depth': 5, 'min_samples_s

### max_depth 7 or 13 

In [50]:
forest = RandomForestRegressor( min_samples_leaf=12,max_depth=13,random_state=10,n_estimators=25)
forest.fit(train_x_18_9,train_y_9)
pre = forest.predict(train_x_18_9)
print(rmse(pre,train_y_9))

4.88669268959


In [51]:
param_test3= {'min_samples_leaf':[i for i in range(2,30,5)], 'min_samples_split':[i for i in range(2,50,5)]}
gsearch3= GridSearchCV(estimator = RandomForestRegressor(n_estimators= 25,max_depth=13,random_state=10),
#                                  min_samples_leaf=20,max_depth=8,max_features='sqrt' ,random_state=10),
                       param_grid =param_test3, scoring='neg_mean_squared_error',cv=5,n_jobs=12)
gsearch3.fit(train_x_18_9,train_y_9)
gsearch3.grid_scores_,gsearch3.best_params_, (gsearch3.best_score_*-1)**0.5

([mean: -45.92241, std: 9.73920, params: {'min_samples_leaf': 2, 'min_samples_split': 2},
  mean: -46.02063, std: 9.78700, params: {'min_samples_leaf': 2, 'min_samples_split': 7},
  mean: -45.69934, std: 9.65107, params: {'min_samples_leaf': 2, 'min_samples_split': 12},
  mean: -45.66958, std: 9.61196, params: {'min_samples_leaf': 2, 'min_samples_split': 17},
  mean: -45.96469, std: 9.72197, params: {'min_samples_leaf': 2, 'min_samples_split': 22},
  mean: -46.04234, std: 9.87372, params: {'min_samples_leaf': 2, 'min_samples_split': 27},
  mean: -45.93636, std: 9.88345, params: {'min_samples_leaf': 2, 'min_samples_split': 32},
  mean: -45.96571, std: 10.04592, params: {'min_samples_leaf': 2, 'min_samples_split': 37},
  mean: -45.99743, std: 9.95187, params: {'min_samples_leaf': 2, 'min_samples_split': 42},
  mean: -46.14430, std: 10.00619, params: {'min_samples_leaf': 2, 'min_samples_split': 47},
  mean: -45.37120, std: 10.24515, params: {'min_samples_leaf': 7, 'min_samples_split': 2},

In [52]:
forest = RandomForestRegressor( min_samples_leaf=17,min_samples_split=37,max_depth=13,random_state=10,n_estimators=25)
forest.fit(train_x_18_9,train_y_9)
pre = forest.predict(train_x_18_9)
print(rmse(pre,train_y_9))

5.31517601458


In [53]:
param_test3= {'min_samples_leaf':[i for i in range(2,30,5)], 'min_samples_split':[i for i in range(2,50,5)]}
gsearch3= GridSearchCV(estimator = RandomForestRegressor(n_estimators= 25,max_depth=7,random_state=10),
#                                  min_samples_leaf=20,max_depth=8,max_features='sqrt' ,random_state=10),
                       param_grid =param_test3, scoring='neg_mean_squared_error',cv=5,n_jobs=12)
gsearch3.fit(train_x_18_9,train_y_9)
gsearch3.grid_scores_,gsearch3.best_params_, (gsearch3.best_score_*-1)**0.5

([mean: -46.15374, std: 9.76046, params: {'min_samples_leaf': 2, 'min_samples_split': 2},
  mean: -46.16870, std: 9.74147, params: {'min_samples_leaf': 2, 'min_samples_split': 7},
  mean: -46.11500, std: 9.66898, params: {'min_samples_leaf': 2, 'min_samples_split': 12},
  mean: -46.12658, std: 9.74452, params: {'min_samples_leaf': 2, 'min_samples_split': 17},
  mean: -46.40084, std: 9.92268, params: {'min_samples_leaf': 2, 'min_samples_split': 22},
  mean: -46.56253, std: 10.00374, params: {'min_samples_leaf': 2, 'min_samples_split': 27},
  mean: -46.55712, std: 10.03098, params: {'min_samples_leaf': 2, 'min_samples_split': 32},
  mean: -46.50077, std: 10.12101, params: {'min_samples_leaf': 2, 'min_samples_split': 37},
  mean: -46.49366, std: 10.05149, params: {'min_samples_leaf': 2, 'min_samples_split': 42},
  mean: -46.59662, std: 10.16492, params: {'min_samples_leaf': 2, 'min_samples_split': 47},
  mean: -45.56950, std: 10.21077, params: {'min_samples_leaf': 7, 'min_samples_split': 

In [54]:
forest = RandomForestRegressor( min_samples_leaf=7,min_samples_split=2,max_depth=7,random_state=10,n_estimators=25)
forest.fit(train_x_18_9,train_y_9)
pre = forest.predict(train_x_18_9)
print(rmse(pre,train_y_9))

5.48248774161


In [60]:
param_test4 = {'max_features':[i for i in range(100,165,5)]}
gsearch4 = GridSearchCV(estimator = RandomForestRegressor(n_estimators= 25, max_depth=13, min_samples_split=37,
                                  min_samples_leaf=17 ,random_state=10),
    param_grid =param_test4, scoring='neg_mean_squared_error',cv=5,n_jobs=12)
gsearch4.fit(train_x_18_9,train_y_9)
gsearch4.grid_scores_,gsearch4.best_params_, (gsearch4.best_score_*-1)**0.5

([mean: -45.12181, std: 10.21084, params: {'max_features': 100},
  mean: -45.14500, std: 10.52334, params: {'max_features': 105},
  mean: -45.09930, std: 10.02480, params: {'max_features': 110},
  mean: -45.00691, std: 10.31957, params: {'max_features': 115},
  mean: -45.03367, std: 10.10604, params: {'max_features': 120},
  mean: -44.78233, std: 10.27031, params: {'max_features': 125},
  mean: -44.57418, std: 10.23770, params: {'max_features': 130},
  mean: -44.96508, std: 10.20858, params: {'max_features': 135},
  mean: -44.84685, std: 10.23778, params: {'max_features': 140},
  mean: -45.31280, std: 10.32514, params: {'max_features': 145},
  mean: -44.76771, std: 10.32003, params: {'max_features': 150},
  mean: -45.16717, std: 10.46535, params: {'max_features': 155},
  mean: -45.17728, std: 10.49743, params: {'max_features': 160}],
 {'max_features': 130},
 6.676389870867963)

In [61]:
forest = RandomForestRegressor( min_samples_leaf=17,min_samples_split=37,max_depth=13,random_state=10,
                               max_features=130,n_estimators=25)
forest.fit(train_x_18_9,train_y_9)
pre = forest.predict(train_x_18_9)
print(rmse(pre,train_y_9))

5.32946027076


In [64]:
param_test4 = {'max_features':[i for i in range(100,165,5)]}
gsearch4 = GridSearchCV(estimator = RandomForestRegressor(n_estimators= 25, max_depth=7, min_samples_split=2,
                                  min_samples_leaf=7 ,random_state=10),
    param_grid =param_test4, scoring='neg_mean_squared_error',cv=5,n_jobs=12)
gsearch4.fit(train_x_18_9,train_y_9)
gsearch4.grid_scores_,gsearch4.best_params_, (gsearch4.best_score_*-1)**0.5

([mean: -45.05912, std: 9.77442, params: {'max_features': 100},
  mean: -45.25076, std: 9.97022, params: {'max_features': 105},
  mean: -45.45736, std: 10.11188, params: {'max_features': 110},
  mean: -45.35418, std: 10.08760, params: {'max_features': 115},
  mean: -45.25662, std: 9.99203, params: {'max_features': 120},
  mean: -44.95587, std: 9.94133, params: {'max_features': 125},
  mean: -45.12798, std: 10.21202, params: {'max_features': 130},
  mean: -44.48262, std: 9.74630, params: {'max_features': 135},
  mean: -44.56885, std: 9.86416, params: {'max_features': 140},
  mean: -45.70374, std: 10.40121, params: {'max_features': 145},
  mean: -45.54342, std: 10.12771, params: {'max_features': 150},
  mean: -45.16464, std: 10.09801, params: {'max_features': 155},
  mean: -45.62552, std: 10.22684, params: {'max_features': 160}],
 {'max_features': 135},
 6.669529072806274)

In [65]:
forest = RandomForestRegressor( min_samples_leaf=7,min_samples_split=2,max_depth=7,random_state=10,
                               max_features=135,n_estimators=25)
forest.fit(train_x_18_9,train_y_9)
pre = forest.predict(train_x_18_9)
print(rmse(pre,train_y_9))

5.49097145517


In [66]:
prediction_test = forest.predict(test_x_18_9)
output_result('output_rf.csv',prediction_test)

     id      value
0  id_0   6.455004
1  id_1  17.939696
2  id_2  26.162440
3  id_3   6.194253
4  id_4  30.100440


In [48]:
gb = GradientBoostingRegressor(random_state=10)
gb.fit(train_x_18_9,train_y_9)
pre = gb.predict(train_x_18_9)
print(rmse(pre,train_y_9))

5.30016467069


In [49]:
prediction_test = gb.predict(test_x_18_9)

output_result('output_gb.csv',prediction_test)

     id      value
0  id_0   7.227736
1  id_1  17.916588
2  id_2  26.600544
3  id_3   7.040215
4  id_4  26.761588


In [44]:
gb.get_params

<bound method BaseEstimator.get_params of GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=10,
             subsample=1.0, verbose=0, warm_start=False)>

In [46]:
param_test1= {'n_estimators':[i for i in range(50,200,10)]}

gsearch1= GridSearchCV(estimator = GradientBoostingRegressor(random_state=10,learning_rate=0.1,max_depth=8,max_features='sqrt'),
                       param_grid =param_test1, scoring='neg_mean_squared_error',cv=5,n_jobs = 12)
gsearch1.fit(train_x_18_9,train_y_9)
gsearch1.grid_scores_,gsearch1.best_params_, (gsearch1.best_score_*-1)**0.5

([mean: -65.96844, std: 16.52025, params: {'n_estimators': 50},
  mean: -65.45826, std: 16.46920, params: {'n_estimators': 60},
  mean: -64.96189, std: 16.25923, params: {'n_estimators': 70},
  mean: -64.54772, std: 16.15981, params: {'n_estimators': 80},
  mean: -64.23294, std: 16.09882, params: {'n_estimators': 90},
  mean: -64.17024, std: 16.14635, params: {'n_estimators': 100},
  mean: -64.01372, std: 16.15635, params: {'n_estimators': 110},
  mean: -63.88243, std: 16.11680, params: {'n_estimators': 120},
  mean: -63.81257, std: 16.08031, params: {'n_estimators': 130},
  mean: -63.78912, std: 16.10091, params: {'n_estimators': 140},
  mean: -63.71603, std: 16.06029, params: {'n_estimators': 150},
  mean: -63.68459, std: 16.07278, params: {'n_estimators': 160},
  mean: -63.70034, std: 16.08594, params: {'n_estimators': 170},
  mean: -63.64051, std: 16.07425, params: {'n_estimators': 180},
  mean: -63.62173, std: 16.07138, params: {'n_estimators': 190}],
 {'n_estimators': 190},
 7.97