In [98]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def load_train_data(filename):
    train = pd.read_csv(filename,encoding='big5')
    
    ### seperate data 
    data = [[] for i in range(18)]
    for r,d in enumerate(train.iloc[:,3:].values):
        data[r%18].append(d)
    data = np.array(data).reshape((18,-1))
    
    return data

def load_test_data(filename):
    test = pd.read_csv(filename,encoding='big5',header = None)
    test_data = [[] for i in range(18)]
    for r,d in enumerate(test.iloc[:,2:].values):
        test_data[r%18].append(d)
    test_data = np.array(test_data).reshape((18,-1))
    
    return test_data

In [99]:
train_data = load_train_data('./data/train.csv')
test_data = load_test_data('./data/test.csv')

print(train_data.shape)
print(test_data.shape)

(18, 5760)
(18, 2160)


In [100]:
def get_train_data(data,hours):
    train_x_18 = []
    train_x_1 = []
    train_y = []

    for i in range(0,data.shape[1]-hours):
        train_x_18.append(data[:,i:i+hours].reshape((-1)))
        train_x_1.append(data[9,i:i+hours].reshape((-1)))
        train_y.append(data[9][i+hours])
        
    train_x_18 = np.array(train_x_18)
    train_x_1 = np.array(train_x_1)
    train_y = np.array(train_y).astype(np.float64)

    # replace NR to 0
    train_x_18[train_x_18 == 'NR'] = 0
    train_x_18 = train_x_18.astype(np.float64)
    train_x_1 = train_x_1.astype(np.float64)
    
    return train_x_18, train_x_1, train_y

def get_test_data(data,hours):
    test_x_18 = []
    test_x_1 = []
    
    for i in range(0,data.shape[1],hours):
        test_x_18.append(data[:,i:i+hours].reshape(-1))
        test_x_1.append(data[9,i:i+hours].reshape(-1))
        
    test_x_18 = np.array(test_x_18)
    test_x_1 = np.array(test_x_1)
    
    test_x_18[test_x_18 == 'NR'] = 0
    test_x_18 = test_x_18.astype(np.float64)
    test_x_1 = test_x_1.astype(np.float64)
    return test_x_18, test_x_1

In [101]:
train_x_18_9, train_x_1_9, train_y_9 = get_train_data(train_data,9)
print(train_x_18_9.shape)
print(train_x_1_9.shape)
print(train_y_9.shape)

train_x_18_5, train_x_1_5, train_y_5 = get_train_data(train_data,5)
print(train_x_18_5.shape)
print(train_x_1_5.shape)
print(train_y_5.shape)

test_x_18_9, test_x_1_9 = get_test_data(test_data,9)
print(test_x_18_9.shape)
print(test_x_1_9.shape)

test_x_18_5, test_x_1_5 = get_test_data(test_data,5)
print(test_x_18_5.shape)
print(test_x_1_5.shape)

(5751, 162)
(5751, 9)
(5751,)
(5755, 90)
(5755, 5)
(5755,)
(240, 162)
(240, 9)
(432, 90)
(432, 5)


In [102]:
def feature_scaling(train_x,test_x):
    data = np.vstack([train_x,test_x])
    feature_mean = np.mean(data,axis = 0) 
    feature_var = np.var(data,axis = 0)
    data = (data - feature_mean)/feature_var 
    
    return data[:-240,:],data[-240:,:]

In [8]:
train_x_18_9,test_x_18_9 = feature_scaling(train_x_18_9,test_x_18_9)
train_x_18_5,test_x_18_5 = feature_scaling(train_x_18_5,test_x_18_5)
train_x_1_5,test_x_1_5 = feature_scaling(train_x_1_5,test_x_1_5)
train_x_1_5,test_x_1_5 = feature_scaling(train_x_1_5,test_x_1_5)


In [103]:
# add bias term
train_x_18_9 = np.hstack([train_x_18_9,np.ones((train_x_18_9.shape[0],1))])
train_x_1_9 = np.hstack([train_x_1_9,np.ones((train_x_1_9.shape[0],1))])
test_x_18_9 = np.hstack([test_x_18_9,np.ones((test_x_18_9.shape[0],1))])
test_x_1_9 = np.hstack([test_x_1_9,np.ones((test_x_1_9.shape[0],1))])

train_x_18_5 = np.hstack([train_x_18_5,np.ones((train_x_18_5.shape[0],1))])
train_x_1_5 = np.hstack([train_x_1_5,np.ones((train_x_1_5.shape[0],1))])
test_x_18_5 = np.hstack([test_x_18_5,np.ones((test_x_18_5.shape[0],1))])
test_x_1_5 = np.hstack([test_x_1_5,np.ones((test_x_1_5.shape[0],1))])

In [10]:
def output_result(filename,predict_value):
    id_ = []
    for i in range(predict_value.shape[0]):
        temp = 'id_'+str(i)
        id_.append(temp)
    output = pd.DataFrame(columns=['id','value'])
    output['id'] = id_
    output['value'] = predict_value
    output.to_csv(filename,index = False)
    
    print(output.head())

In [11]:
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn import metrics
from sklearn.grid_search import GridSearchCV



In [12]:
from sklearn.linear_model import LinearRegression

In [13]:
lm = LinearRegression(normalize=True)
lm.fit(train_x_18_9,train_y_9)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [14]:
pre = lm.predict(train_x_18_9)

In [15]:
def rmse(pre,test):
    tmp = 0
    for i,p in enumerate(pre):
        tmp += (p-test[i])**2
    return (tmp/pre.shape[0])**0.5

In [16]:
rmse(pre,train_y_9)

5.782904184032418

In [38]:
prediction_test = lm.predict(test_x_18_9)
output_result('output_lm.csv',prediction_test)

     id      value
0  id_0   6.779339
1  id_1  18.248901
2  id_2  23.944874
3  id_3   7.731480
4  id_4  27.142087


In [17]:
forest = RandomForestRegressor(random_state=10)
forest.fit(train_x_18_9,train_y_9)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=10, verbose=0, warm_start=False)

In [18]:
pre = forest.predict(train_x_18_9)
print(rmse(pre,train_y_9))

2.84767886334


In [46]:
param_test1= {'n_estimators':[i for i in range(5,95,10)]}

gsearch1= GridSearchCV(estimator = RandomForestRegressor(
                                 min_samples_leaf=1,max_depth=5 ,random_state=10),
                       param_grid =param_test1, scoring='neg_mean_squared_error',cv=5,n_jobs = 12)
gsearch1.fit(train_x_18_9,train_y_9)
gsearch1.grid_scores_,gsearch1.best_params_, (gsearch1.best_score_*-1)**0.5


([mean: -47.84665, std: 8.90858, params: {'n_estimators': 5},
  mean: -47.68945, std: 8.89384, params: {'n_estimators': 15},
  mean: -47.40528, std: 9.28233, params: {'n_estimators': 25},
  mean: -47.72785, std: 9.23768, params: {'n_estimators': 35},
  mean: -47.57056, std: 9.59389, params: {'n_estimators': 45},
  mean: -47.70442, std: 9.69273, params: {'n_estimators': 55},
  mean: -47.68381, std: 9.85721, params: {'n_estimators': 65},
  mean: -47.62470, std: 9.65050, params: {'n_estimators': 75},
  mean: -47.53019, std: 9.65905, params: {'n_estimators': 85}],
 {'n_estimators': 25},
 6.885149226902899)

In [47]:
forest = RandomForestRegressor( min_samples_leaf=1,max_depth=5 ,random_state=10,n_estimators=25)
forest.fit(train_x_18_9,train_y_9)
pre = forest.predict(train_x_18_9)
print(rmse(pre,train_y_9))

6.10640988278


In [48]:
param_test2= {'max_depth':[i for i in range(3,15,2)], 'min_samples_split':[i for i in range(2,30,5)]}
gsearch2= GridSearchCV(estimator = RandomForestRegressor(n_estimators= 25),
#                                  min_samples_leaf=20,max_depth=8,max_features='sqrt' ,random_state=10),
                       param_grid =param_test2, scoring='neg_mean_squared_error',cv=5,n_jobs=12)
gsearch2.fit(train_x_18_9,train_y_9)
gsearch2.grid_scores_,gsearch2.best_params_, (gsearch2.best_score_*-1)**0.5

([mean: -50.92352, std: 10.92926, params: {'max_depth': 3, 'min_samples_split': 2},
  mean: -50.92352, std: 10.92926, params: {'max_depth': 3, 'min_samples_split': 7},
  mean: -50.72098, std: 10.94123, params: {'max_depth': 3, 'min_samples_split': 12},
  mean: -50.88207, std: 11.07967, params: {'max_depth': 3, 'min_samples_split': 17},
  mean: -50.94356, std: 11.07464, params: {'max_depth': 3, 'min_samples_split': 22},
  mean: -50.96903, std: 10.96290, params: {'max_depth': 3, 'min_samples_split': 27},
  mean: -47.51064, std: 9.54831, params: {'max_depth': 5, 'min_samples_split': 2},
  mean: -47.02096, std: 9.80606, params: {'max_depth': 5, 'min_samples_split': 7},
  mean: -47.19664, std: 10.07551, params: {'max_depth': 5, 'min_samples_split': 12},
  mean: -47.42307, std: 9.52071, params: {'max_depth': 5, 'min_samples_split': 17},
  mean: -48.14559, std: 9.99506, params: {'max_depth': 5, 'min_samples_split': 22},
  mean: -48.30933, std: 10.03986, params: {'max_depth': 5, 'min_samples_s

### max_depth 7 or 13 

In [50]:
forest = RandomForestRegressor( min_samples_leaf=12,max_depth=13,random_state=10,n_estimators=25)
forest.fit(train_x_18_9,train_y_9)
pre = forest.predict(train_x_18_9)
print(rmse(pre,train_y_9))

4.88669268959


In [51]:
param_test3= {'min_samples_leaf':[i for i in range(2,30,5)], 'min_samples_split':[i for i in range(2,50,5)]}
gsearch3= GridSearchCV(estimator = RandomForestRegressor(n_estimators= 25,max_depth=13,random_state=10),
#                                  min_samples_leaf=20,max_depth=8,max_features='sqrt' ,random_state=10),
                       param_grid =param_test3, scoring='neg_mean_squared_error',cv=5,n_jobs=12)
gsearch3.fit(train_x_18_9,train_y_9)
gsearch3.grid_scores_,gsearch3.best_params_, (gsearch3.best_score_*-1)**0.5

([mean: -45.92241, std: 9.73920, params: {'min_samples_leaf': 2, 'min_samples_split': 2},
  mean: -46.02063, std: 9.78700, params: {'min_samples_leaf': 2, 'min_samples_split': 7},
  mean: -45.69934, std: 9.65107, params: {'min_samples_leaf': 2, 'min_samples_split': 12},
  mean: -45.66958, std: 9.61196, params: {'min_samples_leaf': 2, 'min_samples_split': 17},
  mean: -45.96469, std: 9.72197, params: {'min_samples_leaf': 2, 'min_samples_split': 22},
  mean: -46.04234, std: 9.87372, params: {'min_samples_leaf': 2, 'min_samples_split': 27},
  mean: -45.93636, std: 9.88345, params: {'min_samples_leaf': 2, 'min_samples_split': 32},
  mean: -45.96571, std: 10.04592, params: {'min_samples_leaf': 2, 'min_samples_split': 37},
  mean: -45.99743, std: 9.95187, params: {'min_samples_leaf': 2, 'min_samples_split': 42},
  mean: -46.14430, std: 10.00619, params: {'min_samples_leaf': 2, 'min_samples_split': 47},
  mean: -45.37120, std: 10.24515, params: {'min_samples_leaf': 7, 'min_samples_split': 2},

In [52]:
forest = RandomForestRegressor( min_samples_leaf=17,min_samples_split=37,max_depth=13,random_state=10,n_estimators=25)
forest.fit(train_x_18_9,train_y_9)
pre = forest.predict(train_x_18_9)
print(rmse(pre,train_y_9))

5.31517601458


In [53]:
param_test3= {'min_samples_leaf':[i for i in range(2,30,5)], 'min_samples_split':[i for i in range(2,50,5)]}
gsearch3= GridSearchCV(estimator = RandomForestRegressor(n_estimators= 25,max_depth=7,random_state=10),
#                                  min_samples_leaf=20,max_depth=8,max_features='sqrt' ,random_state=10),
                       param_grid =param_test3, scoring='neg_mean_squared_error',cv=5,n_jobs=12)
gsearch3.fit(train_x_18_9,train_y_9)
gsearch3.grid_scores_,gsearch3.best_params_, (gsearch3.best_score_*-1)**0.5

([mean: -46.15374, std: 9.76046, params: {'min_samples_leaf': 2, 'min_samples_split': 2},
  mean: -46.16870, std: 9.74147, params: {'min_samples_leaf': 2, 'min_samples_split': 7},
  mean: -46.11500, std: 9.66898, params: {'min_samples_leaf': 2, 'min_samples_split': 12},
  mean: -46.12658, std: 9.74452, params: {'min_samples_leaf': 2, 'min_samples_split': 17},
  mean: -46.40084, std: 9.92268, params: {'min_samples_leaf': 2, 'min_samples_split': 22},
  mean: -46.56253, std: 10.00374, params: {'min_samples_leaf': 2, 'min_samples_split': 27},
  mean: -46.55712, std: 10.03098, params: {'min_samples_leaf': 2, 'min_samples_split': 32},
  mean: -46.50077, std: 10.12101, params: {'min_samples_leaf': 2, 'min_samples_split': 37},
  mean: -46.49366, std: 10.05149, params: {'min_samples_leaf': 2, 'min_samples_split': 42},
  mean: -46.59662, std: 10.16492, params: {'min_samples_leaf': 2, 'min_samples_split': 47},
  mean: -45.56950, std: 10.21077, params: {'min_samples_leaf': 7, 'min_samples_split': 

In [54]:
forest = RandomForestRegressor( min_samples_leaf=7,min_samples_split=2,max_depth=7,random_state=10,n_estimators=25)
forest.fit(train_x_18_9,train_y_9)
pre = forest.predict(train_x_18_9)
print(rmse(pre,train_y_9))

5.48248774161


In [60]:
param_test4 = {'max_features':[i for i in range(100,165,5)]}
gsearch4 = GridSearchCV(estimator = RandomForestRegressor(n_estimators= 25, max_depth=13, min_samples_split=37,
                                  min_samples_leaf=17 ,random_state=10),
    param_grid =param_test4, scoring='neg_mean_squared_error',cv=5,n_jobs=12)
gsearch4.fit(train_x_18_9,train_y_9)
gsearch4.grid_scores_,gsearch4.best_params_, (gsearch4.best_score_*-1)**0.5

([mean: -45.12181, std: 10.21084, params: {'max_features': 100},
  mean: -45.14500, std: 10.52334, params: {'max_features': 105},
  mean: -45.09930, std: 10.02480, params: {'max_features': 110},
  mean: -45.00691, std: 10.31957, params: {'max_features': 115},
  mean: -45.03367, std: 10.10604, params: {'max_features': 120},
  mean: -44.78233, std: 10.27031, params: {'max_features': 125},
  mean: -44.57418, std: 10.23770, params: {'max_features': 130},
  mean: -44.96508, std: 10.20858, params: {'max_features': 135},
  mean: -44.84685, std: 10.23778, params: {'max_features': 140},
  mean: -45.31280, std: 10.32514, params: {'max_features': 145},
  mean: -44.76771, std: 10.32003, params: {'max_features': 150},
  mean: -45.16717, std: 10.46535, params: {'max_features': 155},
  mean: -45.17728, std: 10.49743, params: {'max_features': 160}],
 {'max_features': 130},
 6.676389870867963)

In [68]:
forest = RandomForestRegressor( min_samples_leaf=17,min_samples_split=37,max_depth=13,random_state=10,
                               max_features=130,n_estimators=25)
forest.fit(train_x_18_9,train_y_9)
pre = forest.predict(train_x_18_9)
print(rmse(pre,train_y_9))

5.32946027076


In [64]:
param_test4 = {'max_features':[i for i in range(100,165,5)]}
gsearch4 = GridSearchCV(estimator = RandomForestRegressor(n_estimators= 25, max_depth=7, min_samples_split=2,
                                  min_samples_leaf=7 ,random_state=10),
    param_grid =param_test4, scoring='neg_mean_squared_error',cv=5,n_jobs=12)
gsearch4.fit(train_x_18_9,train_y_9)
gsearch4.grid_scores_,gsearch4.best_params_, (gsearch4.best_score_*-1)**0.5

([mean: -45.05912, std: 9.77442, params: {'max_features': 100},
  mean: -45.25076, std: 9.97022, params: {'max_features': 105},
  mean: -45.45736, std: 10.11188, params: {'max_features': 110},
  mean: -45.35418, std: 10.08760, params: {'max_features': 115},
  mean: -45.25662, std: 9.99203, params: {'max_features': 120},
  mean: -44.95587, std: 9.94133, params: {'max_features': 125},
  mean: -45.12798, std: 10.21202, params: {'max_features': 130},
  mean: -44.48262, std: 9.74630, params: {'max_features': 135},
  mean: -44.56885, std: 9.86416, params: {'max_features': 140},
  mean: -45.70374, std: 10.40121, params: {'max_features': 145},
  mean: -45.54342, std: 10.12771, params: {'max_features': 150},
  mean: -45.16464, std: 10.09801, params: {'max_features': 155},
  mean: -45.62552, std: 10.22684, params: {'max_features': 160}],
 {'max_features': 135},
 6.669529072806274)

In [65]:
forest = RandomForestRegressor( min_samples_leaf=7,min_samples_split=2,max_depth=7,random_state=10,
                               max_features=135,n_estimators=25)
forest.fit(train_x_18_9,train_y_9)
pre = forest.predict(train_x_18_9)
print(rmse(pre,train_y_9))

5.49097145517


In [69]:
prediction_test = forest.predict(test_x_18_9)
output_result('output_rf2.csv',prediction_test)

     id      value
0  id_0   8.026954
1  id_1  16.527186
2  id_2  26.419140
3  id_3   7.292122
4  id_4  27.072096


## GB

In [70]:
param_test1= {'n_estimators':[i for i in range(10,200,10)],'learning_rate':[i for i in range()]}

gsearch1= GridSearchCV(estimator = GradientBoostingRegressor(random_state=10,learning_rate=0.1,max_depth=8),
                       param_grid =param_test1, scoring='neg_mean_squared_error',cv=5,n_jobs = 12)
gsearch1.fit(train_x_18_9,train_y_9)
gsearch1.grid_scores_,gsearch1.best_params_, (gsearch1.best_score_*-1)**0.5

([mean: -80.27983, std: 26.76912, params: {'n_estimators': 10},
  mean: -52.91587, std: 14.27279, params: {'n_estimators': 20},
  mean: -48.71730, std: 11.62847, params: {'n_estimators': 30},
  mean: -47.65739, std: 10.94461, params: {'n_estimators': 40},
  mean: -47.31204, std: 10.75784, params: {'n_estimators': 50},
  mean: -47.21175, std: 10.68398, params: {'n_estimators': 60},
  mean: -47.19690, std: 10.65264, params: {'n_estimators': 70},
  mean: -47.17996, std: 10.63890, params: {'n_estimators': 80},
  mean: -47.14821, std: 10.63391, params: {'n_estimators': 90},
  mean: -47.14401, std: 10.63111, params: {'n_estimators': 100},
  mean: -47.13702, std: 10.64897, params: {'n_estimators': 110},
  mean: -47.12599, std: 10.61793, params: {'n_estimators': 120},
  mean: -47.15594, std: 10.60598, params: {'n_estimators': 130},
  mean: -47.14056, std: 10.57585, params: {'n_estimators': 140},
  mean: -47.13521, std: 10.58659, params: {'n_estimators': 150},
  mean: -47.12788, std: 10.57091, 

In [None]:
gb = GradientBoostingRegressor(random_state=10,learning_rate=0.1,max_depth=8)
gb.fit(train_x_18_9,train_y_9)
pre = gb.predict(train_x_18_9)
print(rmse(pre,train_y_9))

### SVM 

### Ridge Regression 

In [88]:
from sklearn.linear_model import Ridge

In [119]:
param_test1 = {'alpha':[i*0.001 for i in range(0,10)]}
gsearch1 = GridSearchCV(Ridge(normalize=True),
                       param_grid =param_test1, scoring='neg_mean_squared_error',cv=5,n_jobs = 12)
gsearch1.fit(train_x_18_9,train_y_9)
gsearch1.grid_scores_,gsearch1.best_params_, (gsearch1.best_score_*-1)**0.5

([mean: -36.90976, std: 7.12411, params: {'alpha': 0.0},
  mean: -36.72853, std: 7.10674, params: {'alpha': 0.001},
  mean: -36.68149, std: 7.14301, params: {'alpha': 0.002},
  mean: -36.67953, std: 7.18891, params: {'alpha': 0.003},
  mean: -36.70237, std: 7.23663, params: {'alpha': 0.004},
  mean: -36.74170, std: 7.28396, params: {'alpha': 0.005},
  mean: -36.79299, std: 7.33019, params: {'alpha': 0.006},
  mean: -36.85337, std: 7.37514, params: {'alpha': 0.007},
  mean: -36.92084, std: 7.41878, params: {'alpha': 0.008},
  mean: -36.99394, std: 7.46120, params: {'alpha': 0.009000000000000001}],
 {'alpha': 0.003},
 6.056362712390842)

In [139]:
reg = Ridge(alpha=0.003,normalize=True)
reg.fit(train_x_18_9,train_y_9)
pre = reg.predict(train_x_18_9)
print(rmse(pre,train_y_9))

5.79252460812


In [140]:
prediction_test = reg.predict(test_x_18_9)
output_result('output_ridge.csv',prediction_test)

     id      value
0  id_0   6.525927
1  id_1  18.040214
2  id_2  24.224045
3  id_3   7.946928
4  id_4  27.281214


In [109]:
from sklearn.linear_model import Lasso

In [116]:
param_test1 = {'alpha':[i*0.0001 for i in range(1,10)]}
gsearch1 = GridSearchCV(Lasso(normalize=True),
                       param_grid =param_test1, scoring='neg_mean_squared_error',cv=5,n_jobs = 12)
gsearch1.fit(train_x_18_9,train_y_9)
gsearch1.grid_scores_,gsearch1.best_params_, (gsearch1.best_score_*-1)**0.5

([mean: -36.31986, std: 7.19528, params: {'alpha': 0.0001},
  mean: -36.13836, std: 7.32235, params: {'alpha': 0.0002},
  mean: -36.07448, std: 7.40493, params: {'alpha': 0.00030000000000000003},
  mean: -36.04563, std: 7.44903, params: {'alpha': 0.0004},
  mean: -36.04503, std: 7.48520, params: {'alpha': 0.0005},
  mean: -36.06282, std: 7.50578, params: {'alpha': 0.0006000000000000001},
  mean: -36.10414, std: 7.52600, params: {'alpha': 0.0007},
  mean: -36.15919, std: 7.54923, params: {'alpha': 0.0008},
  mean: -36.22604, std: 7.57092, params: {'alpha': 0.0009000000000000001}],
 {'alpha': 0.0005},
 6.003751333192469)

In [141]:
reg = Lasso(alpha=0.0005,normalize=True)
reg.fit(train_x_18_9,train_y_9)
pre = reg.predict(train_x_18_9)
print(rmse(pre,train_y_9))

5.86286255484


In [142]:
prediction_test = reg.predict(test_x_18_9)
output_result('output_lasso.csv',prediction_test)

     id      value
0  id_0   5.474261
1  id_1  17.033321
2  id_2  23.443692
3  id_3   7.655769
4  id_4  27.706008


In [127]:
from sklearn.linear_model import ElasticNet

In [132]:
param_test1 = {'alpha':[i*0.0001 for i in range(1,10)],'l1_ratio':[i*1 for i in range(1,11)]}
gsearch1 = GridSearchCV(ElasticNet(normalize=True),
                       param_grid =param_test1, scoring='neg_mean_squared_error',cv=5,n_jobs = 12)
gsearch1.fit(train_x_18_9,train_y_9)
gsearch1.grid_scores_,gsearch1.best_params_, (gsearch1.best_score_*-1)**0.5

  output_errors = np.average((y_true - y_pred) ** 2, axis=0,
  output_errors = np.average((y_true - y_pred) ** 2, axis=0,
  output_errors = np.average((y_true - y_pred) ** 2, axis=0,
  output_errors = np.average((y_true - y_pred) ** 2, axis=0,
  output_errors = np.average((y_true - y_pred) ** 2, axis=0,
  output_errors = np.average((y_true - y_pred) ** 2, axis=0,
  output_errors = np.average((y_true - y_pred) ** 2, axis=0,
  output_errors = np.average((y_true - y_pred) ** 2, axis=0,
  output_errors = np.average((y_true - y_pred) ** 2, axis=0,
  output_errors = np.average((y_true - y_pred) ** 2, axis=0,
  output_errors = np.average((y_true - y_pred) ** 2, axis=0,
  output_errors = np.average((y_true - y_pred) ** 2, axis=0,
  output_errors = np.average((y_true - y_pred) ** 2, axis=0,
  output_errors = np.average((y_true - y_pred) ** 2, axis=0,
  output_errors = np.average((y_true - y_pred) ** 2, axis=0,
  output_errors = np.average((y_true - y_pred) ** 2, axis=0,


  output_errors = np.average((y_true - y_pred) ** 2, axis=0,
  output_errors = np.average((y_true - y_pred) ** 2, axis=0,
  x = asanyarray(arr - arrmean)
  x = um.multiply(x, x, out=x)


([mean: -36.31986, std: 7.19528, params: {'alpha': 0.0001, 'l1_ratio': 1},
  mean: -285.46046, std: 108.79919, params: {'alpha': 0.0001, 'l1_ratio': 2},
  mean: -285.46046, std: 108.79919, params: {'alpha': 0.0001, 'l1_ratio': 3},
  mean: -285.46046, std: 108.79919, params: {'alpha': 0.0001, 'l1_ratio': 4},
  mean: -285.46046, std: 108.79919, params: {'alpha': 0.0001, 'l1_ratio': 5},
  mean: -285.46046, std: 108.79919, params: {'alpha': 0.0001, 'l1_ratio': 6},
  mean: -285.46046, std: 108.79919, params: {'alpha': 0.0001, 'l1_ratio': 7},
  mean: -285.46046, std: 108.79919, params: {'alpha': 0.0001, 'l1_ratio': 8},
  mean: -285.46046, std: 108.79919, params: {'alpha': 0.0001, 'l1_ratio': 9},
  mean: -36.13836, std: 7.32235, params: {'alpha': 0.0002, 'l1_ratio': 1},
  mean: -285.46046, std: 108.79919, params: {'alpha': 0.0002, 'l1_ratio': 2},
  mean: -285.46046, std: 108.79919, params: {'alpha': 0.0002, 'l1_ratio': 3},
  mean: -285.46046, std: 108.79919, params: {'alpha': 0.0002, 'l1_rati

In [138]:
reg = ElasticNet(alpha=0.0001,normalize=True,l1_ratio=1)
reg.fit(train_x_18_9,train_y_9)
pre = reg.predict(train_x_18_9)
print(rmse(pre,train_y_9))

5.80009429072


### Ensemble 

In [148]:
rf_pre = pd.read_csv('output_rf1.csv')['value'].values
ridge_pre = pd.read_csv('output_ridge.csv')['value'].values
lasso_pre = pd.read_csv('output_lasso.csv')['value'].values
opt_lin_pre = np.dot(test_x_18_9,op)

In [149]:
ensemble_pre = (rf_pre + ridge_pre + lasso_pre)/3

In [150]:
output_result('output_ensemble.csv',ensemble_pre)

     id      value
0  id_0   6.151731
1  id_1  17.671077
2  id_2  24.610059
3  id_3   7.265650
4  id_4  28.362554


In [151]:
opt_w = np.array([-5.46227603e-03, -1.38475160e-02, -2.69443961e-03, -1.72178991e-02,
       -1.37267864e-03,  1.26807630e-03,  3.49498739e-03, -3.82558612e-03,
        2.45317602e-02,  2.22352553e-02,  3.05717890e-02,  3.89331859e-02,
        2.92968741e-02,  4.59270116e-02,  7.11612298e-02,  6.44754913e-02,
        7.56308685e-02,  1.29809349e-01,  4.78095689e-03, -2.29097397e-02,
        5.51486985e-02, -9.68806233e-02,  1.93669012e-01, -7.50386999e-02,
       -1.57227624e-01,  2.79772280e-01,  7.72973264e-01, -1.51172932e-01,
       -3.14471859e-02, -5.06214364e-03,  7.00874400e-02,  1.73947880e-01,
       -3.70711697e-02, -5.27167268e-02,  1.51031468e-02,  3.08366728e-01,
        3.33776252e-02,  3.02304357e-04,  1.09991200e-01, -2.50667028e-02,
       -4.60466911e-03, -6.41362713e-02,  7.12192054e-02,  7.81103034e-02,
       -1.33377173e-01,  7.95896436e-03, -2.23917625e-02, -2.71589196e-02,
       -2.87100474e-02,  2.64741878e-04, -1.39653775e-03, -6.29413514e-02,
       -4.74672427e-02,  1.62219215e-01,  6.01475146e-03, -1.69156876e-02,
        4.26116769e-03, -2.99950532e-03,  1.43802553e-02, -6.30012851e-03,
       -3.49964929e-02, -2.87755862e-03,  1.16361086e-01,  7.22006764e-03,
        8.43067425e-03, -5.34733925e-03, -2.03541299e-02, -1.50046593e-04,
       -2.41009567e-02, -3.23791607e-02, -6.84718130e-03,  1.01285842e-01,
        1.52591810e-03,  3.25899052e-03, -5.39693281e-03,  1.97644000e-02,
       -1.95997693e-02, -5.14089285e-03,  1.99058472e-02, -1.30003139e-02,
        5.12598068e-02, -2.38137421e-02,  2.27893305e-02,  1.04070443e-01,
       -1.46889478e-01,  2.19447838e-02,  3.44459852e-01, -4.60644331e-01,
        5.66446095e-02,  8.98371339e-01,  2.82931625e-02,  2.38683684e-04,
       -5.57858406e-02, -2.16405533e-03, -4.20852466e-02,  5.39366572e-02,
        6.35790727e-03, -3.32275682e-02, -8.19343521e-02,  3.19074011e-03,
        1.19100933e-02, -2.11667332e-03, -1.39639265e-02, -1.36723707e-02,
       -1.51948713e-03, -1.87024951e-02,  3.78699583e-03,  1.05035778e-02,
       -1.24652316e-01,  2.04311435e-01, -3.24351594e-02, -6.86549358e-02,
       -1.44354563e-02,  3.73961493e-02, -7.32034739e-02,  6.71472526e-02,
        2.09991361e-01,  1.75074253e-02,  1.97364295e-02,  2.76141374e-02,
        1.48967792e-02,  5.91216316e-02,  6.99263143e-02,  5.25539811e-02,
        5.98937370e-02,  1.35010885e-01, -6.48920313e-04,  2.51752532e-03,
       -1.23939758e-03,  1.98101096e-03,  2.33214069e-04,  1.70537463e-03,
       -2.42805120e-03,  1.06900683e-03,  1.75945719e-04, -1.61092202e-03,
       -3.05891324e-04,  1.02912566e-03, -2.17626511e-03,  8.02235798e-04,
        1.68405301e-04, -2.67361169e-04, -2.59778351e-03,  3.97561399e-04,
       -7.17681675e-02, -2.14412479e-02,  2.92491725e-02, -2.24209669e-03,
       -2.06683078e-02, -3.63726895e-02, -7.43710117e-02, -7.83101251e-02,
       -2.94613631e-02, -4.44077036e-02,  5.14938409e-02, -5.47785050e-02,
       -3.81267595e-02,  2.04913613e-02,  1.18038393e-01, -1.98068697e-02,
       -8.89588839e-02,  2.64856900e-02,  3.74417766e-02])