In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def load_train_data(filename):
    train = pd.read_csv(filename,encoding='big5')
    
    ### seperate data 
    data = [[] for i in range(18)]
    for r,d in enumerate(train.iloc[:,3:].values):
        data[r%18].append(d)
    data = np.array(data).reshape((18,-1))
    
    return data

def load_test_data(filename):
    test = pd.read_csv(filename,encoding='big5',header = None)
    test_data = [[] for i in range(18)]
    for r,d in enumerate(test.iloc[:,2:].values):
        test_data[r%18].append(d)
    test_data = np.array(test_data).reshape((18,-1))
    
    return test_data

In [45]:
train_data = load_train_data('./data/train.csv')
test_data = load_test_data('./data/test.csv')

print(train_data.shape)
print(test_data.shape)

(18, 5760)
(18, 2160)


In [46]:
train_data[train_data==['NR']]=0
train_data = train_data.astype(np.float64)
test_data[test_data==['NR']]=0
test_data = test_data.astype(np.float64)

In [47]:
from sklearn.preprocessing import PolynomialFeatures

In [48]:
qua_poly = PolynomialFeatures(degree=2,include_bias=False)
train_data_qu = qua_poly.fit_transform(train_data.reshape((train_data.shape[1],-1)))
test_data_qu = qua_poly.fit_transform(test_data.reshape((test_data.shape[1],-1)))
train_data_qu = train_data_qu.reshape((train_data_qu.shape[1],-1))
test_data_qu = test_data_qu.reshape((test_data_qu.shape[1],-1))
print(train_data_qu.shape)
print(test_data_qu.shape)

(189, 5760)
(189, 2160)


In [49]:
def get_train_data(ori_data,qua_data,hours):
    train_x_18 = []
    train_x_1 = []
    train_y = []

    for i in range(0,qua_data.shape[1]-hours):
        train_x_18.append(qua_data[:,i:i+hours].reshape((-1)))
        
    for i in range(0,ori_data.shape[1]-hours):
        train_x_1.append(ori_data[9,i:i+hours].reshape((-1)))
        train_y.append(ori_data[9][i+hours])
        
    train_x_18 = np.array(train_x_18)
    train_x_1 = np.array(train_x_1)
    train_y = np.array(train_y).astype(np.float64)

    # replace NR to 0
    train_x_18[train_x_18 == 'NR'] = 0
    train_x_18 = train_x_18.astype(np.float64)
    train_x_1 = train_x_1.astype(np.float64)
    
    return train_x_18, train_x_1, train_y

def get_test_data(ori_data,qua_data,hours):
    test_x_18 = []
    test_x_1 = []
    
    for i in range(0,qua_data.shape[1],hours):
        test_x_18.append(qua_data[:,i:i+hours].reshape(-1))

    for i in range(0,ori_data.shape[1],hours):
        test_x_1.append(ori_data[9,i:i+hours].reshape(-1))
        
    test_x_18 = np.array(test_x_18)
    test_x_1 = np.array(test_x_1)
    
    test_x_18[test_x_18 == 'NR'] = 0
    test_x_18 = test_x_18.astype(np.float64)
    test_x_1 = test_x_1.astype(np.float64)
    return test_x_18, test_x_1

In [50]:
train_x_18_9, train_x_1_9, train_y_9 = get_train_data(train_data,train_data_qu,9)
print(train_x_18_9.shape)
print(train_x_1_9.shape)
print(train_y_9.shape)

train_x_18_5, train_x_1_5, train_y_5 = get_train_data(train_data,train_data_qu,5)
print(train_x_18_5.shape)
print(train_x_1_5.shape)
print(train_y_5.shape)

test_x_18_9, test_x_1_9 = get_test_data(test_data,test_data_qu,9)
print(test_x_18_9.shape)
print(test_x_1_9.shape)

test_x_18_5, test_x_1_5 = get_test_data(test_data,test_data_qu,5)
print(test_x_18_5.shape)
print(test_x_1_5.shape)

(5751, 1701)
(5751, 9)
(5751,)
(5755, 945)
(5755, 5)
(5755,)
(240, 1701)
(240, 9)
(432, 945)
(432, 5)




In [51]:
# add bias term
train_x_18_9 = np.hstack([train_x_18_9,np.ones((train_x_18_9.shape[0],1))])
train_x_1_9 = np.hstack([train_x_1_9,np.ones((train_x_1_9.shape[0],1))])
test_x_18_9 = np.hstack([test_x_18_9,np.ones((test_x_18_9.shape[0],1))])
test_x_1_9 = np.hstack([test_x_1_9,np.ones((test_x_1_9.shape[0],1))])

train_x_18_5 = np.hstack([train_x_18_5,np.ones((train_x_18_5.shape[0],1))])
train_x_1_5 = np.hstack([train_x_1_5,np.ones((train_x_1_5.shape[0],1))])
test_x_18_5 = np.hstack([test_x_18_5,np.ones((test_x_18_5.shape[0],1))])
test_x_1_5 = np.hstack([test_x_1_5,np.ones((test_x_1_5.shape[0],1))])

In [52]:
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn import metrics
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LinearRegression
def rmse(pre,test):
    tmp = 0
    for i,p in enumerate(pre):
        tmp += (p-test[i])**2
    return (tmp/pre.shape[0])**0.5

In [53]:
param_test1 = {'alpha':[i*0.0001 for i in range(0,10)]}
gsearch1 = GridSearchCV(Ridge(normalize=True),
                       param_grid =param_test1, scoring='neg_mean_squared_error',cv=5,n_jobs = 12)
gsearch1.fit(train_x_18_9,train_y_9)
gsearch1.grid_scores_,gsearch1.best_params_, (gsearch1.best_score_*-1)**0.5

([mean: -41756.48823, std: 76084.96460, params: {'alpha': 0.0},
  mean: -41733.15858, std: 76043.44042, params: {'alpha': 0.0001},
  mean: -41709.10300, std: 76000.38363, params: {'alpha': 0.0002},
  mean: -41684.37224, std: 75955.89954, params: {'alpha': 0.00030000000000000003},
  mean: -41659.01463, std: 75910.08828, params: {'alpha': 0.0004},
  mean: -41633.07590, std: 75863.04462, params: {'alpha': 0.0005},
  mean: -41606.59923, std: 75814.85795, params: {'alpha': 0.0006000000000000001},
  mean: -41579.62528, std: 75765.61238, params: {'alpha': 0.0007},
  mean: -41552.19223, std: 75715.38691, params: {'alpha': 0.0008},
  mean: -41524.33590, std: 75664.25565, params: {'alpha': 0.0009000000000000001}],
 {'alpha': 0.0009000000000000001},
 203.77520924584934)

In [55]:
reg = Ridge(alpha=0.0)
reg.fit(train_x_18_9,train_y_9)
pre = reg.predict(train_x_18_9)
print(rmse(pre,train_y_9))

11.838562998


In [56]:
train_x_18_9

array([[  14.  ,   14.  ,   14.  , ...,    3.8 ,    4.4 ,    1.  ],
       [  14.  ,   14.  ,   13.  , ...,    4.4 ,    4.4 ,    1.  ],
       [  14.  ,   13.  ,   12.  , ...,    4.4 ,    3.  ,    1.  ],
       ..., 
       [ 450.  ,  450.  ,  425.  , ...,    2.88,    3.2 ,    1.  ],
       [ 450.  ,  425.  ,  425.  , ...,    3.2 ,    3.24,    1.  ],
       [ 425.  ,  425.  ,  425.  , ...,    3.24,    3.6 ,    1.  ]])