In [16]:
import numpy as np
import numpy.ma as ma
from numpy import linalg as LA
import csv

def process_data(file_name, train_or_test):
    if train_or_test == 'train':
        data_file = open(file_name, 'r', encoding='big5')
        raw_data = csv.reader(data_file)
        raw_data = np.asarray(list(raw_data))
        raw_data = np.delete(raw_data, 0, 0) # delete indexing(first row)
        raw_data = np.delete(raw_data, np.s_[0:3:], 1) # delete date, location, entry_name(first three column)
        np.place(raw_data, raw_data=='NR', '0') # replace NR with 0
        raw_data = raw_data.astype(float) #to float data
        
        # transfer raw_data into data(every hour is a data)
        data = []
        shape = raw_data.shape
        for row_ in range(shape[0]//18):
            for col in range(shape[1]):
                row = row_*18
                tmp = raw_data[row:row+18, col]
                data.append(tmp)
        data = np.array(data)
        
    elif train_or_test == 'test':
        data_file = open(file_name, 'r', encoding='big5')
        raw_data = csv.reader(data_file)
        raw_data = np.asarray(list(raw_data))
        raw_data = np.delete(raw_data, np.s_[0:2:], 1)
        np.place(raw_data, raw_data=='NR', '0') # replace NR with 0
        raw_data = raw_data.astype(float) #to float data
        
        # transfer raw_data into data(every hour is a data)
        data = []
        shape = raw_data.shape
        for row_ in range(shape[0]//18):
            row = row_ * 18
            tmp = raw_data[row:row+18,]
            data.append(tmp.transpose().flatten())
        data = np.array(data)
    
    return data


#random choose 9 continous data, flatten it into a x
#data[N][9] is the pm2.5 value of Nth data
def choose_data(data, i):
    x = data[i - 9:i,:].flatten()
    y = data[i, 9]
    return x, y
    
def regression(X_data, lr = 0.001, batch_size = 300, run = 100000, momentum_rate = 0.3):
    W = np.random.rand(9*18).flatten() #initial random W
    b = np.random.rand()
    x_len = X_data.shape[0]
    
    b_lr = 0
    w_lr = np.zeros(9*18).flatten()
    dW = np.zeros(9*18).flatten()
    db = 0
    #batch SGD
    for i in range(run):
        #randon pick N data x, y_hat. N=batch_size
        x = []
        y_hat = []
        for _ in range(batch_size):
            n = np.random.randint(9, x_len) 
            xi, yi = choose_data(X_data, n)
            x.append(xi)
            y_hat.append(yi)
        x = np.array(x)
        y_hat = np.array(y_hat)
        #calculate error
        error = y_hat - (np.dot(x, W) + b) #nparray(batch_size, 1)
        
        #calculate gradient
        dW = momentum_rate * dW + (1-momentum_rate)*(-2 * np.matmul(error, x))
        db = momentum_rate * db + (1-momentum_rate)*(-2 * np.mean(error))
        
        #adagrad
        b_lr = b_lr + db**2
        w_lr = w_lr + dW**2
        W = W - lr/np.sqrt(w_lr) * dW
        b = b - lr/np.sqrt(b_lr) * db
        
        # random pick 10000 data for validation
        if(i % 10000 == 0):
            x = []
            y_ = []
            for _ in range(50000):
                n = np.random.randint(9, x_len) 
                xi, yi = choose_data(X_data, n)
                x.append(xi)
                y_.append(yi)
            x = np.array(x)
            y_ = np.array(y_)
            #calculate error
            z = np.matmul(x, W) + b
            error = y_ - (np.matmul(x, W) + b) #nparray(batch_size, 1)
            loss = np.mean(np.square(error))
            print(np.sqrt(loss))
    
    return W, b
        
def output_test(test_X, W, b):
    Y = []
    for x in test_X:
        Y.append(np.dot(W, x) + b)
    with open('ans.csv', 'w') as f:
        f.write('id, value\n')
        for i, y in enumerate(Y):
            f.write('id_' + str(i) + ',' + str(y) + '\n')
    
data = process_data('train.csv', 'train')
test_x = process_data('test_X.csv', 'test')
W, b = regression(data, lr = 1, batch_size = 150, run = 50000)
output_test(test_x, W, b)

2655.2755680956516
6.024929097538227
6.026626776725642
6.020821352808039
5.862807444222179
