# Preprocessing
将data/train.csv文件作为数据源，RAINFALL特殊数据NR设置为0。输入数据从第三列开始取，以便过滤字段名称和日期等。转化成numpy数据以便使用。

In [None]:
data = pd.read_csv('./data/train.csv', encoding='big5')
data = data.iloc[:, 3:]
data[data == 'NR'] = 0
input_data = data.to_numpy()

# Extract Features
将数据以时间进行划分，从中获取九小时区间的输入数据，一小时区间的输出数据，即x，y。

In [None]:
month_data = {}
for month in range(12):
    temp = np.empty([18, 480])
    for day in range(20):
        temp[:, day * 24 : (day + 1) * 24] = input_data[(month * 20 + day) * 18 : (month * 20 + day + 1) * 18, :]
    month_data[month] = temp
x = np.empty([12 * 471, 18 * 9], dtype=float)
y = np.empty([12 * 471, 1], dtype=float)
for month in range(12):
    for day in range(20):
        for hour in range(24):
            if hour > 14 and day == 19:
                continue
            x[month * 471 + day * 24 + hour, :] = month_data[month][:, day * 24 + hour : day * 24 + hour + 9].reshape(1, -1)
            y[month * 471 + day * 24 + hour, 0] = month_data[month][9, day * 24 + hour + 9]

# Normalize
将输入数据标准化，即采用归一化公式。

In [None]:
mean_x = np.mean(x, axis=0)
std_x = np.std(x, axis=0)
for i in range(len(x)):
    for j in range(len(x[0])):
        if std_x[j] != 0:
            x[i][j] = (x[i][j] - mean_x[j]) / std_x[j]

# Training
此次训练模型采用多元一次函数，数据集仅是变量数据，因此需要添加常量变量。通过当前的预测函数计算预测值与y进行计算获得损失值，同时计算梯度下降步数，就可以获得新的预测方程。

In [None]:
size = 18 * 9 + 1
x_set = np.concatenate((np.ones([12 * 471, 1]), x), axis=1).astype(float)
w_set = np.zeros([size, 1])
learning_rate = 100
iter_time = 1000
adagrad = np.zeros([size, 1])
eps = 0.000000001
for t in range(iter_time):
    pre_y = np.dot(x_set, w_set)
    loss = np.sqrt(np.sum(np.power(pre_y - y, 2)) / (12 * 471))
    gradient = 2 * np.dot(x_set.transpose(), pre_y - y)
    adagrad += gradient ** 2
    w_set = w_set - learning_rate * gradient / np.sqrt(adagrad + eps)

# Save
保存训练后的预测模型数据。

In [None]:
np.save('weight.npy', w_set)
np.save('mean.npy', mean_x)
np.save('std.npy', std_x)

# Testing
加载测试集，提取特征值，与训练相同的标准化。

In [None]:
inputFileName = sys.argv[1]
testData = pd.read_csv(inputFileName, header=None, encoding='big5')
testData = testData.iloc[:, 2:]
testData[testData == 'NR'] = 0
test_data = testData.to_numpy()
test_x = np.empty([240, 18 * 9], dtype=float)
for i in range(240):
    test_x[i, :] = test_data[18 * i : 18 * (i + 1), :].reshape(1, -1)
for i in range(len(test_x)):
    for j in range(len(test_x[0])):
        if std_x[j] != 0:
            test_x[i][j] = (test_x[i][j] - mean_x[j]) / std_x[j]
test_x = np.concatenate((np.ones([240, 1]), test_x), axis=1).astype(float)
ans_y = np.dot(test_x, w_set)

# Ans
保存测试答案。

In [None]:
outputFileName = sys.argv[2]
with open(outputFileName, mode='w', newline='') as ans_file:
    csv_writer = csv.writer(ans_file)
    header = ['id', 'value']
    csv_writer.writerow(header)
    for i in range(240):
        csv_writer.writerow(['id_' + str(i), ans_y[i][0]])