# Preprocess train data

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Delete unneccessary column of training data
RAW_DATA = pd.read_csv('data/train.csv', encoding='big5')
RAW_DATA.drop(['日期', '測站'], axis=1, inplace=True)
RAW_DATA = RAW_DATA.as_matrix()

# Calculate and separate the feature number, data, days, end_of_month
FEATURE_NUM = len(np.unique(RAW_DATA[:,0]))
DATA_SET_LENGTH = 9

RAW_DATA = RAW_DATA[:,1:]

DAYS = RAW_DATA.shape[0]/FEATURE_NUM
END_OF_MONTH = 20

In [3]:
# Concatenate array
JAN=FEB=MAR=ARI=MAY=JUN=JUL=AUG=SEP=OCT=NOV=DEC=np.array([]).reshape(18,0)
MONTHS = [JAN, FEB, MAR, ARI, MAY, JUN, JUL, AUG, SEP, OCT, NOV, DEC] 
MONTHS_PM = np.array([]).reshape(0,480)
    
for day in range(int(DAYS)):
        MONTHS[int(day/END_OF_MONTH)] = np.concatenate((MONTHS[int(day/END_OF_MONTH)], RAW_DATA[FEATURE_NUM * day : FEATURE_NUM * (day+1),:]), axis=1)

# Replace the Rainfall NR string to 0 and create PM2.5 array
for month in MONTHS:    
    month[month=='NR'] = '0'
for month in MONTHS:
    MONTHS_PM = np.vstack((MONTHS_PM,month[9,:]))

# Training the data

In [4]:
# Initialization of all the params
gradient_descent_base = 0
gradient_descent_weight_1 = np.zeros([18,9])
gradient_descent_bias = 0

weight_1 = np.zeros([18,9])
bias = 0

learning_rate = 1e-10
epoch = 100

for _ in range(epoch):
    gradient_descent_weight_1 = np.zeros([18,9])
    gradient_descent_base = gradient_descent_bias = 0
    
    for number_of_month in range(len(MONTHS)):
        for number_of_data in range(MONTHS[number_of_month].shape[1]):
            try:
                # 2[(y - (b + w*x))]
                gradient_descent_base = 2 * (float(MONTHS_PM[number_of_month][number_of_data]) - (bias + (weight_1 * MONTHS[number_of_month][:, number_of_data:DATA_SET_LENGTH+number_of_data].astype(float)).sum()))
                gradient_descent_weight_1 += gradient_descent_base * -MONTHS[number_of_month][:, number_of_data:DATA_SET_LENGTH+number_of_data].astype(float)
                gradient_descent_bias += gradient_descent_base

            except: 
                pass
        
    weight_1 -= learning_rate * gradient_descent_weight_1
    bias -= learning_rate * gradient_descent_bias

# Proprocess test data

In [5]:
# Delete unneccessary column of test data
RAW_TEST = pd.read_csv('data/test_X.csv', encoding='big5', header=None)
RAW_TEST.drop([1], axis=1, inplace=True)

# Replace the Rainfall string data to 0
RAW_TEST[RAW_TEST=='NR'] = '0'

# Get the test Dataframe of test data and test data ID
test_data_ID = RAW_TEST[[0]].as_matrix()
test_data = RAW_TEST.iloc[:, RAW_TEST.columns != 0].as_matrix()

In [6]:
# Get the ordered ID of the test data
_, idx = np.unique(test_data_ID, return_index=True)
order_id = test_data_ID[:,0][np.sort(idx)]

In [7]:
# Get the result of Predicted PM2.5
result = []
for number_of_test_data in range(int(test_data.shape[0]/18)):
    result.append((weight_1 * test_data[ FEATURE_NUM*number_of_test_data : FEATURE_NUM*(number_of_test_data+1),:].astype(float)).sum() + bias)

Save the result and output as submission.csv file
=========

In [8]:
pd.DataFrame(data=[order_id, result], index=["id", "value"]).T.to_csv("sampleSubmission.csv", index=False)