In [4]:
import pandas as pd
import numpy as np

def dataProcess(df):
    x_list, y_list = [], []
    # df替换指定元素，将空数据填充为0
    df = df.replace(['NR'], [0.0])
    # astype() 转换array中元素数据类型
    array = np.array(df).astype(float)
    # 将数据集拆分为多个数据帧
    for i in range(0, 4320, 18):
        for j in range(24-9):
            mat = array[i:i+18, j:j+9]
            label = array[i+9, j+9] # 第10行是PM2.5
            x_list.append(mat)
            y_list.append(label)
    x = np.array(x_list)
    y = np.array(y_list)
    
    return x, y, array

def validate(x_val, y_val, weights, bias):
    loss = 0
    for i in range(400):
        loss += abs(y_val[i] - weights.dot(x_val[i].reshape(-1,1)) - bias)
    return loss / 400

In [5]:
def train1(x_train, y_train, x_val,y_val,epoch):
    bias = 0 # 偏置值初始化
    weights = np.ones(9) # 权重初始化
    learning_rate = 1 # 初始学习率
    reg_rate = 0.001 # 正则项系数
    bg2_sum = 0 # 用于存放偏置值的梯度平方和
    wg2_sum = np.zeros(9) # 用于存放权重的梯度平方和
    x_val = x_val[:,9,:]
    for i in range(epoch):
        b_g = 0
        w_g = np.zeros(9)
        # 在所有数据上计算Loss_label的梯度
        for j in range(3200):
            y_pred = weights.dot(x_train[j, 9, :]) - bias
            b_g += (y_train[j] - y_pred) * (-1)
            for k in range(9):
                w_g[k] += (y_train[j] - y_pred) * (-x_train[j, 9, k]) 
        # 求平均    
        b_g /= 3200
        w_g /= 3200
        #  加上Loss_regularization在w上的梯度
        for m in range(9):
            w_g[m] += reg_rate * weights[m]
        
        # adagrad
        bg2_sum += b_g**2
        wg2_sum += w_g**2
        # 更新权重和偏置
        bias -= learning_rate/bg2_sum**0.5 * b_g
        weights -= learning_rate/wg2_sum**0.5 * w_g
        if i%10==0:
            loss = validate(x_val, y_val, weights, bias)
            print('The loss on val data in epoch %s is:%d'%(str(i),loss))
    return weights, bias
def main1():
    df = pd.read_csv('train.csv', usecols=range(3,27))
    x, y, _ = dataProcess(df)
    x_train, y_train = x[0:3200], y[0:3200]
    x_val, y_val = x[3200:3600], y[3200:3600]
    epoch = 200 

    w, b = train1(x_train, y_train, x_val,y_val,epoch)

In [6]:
main1()

The loss on val data in epoch 0 is:21
The loss on val data in epoch 10 is:8
The loss on val data in epoch 20 is:7
The loss on val data in epoch 30 is:7
The loss on val data in epoch 40 is:6
The loss on val data in epoch 50 is:6
The loss on val data in epoch 60 is:6
The loss on val data in epoch 70 is:6
The loss on val data in epoch 80 is:6
The loss on val data in epoch 90 is:6
The loss on val data in epoch 100 is:6
The loss on val data in epoch 110 is:6
The loss on val data in epoch 120 is:5
The loss on val data in epoch 130 is:5
The loss on val data in epoch 140 is:5
The loss on val data in epoch 150 is:5
The loss on val data in epoch 160 is:5
The loss on val data in epoch 170 is:5
The loss on val data in epoch 180 is:5
The loss on val data in epoch 190 is:5


In [7]:
def train2(x_train, y_train, x_val,y_val,epoch):
    bias = 0 # 偏置值初始化
    weights = np.ones((1,162)) # 权重初始化
    learning_rate = 1 # 初始学习率
    reg_rate = 0.001 # 正则项系数
    bg2_sum = 0 # 用于存放偏置值的梯度平方和
    wg2_sum = np.zeros((1,162)) # 用于存放权重的梯度平方和

    for i in range(epoch):
        b_g = 0
        w_g = np.zeros((1,162))
        # 在所有数据上计算Loss_label的梯度
        for j in range(3200):
            data = x_train[j].reshape(-1,1)
            y_pred = weights.dot(data) - bias
            b_g += (y_train[j] - y_pred) * (-1)
            for k in range(162):
                w_g[0,k] += (y_train[j] - y_pred) * (-data[k,0]) 
        # 求平均    
        b_g /= 3200
        w_g /= 3200
        #  加上Loss_regularization在w上的梯度
        for m in range(162):
            w_g[0,m] += reg_rate * weights[0,m]
        
        # adagrad
        bg2_sum += b_g**2
        wg2_sum += w_g**2
        # 更新权重和偏置
        bias -= learning_rate/bg2_sum**0.5 * b_g
        weights -= learning_rate/wg2_sum**0.5 * w_g
        if i%10==0:
            loss = validate(x_val, y_val, weights, bias)
            print('The loss on val data in epoch %s is:%d'%(str(i),loss))
    return weights, bias

def main2():
        df = pd.read_csv('train.csv', usecols=range(3,27))
        x, y, _ = dataProcess(df)
        x_train, y_train = x[0:3200], y[0:3200]
        x_train
        x_val, y_val = x[3200:3600], y[3200:3600]
        epoch = 200 

        w, b = train2(x_train, y_train, x_val,y_val,epoch) 
        

In [8]:
main2()

The loss on val data in epoch 0 is:21
The loss on val data in epoch 10 is:9
The loss on val data in epoch 20 is:8
The loss on val data in epoch 30 is:8
The loss on val data in epoch 40 is:8
The loss on val data in epoch 50 is:8
The loss on val data in epoch 60 is:8
The loss on val data in epoch 70 is:8
The loss on val data in epoch 80 is:8
The loss on val data in epoch 90 is:8
The loss on val data in epoch 100 is:8
The loss on val data in epoch 110 is:8
The loss on val data in epoch 120 is:8
The loss on val data in epoch 130 is:8
The loss on val data in epoch 140 is:7
The loss on val data in epoch 150 is:7
The loss on val data in epoch 160 is:7
The loss on val data in epoch 170 is:7
The loss on val data in epoch 180 is:7
The loss on val data in epoch 190 is:7


In [9]:
def train_pca(x_train, y_train, x_val,y_val,k,epoch):
    bias = 0 # 偏置值初始化
    weights = np.ones((1,k)) # 权重初始化
    learning_rate = 1 # 初始学习率
    reg_rate = 0.001 # 正则项系数
    bg2_sum = 0 # 用于存放偏置值的梯度平方和
    wg2_sum = np.zeros((1,k)) # 用于存放权重的梯度平方和

    for i in range(epoch):
        b_g = 0
        w_g = np.zeros((1,k))
        # 在所有数据上计算Loss_label的梯度
        for j in range(3200):
            data = x_train[j].reshape(-1,1)
            y_pred = weights.dot(data) - bias
            b_g += (y_train[j] - y_pred) * (-1)
            for x in range(k):
                w_g[0,x] += (y_train[j] - y_pred) * (-data[x,0]) 
        # 求平均    
        b_g /= 3200
        w_g /= 3200
        #  加上Loss_regularization在w上的梯度
        for m in range(k):
            w_g[0,m] += reg_rate * weights[0,m]
        
        # adagrad
        bg2_sum += b_g**2
        wg2_sum += w_g**2
        # 更新权重和偏置
        bias -= learning_rate/bg2_sum**0.5 * b_g
        weights -= learning_rate/wg2_sum**0.5 * w_g
        if i%10==0:
            loss = validate(x_val, y_val, weights, bias)
            print('The loss on val data in epoch %s is:%d'%(str(i),loss))
    return weights, bias
def main_pca(k,epoch):
    df = pd.read_csv('train.csv', usecols=range(3,27))
    x, y, _ = dataProcess(df)
    x_train, y_train = x[0:3200], y[0:3200]
    x_pca = x_train
    x_pca = x_pca.reshape(3200,162)
    
    mean = np.sum(x_pca,0)
    std = np.std(x_pca,0)
    x_pca = (x_pca-mean)/std
    
    c = np.dot(x_pca.T,x_pca)/3200
    s,u = np.linalg.eig(c)
    z = np.dot(x.reshape(3600,162),u[:,:k])
    x_train, y_train = z[0:3200], y[0:3200]
    x_val, y_val = z[3200:3600], y[3200:3600]
    
    w, b = train_pca(x_train, y_train, x_val,y_val,k,epoch) 

In [11]:
main_pca(20,1000)

The loss on val data in epoch 0 is:1502
The loss on val data in epoch 10 is:42
The loss on val data in epoch 20 is:37
The loss on val data in epoch 30 is:36
The loss on val data in epoch 40 is:34
The loss on val data in epoch 50 is:32
The loss on val data in epoch 60 is:31
The loss on val data in epoch 70 is:30
The loss on val data in epoch 80 is:29
The loss on val data in epoch 90 is:28
The loss on val data in epoch 100 is:27
The loss on val data in epoch 110 is:26
The loss on val data in epoch 120 is:25
The loss on val data in epoch 130 is:24
The loss on val data in epoch 140 is:24
The loss on val data in epoch 150 is:23
The loss on val data in epoch 160 is:22
The loss on val data in epoch 170 is:22
The loss on val data in epoch 180 is:21
The loss on val data in epoch 190 is:21
The loss on val data in epoch 200 is:20
The loss on val data in epoch 210 is:20
The loss on val data in epoch 220 is:20
The loss on val data in epoch 230 is:19
The loss on val data in epoch 240 is:19
The loss 