In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 设置参数, display: 展示, max_rows: 最大行数
pd.set_option('display.max_rows', 10)

beijing_pm25_dataframe = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00381/PRSA_data_2010.1.1-2014.12.31.csv')
beijing_pm25_dataframe
# 注意， 时间序列的预测中， 不能先进行排序， 因为事件列不能排序
# 可以在聚合出内容后在进行排序

Unnamed: 0,No,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
0,1,2010,1,1,0,,-21,-11.0,1021.0,NW,1.79,0,0
1,2,2010,1,1,1,,-21,-12.0,1020.0,NW,4.92,0,0
2,3,2010,1,1,2,,-21,-11.0,1019.0,NW,6.71,0,0
3,4,2010,1,1,3,,-21,-14.0,1019.0,NW,9.84,0,0
4,5,2010,1,1,4,,-20,-12.0,1018.0,NW,12.97,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
43819,43820,2014,12,31,19,8.0,-23,-2.0,1034.0,NW,231.97,0,0
43820,43821,2014,12,31,20,10.0,-22,-3.0,1034.0,NW,237.78,0,0
43821,43822,2014,12,31,21,10.0,-22,-3.0,1034.0,NW,242.70,0,0
43822,43823,2014,12,31,22,8.0,-22,-4.0,1034.0,NW,246.72,0,0


In [0]:
# 这个函数是用来对数据进行z-score标准化的， 普通线性回归的时候不要用哦
def z_score_normalise(df):
  return (df - df.mean()) / df.std()

# min max 归一化，回归的时候不要用哦
def min_max_normalise(df):
  return (df - df.min()) / (df.max() - df.min())

预测北京的PM2.5的值

In [0]:
from sklearn.preprocessing import LabelBinarizer
# 计划是用前九天的数据进行预测今天的数据， 所以需要对数据进行处理
def construct_data(df):
  
  # 首先是将 categracal 数据转换成独热编码
  np_wind = LabelBinarizer().fit_transform(df.cbwd)
  df.loc[:, 'one_hot_cbwd1'] = np_wind[:, 0].copy()
  df.loc[:, 'one_hot_cbwd2'] = np_wind[:, 1].copy()
  df.loc[:, 'one_hot_cbwd3'] = np_wind[:, 2].copy()
  df.loc[:, 'one_hot_cbwd4'] = np_wind[:, 3].copy()
  df.drop(['cbwd'], axis=1, inplace=True)
  
  # 再来是将 9 小时的数据加在之前的数据的后端
  for i in range(1, 10):
    df['pm2.5_{}'.format(i)] = df.shift(i)['pm2.5']
    df['DEWP_{}'.format(i)] = df.shift(i)['DEWP']
    df['TEMP_{}'.format(i)] = df.shift(i)['TEMP']
    df['PRES_{}'.format(i)] = df.shift(i)['PRES']
    df['one_hot_cbwd1_{}'.format(i)] = df.shift(i)['one_hot_cbwd1']
    df['one_hot_cbwd2_{}'.format(i)] = df.shift(i)['one_hot_cbwd2']
    df['one_hot_cbwd3_{}'.format(i)] = df.shift(i)['one_hot_cbwd3']
    df['one_hot_cbwd4_{}'.format(i)] = df.shift(i)['one_hot_cbwd4']
    df['lws_{}'.format(i)] = df.shift(i)['Iws']
    df['ls_{}'.format(i)] = df.shift(i)['Is']
    df['lr_{}'.format(i)] = df.shift(i)['Ir']
  # 将所有的 NaN 的值删除掉
  df.dropna(inplace=True)

In [4]:
construct_data(beijing_pm25_dataframe)
beijing_pm25_dataframe = beijing_pm25_dataframe.reindex(np.random.permutation(beijing_pm25_dataframe.index))
beijing_pm25_dataframe = beijing_pm25_dataframe.iloc[:, 5:]
print(beijing_pm25_dataframe)
print(beijing_pm25_dataframe.dtypes)
#beijing_pm25_dataframe = z_score_normalise(beijing_pm25_dataframe)

       pm2.5  DEWP  TEMP    PRES  ...  one_hot_cbwd4_9   lws_9  ls_9  lr_9
22979   59.0    19  27.0  1007.0  ...              1.0    1.78   0.0   0.0
13316   23.0    15  29.0  1004.0  ...              1.0    3.13   0.0   0.0
29317  259.0    14  29.0  1014.0  ...              0.0    0.89   0.0   0.0
34572   35.0   -22   5.0  1019.0  ...              0.0  161.37   0.0   0.0
28460  259.0     2   6.0  1015.0  ...              0.0   79.59   0.0   0.0
...      ...   ...   ...     ...  ...              ...     ...   ...   ...
7389    91.0     2   7.0  1022.0  ...              1.0    1.79   0.0   0.0
41135   95.0    16  21.0  1015.0  ...              0.0   10.73   0.0   0.0
16157   23.0     3  12.0  1026.0  ...              0.0    1.79   0.0   1.0
2039    97.0   -10   4.0  1019.0  ...              0.0   22.80   0.0   0.0
15281   14.0    -1  23.0  1017.0  ...              0.0   46.49   0.0   0.0

[40061 rows x 110 columns]
pm2.5              float64
DEWP                 int64
TEMP              

In [0]:
# 将特征和标签分离的函数
def split_features_and_targets(df):
  features = df.iloc[:, 6:]
  targets = df['pm2.5']
  return features, targets

In [0]:
def Compute_Loss(feature, target, w, ridge_lambda):
  single_error = target - np.matmul(feature, w)
  loss = np.matmul(single_error.T, single_error) + ridge_lambda * np.matmul(w.T, w)
  return loss

def z_score_normalise_column(a):
  if np.std(a) < 0.00000001:
    return a
  b = (a - np.mean(a)) / np.std(a)
  return b

def np_z_score_normalise(feature):
  return np.apply_along_axis(z_score_normalise_column, 0, feature)

# Ridge 代表的是岭回归
# lambda 越大, 参数就会越小
# iteration 代表的是梯度下降走的步数
def Ridge_and_Gradient_Descent(feature, target, ridge_lambda, learning_rate, iteration):
  x_0 = np.ones([feature.shape[0], 1])
  feature = np.concatenate((x_0, feature), axis=1)
  # feature = np_z_score_normalise(feature)
  print(feature)
  
  target = target.values.reshape(len(target), 1)
  w = np.zeros((feature.shape[1], 1))
  
  for i in range(iteration):
    w_grad = np.matmul(np.matmul(feature.T, feature), w) + ridge_lambda * w - np.matmul(feature.T, target)
    w = w - learning_rate * w_grad
    loss = Compute_Loss(feature, target, w, ridge_lambda)
    if i % 20 == 1:
      print('Current Loss: ', loss)
  return w, loss

In [13]:
feature, target = split_features_and_targets(beijing_pm25_dataframe)
Ridge_and_Gradient_Descent(feature, target, 1, 0.00000000000001, 1000000)

[[ 1.    0.    1.   ...  1.78  0.    0.  ]
 [ 1.    0.    0.   ...  3.13  0.    0.  ]
 [ 1.    0.    0.   ...  0.89  0.    0.  ]
 ...
 [ 1.    1.    0.   ...  1.79  0.    1.  ]
 [ 1.    0.    0.   ... 22.8   0.    0.  ]
 [ 1.    0.    0.   ... 46.49  0.    0.  ]]
Current Loss:  [[7.22429423e+08]]
Current Loss:  [[6.67802627e+08]]
Current Loss:  [[6.2078711e+08]]
Current Loss:  [[5.80317259e+08]]
Current Loss:  [[5.45476661e+08]]
Current Loss:  [[5.15477205e+08]]
Current Loss:  [[4.89641127e+08]]


KeyboardInterrupt: ignored

In [0]:
def Ridge_and_AdaGrad(feature, target, ridge_lambda, learning_rate, iteration):
  x_0 = np.ones([feature.shape[0], 1])
  feature = np.concatenate((x_0, feature), axis=1)
  
  target = target.values.reshape(len(target), 1)
  w = np.zeros((feature.shape[1], 1))
  
  learning_rate_w = np.zeros((feature.shape[1], 1))
  
  for i in range(iteration):
    w_grad = np.matmul(np.matmul(feature.T, feature), w) + ridge_lambda * w - np.matmul(feature.T, target)
#     w_grad = np.matmul(np.matmul(feature.T, feature), w) - np.matmul(feature.T, target)
    learning_rate_w = learning_rate_w + w_grad ** 2
    w = w - learning_rate * w_grad / np.sqrt(learning_rate_w)
    loss = Compute_Loss(feature, target, w, ridge_lambda)
#     if i % 20 == 1:
#       print('Current Loss: ', loss)
#       print('Current w:', w)
  return w, loss

In [26]:
feature, target = split_features_and_targets(beijing_pm25_dataframe)
Ridge_and_AdaGrad(feature, target, 1, 100, 1000000)

Current Loss:  [[7.28309467e+08]]
Current w: [[4.50600112e-07]
 [1.86609228e-07]
 [3.87708312e-07]
 [2.31477529e-07]
 [5.64099992e-07]
 [7.28445954e-07]
 [1.37330245e-06]
 [2.05459665e-06]
 [3.82470731e-07]
 [4.50213165e-07]
 [3.91894019e-07]
 [2.05022403e-07]
 [5.87307994e-07]
 [7.55577958e-07]
 [1.15365566e-07]
 [6.66517252e-07]
 [1.68355058e-07]
 [1.30999700e-06]
 [1.96800127e-06]
 [3.84518046e-07]
 [4.50229791e-07]
 [3.88017000e-07]
 [1.86072512e-07]
 [6.07146774e-07]
 [7.76931984e-07]
 [1.16813538e-07]
 [6.86197197e-07]
 [1.59930323e-07]
 [1.25071477e-06]
 [1.86571079e-06]
 [3.88576709e-07]
 [4.50248436e-07]
 [3.88267239e-07]
 [1.72541760e-07]
 [6.29682361e-07]
 [7.75879130e-07]
 [1.19853098e-07]
 [7.08816657e-07]
 [1.54807793e-07]
 [1.19605473e-06]
 [1.75655042e-06]
 [3.93894908e-07]
 [4.50269752e-07]
 [3.79757367e-07]
 [1.63762408e-07]
 [6.51713734e-07]
 [7.68014260e-07]
 [1.23927066e-07]
 [7.22033136e-07]
 [1.52748626e-07]
 [1.14451188e-06]
 [1.64904273e-06]
 [4.00093782e-07]
 

KeyboardInterrupt: ignored

In [0]:
 [[4.50600112e-07]
 [1.86609228e-07]
 [3.87708283e-07]
 [2.31477529e-07]
 [5.64099977e-07]
 [7.28445940e-07]
  
  
  [[4.50600112e-07]
 [1.86609228e-07]
 [3.87708312e-07]
 [2.31477529e-07]
 [5.64099992e-07]
 [7.28445954e-07]
 [1.37330245e-06]
 [2.05459665e-06]
 [3.82470731e-07]
 [4.50213165e-07]
 [3.91894019e-07]
 [2.05022403e-07]
 [5.87307994e-07]
 [7.55577958e-07]
 [1.15365566e-07]
 [6.66517252e-07]
 [1.68355058e-07]
 [1.30999700e-06]
 [1.96800127e-06]
 [3.84518046e-07]
 [4.50229791e-07]
 [3.88017000e-07]
 [1.86072512e-07]
 [6.07146774e-07]
 [7.76931984e-07]
 [1.16813538e-07]
 [6.86197197e-07]
 [1.59930323e-07]
 [1.25071477e-06]
 [1.86571079e-06]
 [3.88576709e-07]
 [4.50248436e-07]
 [3.88267239e-07]
 [1.72541760e-07]
 [6.29682361e-07]
 [7.75879130e-07]
 [1.19853098e-07]
 [7.08816657e-07]
 [1.54807793e-07]
 [1.19605473e-06]
 [1.75655042e-06]
 [3.93894908e-07]
 [4.50269752e-07]
 [3.79757367e-07]
 [1.63762408e-07]
 [6.51713734e-07]
 [7.68014260e-07]
 [1.23927066e-07]
 [7.22033136e-07]
 [1.52748626e-07]
 [1.14451188e-06]
 [1.64904273e-06]
 [4.00093782e-07]
 [4.50297321e-07]
 [3.76700285e-07]
 [1.58927392e-07]
 [6.69413410e-07]
 [7.52092546e-07]
 [1.28788017e-07]
 [7.25921851e-07]
 [1.49310921e-07]
 [1.09738757e-06]
 [1.54222032e-06]
 [4.06221105e-07]
 [4.50333630e-07]
 [3.70127594e-07]
 [1.58463223e-07]
 [6.83068592e-07]
 [7.31977224e-07]
 [1.33815732e-07]
 [7.25574054e-07]
 [1.46045977e-07]
 [1.05598514e-06]
 [1.44112894e-06]
 [4.11884812e-07]
 [4.50379687e-07]
 [3.58862621e-07]
 [1.60179155e-07]
 [6.92978645e-07]
 [7.15373417e-07]
 [1.39119848e-07]
 [7.27536047e-07]
 [1.44845814e-07]
 [1.01945746e-06]
 [1.35007268e-06]
 [4.16308239e-07]
 [4.50436588e-07]
 [3.52703978e-07]
 [1.63657660e-07]
 [6.95151357e-07]
 [7.02721039e-07]
 [1.44207604e-07]
 [7.24727300e-07]
 [1.44348675e-07]
 [9.87746802e-07]
 [1.27078854e-06]
 [4.18994816e-07]
 [4.50502526e-07]
 [3.54163333e-07]
 [1.70654630e-07]
 [6.88410239e-07]
 [6.89514110e-07]
 [1.49541819e-07]
 [7.19593700e-07]
 [1.42838942e-07]]
  
  
  
 [1.37330245e-06]
 [2.05459665e-06]
 [3.82470731e-07]
 [4.50213165e-07]
 [3.91893991e-07]
 [2.05022403e-07]
 [5.87307994e-07]
 [7.55577943e-07]
 [1.15365566e-07]
 [6.66517209e-07]
 [1.68355044e-07]
 [1.30999700e-06]
 [1.96800127e-06]
 [3.84518046e-07]
 [4.50229791e-07]
 [3.88017000e-07]
 [1.86072512e-07]
 [6.07146774e-07]
 [7.76931955e-07]
 [1.16813538e-07]
 [6.86197154e-07]
 [1.59930323e-07]
 [1.25071477e-06]
 [1.86571079e-06]
 [3.88576709e-07]
 [4.50248436e-07]
 [3.88267225e-07]
 [1.72541746e-07]
 [6.29682347e-07]
 [7.75879130e-07]
 [1.19853098e-07]
 [7.08816586e-07]
 [1.54807793e-07]
 [1.19605473e-06]
 [1.75655042e-06]
 [3.93894908e-07]
 [4.50269766e-07]
 [3.79757367e-07]
 [1.63762394e-07]
 [6.51713719e-07]
 [7.68014260e-07]
 [1.23927066e-07]
 [7.22033064e-07]
 [1.52748612e-07]
 [1.14451188e-06]
 [1.64904273e-06]
 [4.00093796e-07]
 [4.50297321e-07]
 [3.76700271e-07]
 [1.58927378e-07]
 [6.69413382e-07]
 [7.52092546e-07]
 [1.28788017e-07]
 [7.25921780e-07]
 [1.49310921e-07]
 [1.09738755e-06]
 [1.54222033e-06]
 [4.06221105e-07]
 [4.50333630e-07]
 [3.70127594e-07]
 [1.58463209e-07]
 [6.83068592e-07]
 [7.31977224e-07]
 [1.33815746e-07]
 [7.25573997e-07]
 [1.46045977e-07]
 [1.05598514e-06]
 [1.44112893e-06]
 [4.11884812e-07]
 [4.50379687e-07]
 [3.58862621e-07]
 [1.60179155e-07]
 [6.92978631e-07]
 [7.15373389e-07]
 [1.39119834e-07]
 [7.27535976e-07]
 [1.44845814e-07]
 [1.01945746e-06]
 [1.35007268e-06]
 [4.16308239e-07]
 [4.50436559e-07]
 [3.52703964e-07]
 [1.63657660e-07]
 [6.95151357e-07]
 [7.02721039e-07]
 [1.44207604e-07]
 [7.24727229e-07]
 [1.44348675e-07]
 [9.87746802e-07]
 [1.27078852e-06]
 [4.18994816e-07]
 [4.50502526e-07]
 [3.54163319e-07]
 [1.70654630e-07]
 [6.88410225e-07]
 [6.89514081e-07]
 [1.49541819e-07]
 [7.19593629e-07]
 [1.42838928e-07]]