<a href="https://colab.research.google.com/github/ZackWongAPO/Colab/blob/main/LinearRegressionAir_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#加载库

In [13]:
# 库的加载

# general
import io

# data processing
import numpy
import pandas
from sklearn.model_selection import train_test_split

# machine learning support
import keras

# data visualization
import plotly.express as px
import plotly.subplots as sb
import plotly.graph_objects as go
import seaborn

from google.colab import files
from keras.callbacks import EarlyStopping
from keras.callbacks import LambdaCallback

#加载空气污染数据集

In [14]:
uploaded = files.upload()

Saving AirPollutionDefault.xlsx to AirPollutionDefault (1).xlsx


#数据预处理
>数据预处理和特征工程详见pandas.ipynb，此处不再赘述，重点研究如何抽取训练集与测试集

>>由于数据与季节，年份有很大的关系，因此抽取数据分三种方式，这三个代码选择其一执行

In [20]:
# 日期正常排列，不作任何处理
df = pandas.read_excel('/content/AirPollutionDefault.xlsx')
print(df.head())
label = df['AQI']
print(label.head())
feature = df[['Date', 'PM25', 'PM10', 'SO2', 'CO', 'NO2', 'O3_8h']]
print(feature.head())

# X的意思是特征值，y的意思是标签值
feature_train, feature_test, AQI_train, AQI_test = train_test_split(feature, label, test_size=0.15, random_state=123)
features_train = feature_train.values
features_test = feature_test.values
AQIs_train = AQI_test.values
AQIs_test = AQI_test.values
print(type(feature_train))
print(type(features_train))
print(feature_train.head())
print(feature_test.head())
print(AQI_train.head())
print(AQI_test.head())

   Date  year  month  day  AQI   PM25  PM10   SO2   CO  NO2  O3_8h
0     0  2013     12    2  145  111.0   163  89.0  1.7   85   42.0
1     1  2013     12    3  356  306.0   299  98.0  1.9  127   21.0
2     2  2013     12    4  314  264.0   230  98.0  1.5  144   24.0
3     3  2013     12    5  170  129.0   162  59.0  1.3   80   36.0
4     4  2013     12    6   55   39.0    61  45.0  0.9   76   36.0
0    145
1    356
2    314
3    170
4     55
Name: AQI, dtype: int64
   Date   PM25  PM10   SO2   CO  NO2  O3_8h
0     0  111.0   163  89.0  1.7   85   42.0
1     1  306.0   299  98.0  1.9  127   21.0
2     2  264.0   230  98.0  1.5  144   24.0
3     3  129.0   162  59.0  1.3   80   36.0
4     4   39.0    61  45.0  0.9   76   36.0
<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>
      Date  PM25  PM10   SO2   CO  NO2  O3_8h
1318  1318  33.0    62   6.0  0.7   15  155.0
1366  1366   8.0    35   8.0  0.3   19   84.0
91      91  58.0    90  38.0  0.9   44   92.0
2091  2091  12.0   

In [None]:
# 删除臭氧特征值，进行数据归一化后训练
df = pandas.read_xls('/content/AirPollution01.xlsx')
print(df.head())

In [None]:
# 进一步拆分年月日，每周内按照随机数抽选训练集与测试集
df = pandas.read_xls('/content/AirPollution01Final.xlsx')
print(df.head())

#编写画图函数

In [None]:
# 定义画图函数

def make_plots(df, feature_names, label_name, model_output, sample_size=200):

  random_sample = df.sample(n=sample_size).copy()
  random_sample.reset_index()
  weights, bias, epochs, rmse = model_output

  is_2d_plot = len(feature_names) == 1
  model_plot_type = "scatter" if is_2d_plot else "surface"
  fig = make_subplots(rows=1, cols=2,
                      subplot_titles=("Loss Curve", "Model Plot"),
                      specs=[[{"type": "scatter"}, {"type": model_plot_type}]])

  plot_data(random_sample, feature_names, label_name, fig)
  plot_model(random_sample, feature_names, weights, bias, fig)
  plot_loss_curve(epochs, rmse, fig)

  fig.show()
  return

def plot_loss_curve(epochs, rmse, fig):
  curve = px.line(x=epochs, y=rmse)
  curve.update_traces(line_color='#ff0000', line_width=3)

  fig.append_trace(curve.data[0], row=1, col=1)
  fig.update_xaxes(title_text="Epoch", row=1, col=1)
  fig.update_yaxes(title_text="Root Mean Squared Error", row=1, col=1, range=[rmse.min()*0.8, rmse.max()])

  return

def plot_data(df, features, label, fig):
  if len(features) == 1:
    scatter = px.scatter(df, x=features[0], y=label)
  else:
    scatter = px.scatter_3d(df, x=features[0], y=features[1], z=label)

  fig.append_trace(scatter.data[0], row=1, col=2)
  if len(features) == 1:
    fig.update_xaxes(title_text=features[0], row=1, col=2)
    fig.update_yaxes(title_text=label, row=1, col=2)
  else:
    fig.update_layout(scene1=dict(xaxis_title=features[0], yaxis_title=features[1], zaxis_title=label))

  return

def plot_model(df, features, weights, bias, fig):
  df['FARE_PREDICTED'] = bias[0]

  for index, feature in enumerate(features):
    df['FARE_PREDICTED'] = df['FARE_PREDICTED'] + weights[index][0] * df[feature]

  if len(features) == 1:
    model = px.line(df, x=features[0], y='FARE_PREDICTED')
    model.update_traces(line_color='#ff0000', line_width=3)
  else:
    z_name, y_name = "FARE_PREDICTED", features[1]
    z = [df[z_name].min(), (df[z_name].max() - df[z_name].min()) / 2, df[z_name].max()]
    y = [df[y_name].min(), (df[y_name].max() - df[y_name].min()) / 2, df[y_name].max()]
    x = []
    for i in range(len(y)):
      x.append((z[i] - weights[1][0] * y[i] - bias[0]) / weights[0][0])

    plane=pd.DataFrame({'x':x, 'y':y, 'z':[z] * 3})

    light_yellow = [[0, '#89CFF0'], [1, '#FFDB58']]
    model = go.Figure(data=go.Surface(x=plane['x'], y=plane['y'], z=plane['z'],
                                      colorscale=light_yellow))

  fig.add_trace(model.data[0], row=1, col=2)

  return

def model_info(feature_names, label_name, model_output):
  weights = model_output[0]
  bias = model_output[1]

  nl = "\n"
  header = "-" * 80
  banner = header + nl + "|" + "MODEL INFO".center(78) + "|" + nl + header

  info = ""
  equation = label_name + " = "

  for index, feature in enumerate(feature_names):
    info = info + "Weight for feature[{}]: {:.3f}\n".format(feature, weights[index][0])
    equation = equation + "{:.3f} * {} + ".format(weights[index][0], feature)

  info = info + "Bias: {:.3f}\n".format(bias[0])
  equation = equation + "{:.3f}\n".format(bias[0])

  return banner + nl + info + nl + equation

print("SUCCESS: defining plotting functions complete.")

#编写训练函数
>包括机器学习模型的选择，深度学习神经元个数与层数的设计，模型的编译，训练超参数的输入等等

In [18]:
# 封装各机器学习细节至总函数

# 建立简单线性模型
def build_linear_model(my_learning_rate, num_features):
  # 大多数模型都是序列模型，无脑建模就可以
  model = keras.models.Sequential()

  # 添加全连接层
  # unit表示神经元个数，只添加一层就只有一个输出层，一个神经元意味着只有一条直线
  # input_shape 是输入数据的形状，对于大多数神经网络的输入层而言都是一维向量
  model.add(keras.layers.Dense(units=1,input_shape=(num_features,)))

  # 编译模型，从而加速训练速度

  # optimizer：优化器，用于控制梯度裁剪。必选项。在此处修改流程图中的三种优化器方法
  # 方法一：keras.optimizers.SGD(lr=0.01, momentum=0.0, decay=0.0, nesterov=False)（随机梯度下降法）
  # 方法二：keras.optimizers.SGD(lr=0.01, momentum=0.5, decay=0.0, nesterov=True)（带动量版本的SGD）
  # 方法三：keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)(理论最优)
  # lr: float >= 0. 学习率。
  # momentum : float>= 0. 动量值参数，用于加速 SGD 在相关方向上前进，并抑制震荡。
  # decay: float >= 0. 每次参数更新后学习率衰减值，可有效防止模型不收敛。
  # nesterov: boolean. 是否使用 Nesterov 动量。

  # loss：损失函数（或称目标函数、优化评分函数）。必选项
  # metrics：评价函数用于评估当前训练模型的性能。
  # 当模型编译后（compile），评价函数应该作为 metrics 的参数来输入。
  # 评价函数和损失函数相似，只不过评价函数的结果不会用于训练过程中，只应用于测试集。
  model.compile(optimizer=keras.optimizers.SGD(learning_rate=my_learning_rate),
                loss=keras.losses.MeanSquaredError,  # 选择差值平方作为Loss函数
                metrics=[keras.metrics.RootMeanSquaredError()])

  return model


def train_model(model, features, label, epochs, batch_size):

  # # 定义回调函数，当loss值不再明显变化的时候会提前结束训练
  # early_stopping = EarlyStopping(monitor='val_loss', patience=2)

  # 开始训练模型
  # batch_size：将一整个数据集随机分成多个小块(batch)，可以有效增加模型的收敛速度
  # epochs：纪元数，相当于刷新权重值矩阵的次数
  # x：输入特征值训练集，格式为Numpy数组
  # y：输入标签值训练集，格式为Numpy数组
  # 返回的是一个history对象，其属性值为连续epoch纪元值所对应的Loss值和评价值

  history = model.fit( x=features,
              y=label,
              batch_size=batch_size,
              epochs=epochs,
              # callbacks=[early_stopping]
              )

  # 得到最终训练结果的权重和偏差
  trained_weight = model.get_weights()[0]
  trained_bias = model.get_weights()[1]

  # 返回纪元值列表
  epochs = history.epoch
  print(type(epochs))

  # 将每个纪元值所对应的Loss值和评价值转化为pandas表格，用于作图
  hist = pandas.DataFrame(history.history)

  print(hist.head())
  rmse = hist["root_mean_squared_error"]

  return trained_weight, trained_bias, epochs, rmse

#编写验证函数
>因为训练数据都是通过归一化的，所以预测时的输入应该也是归一化模型

调用上述函数

In [19]:
linear_model = build_linear_model(0.01)
trained_weight, trained_bias, epochs, rmse = train_model(model = linear_model,
                                features = features_train,
                                label = AQIs_train,
                                epochs = 200,
                                batch_size = 200)

TypeError: build_linear_model() missing 1 required positional argument: 'num_features'

In [10]:
print(trained_weight)
print(trained_bias)

[[nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]]
[nan]
