In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [None]:
data = pd.read_csv(r"./data_raw.csv")
data["main_focus"] = data["main_focus"].astype(int)
enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)
enc = enc.fit(data[["region"]])
data["region"] = enc.fit_transform(data[["region"]])

In [None]:
# 任务 1：
# 实例化本项目中所需要的所有机器学习模型，包括：tree 回归，线性回归，岭回归，
# Lasso 回归，SVM 回归，KNN 回归，随机森林回归，Adaboost 回归，GBRT 回归，
# Bagging 回归，Extralsess 极限随机数回归。
tree_model = DecisionTreeRegressor()
linear_model = LinearRegression()
ridge_model = Ridge()
lasso_model = Lasso()
svr_model = SVR()
knn_model = KNeighborsRegressor()
rf_model = RandomForestRegressor()
adbst_model = AdaBoostRegressor()
gbr_model = GradientBoostingRegressor()
bag_model = BaggingRegressor()
ext_model = ExtraTreesRegressor()

In [None]:
# 任务 2：
# 定义噪声参数 3* std
std = data["sales_record"].std()
noise_value = 3*std
data = data[data["sales_record"] < noise_value]

In [None]:
# 任务 3：
# 定义基础的统计量，包括：最大值，最小值，均值，最大值减最小值等
max_value = data["sales_record"].max()
min_value = data["sales_record"].min()
mean_value = data["sales_record"].mean()
max_sub_min = max_value - mean_value

In [None]:
# 任务 4：
# 定义 MSE，MAE，RMSE，R2
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score as R2

def RMSE(y_true, y_pred):
    return np.sqrt(MSE(y_true, y_pred))

In [None]:
# 任务 5：
# 封装以上功能，成为一个还能够进行：遍历以上所有模型进行训练，且能够将这些模
# 型的效果图输出，且能够将这些模型的评价指标参数打印的函数。且实现数据集 y 预
# 测值与 y 实际值进行对比的效果。
def model_compare(train_X, train_y, test_X, test_y):
    model_list = [tree_model, linear_model, ridge_model, lasso_model, svr_model
        , knn_model, rf_model, adbst_model, gbr_model, bag_model, ext_model
                  ]
    result_y = {"y_true": test_y}
    for model in model_list:
        model.fit(train_X, train_y)
        pred_y = model.predict(test_X)
        mae = MAE(test_y, pred_y)
        mse = MSE(test_y, pred_y)
        rmse = RMSE(test_y, pred_y)
        r2 = R2(test_y, pred_y)
        model_name = model.__class__.__name__
        result_y[model_name] = pred_y
        print(model_name, f"mae = {mae}, mse = {mse}, rmse = {rmse}, r2 = {r2}")

    # 开始画图
    colors = ["red", "lightcoral", "tomato", "sandybrown", "yellowgreen",
              "c", "seagreen", "lightskyblue", "orchid", "pink", "slategrey", "hotpink"]

    x = range(len(test_y))
    for idx, (model_name, pred_y) in enumerate(result_y.items()):
        if model_name != "y_true":
            plt.plot(x, result_y["y_true"], color="black",  label="y_true", alpha=0.5)
            plt.plot(x, pred_y, color=colors[idx], label=model_name, alpha=1)
            plt.legend()  # 显示图例
            plt.xlabel('sample')
            plt.ylabel('sales_record')
            plt.show()

In [None]:
# 任务 7：
# 分割训练集与预测集
from sklearn.model_selection import train_test_split
X = data.drop(columns=["sales_record"])
y = data["sales_record"]
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)

In [None]:
# 任务 6：
# 对该数据集中的数据进行操作：对部分数据先拟合，通过拟合后的整体指标进行，对
# 数据实现标准化和归一化。
from sklearn.preprocessing import MinMaxScaler, StandardScaler
num_col = ['undesirable_event', 'feedback', 'positive_feedback', 'competitive', 'spread_rate', 'transfer_rate', 'matain_rate']
num_train_x = train_x[num_col]
num_test_x = test_x[num_col]

minmax_scaler = MinMaxScaler().fit(num_train_x)
# standard_scaler = StandardScaler().fit(num_train_x)
num_train_x = minmax_scaler.transform(num_train_x)
num_test_x = minmax_scaler.transform(num_test_x)
num_train_x = pd.DataFrame(num_train_x, columns=num_col, index=train_x.index)
num_test_x = pd.DataFrame(num_test_x, columns=num_col, index=test_x.index)
train_x[num_col] = num_train_x
test_x[num_col] = num_test_x

In [None]:
# 任务 7：
# 并对模型进行训练
model_compare(train_x, train_y, test_x, test_y)

In [None]:
# 任务 8：
# 根据前几周自学的知识进行统计学分析，调整超参数改变的方向，提高模型效果。

# 对RandomForestRegressor调参
forest_reg_ori = RandomForestRegressor(random_state=1).fit(train_x, train_y)
test_y_pred_ori = forest_reg_ori.predict(test_x)
r2_ori = R2(test_y, test_y_pred_ori)
mae_ori = MAE(test_y, test_y_pred_ori)

forest_reg = RandomForestRegressor(max_depth=8, n_estimators=500, random_state=1).fit(train_x, train_y)
test_y_pred = forest_reg.predict(test_x)
r2 = R2(test_y, test_y_pred)
mae = MAE(test_y, test_y_pred)
print(f"调参数前r2: {r2_ori}, 调参数后r2: {r2}, 调参数前mae: {mae_ori}, 调参数后mae: {mae}")

# 对ExtraTreesRegressor调参

ext_reg_ori = ExtraTreesRegressor(random_state=1).fit(train_x, train_y)
test_y_pred_ori = ext_reg_ori.predict(test_x)
r2_ori = R2(test_y, test_y_pred_ori)
mae_ori = MAE(test_y, test_y_pred_ori)

ext_reg = ExtraTreesRegressor(max_depth=10, n_estimators=100, random_state=1).fit(train_x, train_y)
test_y_pred = ext_reg.predict(test_x)
r2 = R2(test_y, test_y_pred)
mae = MAE(test_y, test_y_pred)
print(f"调参数前r2: {r2_ori}, 调参数后r2: {r2}, 调参数前mae: {mae_ori}, 调参数后mae: {mae}")



In [None]:
# 任务 9：
# 利用全体数据集进行训练，将训练好的模型进行保存
data_x = data.drop(columns=["region"])
data_y = data["region"].values
model = RandomForestRegressor()
model.fit(data_x, data_y)


import pickle
# 可以将保存路径改为自己想要保存的路径
save_path = r"model.pkl"
with open(save_path, "wb") as ot_path:
    pickle.dump(model, ot_path)