<a href="https://colab.research.google.com/github/ghwlsro/multipleLinearRegressionStudy/blob/master/multi_linear_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 下载数据
import pandas as pd
url = "https://raw.githubusercontent.com/ghwlsro/multipleLinearRegressionStudy/master/Salary_Data2.csv"
data = pd.read_csv(url)

In [None]:
# 处理EducationLevel数据
data["EducationLevel"] = data["EducationLevel"].map({"高中以下": 0, "大學": 1, "碩士以上": 2})

In [None]:
# 对city进行独热编码（一个多值属性变成多个正交数字属性）
from sklearn.preprocessing import OneHotEncoder
oneHotEncoder = OneHotEncoder()
oneHotEncoder.fit(data[["City"]])
city_encoded = oneHotEncoder.transform(data[["City"]]).toarray()
data[["CityA", "CityB", "CityC"]] = city_encoded
data = data.drop(["City", "CityC"], axis=1)

In [None]:
# 分割数据
from sklearn.model_selection import train_test_split
data
x = data[["YearsExperience", "EducationLevel", "CityA", "CityB"]]
y =data[["Salary"]]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=87)
x_train = x_train.to_numpy()
x_test = x_test.to_numpy()
y_train = y_train.to_numpy().reshape(len(y_train))
y_test = y_test.to_numpy().reshape(len(y_test))

In [None]:
# 数据缩放
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
# 找w和b，其中，y_pred = w1*x["YearsExperience"] + w2*x["EducationLevel"] + w3*x["CityA"] + w4*x["CityB"] + b
import numpy as np
w = np.array([1, 2, 3, 4])
b = 1
y_pred = (x_train*w).sum(axis=1) + b
y_pred.shape
len(y_train)

28

In [None]:
def compute_cost(x, y, w, b):
  y_pred = (x*w).sum(axis=1) + b
  cost = ((y - y_pred)**2).mean()
  return cost

In [None]:
# 梯度下降
import numpy as np

def gradient_decent(x, y, w_init, b_init, learning_rate, run_iter, print_iter):
  w = w_init
  b = b_init
  w_hist = []
  b_hist = []
  c_hist = []
  for i in range(run_iter):
    # 依据y = x1*w1 + x2*w2 + ... + x4*w4 + b 计算y_pred
    y_pred = (x*w).sum(axis=1) + b
    # 计算cost在b方向的梯度
    b_gradient = (y_pred - y).mean()
    # 计算cost在4个w方向的梯度
    # 定义变量
    w_gradient = np.zeros(4,)
    # 计算cost在w上的梯度
    for j in range(0, x.shape[1]):
      w_gradient[j] = (x[:, j] * (y_pred-y)).mean()
    # 迭代w和b
    w = w - w_gradient * learning_rate
    b = b - b_gradient * learning_rate

    # 计算cost
    cost = compute_cost(x, y, w, b)

    if i%print_iter == 0:
      formatted_w = [f"{val: .2e}" for val in w]
      print(f"iteration {i: 6d}: w={formatted_w} b={b:.2e} cost={cost:.2e}")
      w_hist.append(w)
      b_hist.append(b)
      c_hist.append(cost)

  # 输出最终w b cost值
  return w, b, w_hist, b_hist, c_hist

In [None]:
# 测试gradient_decent()
w_init = np.array([1, 2, 3, 4])
b_init = 1
learning_rate = 0.001
run_iter = 100000
print_iter = 5000

w_final, b_final, w_hist, b_hist, c_hist = gradient_decent(x_train, y_train, w_init, b_init, learning_rate, run_iter, print_iter)

iteration      0: w=[' 1.01e+00', ' 2.01e+00', ' 2.99e+00', ' 4.00e+00'] b=1.05e+00 cost=2.72e+03
iteration   5000: w=[' 2.95e+00', ' 1.32e+01', '-2.41e+00', '-2.64e+00'] b=5.06e+01 cost=2.65e+01
iteration  10000: w=[' 3.75e+00', ' 1.39e+01', '-1.79e+00', '-3.52e+00'] b=5.09e+01 cost=2.53e+01
iteration  15000: w=[' 4.01e+00', ' 1.41e+01', '-1.56e+00', '-3.71e+00'] b=5.09e+01 cost=2.52e+01
iteration  20000: w=[' 4.09e+00', ' 1.41e+01', '-1.49e+00', '-3.76e+00'] b=5.09e+01 cost=2.52e+01
iteration  25000: w=[' 4.11e+00', ' 1.41e+01', '-1.47e+00', '-3.78e+00'] b=5.09e+01 cost=2.52e+01
iteration  30000: w=[' 4.12e+00', ' 1.41e+01', '-1.46e+00', '-3.78e+00'] b=5.09e+01 cost=2.52e+01
iteration  35000: w=[' 4.12e+00', ' 1.41e+01', '-1.46e+00', '-3.78e+00'] b=5.09e+01 cost=2.52e+01
iteration  40000: w=[' 4.12e+00', ' 1.41e+01', '-1.46e+00', '-3.78e+00'] b=5.09e+01 cost=2.52e+01
iteration  45000: w=[' 4.12e+00', ' 1.41e+01', '-1.46e+00', '-3.78e+00'] b=5.09e+01 cost=2.52e+01
iteration  50000: w=

In [None]:
# 测试测试集
y_pred = (x_test * w_final).sum(axis=1) + b_final
pd.DataFrame({"y_pred": y_pred, "y_test": y_test})
compute_cost(x_test, y_test, w_final, b_final)

18.116060544193736

In [None]:
# 使用模型
# 假设x的参数是5.3 硕士以上 城市A
# 整理数据
x = np.array([[5.3, 2, 1, 0]])
x = scaler.transform(x)
y_pred = (x*w_final).sum(axis=1) + b_final
y_pred

array([65.53661553])