In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.api as sm
import matplotlib   
from sympy import diff
from sympy import symbols
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['axes.unicode_minus']=False

# 回归数据数据集  
> - 下载地址：https://www.kaggle.com/datasets/altavish/boston-housing-dataset  
> - 这是一个关于波士顿房价及其相关属性描述的数据集，包含506个观测值和14个变量，可用来做线性回归分析，但数据集中存在缺失值，处理缺失值过程中将它们变为0。  
> - 数据描述如下：  
> > - CRIM：犯罪率  
> > - ZN: 住宅用地比例超过25000平方英尺  
> > - INDUS：非零售商业用地占比  
> > - CHAS：是否临Charles河（1则临河，0则不临河）  
> > - NOX：氮氧化物浓度  
> > - RM：房屋房间数  
> > - AGE：房屋年龄  
> > - DIS：和就业中心的距离  
> > - RAD：是否容易上高速路  
> > - TAX：税率  
> > - PTRATIO：学生人数比老师人数  
> > - B：城镇黑人比例计算的统计值  
> > - LSTAT：低收入人群比例  
> > - MEDV：房价中位数  
> - **其中MEDV为输出变量，其余为输入变量**

In [2]:
filename = 'HousingData.csv'
data = pd.read_csv(filename)
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,,36.2


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     486 non-null    float64
 1   ZN       486 non-null    float64
 2   INDUS    486 non-null    float64
 3   CHAS     486 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      486 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    486 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


In [4]:
# 缺失值处理
data.dropna(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 394 entries, 0 to 504
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     394 non-null    float64
 1   ZN       394 non-null    float64
 2   INDUS    394 non-null    float64
 3   CHAS     394 non-null    float64
 4   NOX      394 non-null    float64
 5   RM       394 non-null    float64
 6   AGE      394 non-null    float64
 7   DIS      394 non-null    float64
 8   RAD      394 non-null    int64  
 9   TAX      394 non-null    int64  
 10  PTRATIO  394 non-null    float64
 11  B        394 non-null    float64
 12  LSTAT    394 non-null    float64
 13  MEDV     394 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 46.2 KB


In [5]:
# data X and Y 
X = data.iloc[:, 0:-1].values
X = sm.add_constant(X)

m, n= X.shape

Y = data.iloc[:, -1].values.reshape(m, 1)

## 正规方程

In [6]:
# 正规方程
beta = np.linalg.inv(X.T @ X) @ X.T @ Y
np.set_printoptions(precision=4, suppress=True)
print('回归系数为:', beta)

回归系数为: [[ 32.6801]
 [ -0.0976]
 [  0.0489]
 [  0.0304]
 [  2.7694]
 [-17.969 ]
 [  4.2833]
 [ -0.013 ]
 [ -1.4585]
 [  0.2859]
 [ -0.0131]
 [ -0.9146]
 [  0.0097]
 [ -0.4237]]


## Python自带函数

In [7]:
# Python自带函数
model = sm.OLS(Y, X)
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.767
Model:,OLS,Adj. R-squared:,0.759
Method:,Least Squares,F-statistic:,96.29
Date:,"Thu, 31 Oct 2024",Prob (F-statistic):,1.75e-111
Time:,20:55:37,Log-Likelihood:,-1143.4
No. Observations:,394,AIC:,2315.0
Df Residuals:,380,BIC:,2370.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,32.6801,5.681,5.752,0.000,21.509,43.851
x1,-0.0976,0.032,-3.007,0.003,-0.161,-0.034
x2,0.0489,0.014,3.397,0.001,0.021,0.077
x3,0.0304,0.066,0.461,0.645,-0.099,0.160
x4,2.7694,0.925,2.993,0.003,0.950,4.588
x5,-17.9690,4.243,-4.235,0.000,-26.311,-9.627
x6,4.2833,0.471,9.100,0.000,3.358,5.209
x7,-0.0130,0.014,-0.898,0.370,-0.041,0.015
x8,-1.4585,0.211,-6.912,0.000,-1.873,-1.044

0,1,2,3
Omnibus:,161.243,Durbin-Watson:,1.247
Prob(Omnibus):,0.0,Jarque-Bera (JB):,904.814
Skew:,1.657,Prob(JB):,3.3299999999999995e-197
Kurtosis:,9.643,Cond. No.,15700.0


## 梯度下降法

In [8]:
def H(theta, x):
    h = np.matmul(theta, x)
    return h

In [9]:
tol = 1e-6; N = 5e6; error = 1

alpha = 0.000005
k = 0; kk = 0
theta_old = np.array([32, 0, 0, 0, 3, -17, 4,  0, -1, 0, 0, -1, 0, 0])
theta_new = np.zeros((n, ))
J = np.zeros((int(N), ))

while error>tol and k<N:
    summation = np.zeros((n, ))
    for i in range(m):
        summation = summation + (H(theta_old, X[i, :].T) - Y[i, 0])*X[i, :]
    theta_new = theta_old - alpha*(1.0/m)*summation

    if k%10000==0:
        for i in range(m):
            J[kk] = J[kk] + (H(theta_old, X[i, :].T)-Y[i, 0])**2
            J[kk] = J[kk]/(2*m)
        kk = kk + 1

    k = k + 1
    error = np.max(np.abs(theta_new - theta_old))
    theta_old = theta_new

print('回归系数为', theta_new)
print(f'迭代次数为：{k}')

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
回归系数为 [ 32.005   -0.0938   0.0461   0.0375   2.9979 -16.9961   4.1253  -0.0065
  -1.2699   0.2842  -0.0129  -0.9189   0.0099  -0.4384]
迭代次数为：156149
