In [1]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

data = pd.read_csv('D:/intro/DaSE_intro/11.15/bike.csv', header = 0) #以第一行为标题
print(data)

         id  city  hour  is_workday  weather  temp_1  temp_2  wind    y
0         1     0    22           1        2     3.0     0.7     0   15
1         2     0    10           1        1    21.0    24.9     3   48
2         3     0     0           1        1    25.3    27.4     0   21
3         4     0     7           0        1    15.7    16.2     0   11
4         5     1    10           1        1    21.1    25.0     2   39
...     ...   ...   ...         ...      ...     ...     ...   ...  ...
9995   9996     0     4           0        2     8.3     7.3     0    2
9996   9997     1     5           0        1    22.3    22.2     0    1
9997   9998     1     0           0        1     9.6     9.7     0   11
9998   9999     0    18           0        2    27.4    29.7     1  105
9999  10000     0     9           1        2     3.2    -2.1     2   48

[10000 rows x 9 columns]


In [2]:
data = data.drop('id', axis = 1)
print(data)

      city  hour  is_workday  weather  temp_1  temp_2  wind    y
0        0    22           1        2     3.0     0.7     0   15
1        0    10           1        1    21.0    24.9     3   48
2        0     0           1        1    25.3    27.4     0   21
3        0     7           0        1    15.7    16.2     0   11
4        1    10           1        1    21.1    25.0     2   39
...    ...   ...         ...      ...     ...     ...   ...  ...
9995     0     4           0        2     8.3     7.3     0    2
9996     1     5           0        1    22.3    22.2     0    1
9997     1     0           0        1     9.6     9.7     0   11
9998     0    18           0        2    27.4    29.7     1  105
9999     0     9           1        2     3.2    -2.1     2   48

[10000 rows x 8 columns]


In [3]:
# 筛选出上海市的所有数据，然后剔除city列
shanghai = data.loc[data['city'] == 1]
shanghai = shanghai.drop('city', axis = 1)
print(shanghai)

      hour  is_workday  weather  temp_1  temp_2  wind    y
4       10           1        1    21.1    25.0     2   39
5        0           1        1    20.4    18.2     0   12
9        4           1        3    17.4    18.0     3    2
10       0           1        1    14.9    15.3     2    6
11       8           0        1    25.0    28.1     0   25
...    ...         ...      ...     ...     ...   ...  ...
9990    23           1        2    19.2    19.9     1   44
9991    19           1        1    25.1    26.2     2  124
9993     5           1        3    13.7    14.1     2    1
9996     5           0        1    22.3    22.2     0    1
9997     0           0        1     9.6     9.7     0   11

[4998 rows x 7 columns]


In [4]:
# 将hour列中原来6点-18点统一为1；19点-次日5点统一为0
shanghai.loc[shanghai['hour'] >= 19, 'hour'] = 0
shanghai.loc[shanghai['hour'] <= 5, 'hour'] = 0
shanghai.loc[shanghai['hour'] != 0, 'hour'] = 1
print(shanghai)

      hour  is_workday  weather  temp_1  temp_2  wind    y
4        1           1        1    21.1    25.0     2   39
5        0           1        1    20.4    18.2     0   12
9        0           1        3    17.4    18.0     3    2
10       0           1        1    14.9    15.3     2    6
11       1           0        1    25.0    28.1     0   25
...    ...         ...      ...     ...     ...   ...  ...
9990     0           1        2    19.2    19.9     1   44
9991     0           1        1    25.1    26.2     2  124
9993     0           1        3    13.7    14.1     2    1
9996     0           0        1    22.3    22.2     0    1
9997     0           0        1     9.6     9.7     0   11

[4998 rows x 7 columns]


In [5]:
# y列为单车租用数量，是我们的预测目标（标签），请将该列提取出来，并转换为一个numpy列向量，将原先的y列剔除
bike = shanghai['y'].to_numpy() # 转化成一个numpy数组
bike = bike.reshape(len(bike), 1)
shanghai = shanghai.drop('y', axis = 1)
print(bike)

[[39]
 [12]
 [ 2]
 ...
 [ 1]
 [ 1]
 [11]]


In [6]:
# 请将DataFrame对象转换为Numpy数组，方便后续操作
shanghai = np.array(shanghai)
print(shanghai)

[[ 1.   1.   1.  21.1 25.   2. ]
 [ 0.   1.   1.  20.4 18.2  0. ]
 [ 0.   1.   3.  17.4 18.   3. ]
 ...
 [ 0.   1.   3.  13.7 14.1  2. ]
 [ 0.   0.   1.  22.3 22.2  0. ]
 [ 0.   0.   1.   9.6  9.7  0. ]]


In [7]:
# 按照训练集与测试集8:2的比例将原始数据集划分
# shuffle参数用于“洗牌”，打乱原始的数据行排列顺序
x_train, x_test, y_train, y_test = train_test_split(shanghai, bike, test_size=0.2, shuffle=True)
print('训练集数据量：',len(x_train), len(y_train))
print('测试集数据量：',len(x_test), len(y_test))

训练集数据量： 3998 3998
测试集数据量： 1000 1000


In [8]:
# 分别对训练集数据、训练集标签、测试集数据和测试集标签进行归一化
scale = MinMaxScaler() # 归一化模型
x_train = scale.fit_transform(x_train)
x_test = scale.fit_transform(x_test)
y_train = scale.fit_transform(y_train)
y_test = scale.fit_transform(y_test)

In [9]:
# 构建一个线性回归模型（多元一次函数），然后利用训练集训练模型
model = LinearRegression()
model.fit(x_train, y_train)
print(model.coef_) # 系数
print(model.intercept_) # 截距

[[ 0.1647886   0.00160362 -0.08069044  0.25552505  0.1517082   0.03036236]]
[-0.06253923]


In [10]:
# 模型测试：利用测试集对训练好的模型进行评估
test = model.predict(x_test)
#print(test)

In [11]:
# 模型评估：请使用均方根误差（RMSE）作为评估指标，并输出RMSE值
mse = mean_squared_error(y_test, test)
rmse = math.sqrt(mse)
print(rmse)

0.17213183506123064
