In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
house_df = pd.read_csv(r"./model_data/houses_all_set.csv", sep=",")

In [3]:
house_df.head()

Unnamed: 0,id,region,city,rooms,halls,towards,decoration,have_elevator,visited,attention,publishday,popular,unit_price,area,total_price
0,1,昆明市教工二幼儿园宿舍,K,2,2,SE NW,0,0,6,9,19.0,0.3632,1.3,50.0,
1,2,新亚洲体育城星宇园,K,3,2,SE,3,0,0,0,19.0,0.0,1.3446,119.0,160.0
2,3,禧瑞都,K,3,2,SE,2,0,0,0,19.0,0.0,1.1021,147.0,162.0
3,4,金色交响家园,K,4,2,SE,2,0,1,1,15.0,0.0667,1.4477,105.0,152.0
4,5,金碧阳光商住楼,K,3,2,S N,0,0,1,3,19.0,0.0842,1.8196,129.15,235.0


### train_test_split

In [4]:
# 以total_price是否为空，将数据分为两部分
cond_idx1 = house_df.total_price.notnull()  # total_price NOT NULL
cond_idx2 = house_df.total_price.isnull()   # total_price IS NULL

tp_notnull = house_df[cond_idx1]
tp_isnull  = house_df[cond_idx2]

In [5]:
# 将 tp_notnull 和 tp_isnull 写入文件
tp_notnull.to_csv(r"./model_data/house_train.csv", index=False)
tp_isnull.to_csv(r"./model_data/house_test.csv", index=False)

In [6]:
# DataFrame to ndarray 
X_train, y_train = np.array(tp_notnull.loc[:, ['unit_price','area']]), np.array(tp_notnull['total_price'])
X_test, y_test = np.array(tp_isnull.loc[:, ['unit_price','area']]), np.array(tp_isnull['total_price'])

In [7]:
X_train.shape

(8834, 2)

In [8]:
X_test[:5]

array([[ 1.3   , 50.    ],
       [ 1.0134, 55.26  ],
       [ 0.9767, 93.18  ],
       [ 2.4586, 38.64  ],
       [ 2.    , 40.    ]])

In [9]:
y_train.shape

(8834,)

In [10]:
y_test[:5]

array([nan, nan, nan, nan, nan])

### 使用多项式回归构建 total_price 的二维特征

In [11]:
from sklearn.preprocessing import PolynomialFeatures

poly_1 = PolynomialFeatures(degree=2)
poly_1.fit(X_train)
# 构建训练集 X_train 的二维特征矩阵
X_train2 = poly_1.transform(X_train)

poly_2 = PolynomialFeatures(degree=2)
poly_2.fit(X_test)
# 构建测试集 X_test 的二维特征矩阵
X_test2 = poly_2.transform(X_test)

In [12]:
X_train2[:5]

array([[1.00000000e+00, 1.34460000e+00, 1.19000000e+02, 1.80794916e+00,
        1.60007400e+02, 1.41610000e+04],
       [1.00000000e+00, 1.10210000e+00, 1.47000000e+02, 1.21462441e+00,
        1.62008700e+02, 2.16090000e+04],
       [1.00000000e+00, 1.44770000e+00, 1.05000000e+02, 2.09583529e+00,
        1.52008500e+02, 1.10250000e+04],
       [1.00000000e+00, 1.81960000e+00, 1.29150000e+02, 3.31094416e+00,
        2.35001340e+02, 1.66797225e+04],
       [1.00000000e+00, 1.27230000e+00, 9.43200000e+01, 1.61874729e+00,
        1.20003336e+02, 8.89626240e+03]])

In [13]:
X_test2[:5]

array([[1.00000000e+00, 1.30000000e+00, 5.00000000e+01, 1.69000000e+00,
        6.50000000e+01, 2.50000000e+03],
       [1.00000000e+00, 1.01340000e+00, 5.52600000e+01, 1.02697956e+00,
        5.60004840e+01, 3.05366760e+03],
       [1.00000000e+00, 9.76700000e-01, 9.31800000e+01, 9.53942890e-01,
        9.10089060e+01, 8.68251240e+03],
       [1.00000000e+00, 2.45860000e+00, 3.86400000e+01, 6.04471396e+00,
        9.50003040e+01, 1.49304960e+03],
       [1.00000000e+00, 2.00000000e+00, 4.00000000e+01, 4.00000000e+00,
        8.00000000e+01, 1.60000000e+03]])

### 线性回归预测total_price 缺失值

In [14]:
from sklearn.linear_model import LinearRegression

# Create an instance
lin_reg = LinearRegression()
# fit
lin_reg.fit(X_train2, y_train)
# predict
y_predict = lin_reg.predict(X_test2)

In [15]:
y_predict[:10]

array([64.99732753, 55.99757776, 91.00437235, 94.99817919, 79.99778854,
       96.00169512, 67.99968297, 67.99852122, 94.99604105, 89.00197127])

In [16]:
# 将预测结果转化为 pd.Series，并对每个值四舍五入保留2位小数
tprice_pred = pd.DataFrame(data=y_predict, columns=["total_price"], dtype='float64').apply(lambda x: round(x, 2))

In [17]:
tprice_pred.head()

Unnamed: 0,total_price
700,95.0
701,97.0
702,96.0
703,80.0
704,95.0


In [18]:
# ## total_price 的真实值
y_true_df = pd.read_csv(r"./model_data/tprice_missing.csv", names=['total_price'])

y_true = np.array(y_true_df, dtype='float64')
y_true[:10]

array([[65.  ],
       [56.  ],
       [91.01],
       [95.  ],
       [80.  ],
       [96.01],
       [68.  ],
       [68.  ],
       [95.  ],
       [89.  ]])

In [19]:
# 将total_price的预测结果写入文件
# tprice_pred.to_csv(r"./model_data/predicts/tprice_miss_pred.csv", index=False)

In [20]:
# 重新读入 'train' and 'test' 数据集
train2 = pd.read_csv(r"./model_data/house_train.csv")
test2 = pd.read_csv(r"./model_data/house_test.csv")

In [21]:
train2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8834 entries, 0 to 8833
Data columns (total 15 columns):
id               8834 non-null int64
region           8834 non-null object
city             8834 non-null object
rooms            8834 non-null int64
halls            8834 non-null int64
towards          8834 non-null object
decoration       8834 non-null int64
have_elevator    8834 non-null int64
visited          8834 non-null int64
attention        8834 non-null int64
publishday       8834 non-null float64
popular          8834 non-null float64
unit_price       8834 non-null float64
area             8834 non-null float64
total_price      8834 non-null float64
dtypes: float64(5), int64(7), object(3)
memory usage: 1.0+ MB


In [22]:
test2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 705 entries, 0 to 704
Data columns (total 15 columns):
id               705 non-null int64
region           705 non-null object
city             705 non-null object
rooms            705 non-null int64
halls            705 non-null int64
towards          705 non-null object
decoration       705 non-null int64
have_elevator    705 non-null int64
visited          705 non-null int64
attention        705 non-null int64
publishday       705 non-null float64
popular          705 non-null float64
unit_price       705 non-null float64
area             705 non-null float64
total_price      0 non-null float64
dtypes: float64(5), int64(7), object(3)
memory usage: 82.7+ KB


In [23]:
test2.tail()

Unnamed: 0,id,region,city,rooms,halls,towards,decoration,have_elevator,visited,attention,publishday,popular,unit_price,area,total_price
700,8894,芦潮港农场西区新村,S,3,2,SE,2,0,2,44,19.2,0.7604,1.3725,69.22,
701,9104,果园小区,S,1,1,S,1,0,5,9,15.0,0.4133,2.8572,33.95,
702,9110,松云银座,S,1,1,E,2,1,3,7,31.0,0.1355,2.0875,45.99,
703,9157,万达广场(松江),S,1,1,N,0,1,9,2,31.0,0.2226,1.9089,41.91,
704,9331,解放新村,S,1,1,S,2,0,2,55,19.2,0.9323,2.6761,35.5,


In [24]:
# 将测试集 house_test 的 'total_price' 列重新赋值为预测的新数据
test2['total_price'] = tprice_pred['total_price']

In [25]:
test2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 705 entries, 0 to 704
Data columns (total 15 columns):
id               705 non-null int64
region           705 non-null object
city             705 non-null object
rooms            705 non-null int64
halls            705 non-null int64
towards          705 non-null object
decoration       705 non-null int64
have_elevator    705 non-null int64
visited          705 non-null int64
attention        705 non-null int64
publishday       705 non-null float64
popular          705 non-null float64
unit_price       705 non-null float64
area             705 non-null float64
total_price      705 non-null float64
dtypes: float64(5), int64(7), object(3)
memory usage: 82.7+ KB


In [26]:
test2.sample(10)

Unnamed: 0,id,region,city,rooms,halls,towards,decoration,have_elevator,visited,attention,publishday,popular,unit_price,area,total_price
269,1572,保利城二期,C,2,2,S,0,1,6,45,18.5,0.9568,1.2338,72.95,90.0
337,1878,保利城三期,C,2,2,W,1,1,21,56,18.5,1.7027,1.1112,74.7,83.0
34,159,和谐世纪,K,1,1,SE,0,0,0,1,19.0,0.0158,1.0,55.0,55.0
334,1862,石人南路23号,C,2,1,S,0,0,19,99,18.5,2.3243,1.0546,61.64,65.0
404,2189,格凌兰三期,C,2,2,NE,0,1,12,55,18.5,1.3459,1.1882,71.54,85.0
236,1310,前锋小区,C,2,1,E,0,0,3,99,18.5,1.7189,1.1176,67.11,75.0
247,1391,长城宜苑,C,2,1,SE,0,1,11,1,17.0,0.4706,1.4569,48.05,70.0
22,104,建工新城锦绣园,K,2,2,S,0,0,1,0,26.0,0.0269,1.0682,88.0,94.0
254,1506,新风路39号,C,2,1,SE,2,0,0,22,18.5,0.3568,1.3536,67.23,91.0
150,790,抚琴东南路1号,C,2,1,S,2,0,10,138,18.5,2.6162,1.1685,55.63,65.0


In [27]:
# 将训练集 train2 和测试集 test2 整合为一个总的数据集
house_pred_all = pd.concat([train2, test2], axis=0)

In [28]:
house_pred_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9539 entries, 0 to 704
Data columns (total 15 columns):
id               9539 non-null int64
region           9539 non-null object
city             9539 non-null object
rooms            9539 non-null int64
halls            9539 non-null int64
towards          9539 non-null object
decoration       9539 non-null int64
have_elevator    9539 non-null int64
visited          9539 non-null int64
attention        9539 non-null int64
publishday       9539 non-null float64
popular          9539 non-null float64
unit_price       9539 non-null float64
area             9539 non-null float64
total_price      9539 non-null float64
dtypes: float64(5), int64(7), object(3)
memory usage: 1.2+ MB


In [29]:
# 将预测后的全部数据写入文件
# house_pred_all.to_csv(r"./model_data/predicts/houses_pred_all.csv", index=False)