In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv("./AmesHousing.csv")
df.tail()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
2925,2926,923275080,80,RL,37.0,7937,Pave,,IR1,Lvl,...,0,,GdPrv,,0,3,2006,WD,Normal,142500
2926,2927,923276100,20,RL,,8885,Pave,,IR1,Low,...,0,,MnPrv,,0,6,2006,WD,Normal,131000
2927,2928,923400125,85,RL,62.0,10441,Pave,,Reg,Lvl,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,132000
2928,2929,924100070,20,RL,77.0,10010,Pave,,Reg,Lvl,...,0,,,,0,4,2006,WD,Normal,170000
2929,2930,924151050,60,RL,74.0,9627,Pave,,Reg,Lvl,...,0,,,,0,11,2006,WD,Normal,188000


In [None]:
df.info()

In [5]:
df.shape

(2930, 82)

1. Tiền xử lý dữ liệu

Do thuật toán hồi quy chỉ làm việc với dữ liệu dạng số nên ta chuyển các biến có kiểu categorical thành biến “Dummy”In [21]:

In [6]:
df1= pd.get_dummies(df)
df1.head()

Unnamed: 0,Order,PID,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,...,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD,Sale Condition_Abnorml,Sale Condition_AdjLand,Sale Condition_Alloca,Sale Condition_Family,Sale Condition_Normal,Sale Condition_Partial
0,1,526301100,20,141.0,31770,6,5,1960,1960,112.0,...,0,0,0,1,0,0,0,0,1,0
1,2,526350040,20,80.0,11622,5,6,1961,1961,0.0,...,0,0,0,1,0,0,0,0,1,0
2,3,526351010,20,81.0,14267,6,6,1958,1958,108.0,...,0,0,0,1,0,0,0,0,1,0
3,4,526353030,20,93.0,11160,7,5,1968,1968,0.0,...,0,0,0,1,0,0,0,0,1,0
4,5,527105010,60,74.0,13830,5,5,1997,1998,0.0,...,0,0,0,1,0,0,0,0,1,0


Để xử lý dữ liệu bị thiếu ta thay bằng giá trị trung vị của cột tương ứng. Ta định nghĩa hàm impute_median() để thực hiện việc này

In [7]:
def impute_median(series):
    return series.fillna(series.median())
df1['Lot Frontage']= df1['Lot Frontage'].transform(impute_median)
df1['Mas Vnr Area']=df1['Mas Vnr Area'].transform(impute_median)
df1['BsmtFin SF 1']=df1['BsmtFin SF 1'].transform(impute_median)
df1['BsmtFin SF 2']=df1['BsmtFin SF 2'].transform(impute_median)
df1['Bsmt Unf SF']=df1['Bsmt Unf SF'].transform(impute_median)
df1['Total Bsmt SF']=df1['Total Bsmt SF'].transform(impute_median)
df1['Bsmt Full Bath']=df1['Bsmt Full Bath'].transform(impute_median)
df1['Bsmt Half Bath']=df1['Bsmt Half Bath'].transform(impute_median)
df1['Garage Cars']=df1['Garage Cars'].transform(impute_median)
df1['Garage Area']=df1['Garage Area'].transform(impute_median)

#Check remaining columns with NaN values
df1.columns[df1.isna().any()].tolist() 
## Cột này còn thiếu DL sau khi fillna() với mean()

['Garage Yr Blt']

Ta sẽ lọai bỏ – không xét đến cột này, khi xây dựng mô hình đồng thời đưa dữ liệu đã tiền xử lý vào mảng df2 để tính toán trong các bước tiếp theo

In [9]:
#Drop this column
df2=df1.drop('Garage Yr Blt',axis=1)
df2.head()

Unnamed: 0,Order,PID,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,...,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD,Sale Condition_Abnorml,Sale Condition_AdjLand,Sale Condition_Alloca,Sale Condition_Family,Sale Condition_Normal,Sale Condition_Partial
0,1,526301100,20,141.0,31770,6,5,1960,1960,112.0,...,0,0,0,1,0,0,0,0,1,0
1,2,526350040,20,80.0,11622,5,6,1961,1961,0.0,...,0,0,0,1,0,0,0,0,1,0
2,3,526351010,20,81.0,14267,6,6,1958,1958,108.0,...,0,0,0,1,0,0,0,0,1,0
3,4,526353030,20,93.0,11160,7,5,1968,1968,0.0,...,0,0,0,1,0,0,0,0,1,0
4,5,527105010,60,74.0,13830,5,5,1997,1998,0.0,...,0,0,0,1,0,0,0,0,1,0


2. Xử lý dữ liệu


Đặt mảng y, chứa giá bán của các ngôi nhà, tương ứng cột SalePrice, chính là đầu ra của mô hình. Mảng X, chứa tất cả các cột còn lại (ngoại trừ cột SalePrice)

In [13]:
#Define target array y
y= df2['SalePrice'].values

#Create feature array X
X= df2.drop('SalePrice',axis=1).values

#Check X's shape
print(X.shape)
#Check X's shape
y.shape

(2930, 305)


(2930,)

In [14]:
#Reshape y to have 1 column
y=y.reshape(-1,1)
y.shape

(2930, 1)

3. Chia DL

Chia DL thành 2 phần: (train data) và (test data). Tham số test_size=0.3 cho biết, 30% dữ liệu được lấy để kiểm tra

In [15]:
#Split the arrays into training and testing data sets
X_train, X_test,y_train, y_test= train_test_split(X,y,test_size=0.3,random_state=42)

4. Huan luyen Mo hinh

In [16]:
#Create a regressor object
LR= LinearRegression()

#Fit training set to the regressor
LR.fit(X_train,y_train)

In [17]:
print("Mô hình hồi quy tuyến tính đã được huấn luyện, có các tham số:")
print("Intercept =", LR.intercept_)
print("Coefficients:", LR.coef_)

Mô hình hồi quy tuyến tính đã được huấn luyện, có các tham số:
Intercept = [-2991353.05284685]
Coefficients: [[ 2.91525289e+00  5.09552521e-06 -9.58085749e+01  5.58719525e+01
   6.87156440e-01  6.29529927e+03  6.03226927e+03  3.69597140e+02
   3.10492569e+01  2.05862641e+01  3.90754730e+02  3.83869658e+02
   3.70544828e+02 -3.47313693e+02  1.77627562e+01  3.15233345e+01
  -1.60771497e+01  3.32089150e+01  5.19265397e+02 -2.50368977e+02
   2.35546845e+03  1.48057693e+03 -3.46583919e+03 -1.32271765e+04
   4.72073460e+01  5.54546021e+03  5.57766490e+03  7.29141674e+00
   1.43182197e+01  7.31505862e-01  4.17224255e+00  1.98061668e+01
   4.79586128e+01 -9.15212109e+01 -8.95695064e-01  8.43802293e+01
   1.02514117e+03  1.45276675e+04 -2.41747544e+04 -3.88227782e+03
   3.81612162e+04 -2.73078347e+03 -7.21031420e+03 -1.46907538e+04
  -1.15437447e+04  1.15437447e+04 -8.79446809e+02 -1.50055627e+03
   1.54886438e+02  4.91066294e+03 -5.15884909e+03  9.32997059e+01
  -8.33247815e+02  5.90117868e+03

5. Su dung Mo Hinh

In [18]:
#Make predictions with the regressor
y_prediction = LR.predict(X_test)

6. Danh gia Mo Hinh

In [19]:
# Calculate R2-score
score=r2_score(y_test,y_prediction)
print('R2-score is ',score)
print('Mean_sqrd_error is==',mean_squared_error(y_test,y_prediction))
print('Root_mean_squared error of is==',np.sqrt(mean_squared_error(y_test,y_prediction)))

R2-score is  0.8955304226952455
Mean_sqrd_error is== 734361363.6168921
Root_mean_squared error of is== 27099.102634900886
