ref : https://twinsynergy.co.th/70-machine-learning-datasets-project-ideas/

dataset : https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html 

4. The Boston Housing Dataset

เป็นชุดข้อมูลเกี่ยวกับบ้านหลังต่าง ๆ ในเมืองบอสตัน คนนิยมนำไปใช้ในการทำ pattern recognition ภายในชุดข้อมูลประกอบไปด้วย อัตราการเกิดอาชญากรรม ภาษี จำนวนห้อง ฯลฯ มีทั้งหมด 506 แถว แต่ละแถวมี 14 ตัวแปร เราสามารถใช้ชุดข้อมูลนี้เพื่อทำนายราคาบ้านได้

Data Science Project Idea: ทำนายราคาที่อยู่อาศัยของบ้านหลังใหม่โดยใช้ linear regression ซึ่ง linear regression ใช้ในการทำนายค่าของอินพุตที่ไม่รู้จัก เมื่อข้อมูลมีความสัมพันธ์เชิงเส้นระหว่างตัวแปรอินพุตและเอาต์พุต

Variables

There are 14 attributes in each case of the dataset. They are:

CRIM - per capita crime rate by town

ZN - proportion of residential land zoned for lots over 25,000 sq.ft.

INDUS - proportion of non-retail business acres per town.

CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)

NOX - nitric oxides concentration (parts per 10 million) 

RM - average number of rooms per dwelling


AGE - proportion of owner-occupied units built prior to 1940

DIS - weighted distances to five Boston employment centres

RAD - index of accessibility to radial highways

TAX - full-value property-tax rate per $10,000

PTRATIO - pupil-teacher ratio by town

B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town

LSTAT - % lower status of the population

MEDV - Median value of owner-occupied homes in $1000's

## Import Libraries

In [None]:
# Import fundamental libreries
import pandas as pd
import matplotlib.pyplot as plt
# linear regression
from sklearn.linear_model import LinearRegression
# for split data to train & test
from sklearn.model_selection import train_test_split
# for showing OLS regression result 
import statsmodels.api as sm
# for calculate RMSE 
from sklearn.metrics import mean_squared_error
import math

## Import dataset

In [None]:
df = pd.read_csv('Boston_Housing.csv')
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,CAT. MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0,0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6,0
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7,1
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4,1
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4,0
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6,0
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9,0
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0,0


## Preprocessing

In [None]:
# check missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CRIM       506 non-null    float64
 1   ZN         506 non-null    float64
 2   INDUS      506 non-null    float64
 3   CHAS       506 non-null    int64  
 4   NOX        506 non-null    float64
 5   RM         506 non-null    float64
 6   AGE        506 non-null    float64
 7   DIS        506 non-null    float64
 8   RAD        506 non-null    int64  
 9   TAX        506 non-null    int64  
 10  PTRATIO    506 non-null    float64
 11  B          506 non-null    float64
 12  LSTAT      506 non-null    float64
 13  MEDV       506 non-null    float64
 14  CAT. MEDV  506 non-null    int64  
dtypes: float64(11), int64(4)
memory usage: 59.4 KB


don't have missing value!

# Preprocessing --> select features for fitting model

In [None]:
# drop MEDV because it's a variavle that we want to predict.
# drop CAT.MEDV because we don't need to use it for this model.
df2 = df.drop(['MEDV','CAT. MEDV'], axis='columns')
# set y to variable that we want to predict, in this case is MEDV
y = df.MEDV
df2

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48


In [None]:
# regression use only numerical variable then we drop nominal variable
X = df2.drop(['CHAS','RAD',], axis='columns')
X

Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.538,6.575,65.2,4.0900,296,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.469,6.421,78.9,4.9671,242,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.469,7.185,61.1,4.9671,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.458,6.998,45.8,6.0622,222,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.458,7.147,54.2,6.0622,222,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.573,6.593,69.1,2.4786,273,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.573,6.120,76.7,2.2875,273,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.573,6.976,91.0,2.1675,273,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.573,6.794,89.3,2.3889,273,21.0,393.45,6.48


In [None]:
# estimate with sm.OLS for check p-value of each attribute
# p-value close to 0 means it's more significant for model
est = sm.OLS(y, X)
est2 = est.fit()
print(est2.summary())

                                 OLS Regression Results                                
Dep. Variable:                   MEDV   R-squared (uncentered):                   0.958
Model:                            OLS   Adj. R-squared (uncentered):              0.957
Method:                 Least Squares   F-statistic:                              1018.
Date:                Tue, 12 Oct 2021   Prob (F-statistic):                        0.00
Time:                        06:05:06   Log-Likelihood:                         -1533.1
No. Observations:                 506   AIC:                                      3088.
Df Residuals:                     495   BIC:                                      3135.
Df Model:                          11                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

we will choose only attribue that have p-value close to 0 then we choose ZN, RM, DIS, PTRATIO, B, LSTAT

In [None]:
X = df2.drop(['CHAS','RAD','CRIM','INDUS','NOX','AGE','TAX'], axis='columns')
X

Unnamed: 0,ZN,RM,DIS,PTRATIO,B,LSTAT
0,18.0,6.575,4.0900,15.3,396.90,4.98
1,0.0,6.421,4.9671,17.8,396.90,9.14
2,0.0,7.185,4.9671,17.8,392.83,4.03
3,0.0,6.998,6.0622,18.7,394.63,2.94
4,0.0,7.147,6.0622,18.7,396.90,5.33
...,...,...,...,...,...,...
501,0.0,6.593,2.4786,21.0,391.99,9.67
502,0.0,6.120,2.2875,21.0,396.90,9.08
503,0.0,6.976,2.1675,21.0,396.90,5.64
504,0.0,6.794,2.3889,21.0,393.45,6.48


## Model

In [None]:
# split data train:test 70:30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [None]:
y_pred = model.predict(X_test)

In [None]:
# check performane from RMSE
MSE = mean_squared_error(y_test, y_pred)
RMSE = math.sqrt(MSE)
print("Root Mean Square Error:\n")
print(RMSE)

Root Mean Square Error:

5.444903134324248
