In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv("data/real_estate_data.csv")
print(data.head())
print("Shape: ", data.shape)
print("Null Check: ", data.isna().sum())

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD  TAX  PTRATIO  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671    2  242     17.8   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671    2  242     17.8   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622    3  222     18.7   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622    3  222     18.7   

   LSTAT  MEDV  
0   4.98  24.0  
1   9.14  21.6  
2   4.03  34.7  
3   2.94  33.4  
4    NaN  36.2  
Shape:  (506, 13)
Null Check:  CRIM       20
ZN         20
INDUS      20
CHAS       20
NOX         0
RM          0
AGE        20
DIS         0
RAD         0
TAX         0
PTRATIO     0
LSTAT      20
MEDV        0
dtype: int64


In [4]:
data.dropna(inplace=True)
print("Shape: ", data.shape)

Shape:  (394, 13)


In [5]:
X = data.drop(columns=["MEDV"])
Y = data["MEDV"]
print(X.head())
print(Y.head())

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD  TAX  PTRATIO  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671    2  242     17.8   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671    2  242     17.8   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622    3  222     18.7   
5  0.02985   0.0   2.18   0.0  0.458  6.430  58.7  6.0622    3  222     18.7   

   LSTAT  
0   4.98  
1   9.14  
2   4.03  
3   2.94  
5   5.21  
0    24.0
1    21.6
2    34.7
3    33.4
5    28.7
Name: MEDV, dtype: float64


In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=1)
# mse is the functin used to measure error (Mean Squared Error)
regression_tree = DecisionTreeRegressor(criterion = "mse")
regression_tree.fit(X_train, Y_train)
r_squared = regression_tree.score(X_test, Y_test)
print(r_squared)

0.8238838930549399




We can also find the average error in our testing set which is the average error in median home value prediction


In [7]:
# This will generate the average error in dollars we will make in median home value
prediction = regression_tree.predict(X_test)
print("$",(prediction - Y_test).abs().mean()*1000)

$ 2886.0759493670885


In [9]:
# Trying new criteria of mae (Mean Absolute Error)
regression_tree = DecisionTreeRegressor(criterion = "mae")
regression_tree.fit(X_train, Y_train)
r_squared = regression_tree.score(X_test, Y_test)
print(r_squared)
prediction = regression_tree.predict(X_test)
print("$",(prediction - Y_test).abs().mean()*1000)

0.8726504266144106
$ 2641.772151898734


