**Sample Decision Tree**

In [1]:
# load dataset
import pandas as pd
df = pd.read_csv('../../../data/Housing.csv')

In [2]:
# show first lines
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [3]:
# list columns
df.columns.to_list()

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude',
 'MedHouseVal']

In [4]:
# column types
df.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude', 'MedHouseVal'],
      dtype='object')

In [5]:
# describe data
df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [6]:
# missing values
df.isna().sum()

MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64

In [7]:
# Select prediction Target (price)
y = df.price
y

AttributeError: 'DataFrame' object has no attribute 'price'

In [52]:
# choosing features to predict target
housing_features = ['bedrooms', 'bathrooms', 'sqft_living']
X = df[housing_features]
X


Unnamed: 0,bedrooms,bathrooms,sqft_living
0,2,1.00,1180
1,3,2.25,2570
2,2,1.00,770
3,4,3.00,1960
4,3,2.00,1680
...,...,...,...
21608,3,2.50,1530
21609,4,2.50,2310
21610,2,0.75,1020
21611,3,2.50,1600


In [53]:
# describe features
X.describe()

Unnamed: 0,bedrooms,bathrooms,sqft_living
count,21613.0,21613.0,21613.0
mean,3.370795,2.114757,2079.899736
std,0.930105,0.770163,918.440897
min,0.0,0.0,290.0
25%,3.0,1.75,1427.0
50%,3.0,2.25,1910.0
75%,4.0,2.5,2550.0
max,33.0,8.0,13540.0


In [54]:
# BUILDING THE MODEL
from sklearn.tree import DecisionTreeRegressor
housing_model = DecisionTreeRegressor(random_state=1)
housing_model.fit(X, y)

In [55]:
# Make predictions
X_sample = X.head(3)
y_pred = housing_model.predict(X_sample)
y_true = df.head(3).price
# Print results
print("Items to predict:")
print(X_sample)
print("Predictions:")
print(y_pred)
print("Real values:")
print(y_true.values)
print("Absolute errors:")
print((y_pred - y_true).values)

Items to predict:
   bedrooms  bathrooms  sqft_living
0         2       1.00         1180
1         3       2.25         2570
2         2       1.00          770
Predictions:
[354150.         503666.66666667 281901.62162162]
Real values:
[231300. 538000. 180000.]
Absolute errors:
[122850.         -34333.33333333 101901.62162162]


In [56]:
# Validate model using MAE (Mean Absolute Error)
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
mae = mean_absolute_error(y_true, y_pred)
mape = mean_absolute_percentage_error(y_true, y_pred)
print(f"The mean absolute error is {mae} in percentage {mape*100:.2f} %")

The mean absolute error is 86361.65165165164 in percentage 38.70 %


In [57]:
# divide data in train and validation
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
housing_model_train_test = DecisionTreeRegressor()
housing_model_train_test.fit(X=X_train, y=y_train)

In [59]:
# Predict items across validation items
y_pred_X_test = housing_model_train_test.predict(X_test)
print(f"Predictions over training data: {y_pred_X_test}")
# MAE
print(f"MAE is {mean_absolute_error(y_test, y_pred_X_test)} in percentage {mean_absolute_percentage_error(y_test, y_pred_X_test)*100:.2f} %")

Predictions over training data: [324375.         600000.         350000.         ... 896600.
 334572.22222222 412401.        ]
MAE is 191305.1793444333 in percentage 38.33 %
