In [64]:
import pandas as pd
from pandas_datareader import data
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import GridSearchCV

In [66]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

In [68]:
df = pd.DataFrame(housing.data)
df.columns = housing.feature_names
df['MedHouseValue'] = housing.target
housing.DESCR

'.. _california_housing_dataset:\n\nCalifornia Housing dataset\n--------------------------\n\n**Data Set Characteristics:**\n\n:Number of Instances: 20640\n\n:Number of Attributes: 8 numeric, predictive attributes and the target\n\n:Attribute Information:\n    - MedInc        median income in block group\n    - HouseAge      median house age in block group\n    - AveRooms      average number of rooms per household\n    - AveBedrms     average number of bedrooms per household\n    - Population    block group population\n    - AveOccup      average number of household members\n    - Latitude      block group latitude\n    - Longitude     block group longitude\n\n:Missing Attribute Values: None\n\nThis dataset was obtained from the StatLib repository.\nhttps://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html\n\nThe target variable is the median house value for California districts,\nexpressed in hundreds of thousands of dollars ($100,000).\n\nThis dataset was derived from the 1990 U.S

In [70]:
df.head

<bound method NDFrame.head of        MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0      8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1      8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2      7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3      5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4      3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   
...       ...       ...       ...        ...         ...       ...       ...   
20635  1.5603      25.0  5.045455   1.133333       845.0  2.560606     39.48   
20636  2.5568      18.0  6.114035   1.315789       356.0  3.122807     39.49   
20637  1.7000      17.0  5.205543   1.120092      1007.0  2.325635     39.43   
20638  1.8672      18.0  5.329513   1.171920       741.0  2.123209     39.43   
20639  2.3886      16.0  5.254717   1.162264      1387.0  2.616981     39.37   

       Lo

In [72]:
x = df.iloc[:, 0:8]
y = df.iloc[:,8]

In [74]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [80]:
dtr = DecisionTreeRegressor(criterion = 'squared_error', max_depth=5)
dtr.fit(x_train, y_train)
y_pred = dtr.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.4f}")
print("r2_score: ", r2_score(y_test, y_pred))

Mean Squared Error: 0.5245
r2_score:  0.5997321244428706


# Hyperparameter Tuning

In [91]:
param_grid = {
    'max_depth':[2,4,8,10,20,None],
    'max_features':[0.25,0.5,1.0],
    'min_samples_split':[2, 5, 10]
}

In [93]:
reg = GridSearchCV(DecisionTreeRegressor(), param_grid = param_grid)

In [95]:
reg.fit(x_train, y_train)

In [96]:
y_pred = reg.predict(x_test)
print(reg.best_params_)
r2_score(y_test, y_pred)

{'max_depth': 10, 'max_features': 1.0, 'min_samples_split': 10}


0.6795020516109198

# Feature Importance

In [100]:
for importance, name in sorted(zip(dtr.feature_importances_, x_train.columns),reverse=True):
  print (name, importance)

MedInc 0.7712117162048163
AveOccup 0.1284067461489551
HouseAge 0.0416208799360782
AveRooms 0.031260721268004193
Latitude 0.022049480286783153
Population 0.0024849982871779763
Longitude 0.002096950201375669
AveBedrms 0.0008685076668094034
