In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.datasets import fetch_california_housing
california_df=fetch_california_housing()

In [3]:
df=pd.DataFrame(california_df.data,columns=california_df.feature_names)
df['Target']=california_df.target
## Taking Sample Data
df=df.sample(frac=0.25)

In [4]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
15463,4.0,4.0,5.268041,1.073883,1164.0,2.0,33.16,-117.23,2.281
16281,1.8814,40.0,4.418898,1.07874,2268.0,3.571654,37.94,-121.28,0.577
5040,2.2396,44.0,4.670968,1.012903,861.0,2.777419,33.98,-118.32,1.086
10597,4.2083,14.0,4.825737,0.970509,874.0,2.343164,33.69,-117.8,2.51
6078,5.5496,29.0,6.330049,0.960591,2205.0,3.62069,34.09,-117.86,2.182


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5160 entries, 15463 to 371
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      5160 non-null   float64
 1   HouseAge    5160 non-null   float64
 2   AveRooms    5160 non-null   float64
 3   AveBedrms   5160 non-null   float64
 4   Population  5160 non-null   float64
 5   AveOccup    5160 non-null   float64
 6   Latitude    5160 non-null   float64
 7   Longitude   5160 non-null   float64
 8   Target      5160 non-null   float64
dtypes: float64(9)
memory usage: 403.1 KB


In [6]:
df.shape

(5160, 9)

In [7]:
df.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
Target        0
dtype: int64

In [8]:
df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
count,5160.0,5160.0,5160.0,5160.0,5160.0,5160.0,5160.0,5160.0,5160.0
mean,3.834942,28.528682,5.395297,1.093571,1422.70407,3.073272,35.612004,-119.55756,2.046493
std,1.85801,12.515776,2.017238,0.360423,1191.60009,8.407166,2.123101,2.001261,1.133525
min,0.4999,1.0,1.130435,0.5,8.0,1.089268,32.56,-124.3,0.14999
25%,2.55185,18.0,4.429031,1.005861,791.0,2.427543,33.93,-121.78,1.213
50%,3.5238,29.0,5.194769,1.047528,1156.0,2.821158,34.26,-118.49,1.784
75%,4.705925,37.0,6.020765,1.09952,1724.0,3.289445,37.69,-117.99,2.6035
max,15.0001,52.0,52.848214,11.410714,35682.0,599.714286,41.95,-114.55,5.00001


In [9]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.33, random_state=42)

In [12]:
from sklearn.tree import DecisionTreeRegressor
regressor=DecisionTreeRegressor()

In [13]:
regressor.fit(X_train,y_train)

In [14]:
y_pred = regressor.predict(X_test)

In [15]:
y_pred

array([2.68   , 1.375  , 1.297  , ..., 1.554  , 5.00001, 1.922  ])

In [16]:
from sklearn.metrics import r2_score,classification_report,confusion_matrix


In [17]:
print(r2_score(y_pred,y_test))

0.48607099159097855


HyperParameter Tuning

In [18]:
parameter = {
    'criterion' : ['squared_error','friedman_mse','absolute_error','poisson'],
    'splitter':['best','random'],
    'max_depth':[1,2,3,4,5,6,7,8,10,11,12],
    'max_features':['auto', 'sqrt', 'log2']
}


In [19]:
import warnings
warnings.filterwarnings('ignore')

In [20]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(DecisionTreeRegressor(),param_grid = parameter,cv=5,scoring='neg_mean_squared_error')


In [21]:
clf.fit(X_train,y_train)

In [22]:
clf.best_params_

{'criterion': 'poisson',
 'max_depth': 7,
 'max_features': 'auto',
 'splitter': 'best'}

In [25]:
regressor = DecisionTreeRegressor(criterion='poisson',max_depth=7,max_features='auto',splitter='best')

In [26]:
#y_pred=c.predict(X_test)
regressor.fit(X_train,y_train)

In [27]:
y_pred = regressor.predict(X_test)

In [28]:
r2_score(y_pred,y_test)

0.48298295189447493