In [16]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.datasets import load_boston
from sklearn.model_selection import GridSearchCV

ImportError: 
`load_boston` has been removed from scikit-learn since version 1.2.

The Boston housing prices dataset has an ethical problem: as
investigated in [1], the authors of this dataset engineered a
non-invertible variable "B" assuming that racial self-segregation had a
positive impact on house prices [2]. Furthermore the goal of the
research that led to the creation of this dataset was to study the
impact of air quality but it did not give adequate demonstration of the
validity of this assumption.

The scikit-learn maintainers therefore strongly discourage the use of
this dataset unless the purpose of the code is to study and educate
about ethical issues in data science and machine learning.

In this special case, you can fetch the dataset from the original
source::

    import pandas as pd
    import numpy as np

    data_url = "http://lib.stat.cmu.edu/datasets/boston"
    raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
    data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
    target = raw_df.values[1::2, 2]

Alternative datasets include the California housing dataset and the
Ames housing dataset. You can load the datasets as follows::

    from sklearn.datasets import fetch_california_housing
    housing = fetch_california_housing()

for the California housing dataset and::

    from sklearn.datasets import fetch_openml
    housing = fetch_openml(name="house_prices", as_frame=True)

for the Ames housing dataset.

[1] M Carlisle.
"Racist data destruction?"
<https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>

[2] Harrison Jr, David, and Daniel L. Rubinfeld.
"Hedonic housing prices and the demand for clean air."
Journal of environmental economics and management 5.1 (1978): 81-102.
<https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>


In [4]:
boston = load_boston()
df = pd.DataFrame(boston.data)

In [5]:
df.columns = housing.feature_names
df['MEDV'] = housing.target

NameError: name 'df' is not defined

In [7]:
rt = DecisionTreeRegressor(criterion = 'mse', max_depth=5)

In [8]:
rt.fit(X_train,y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=5,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

# Hyperparameter Tuning

In [11]:
param_grid = {
    'max_depth':[2,4,8,10,None],
    'criterion':['mse','mae'],
    'max_features':[0.25,0.5,1.0],
    'min_samples_split':[0.25,0.5,1.0]
}

In [12]:
reg = GridSearchCV(DecisionTreeRegressor(),param_grid=param_grid)

In [13]:
reg.best_score_

0.6452352174104019

In [14]:
reg.best_params_

{'criterion': 'mse',
 'max_depth': None,
 'max_features': 0.5,
 'min_samples_split': 0.25}

# Feature Importance

In [17]:
for importance, name in sorted(zip(rt.feature_importances_, X_train.columns),reverse=True):
  print (name, importance)

RM 0.6344993240692652
LSTAT 0.19426427075925173
CRIM 0.07395590730917082
DIS 0.06744514557703153
B 0.011905660139828182
AGE 0.006176126174365511
PTRATIO 0.004391097507128497
NOX 0.0035610403857026535
INDUS 0.002627468726682041
RAD 0.0011739593515739223
ZN 0.0
TAX 0.0
CHAS 0.0


In [None]:
# when Doing cross validation do we not need to do fit? And predict and train split by 
#   ourselves, does cross validation do this for us?

# ⏺ Bilkul sahi samjha aapne! Cross validation automatically sab kuch handle karta hai:

#   Cross validation internally ye sab karta hai:

#   1. Data splitting - Automatically K folds mein divide karta hai
#   2. Training - Har iteration mein model ko train karta hai
#   3. Prediction - Test fold pe predictions banata hai
#   4. Scoring - Score calculate karta hai
#   5. Repeat - Ye process K times repeat karta hai

#   Example - Behind the scenes kya hota hai:

#   cv_scores = cross_val_score(model, X, y, cv=5)

#   # Internally ye ho raha hai:
#   Fold 1: Train on [2,3,4,5], Test on [1] → fit() → predict() → score
#   Fold 2: Train on [1,3,4,5], Test on [2] → fit() → predict() → score
#   Fold 3: Train on [1,2,4,5], Test on [3] → fit() → predict() → score
#   Fold 4: Train on [1,2,3,5], Test on [4] → fit() → predict() → score
#   Fold 5: Train on [1,2,3,4], Test on [5] → fit() → predict() → score

#   Important points:

#   - Aapko manually train_test_split() karne ki zarurat NAHI hai
#   - Aapko manually fit() karne ki zarurat NAHI hai
#   - Aapko manually predict() karne ki zarurat NAHI hai
#   - Cross validation ye sab automatically karta hai aur sirf scores return karta hai

#   Lekin agar aapko final model chahiye:
#   - Cross validation sirf evaluation ke liye hai
#   - Final deployment ke liye aapko separately full data pe model train karna padega
#   - Ya phir cross_val_predict() use kar sakte hain predictions ke liye

# Cross Validation perform karte hain
from sklearn.model_selection import cross_val_score

# K-Fold Cross Validation with K=5
# Same model parameters use karte hain jo upar use kiye the
rt_cv = DecisionTreeRegressor(criterion='squared_error', max_depth=5)

# Cross validation scores calculate karo (5 folds)
cv_scores = cross_val_score(rt_cv, X, y, cv=5, scoring='r2')

print("Cross Validation Results:")
print("-" * 40)
print(f"Individual fold scores: {cv_scores}")
print(f"Mean CV Score: {cv_scores.mean():.4f}")
print(f"Standard Deviation: {cv_scores.std():.4f}")
print(f"\nSingle train-test split score: 0.8834")
print(f"Cross validation average: {cv_scores.mean():.4f}")

# Agar CV score aur single split score mein bahut difference hai, 
# toh model ki stability questionable hai

Cross Validation Results:
----------------------------------------
Individual fold scores: [0.32603463 0.52499182 0.60290742 0.31954034 0.53355785]
Mean CV Score: 0.4614
Standard Deviation: 0.1164

Single train-test split score: 0.8834
Cross validation average: 0.4614


In [20]:
# Different types of Cross Validation
from sklearn.model_selection import cross_validate, KFold, ShuffleSplit

# 1. Detailed metrics ke saath cross validation
cv_results = cross_validate(rt_cv, X, y, cv=5, 
                           scoring=['r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'],
                           return_train_score=True)

print("Detailed Cross Validation Metrics:")
print("-" * 50)
print(f"R² Test Score: {cv_results['test_r2'].mean():.4f} (+/- {cv_results['test_r2'].std():.4f})")
print(f"R² Train Score: {cv_results['train_r2'].mean():.4f} (+/- {cv_results['train_r2'].std():.4f})")
print(f"MSE: {-cv_results['test_neg_mean_squared_error'].mean():.4f}")
print(f"MAE: {-cv_results['test_neg_mean_absolute_error'].mean():.4f}")

# 2. Custom K-Fold with shuffle
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
cv_scores_10fold = cross_val_score(rt_cv, X, y, cv=kfold, scoring='r2')
print(f"\n10-Fold CV Score: {cv_scores_10fold.mean():.4f} (+/- {cv_scores_10fold.std():.4f})")

# 3. ShuffleSplit CV (Random train-test splits)
shuffle_split = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
cv_scores_shuffle = cross_val_score(rt_cv, X, y, cv=shuffle_split, scoring='r2')
print(f"ShuffleSplit CV Score: {cv_scores_shuffle.mean():.4f} (+/- {cv_scores_shuffle.std():.4f})")

In [21]:
# Cross Validation with best parameters from GridSearchCV
best_model = DecisionTreeRegressor(criterion='mse', max_depth=None, 
                                  max_features=0.5, min_samples_split=0.25)

cv_scores_best = cross_val_score(best_model, X, y, cv=5, scoring='r2')

print("Best Model (from GridSearchCV) Cross Validation:")
print("-" * 50)
print(f"CV Scores: {cv_scores_best}")
print(f"Mean: {cv_scores_best.mean():.4f}")
print(f"Std: {cv_scores_best.std():.4f}")

# Compare with original model
print(f"\nComparison:")
print(f"Original model (max_depth=5): {cv_scores.mean():.4f}")
print(f"Best model (GridSearchCV params): {cv_scores_best.mean():.4f}")
print(f"Improvement: {(cv_scores_best.mean() - cv_scores.mean()):.4f}")