In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('BostonHousing.csv')

df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


- CRIM: Per capita crime rate by town
- ZN: Proportion of residential land zoned for lots over 25,000 sq. ft
- INDUS: Proportion of non-retail business acres per town
- CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
- NOX: Nitric oxide concentration (parts per 10 million)
- RM: Average number of rooms per dwelling
- AGE: Proportion of owner-occupied units built prior to 1940
- DIS: Weighted distances to five Boston employment centers
- RAD: Index of accessibility to radial highways
- TAX: Full-value property tax rate per $10,000
- PTRATIO: Pupil-teacher ratio by town
- B: 1000(Bk — 0.63)², where Bk is the proportion of people of African American descent by town
- LSTAT: Percentage of lower status of the population
- MEDV: Median value of owner-occupied homes in $1000s

In [7]:
# baseline model

from sklearn.model_selection import train_test_split

# Define your features and target variable
X = df.drop('medv', axis=1)  # Features
y = df['medv']  # Target variable

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# !pip install xgboost

In [9]:
# using XGBoost Baseline

from xgboost import XGBRegressor

xgb = XGBRegressor()

xgb.fit(X_train, y_train)

In [17]:
from sklearn.metrics import r2_score,mean_squared_error

In [23]:
y_pred_test = xgb.predict(X_test)
y_pred_train = xgb.predict(X_train)

print('r2 score', r2_score(y_test, y_pred_test))
print('r2 score', r2_score(y_train, y_pred_train))


print('mse', mean_squared_error(y_test, y_pred_test))

print('mse', mean_squared_error(y_train, y_pred_train))

r2 score 0.9051969100301765
r2 score 0.9999975544760755
mse 6.952269242775621
mse 0.00021245098747811353


In [31]:
from sklearn.model_selection import GridSearchCV

# Defining the parameter grid
param_grid = {
    'estimator':[100,50,200],
    'max_depth': [3, 6, 9],
    'eta': [0.01, 0.1, 0.3],  # Also known as 'learning_rate'
    'subsample': [0.5, 0.75, 1.0],
    

# Initializing the XGBoost regressor
xgb_reg = XGBRegressor(objective='reg:squarederror', verbosity=0)

# Setting up the grid search with 3-fold cross-validation
grid_search = GridSearchCV(estimator=xgb_reg, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

# Fitting the grid search model
grid_search.fit(X_train, y_train)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best parameters:", best_params)
print("Best score:", best_score)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best parameters: {'colsample_bytree': 1.0, 'estimator': 100, 'eta': 0.1, 'max_depth': 3, 'subsample': 0.75}
Best score: -11.93307260507165
