In [None]:
housing_link = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv'
housing_output_path = 'housing.csv'


!wget $housing_link -O $housing_output_path

--2023-10-21 10:25:53--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1423529 (1.4M) [text/plain]
Saving to: ‘housing.csv’


2023-10-21 10:25:53 (142 MB/s) - ‘housing.csv’ saved [1423529/1423529]



*Preparing the Dataset*

In [None]:
import pandas as pd

housing = pd.read_csv(housing_output_path)
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [None]:
import numpy as np

# Filter out the rows
mask = housing['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])
housing = housing[mask]

# Fill missing values
housing.fillna(0, inplace=True)

# Apply log transformation to the target column
housing['median_house_value'] = np.log(housing['median_house_value'])

housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
701,-121.97,37.64,32.0,1283.0,194.0,485.0,171.0,6.0574,12.973863,<1H OCEAN
830,-121.99,37.61,9.0,3666.0,711.0,2341.0,703.0,4.6458,12.287653,<1H OCEAN
859,-121.97,37.57,21.0,4342.0,783.0,2172.0,789.0,4.6146,12.41957,<1H OCEAN
860,-121.96,37.58,15.0,3575.0,597.0,1777.0,559.0,5.7192,12.554967,<1H OCEAN
861,-121.98,37.58,20.0,4126.0,1031.0,2079.0,975.0,3.6832,12.287192,<1H OCEAN


In [None]:
from sklearn.model_selection import train_test_split

df_train_full, df_test = train_test_split(
    housing,
    test_size=0.2,
    random_state=1
)

df_train, df_valid = train_test_split(
    df_train_full,
    test_size=0.25,
    random_state=1
)

In [None]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=True)

X_train_full = df_train_full.drop('median_house_value', axis=1)
y_train_full = df_train_full['median_house_value'].values

X_train = df_train.drop('median_house_value', axis=1).to_dict(orient='records')
X_valid = df_valid.drop('median_house_value', axis=1).to_dict(orient='records')
X_test = df_test.drop('median_house_value', axis=1).to_dict(orient='records')

X_train = dv.fit_transform(X_train)
X_valid = dv.transform(X_valid)
X_test = dv.transform(X_test)

y_train = df_train['median_house_value'].values
y_valid = df_valid['median_house_value'].values
y_test = df_test['median_house_value'].values

**Question 1**

In [None]:
from sklearn.tree import DecisionTreeRegressor, export_text

dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

print(export_text(dt, feature_names=dv.feature_names_))

|--- ocean_proximity=INLAND <= 0.50
|   |--- value: [12.30]
|--- ocean_proximity=INLAND >  0.50
|   |--- value: [11.61]



**Question 2**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

round(mean_squared_error(y_valid, rf.predict(X_valid), squared=False), 3)

0.245

**Question 3**

In [None]:
n_estimators = range(10, 200 + 1, 10)
errors = list()

for n in n_estimators:
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_valid)
    error = round(mean_squared_error(y_valid, y_pred, squared=False), 3)

    errors.append((n, error))

df_errors = pd.DataFrame(errors, columns=['n_estimators', 'RMSE'])
df_errors

Unnamed: 0,n_estimators,RMSE
0,10,0.245
1,20,0.239
2,30,0.237
3,40,0.236
4,50,0.236
5,60,0.235
6,70,0.235
7,80,0.235
8,90,0.235
9,100,0.234


**Question 4**

In [None]:
n_estimators = range(10, 200 + 1, 10)
max_depths = [10, 15, 20, 25]
errors = list()

for max_depth in max_depths:
    for n in n_estimators:
        rf = RandomForestRegressor(
            n_estimators=n, max_depth=max_depth,
            random_state=1, n_jobs=-1
        )
        rf.fit(X_train, y_train)

        y_pred = rf.predict(X_valid)
        error = mean_squared_error(y_valid, y_pred, squared=False)

        errors.append((max_depth, n, error))

df_errors = pd.DataFrame(errors, columns=['max_depth', 'n_estimators', 'RMSE'])
df_errors.sort_values(by='RMSE')

Unnamed: 0,max_depth,n_estimators,RMSE
56,20,170,0.233310
55,20,160,0.233369
76,25,170,0.233406
75,25,160,0.233408
57,20,180,0.233506
...,...,...,...
4,10,50,0.245289
7,10,80,0.245379
2,10,30,0.246079
1,10,20,0.247320


**Question 5**

In [None]:
rf = RandomForestRegressor(
    n_estimators=10, max_depth=20, random_state=1, n_jobs=-1
)
rf.fit(X_train, y_train)

df_feature_importances = pd.DataFrame(
    zip(rf.feature_importances_, dv.feature_names_),
    columns=['feature_importance', 'feature_name']
)
df_feature_importances.sort_values(by='feature_importance', ascending=False)

Unnamed: 0,feature_importance,feature_name
6,0.36628,ocean_proximity=INLAND
4,0.33532,median_income
2,0.101279,latitude
3,0.086834,longitude
1,0.030134,housing_median_age
7,0.027851,population
9,0.021493,total_rooms
8,0.015465,total_bedrooms
0,0.015343,households
5,0.0,ocean_proximity=<1H OCEAN


**Question 6**

In [None]:
!pip install xgboost



In [None]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)

watchlist = [(dtrain, 'train'), (dvalid, 'validation')]

In [None]:
xgb_num_rounds = 100
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,

    'seed': 1,
    'verbosity': 1,
}

model_eta3 = xgb.train(xgb_params, dtrain, xgb_num_rounds, evals=watchlist)

[0]	train-rmse:0.44351	validation-rmse:0.44250
[1]	train-rmse:0.36600	validation-rmse:0.36917
[2]	train-rmse:0.31556	validation-rmse:0.32383
[3]	train-rmse:0.28541	validation-rmse:0.29965
[4]	train-rmse:0.26573	validation-rmse:0.28418
[5]	train-rmse:0.25338	validation-rmse:0.27463
[6]	train-rmse:0.24157	validation-rmse:0.26770
[7]	train-rmse:0.23486	validation-rmse:0.26423
[8]	train-rmse:0.22668	validation-rmse:0.25820
[9]	train-rmse:0.21996	validation-rmse:0.25444
[10]	train-rmse:0.21445	validation-rmse:0.25179
[11]	train-rmse:0.21155	validation-rmse:0.25054
[12]	train-rmse:0.20729	validation-rmse:0.24791
[13]	train-rmse:0.20318	validation-rmse:0.24628
[14]	train-rmse:0.20135	validation-rmse:0.24630
[15]	train-rmse:0.19858	validation-rmse:0.24523
[16]	train-rmse:0.19580	validation-rmse:0.24445
[17]	train-rmse:0.19333	validation-rmse:0.24395
[18]	train-rmse:0.19107	validation-rmse:0.24284
[19]	train-rmse:0.18735	validation-rmse:0.24127
[20]	train-rmse:0.18524	validation-rmse:0.23978
[2

In [None]:
xgb_num_rounds = 100
xgb_params = {
    'eta': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,

    'seed': 1,
    'verbosity': 1,
}

model_eta1 = xgb.train(xgb_params, dtrain, xgb_num_rounds, evals=watchlist)

[0]	train-rmse:0.52449	validation-rmse:0.52045
[1]	train-rmse:0.48736	validation-rmse:0.48443
[2]	train-rmse:0.45433	validation-rmse:0.45293
[3]	train-rmse:0.42534	validation-rmse:0.42550
[4]	train-rmse:0.39987	validation-rmse:0.40144
[5]	train-rmse:0.37823	validation-rmse:0.38152
[6]	train-rmse:0.35887	validation-rmse:0.36393
[7]	train-rmse:0.34177	validation-rmse:0.34838
[8]	train-rmse:0.32701	validation-rmse:0.33492
[9]	train-rmse:0.31412	validation-rmse:0.32333
[10]	train-rmse:0.30326	validation-rmse:0.31427
[11]	train-rmse:0.29356	validation-rmse:0.30615
[12]	train-rmse:0.28520	validation-rmse:0.29922
[13]	train-rmse:0.27760	validation-rmse:0.29269
[14]	train-rmse:0.27116	validation-rmse:0.28797
[15]	train-rmse:0.26539	validation-rmse:0.28380
[16]	train-rmse:0.26013	validation-rmse:0.27970
[17]	train-rmse:0.25583	validation-rmse:0.27662
[18]	train-rmse:0.25203	validation-rmse:0.27388
[19]	train-rmse:0.24818	validation-rmse:0.27124
[20]	train-rmse:0.24512	validation-rmse:0.26882
[2