The goal of this homework is to create a regression model for predicting housing prices (column 'median_house_value').

# Import Libraries

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Import Dataset

In [2]:
df = pd.read_csv('housing.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


# Prepare Data

In [3]:
df = df[df['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])]
df.fillna(0, inplace=True)
df['median_house_value'] = np.log1p(df['median_house_value'].values)

# Split the Data

In [6]:
seed = 1

train_df, temp_df = train_test_split(df, test_size=0.4, random_state=seed)
val_df, test_df = train_test_split(temp_df, test_size=0.2, random_state=seed)

# Encode the Data

In [7]:
# Separate the target variable from the features
X_train = train_df.drop('median_house_value', axis=1)
y_train = train_df['median_house_value']

X_val = val_df.drop('median_house_value', axis=1)
y_val = val_df['median_house_value']

X_test = test_df.drop('median_house_value', axis=1)
y_test = test_df['median_house_value']

# Use DictVectorizer to turn the dataframes into matrices
vectorizer = DictVectorizer(sparse=True)

X_train = vectorizer.fit_transform(X_train.to_dict(orient='records'))
X_val = vectorizer.transform(X_val.to_dict(orient='records'))
X_test = vectorizer.transform(X_test.to_dict(orient='records'))

# Question 1

In [9]:
# Membuat model Decision Tree Regressor dengan max_depth=1
model = DecisionTreeRegressor(max_depth=1)
# Melatih model dengan data pelatihan
model.fit(X_train, y_train)
# Menentukan fitur yang digunakan untuk memisahkan data
splitting_feature_index = model.tree_.feature[0]
# Mengambil nama fitur berdasarkan indeksnya
splitting_feature_name = vectorizer.get_feature_names_out()[splitting_feature_index]
# Menampilkan fitur yang digunakan untuk pemisahan data
print("The feature used for splitting the data is:", splitting_feature_name)

The feature used for splitting the data is: ocean_proximity=<1H OCEAN


# Question 2

In [11]:
# Create a Random Forest Regressor with the specified parameters
rf_model = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
# Train the model on the training data
rf_model.fit(X_train, y_train)
# Make predictions on the validation data
y_val_pred = rf_model.predict(X_val)
# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("RMSE on validation data:", rmse)

RMSE on validation data: 0.23879635458921267


# Question 3

In [12]:
# Define a range of n_estimators values to experiment with
n_estimators_values = range(10, 201, 10)

# Initialize an empty list to store RMSE values
rmse_values = []

for n_estimators in n_estimators_values:
    # Create a Random Forest Regressor with the specified parameters
    rf_model = RandomForestRegressor(n_estimators=n_estimators, random_state=1, n_jobs=-1)

    # Train the model on the training data
    rf_model.fit(X_train, y_train)

    # Make predictions on the validation data
    y_val_pred = rf_model.predict(X_val)

    # Calculate the RMSE
    rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

    rmse_values.append(rmse)
    print(f"n_estimators={n_estimators}: RMSE={rmse}")

# Find the index of the minimum RMSE value
best_n_estimators_index = np.argmin(rmse_values)

# Determine the best value of n_estimators
best_n_estimators = n_estimators_values[best_n_estimators_index]
print(f"The best value of n_estimators is {best_n_estimators} with RMSE={rmse_values[best_n_estimators_index]}")

n_estimators=10: RMSE=0.23879635458921267
n_estimators=20: RMSE=0.22998532813957734
n_estimators=30: RMSE=0.22731461203509837
n_estimators=40: RMSE=0.22664496313479962
n_estimators=50: RMSE=0.22552369309511094
n_estimators=60: RMSE=0.2252821217573416
n_estimators=70: RMSE=0.22504562183085158
n_estimators=80: RMSE=0.2247365597616832
n_estimators=90: RMSE=0.224847020503133
n_estimators=100: RMSE=0.2244994467707608
n_estimators=110: RMSE=0.22431745023033342
n_estimators=120: RMSE=0.22422840311713363
n_estimators=130: RMSE=0.2240898677367095
n_estimators=140: RMSE=0.22400076726350868
n_estimators=150: RMSE=0.22383498377970573
n_estimators=160: RMSE=0.22378489930483017
n_estimators=170: RMSE=0.2237478751873627
n_estimators=180: RMSE=0.22385252288931085
n_estimators=190: RMSE=0.22385029061215883
n_estimators=200: RMSE=0.22382483135925066
The best value of n_estimators is 170 with RMSE=0.2237478751873627


# Question 4

In [13]:
# Define a range of max_depth and n_estimators values to experiment with
max_depth_values = [10, 15, 20, 25]
n_estimators_values = range(10, 201, 10)

# Initialize variables to track the best max_depth and corresponding RMSE
best_max_depth = None
best_rmse = float('inf')

for max_depth in max_depth_values:
    for n_estimators in n_estimators_values:
        # Create a Random Forest Regressor with the specified parameters
        rf_model = RandomForestRegressor(max_depth=max_depth, n_estimators=n_estimators, random_state=1, n_jobs=-1)

        # Train the model on the training data
        rf_model.fit(X_train, y_train)

        # Make predictions on the validation data
        y_val_pred = rf_model.predict(X_val)

        # Calculate the RMSE
        rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

        # Check if this combination has a lower RMSE
        if rmse < best_rmse:
            best_max_depth = max_depth
            best_rmse = rmse

        print(f"max_depth={max_depth}, n_estimators={n_estimators}: RMSE={rmse}")

print(f"The best max_depth is {best_max_depth} with RMSE={best_rmse}")

max_depth=10, n_estimators=10: RMSE=0.24353784467765577
max_depth=10, n_estimators=20: RMSE=0.23966748097519688
max_depth=10, n_estimators=30: RMSE=0.2375512511121298
max_depth=10, n_estimators=40: RMSE=0.23707554234312792
max_depth=10, n_estimators=50: RMSE=0.23648491199262234
max_depth=10, n_estimators=60: RMSE=0.23658956920821567
max_depth=10, n_estimators=70: RMSE=0.2365559006312253
max_depth=10, n_estimators=80: RMSE=0.23589904741506867
max_depth=10, n_estimators=90: RMSE=0.23575071677743767
max_depth=10, n_estimators=100: RMSE=0.23563261988507686
max_depth=10, n_estimators=110: RMSE=0.23547186347115812
max_depth=10, n_estimators=120: RMSE=0.23542439491917747
max_depth=10, n_estimators=130: RMSE=0.23533663804200974
max_depth=10, n_estimators=140: RMSE=0.23516990186394524
max_depth=10, n_estimators=150: RMSE=0.2350825414807882
max_depth=10, n_estimators=160: RMSE=0.23509025390112293
max_depth=10, n_estimators=170: RMSE=0.23502640247757595
max_depth=10, n_estimators=180: RMSE=0.2351

# Question 5

In [14]:
from sklearn.ensemble import RandomForestRegressor

# Create a Random Forest Regressor with the specified parameters
rf_model = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Get feature importances from the model
feature_importances = rf_model.feature_importances_

# Create a dictionary to map feature names to their importance scores
feature_importance_dict = dict(zip(vectorizer.get_feature_names_out(), feature_importances))

# Find the most important feature
most_important_feature = max(feature_importance_dict, key=feature_importance_dict.get)

print("The most important feature is:", most_important_feature)


The most important feature is: median_income


# Question 6

In [15]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np

# Create DMatrix for train and validation
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# Create a watchlist
watchlist = [(dval, 'eval'), (dtrain, 'train')]

# Define XGBoost parameters
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
    'eval_metric': 'rmse'  # RMSE as the evaluation metric
}

# Train the model with eta=0.3 for 100 rounds
num_round = 100
model_eta_0_3 = xgb.train(xgb_params, dtrain, num_round, watchlist)

# Make predictions on the validation data
y_val_pred_eta_0_3 = model_eta_0_3.predict(dval)

# Calculate RMSE for eta=0.3
rmse_eta_0_3 = np.sqrt(mean_squared_error(y_val, y_val_pred_eta_0_3))
print("RMSE with eta=0.3:", rmse_eta_0_3)

# Now change eta to 0.1
xgb_params['eta'] = 0.1

# Train the model with eta=0.1 for 100 rounds
model_eta_0_1 = xgb.train(xgb_params, dtrain, num_round, watchlist)

# Make predictions on the validation data
y_val_pred_eta_0_1 = model_eta_0_1.predict(dval)

# Calculate RMSE for eta=0.1
rmse_eta_0_1 = np.sqrt(mean_squared_error(y_val, y_val_pred_eta_0_1))
print("RMSE with eta=0.1:", rmse_eta_0_1)

# Compare RMSE scores and identify the best eta
if rmse_eta_0_3 < rmse_eta_0_1:
    best_eta = 0.3
elif rmse_eta_0_1 < rmse_eta_0_3:
    best_eta = 0.1
else:
    best_eta = "Both give equal value"

print("The best eta is:", best_eta)


[0]	eval-rmse:0.44610	train-rmse:0.44258
[1]	eval-rmse:0.36961	train-rmse:0.36286
[2]	eval-rmse:0.32324	train-rmse:0.31398
[3]	eval-rmse:0.29743	train-rmse:0.28441
[4]	eval-rmse:0.28038	train-rmse:0.26506
[5]	eval-rmse:0.26897	train-rmse:0.25176
[6]	eval-rmse:0.26247	train-rmse:0.24301
[7]	eval-rmse:0.25451	train-rmse:0.23305
[8]	eval-rmse:0.25210	train-rmse:0.22820
[9]	eval-rmse:0.24829	train-rmse:0.22173
[10]	eval-rmse:0.24518	train-rmse:0.21631
[11]	eval-rmse:0.24139	train-rmse:0.21105
[12]	eval-rmse:0.24018	train-rmse:0.20836
[13]	eval-rmse:0.23937	train-rmse:0.20530
[14]	eval-rmse:0.23834	train-rmse:0.20205
[15]	eval-rmse:0.23682	train-rmse:0.19798
[16]	eval-rmse:0.23527	train-rmse:0.19560
[17]	eval-rmse:0.23447	train-rmse:0.19403
[18]	eval-rmse:0.23252	train-rmse:0.19098


  if is_sparse(data):


[19]	eval-rmse:0.23227	train-rmse:0.18807
[20]	eval-rmse:0.23130	train-rmse:0.18518
[21]	eval-rmse:0.23031	train-rmse:0.18301
[22]	eval-rmse:0.22961	train-rmse:0.18188
[23]	eval-rmse:0.22913	train-rmse:0.17963
[24]	eval-rmse:0.22799	train-rmse:0.17695
[25]	eval-rmse:0.22737	train-rmse:0.17484
[26]	eval-rmse:0.22726	train-rmse:0.17346
[27]	eval-rmse:0.22714	train-rmse:0.17165
[28]	eval-rmse:0.22600	train-rmse:0.16958
[29]	eval-rmse:0.22597	train-rmse:0.16847
[30]	eval-rmse:0.22599	train-rmse:0.16752
[31]	eval-rmse:0.22617	train-rmse:0.16602
[32]	eval-rmse:0.22573	train-rmse:0.16455
[33]	eval-rmse:0.22535	train-rmse:0.16360
[34]	eval-rmse:0.22502	train-rmse:0.16264
[35]	eval-rmse:0.22507	train-rmse:0.16119
[36]	eval-rmse:0.22513	train-rmse:0.15962
[37]	eval-rmse:0.22436	train-rmse:0.15760
[38]	eval-rmse:0.22419	train-rmse:0.15662
[39]	eval-rmse:0.22389	train-rmse:0.15556
[40]	eval-rmse:0.22388	train-rmse:0.15444
[41]	eval-rmse:0.22380	train-rmse:0.15352
[42]	eval-rmse:0.22310	train-rmse:



[16]	eval-rmse:0.27638	train-rmse:0.25953
[17]	eval-rmse:0.27239	train-rmse:0.25480
[18]	eval-rmse:0.26947	train-rmse:0.25100
[19]	eval-rmse:0.26703	train-rmse:0.24753
[20]	eval-rmse:0.26404	train-rmse:0.24383
[21]	eval-rmse:0.26181	train-rmse:0.24085
[22]	eval-rmse:0.25930	train-rmse:0.23777
[23]	eval-rmse:0.25717	train-rmse:0.23509
[24]	eval-rmse:0.25502	train-rmse:0.23243
[25]	eval-rmse:0.25327	train-rmse:0.22987
[26]	eval-rmse:0.25178	train-rmse:0.22780
[27]	eval-rmse:0.25035	train-rmse:0.22574
[28]	eval-rmse:0.24892	train-rmse:0.22370
[29]	eval-rmse:0.24733	train-rmse:0.22139
[30]	eval-rmse:0.24580	train-rmse:0.21939
[31]	eval-rmse:0.24515	train-rmse:0.21809
[32]	eval-rmse:0.24393	train-rmse:0.21623
[33]	eval-rmse:0.24282	train-rmse:0.21432
[34]	eval-rmse:0.24193	train-rmse:0.21271
[35]	eval-rmse:0.24086	train-rmse:0.21113
[36]	eval-rmse:0.24030	train-rmse:0.20975
[37]	eval-rmse:0.23942	train-rmse:0.20828
[38]	eval-rmse:0.23876	train-rmse:0.20698
[39]	eval-rmse:0.23831	train-rmse: