In [1]:
import numpy as np
import pandas as pd
import os
from six.moves import urllib
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import (
    RandomizedSearchCV,
    GridSearchCV,
    StratifiedShuffleSplit,
    train_test_split,
)
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import randint

In [45]:
housing = pd.read_csv('datasets\\raw\\housing.csv')

In [46]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [47]:
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [48]:
housing["income_cat"] = pd.cut(
    housing["median_income"],
    bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
    labels=[1, 2, 3, 4, 5],
)

In [52]:
housing_sample = housing.sample(10)
housing_sample

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,income_cat
11950,-117.44,33.94,30.0,2992.0,516.0,1521.0,507.0,3.9128,126900.0,INLAND,3
14917,-117.04,32.62,27.0,1710.0,282.0,1089.0,297.0,4.6793,151900.0,NEAR OCEAN,4
16654,-120.65,35.27,52.0,2254.0,642.0,1237.0,590.0,2.6208,227100.0,NEAR OCEAN,2
3817,-118.49,34.2,35.0,1109.0,206.0,515.0,202.0,5.2118,215800.0,<1H OCEAN,4
9737,-121.76,36.77,27.0,1608.0,503.0,2031.0,498.0,2.3384,121000.0,<1H OCEAN,2
8980,-118.41,34.0,46.0,105.0,20.0,69.0,19.0,3.9643,275000.0,<1H OCEAN,3
4671,-118.29,34.05,11.0,677.0,370.0,1143.0,341.0,2.3864,350000.0,<1H OCEAN,2
16764,-122.49,37.7,36.0,1946.0,340.0,828.0,313.0,5.2811,287700.0,NEAR OCEAN,4
2162,-119.82,36.78,36.0,1370.0,289.0,812.0,282.0,2.6127,69600.0,INLAND,2
13951,-117.13,34.24,17.0,2828.0,506.0,673.0,274.0,5.2563,144100.0,INLAND,4


In [54]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
ohe.fit(housing[['income_cat']])
ohe.transform(housing_sample[['income_cat']]).toarray()

array([[0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

In [57]:
ohe.get_feature_names_out()

array(['income_cat_1', 'income_cat_2', 'income_cat_3', 'income_cat_4',
       'income_cat_5'], dtype=object)

In [5]:
# Define the imputer for numerical columns
num_imputer = SimpleImputer(strategy='median')

# Define the imputer for categorical columns
cat_imputer = SimpleImputer(strategy='most_frequent')

In [36]:
"""
Splits the housing DataFrame into training and test sets while preserving the distribution of the income_cat column.
income_cat is a categorical variable that we want to maintain the same proportion in both training and test sets.
"""
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [39]:
housing['ocean_proximity'].value_counts()

ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64

In [40]:
housing[housing['ocean_proximity']=='ISLAND']

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,income_cat
8314,-118.32,33.35,27.0,1675.0,521.0,744.0,331.0,2.1579,450000.0,ISLAND,2
8315,-118.33,33.34,52.0,2359.0,591.0,1100.0,431.0,2.8333,414700.0,ISLAND,2
8316,-118.32,33.33,52.0,2127.0,512.0,733.0,288.0,3.3906,300000.0,ISLAND,3
8317,-118.32,33.34,52.0,996.0,264.0,341.0,160.0,2.7361,450000.0,ISLAND,2
8318,-118.48,33.43,29.0,716.0,214.0,422.0,173.0,2.6042,287500.0,ISLAND,2


In [38]:
strat_train_set.drop('income_cat',axis=1)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,72100.0,INLAND
15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,279600.0,NEAR OCEAN
2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.8750,82700.0,INLAND
14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,112500.0,NEAR OCEAN
20496,-118.70,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,238300.0,<1H OCEAN
...,...,...,...,...,...,...,...,...,...,...
15174,-117.07,33.03,14.0,6665.0,1231.0,2026.0,1001.0,5.0900,268500.0,<1H OCEAN
12661,-121.42,38.51,15.0,7901.0,1422.0,4769.0,1418.0,2.8139,90400.0,INLAND
19263,-122.72,38.44,48.0,707.0,166.0,458.0,172.0,3.1797,140400.0,<1H OCEAN
19140,-122.70,38.31,14.0,3155.0,580.0,1208.0,501.0,4.1964,258100.0,<1H OCEAN


In [45]:
def preprocess(set_):
    set_ = set_.drop("income_cat", axis=1)

    set_["rooms_per_household"] = set_["total_rooms"] / set_["households"]
    set_["bedrooms_per_room"] = set_["total_bedrooms"] / set_["total_rooms"]
    set_["population_per_household"] = set_["population"] / set_["households"]

    # Separate features into numerical and categorical columns
    num_cols = set_.drop(['median_house_value'],axis=1).select_dtypes(include=['number']).columns
    cat_cols = set_.drop(['median_house_value'],axis=1).select_dtypes(include=['object']).columns

    # Apply the imputers
    set_num = num_imputer.fit_transform(set_[num_cols])
    set_cat = cat_imputer.fit_transform(set_[cat_cols])

    # # Convert the results back to DataFrames
    set_num = pd.DataFrame(set_num, columns=num_cols)
    set_cat = pd.DataFrame(set_cat, columns=cat_cols)

    # # Combine the numerical and categorical columns back into a single DataFrame
    set_ = pd.concat([set_num, set_cat], axis=1)

    set_ = pd.concat([set_,pd.get_dummies(set_['ocean_proximity'], drop_first=True).astype('int')],axis=1)
    set_ = set_.drop(['ocean_proximity'],axis=1)
    return set_

In [47]:
preprocess(strat_test_set)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,-118.39,34.12,29.0,6447.0,1012.0,2184.0,960.0,8.2816,6.715625,0.156972,2.275000,0,0,0,0
1,-120.42,34.89,24.0,2020.0,307.0,855.0,283.0,5.0099,7.137809,0.151980,3.021201,0,0,0,0
2,-118.45,34.25,36.0,1453.0,270.0,808.0,275.0,4.3839,5.283636,0.185822,2.938182,0,0,0,0
3,-118.10,33.91,35.0,1653.0,325.0,1072.0,301.0,3.2708,5.491694,0.196612,3.561462,0,0,0,0
4,-117.07,32.77,38.0,3779.0,614.0,1495.0,614.0,4.3529,6.154723,0.162477,2.434853,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4123,-117.29,33.72,19.0,2248.0,427.0,1207.0,368.0,2.8170,6.108696,0.189947,3.279891,0,0,0,0
4124,-118.24,33.99,33.0,885.0,294.0,1270.0,282.0,2.1615,3.138298,0.332203,4.503546,0,0,0,0
4125,-119.72,34.44,43.0,1781.0,342.0,663.0,358.0,4.7000,4.974860,0.192027,1.851955,0,0,0,0
4126,-117.91,33.63,30.0,2071.0,412.0,1081.0,412.0,4.9125,5.026699,0.198938,2.623786,0,0,0,0


In [8]:
train = pd.read_csv('datasets\\processed\\train.csv')
test = pd.read_csv('datasets\\processed\\test.csv')

In [11]:
x_train = train.drop('median_house_value',axis=1)
y_train = train[['median_house_value']]

x_test = test.drop('median_house_value',axis=1)
y_test = test[['median_house_value']]

In [25]:
param_distribs = {
    "n_estimators": randint(low=1, high=200),
    "max_features": randint(low=1, high=8),
}

forest_reg = RandomForestRegressor(random_state=42)

rnd_search = RandomizedSearchCV(
    forest_reg,
    param_distributions=param_distribs,
    n_iter=10,
    cv=5,
    scoring="neg_mean_squared_error",
    random_state=42,
)
rnd_search.fit(x_train, y_train)
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

49175.920081996424 {'max_features': 7, 'n_estimators': 180}
50979.31309987477 {'max_features': 5, 'n_estimators': 15}
50615.31290608045 {'max_features': 3, 'n_estimators': 72}
50356.50620069724 {'max_features': 5, 'n_estimators': 21}
49356.66118397995 {'max_features': 7, 'n_estimators': 122}
50590.24171016077 {'max_features': 3, 'n_estimators': 75}
50439.661883382774 {'max_features': 3, 'n_estimators': 88}
49498.226752242146 {'max_features': 5, 'n_estimators': 100}
50262.550516472504 {'max_features': 3, 'n_estimators': 150}
63055.85950859079 {'max_features': 5, 'n_estimators': 2}


In [41]:
params_with_lowest_rmse = sorted(dict(zip(np.sqrt(-(cvres['mean_test_score'])),cvres['params'])).items(), key=lambda x:x[0])[0]

In [45]:
randomForestRegressor = RandomForestRegressor(**params_with_lowest_rmse[1],random_state=42)
randomForestRegressor.fit(x_train, y_train)
pred_train = randomForestRegressor.predict(x_train)
pred_test = randomForestRegressor.predict(x_test)
print(f'Training set RMSE {np.sqrt(mean_squared_error(y_train, pred_train))}')
print(f'Testing set RMSE {np.sqrt(mean_squared_error(y_test, pred_test))}')

  return fit_method(estimator, *args, **kwargs)


Training set RMSE 17987.974824645295
Testing set RMSE 46742.90264020781


In [93]:
import pickle
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [48]:
import pickle
# Later on, load the model from the pickle file
with open('models\\random_forest_regressor.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Use the loaded model to make predictions
loaded_model.predict(x_test)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


array([490271.39444444, 213950.02222222, 207889.44444444, ...,
       367127.33888889, 256300.02222222, 225921.66666667])

Unnamed: 0,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,False,False,True,False
1,False,False,True,False
2,False,False,True,False
3,False,False,True,False
4,False,False,True,False
...,...,...,...,...
20635,True,False,False,False
20636,True,False,False,False
20637,True,False,False,False
20638,True,False,False,False


In [None]:
imputer = SimpleImputer(strategy="median")

housing_num = housing.drop("ocean_proximity", axis=1)

imputer.fit(housing_num)
X = imputer.transform(housing_num)