In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer, MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

csv_path = "D:\Projects\ml-experiments\datasets\housing\housing.csv"

data = pd.read_csv(csv_path)

X = data.drop("median_house_value", axis=1)
y = data.median_house_value.copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Transform the train set with scaling, inputing and one hot encoding for the non-numeric feature (ocean proximity):

In [8]:
X_train_num = X_train.drop("ocean_proximity", axis=1)
binarizer = LabelBinarizer()

Next returns a 2-dimensional NumPy array (ndarray) of shape (16512,5) (16512 binary arrays of size 5 - a single 1 and rest 0 to denote a certain class):

In [9]:
ocean_proximity_1_hot = binarizer.fit_transform(X_train.ocean_proximity)

Transform the numeric predictors using a pipeline composed of an inputer and scaler. MinMaxScaler instantiated with default values will scale each feature between 0 and 1. Scale can be controlled using the feature_range parameter (tuple):

In [10]:
pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")), ('scaler', MinMaxScaler())])
X_train_tr = pipeline.fit_transform(X_train_num)

X_train_tr is now a 2 dimensional nd array (16512,8) - Need to merge it with the binarized labels into (16512,13) shape. ocean proximity one hot vector is a 2 dimensional ndarray of shape (16512,5):

In [11]:
X_train_tr = np.append(X_train_tr, ocean_proximity_1_hot, axis=1)

Linear Regression:

In [12]:
lin_reg = LinearRegression()
lin_reg.fit(X_train_tr, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

Test with first 5 instances from the training set

In [13]:
some_data = X_train_tr[:5]
some_labels = y_train[:5]

print(f"Linear Regression Predictions: {lin_reg.predict(some_data)}")
print(f"Labels:{list(some_labels)}")

Linear Regression Predictions: [189408. 290656. 250944. 147648. 165632.]
Labels:[103000, 382100, 172600, 93400, 96500]


Measure the error using RMSE function:

In [14]:
predictions = lin_reg.predict(X_train_tr)
lin_mse = mean_squared_error(y_train, predictions)
lin_rmse = np.sqrt(lin_mse)

print("Linear Regression RMSE for the whole training set is:", lin_rmse)

Linear Regression RMSE for the whole training set is: 68437.636165114


68437 is not great at all! (basically means a typical prediction error of $68437). The model is underfitting (too simple)

Decision Tree Regression:

In [15]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train_tr, y_train)

print("Decision Tree Predictions:", tree_reg.predict(some_data))
print("Labels:", list(some_labels))

predictions = tree_reg.predict(X_train_tr)
lin_mse = mean_squared_error(y_train, predictions)
lin_rmse = np.sqrt(lin_mse)

print("Decision Tree RMSE for the whole training set is:", lin_rmse)

Decision Tree Predictions: [103000. 382100. 172600.  93400.  96500.]
Labels: [103000, 382100, 172600, 93400, 96500]
Decision Tree RMSE for the whole training set is: 0.0


Decision Tree looks perfect (0 RMSE) but it's not! It just performs perfect on the training set that it learned but won't generalize well to new data as demonstrated below(overfitting).

Perform K-fold cross-validation of Decision Tree model to measure the RMSE. The algorithm splits the training set in K parts and does K training + prediction iterations each time picking one random part for evaluation and training on the other K-1. The result is K scores. The main benefit of the approach is clean predictions: on each iteration the predictor gets to predict on a subset of instances that it never saw during training.

In [16]:
scores = cross_val_score(tree_reg, X_train_tr, y_train, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

print("Decision Tree Scores:", rmse_scores)
print("Decision Tree Mean:", rmse_scores.mean())
print("Decision Tree Standard deviation:", rmse_scores.std())

Decision Tree Scores: [66395.02770474 68926.75670235 68203.67758163 70318.76715507
 69184.25903717 67892.25147943 62180.54067926 69044.54017491
 68616.18231523 68288.55396736]
Decision Tree Mean: 67905.05567971314
Decision Tree Standard deviation: 2135.3461008052263


Train and test a Random Forest Regressor (works by training multiple Decision Trees on different subsets of the training set and averaging based on their predictions)

In [19]:
forest_reg = RandomForestRegressor(n_estimators=15)
forest_reg.fit(X_train_tr, y_train)

print("Random Forest Predictions:", forest_reg.predict(some_data))
print("Labels:", list(some_labels))

predictions = forest_reg.predict(X_train_tr)
lin_mse = mean_squared_error(y_train, predictions)
lin_rmse = np.sqrt(lin_mse)

print("Random Forest RMSE for the whole trainig set is:", lin_rmse)

Random Forest Predictions: [104560.         366453.33333333 163853.33333333  92146.66666667
  91200.        ]
Labels: [103000, 382100, 172600, 93400, 96500]
Random Forest RMSE for the whole trainig set is: 19977.87442216315
