# Modeling

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv('../data/processed/clean_kelowna_housing_data.csv')

In [None]:
grouped = df.groupby('Type')
# Group the df by 'Type' column, so I can check for IQR in all types

q1 = grouped["price"].transform(lambda x: x.quantile(0.25))
q3 = grouped["price"].transform(lambda x: x.quantile(0.75))

iqr = q3 - q1

upper_bound = q3 + (1.5 * iqr)
lower_bound = q1 - (1.5 * iqr)
# Calculate the upper and lower bounds using the interquartile range

outliers = df[df["price"].gt(upper_bound) | df["price"].lt(lower_bound)]

outliers
# print all outliers

In [None]:
df = df.drop(outliers.index)

In [None]:
data = pd.get_dummies(df.drop(columns = {'name', 'address', 'city'}))
data

In [None]:
X = data.iloc[:, 1:]  # all rows, no label
y = data.loc[:, 'price']  # all rows, label only
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import numpy as np

# Define the model
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)

# Define the parameter grid for pruning
param_grid = {
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform a grid search to find the best hyperparameters
grid_search = GridSearchCV(estimator=rf_reg, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_rf_reg = grid_search.best_estimator_

# Print the best hyperparameters
print("Best hyperparameters: ", grid_search.best_params_)

# Train the model on the training set using the best hyperparameters
best_rf_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_rf_reg.predict(X_test)
y_pred1 = best_rf_reg.predict(X_train)


# Evaluate the model
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
print("Mean squared error1: %.2f" % mean_squared_error(y_train, y_pred1))
print('R2 score: %.2f' % r2_score(y_test, y_pred))
print('Mean absolute error: %.2f' % mean_absolute_error(y_test, y_pred))
print('Mean absolute error1: %.2f' % mean_absolute_error(y_train, y_pred1))


# Plot the feature importances
importances = best_rf_reg.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices])
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
plt.show()

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

# Create Random Forest regressor with max_depth of 10 and 100 trees
rf = RandomForestRegressor(n_estimators=100, max_depth=10)

# Use 5-fold cross-validation to evaluate model performance
cv_scores = cross_val_score(rf, X_train, y_train, cv=5)

# Compute the mean and standard deviation of the cross-validation scores
mean_cv_score = np.mean(cv_scores)
std_cv_score = np.std(cv_scores)

# Print the mean and standard deviation of the cross-validation scores
print("Mean CV score:", mean_cv_score)
print("CV score standard deviation:", std_cv_score)

# Fit the model to the training data
rf.fit(X_train, y_train)

# Evaluate the model on the training and test data
train_score = rf.score(X_train, y_train)
test_score = rf.score(X_test, y_test)

# Print the training and test scores
print("Training score:", train_score)
print("Test score:", test_score)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

# Create Random Forest regressor with max_depth of 10 and 100 trees
rf = RandomForestRegressor(n_estimators=100, max_depth=10)

# Use 5-fold cross-validation to evaluate model performance
cv_scores = cross_val_score(rf, X_train, y_train, cv=5)

# Compute the mean and standard deviation of the cross-validation scores
mean_cv_score = np.mean(cv_scores)
std_cv_score = np.std(cv_scores)

# Print the mean and standard deviation of the cross-validation scores
print("Mean CV score:", mean_cv_score)
print("CV score standard deviation:", std_cv_score)

# Fit the model to the training data
rf.fit(X_train, y_train)

# Evaluate the model on the training and test data
train_score = rf.score(X_train, y_train)
test_score = rf.score(X_test, y_test)

# Print the training and test scores
print("Training score:", train_score)
print("Test score:", test_score)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# define the number of trees in the random forest
n_trees = 100

# instantiate the random forest regressor with pruning
rf = RandomForestRegressor(n_estimators=n_trees, ccp_alpha=0.01)

# fit the model to the training data
rf.fit(X_train, y_train)

# predict the target values for the test data
y_pred = rf.predict(X_test)

# calculate the mean squared error and R^2 score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# print the results
print('Mean squared error: {:.4f}'.format(mse))
print('R^2 score: {:.4f}'.format(r2))

# plot the feature importances
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# select the top 10 most important features
top_features = indices[:10]
print('Top 10 most important features:', X_train.columns[top_features])

plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices],
        color="r", yerr=std[indices], align="center")
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation='vertical')
plt.xlim([-1, X_train.shape[1]])
plt.show()

# plot the predicted vs. actual target values
plt.figure()
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Random Forest Regression')
plt.show()

In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score, _scorer
import matplotlib.pyplot as plt

# define the hyperparameters for the GBM Light model
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mse',
    'max_depth': 3,
    'num_leaves': 31,
    'learning_rate': 0.15,
    'feature_fraction': 0.9
}

# create the training and validation datasets for GBM Light
lgb_train = lgb.Dataset(X_train, y_train)
lgb_val = lgb.Dataset(X_test, y_test)

# train the GBM Light model
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=500,
                valid_sets=[lgb_train, lgb_val],
                early_stopping_rounds=50,
                verbose_eval=50)

# predict the target values for the test data
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)

# calculate the mean squared error and R^2 score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# print the results
print('Mean squared error: {:.4f}'.format(mse))
print('R^2 score: {:.4f}'.format(r2))

# plot the feature importances
lgb.plot_importance(gbm, max_num_features=10)

# plot the predicted vs. actual target values
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('GBM Light Regression')
plt.show()

In [None]:
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('GBM Light Regression')
plt.show()

In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np

# define the hyperparameters for the GBM Light model
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mse',
    'max_depth': 3,
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'min_child_samples': 10
}

# create the training and validation datasets for GBM Light
lgb_train = lgb.Dataset(X_train, y_train)
lgb_val = lgb.Dataset(X_test, y_test)

# train the GBM Light model
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10000,
                valid_sets=[lgb_train, lgb_val],
                early_stopping_rounds=50,
                verbose_eval=50)

# predict the target values for the test data
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)

# calculate the mean squared error and R^2 score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# print the results
print('Mean squared error: {:.4f}'.format(mse))
print('R^2 score: {:.4f}'.format(r2))
print('Mean absolute error: {:.4f}'.format(mae))

# plot the feature importances
lgb.plot_importance(gbm, max_num_features=10)

# plot the predicted vs. actual target values
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('GBM Light Regression')
plt.show()

# plot the training and validation MSE scores over boosting rounds
train_mse = gbm.evals_result_['training']['mse']
val_mse = gbm.evals_result_['valid_1']['mse']
x_axis = np.arange(0, len(train_mse))
plt.plot(x_axis, train_mse, label='Training MSE')
plt.plot(x_axis, val_mse, label='Validation MSE')
plt.legend()
plt.xlabel('Boosting round')
plt.ylabel('MSE')
plt.title('Training and Validation MSE over Boosting Rounds')
plt.show()