In [1]:
# Importing the required libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import norm
from scipy.stats import skew, norm

## Exporting the processed train and test datasets

In [2]:
train = pd.read_csv(r"E:\processed_train.csv")
test = pd.read_csv(r"E:\processed_test.csv")

## Separating Target and Predictor Variables

In [3]:
target = train["SalePrice"]

In [4]:
target.shape

(1460,)

In [5]:
predictor = train.drop(columns=["SalePrice"])

In [6]:
predictor.shape

(1460, 295)

In [7]:
# Basic Regressors from Sklearn.
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error
from sklearn.metrics import r2_score

In [8]:
# Advanced Regressors.
import shap
from xgboost import XGBRegressor
from catboost import Pool
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from mlxtend.regressor import StackingRegressor

In [9]:
import time

## Creating RMSE Metric and Setting K Folds for Cross Validation

In [10]:
def cv_rmse(model):
    return np.sqrt(-cross_val_score(model, predictor, target, scoring="neg_mean_squared_error", cv=kf))

In [11]:
kf = KFold(n_splits=10, random_state=42, shuffle=True)
cv_scores, cv_std = [], []

## Gradient Boosting Regressor

In [12]:
# Hyperparameter Tuning

In [None]:
t0 = time.time()

gbr = GradientBoostingRegressor()

gbr_params = {'n_estimators':[100,500,1000,5000],
          'max_depth':[3,4,5,6,7],
          'learning_rate':[1,0.1,0.01],
          'min_samples_split':[0.1,0.5,1],
          'min_samples_leaf':[0.1,0.25,0.5]}

gbr_model = GridSearchCV(gbr,
                         param_grid=gbr_params,
                         cv=3,
                         n_jobs=-1)

gbr_model.fit(predictor,target)

t1 = time.time() - t0
print('Elapsed time:',t1,'s')

In [None]:
print('Results from Grid Search:')
print("\nBest score across all searched params:\n",gbr_model.best_score_)
print("\nBest parameters across all searched params:\n",gbr_model.best_params_)

In [13]:
# Optimizing the model and fitting again.

In [None]:
gbr_opt = GradientBoostingRegressor(n_estimators=5000,
                                  max_depth=3,
                                  learning_rate=0.01,
                                  min_samples_split=0.1,
                                  min_samples_leaf=0.1)

gbr_opt.fit(predictor,target)

In [14]:
# Evaluating model's performance.

In [None]:
target_pred = gbr_opt.predict(predictor)

In [None]:
MSE = np.mean((target_pred - target)**2)

print(f"Mean Squared Error = {MSE}\n")
print(f"R2 Score: {r2_score(target, target_pred)}")

In [15]:
# Visualizing the model.

In [None]:
sns.set(font_scale=1.2)
plt.figure(figsize=(10,6))
sns.scatterplot(x=target, y=target_pred, color="r")
plt.xlabel("Prices")
plt.ylabel("Predicted prices")
plt.title("Prices vs. Predicted Prices")

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(x=target_pred, y=target_pred - target, color="r")
plt.title("Residual Plot")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")

## XGBoost Regressor

In [17]:
# Hyperparameter Tuning

In [None]:
xgb = XGBRegressor(objective='reg:squarederror')

xgb_params = {"n_estimators": [500, 750, 1000, 1500, 2000], 
             "learning_rate": [0.01, 0.02, 0.05], 
             "max_depth": [6, 8], 
             "subsample": [0.3, 0.5, 0.7]}

xgb_model = GridSearchCV(XGBRegressor(xgb, xgb_params)
                    
xgb_model.fit(predictor, target)

In [None]:
print('Results from Grid Search:')
print("\nBest score across all searched params:\n",xgb_model.best_score_)
print("\nBest parameters across all searched params:\n",xgb_model.best_params_)

In [18]:
# Fitting the model again.

In [None]:
model = XGBRegressor(n_estimators=1500, learning_rate=0.02, max_depth=6, subsample=0.7)
model.fit(predictor, target)

In [19]:
# Evaluating model's performance.

In [None]:
target_pred = model.predict(predictor)

In [None]:
MSE = np.mean((target_pred - target)**2)

print(f"Mean Squared Error = {MSE}\n")
print(f"R2 Score: {r2_score(target, target_pred)}")

In [20]:
# Visualizing the model.

In [None]:
sns.set(font_scale=1.2)
plt.figure(figsize=(10,6))
sns.scatterplot(x=target, y=target_pred, color="r")
plt.xlabel("Prices")
plt.ylabel("Predicted prices")
plt.title("Prices vs. Predicted Prices")

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(x=target_pred, y=target_pred - target, color="r")
plt.title("Residual Plot")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")