In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler, PowerTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import itertools
import time
from scipy.stats import iqr

# Step 1: Load Data

In [None]:
# -----------------
# Load the training and test datasets
df_train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
df_test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

# For the test dataset, add the target column 'SalePrice' with value 0 (for Combine train and test data for preprocessing)
if 'SalePrice' not in df_test.columns:
    df_test['SalePrice'] = 0
    
# Combine train and test data for preprocessing
df = pd.concat([df_train, df_test], axis = 0)
df = df.set_index('Id')

# Display the first few rows of the combined dataset
df.head()

# Step 2: Handle Missing Values

In [None]:
# -----------------------------
# Find missing values across all columns
df_mv  = df[df.isnull().sum()[df.isnull().sum()>0].index]

# Visualize missing values using a heatmap
sns.heatmap(df_mv.isnull())
plt.show()

In [None]:
# Remove categorical columns with more than 1100 missing values
df_objects =  df[df.select_dtypes(include=['object']).columns]
df = df.drop(df[df_objects.isna().sum()[df_objects.isna().sum() > 1100].index], axis = 1)

# Fill missing values of other categorical columns with 'MV' and encode them using one-hot encoding
df_objects = df_objects.drop(df_objects[df_objects.isna().sum()[df_objects.isna().sum() > 1100].index], axis = 1)
df_objects = df_objects.fillna('MV')
df_objects_encoded = pd.get_dummies(df_objects)

# Drop columns that are encoded as 'MV'
for i in df_objects_encoded.columns:
    if 'MV' in i:
        df_objects_encoded = df_objects_encoded.drop(i, axis = 1)
        print(i)
        
# Combine the original dataframe with the encoded categorical columns
new_df = pd.concat([df, df_objects_encoded], axis = 1)


# Step 3: Handle Missing Values in Numerical Columns

#### impute the missing values of numerical values

In [None]:
# Drop any remaining object-type columns (they've already been encoded)
new_df = new_df.drop(df.select_dtypes(include=['object']), axis = 1)

#get an overview of numerical missing values
new_df.isna().sum()[new_df.isna().sum() > 0]

In [None]:
# Impute missing values in numerical columns
# Use mode for ordinal features and mean for continuous features

Mode_columns = ['GarageCars', 'GarageYrBlt', 'BsmtFullBath', 'BsmtHalfBath']
Mean_columns = ['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
                'TotalBsmtSF', 'GarageArea']

for i in Mode_columns:
    new_df[i] = new_df[i].fillna(new_df[i].mode()[0])

for i in Mean_columns:
    new_df[i] = new_df[i].fillna(np.round(new_df[i].mean()))

# check for removing all missing values
new_df.isna().sum()[new_df.isna().sum() > 0] 

# Step 4: Data Splitting

In [None]:
# ----------------------
# Separate the train and test sets from the combined dataset
train_data = new_df[0:len(df_train)]
test_data = new_df[len(df_train):]
test_data = test_data.drop(columns='SalePrice')

# Step 5: Target Variable Analysis (SalePrice)

In [None]:
# --------------------------------------------
# Visualize the distribution of SalePrice
sns.histplot(data=train_data, x="SalePrice", color="red", kde=True, bins=30)
plt.title("Sale Price Histogram")
plt.xlabel("Sale Price")
plt.ylabel("Count")
plt.show()

# Display SalePrice summary statistics
print(train_data["SalePrice"].describe())

In [None]:
# Apply Box-Cox transformation to SalePrice to normalize its distribution
from sklearn.preprocessing import PowerTransformer
y_train = train_data['SalePrice']
boxcox = PowerTransformer(method = 'box-cox')
boxcox.fit(y_train.values.reshape(-1, 1))
trans_y_train = boxcox.fit_transform(y_train.values.reshape(-1, 1))

In [None]:
# Visualize the transformed SalePrice distribution
sns.histplot(trans_y_train, color="red", kde=True, bins=30)
plt.title("Sale Price Histogram")
plt.xlabel("Sale Price")
plt.ylabel("Count")
plt.show()

In [None]:
# Write preprocessed data to a CSV file for further analysis
train_data.to_csv('train_data.csv', index=True)
test_data.to_csv('test_data.csv', index=True)

# Step 6: Split Data into Training and Validation Sets

In [None]:
# ----------------------------------------------------
#Split data into test and train
train, test = train_test_split(train_data, 
                               test_size = 0.3, 
                               random_state = 724)
print(train.shape)
print(test.shape)

In [None]:
# Define feature matrices and target vectors for train set
X_train = train.drop(['SalePrice'], axis = 1)
X_train.head()

y_train = train['SalePrice']
y_train

#Box-Cox transformation for target variable in train set
from sklearn.preprocessing import PowerTransformer
boxcox = PowerTransformer(method = 'box-cox')
boxcox.fit(y_train.values.reshape(-1, 1))
trans_y_train = boxcox.fit_transform(y_train.values.reshape(-1, 1))

In [None]:
# Define feature matrices and target vectors for validation set
X_test = test.drop(['SalePrice'], axis = 1)
X_test.head()

# Step 7: Train Random Forest Model

In [None]:
# ---------------------------------
# Initialize and train a Random Forest Regressor
rf_reg = RandomForestRegressor(n_estimators = 2000,
                               max_features = X_train.shape[1], 
                               criterion = 'squared_error',
                               max_depth = 20, 
                               min_samples_leaf = 4, 
                               ccp_alpha = 0,
                               random_state = 724)

model_1 = rf_reg.fit(X_train, trans_y_train.reshape(-1))

In [None]:
# Evaluate feature importance
importance = pd.DataFrame({'importance': model_1.feature_importances_ * 100}, 
                          index = X_train.columns)
filtered_importance = importance[importance['importance'] > 1]
filtered_importance.sort_values(by = 'importance', axis = 0, ascending = True).plot(kind = 'barh',color = 'r')
plt.title('Variable Importance')
plt.xlabel('MSE Increase (%)')
plt.show()

In [None]:
# Hyperparameter Tuning using Cross-Validation for RandomForest
# ----------------------------------------------------
# Create hyperparameter grid for Random Forest
n_estimators = [2000, 2100]
max_features = ['sqrt', 'log2', None] #If None or 1.0, then max_features = n_features
max_depth = [20, 25]
min_samples_leaf = [4, 5, 6]
params_grid = list(itertools.product(n_estimators, max_features, max_depth, min_samples_leaf))
params_grid = pd.DataFrame(data = params_grid,
                           index = range(1, 37), 
                           columns = ['n_estimators', 
                                      'max_features', 
                                      'max_depth', 
                                      'min_samples_leaf'])
params_grid

In [None]:
# Perform 5-fold cross-validation to choose the best hyperparameter
start_time = time.time()
cv_errors = np.zeros(shape = len(params_grid)) #to save cv results
for i in range(len(params_grid)):
    rf_reg = RandomForestRegressor(n_estimators = params_grid.iloc[i, 0],
                                   max_features = params_grid.iloc[i, 1], 
                                   criterion = 'squared_error',
                                   max_depth = params_grid.iloc[i, 2], 
                                   min_samples_leaf = params_grid.iloc[i, 3], 
                                   ccp_alpha = 0,
                                  random_state= 42)
    scores = cross_val_score(estimator = rf_reg, 
                             X = X_train, 
                             y = trans_y_train,
                             scoring = 'neg_root_mean_squared_error',
                             cv = 5, n_jobs = -1)
    cv_errors[i] = scores.mean() 
end_time = time.time()
print('The Processing time is: ', end_time - start_time, 'seconds')

cv_errors

In [None]:
# Find the best model
best_params = params_grid.iloc[np.argmax(cv_errors), :]
print(f'Best Random Forest parameters: {best_params}')

In [None]:
# Retrain the Random Forest model with the best parameters
rf_reg = RandomForestRegressor(n_estimators = params_grid.iloc[np.argmax(cv_errors), 0],
                               max_features = params_grid.iloc[np.argmax(cv_errors), 1], 
                               criterion = 'squared_error',
                               max_depth = params_grid.iloc[np.argmax(cv_errors), 2], 
                               min_samples_leaf = params_grid.iloc[np.argmax(cv_errors), 3], 
                               ccp_alpha = 0,
                               random_state = 42)
model_1 = rf_reg.fit(X_train, trans_y_train.reshape(-1))

#Prediction using model 1
pred_rf = model_1.predict(X_test)
pred_rf = pd.Series(boxcox.inverse_transform(pred_rf.reshape(-1, 1)).reshape(-1), 
                    index = test.index)
pred_rf

In [None]:
#Absolute error
abs_err_rf = abs(test['SalePrice'] - pred_rf)

#Absolute error mean, median, sd, IQR, max, min
from scipy.stats import iqr
models_comp = pd.DataFrame({'Mean of AbsErrors':    abs_err_rf.mean(),
                                       'Median of AbsErrors' : abs_err_rf.median(),
                                       'SD of AbsErrors' :     abs_err_rf.std(),
                                       'IQR of AbsErrors':     iqr(abs_err_rf),
                                       'Min of AbsErrors':     abs_err_rf.min(),
                                       'Max of AbsErrors':     abs_err_rf.max()}, 
                                      index = ['Random Forest'])
models_comp

In [None]:
#Actual vs. Prediction
plt.scatter(x = test['SalePrice'], y = pred_rf, c = 'black', alpha = 0.7)
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.title('Actual vs. Prediction - Random Forest')

#Add 45 degree line
xp = np.linspace(test['SalePrice'].min(), test['SalePrice'].max(), 100)
plt.plot(xp, xp, c = 'red', linewidth = 3)
plt.show()

# Step 8: Gradient Boosting Regressor

In [None]:
# -----------------------------------
# Initialize and train a Gradient Boosting Regressor (not tuned)
boosting_reg = GradientBoostingRegressor(learning_rate = 0.05,  #learning rate
                                         n_estimators = 300,   #the total number of trees to fit
                                         subsample = 0.7,      #the fraction of samples to be used,  
                                                                    #if .< 1, Stochastic GB
                                         max_depth = 5,        #the maximum depth of each tree
                                         min_samples_leaf = 16, #the minimum number of observations in the leaf nodes of the trees
                                         random_state = 42)

model_2 = boosting_reg.fit(X_train, trans_y_train.reshape(-1))

In [None]:
# Hyperparameter Tuning using Cross-Validation for GradientBoosting
# ----------------------------------------------------
# Create hyperparameter grid
learning_rate = [0.04, 0.05, 0.06 ]
n_estimators = [250, 300, 350]
subsample = [0.6, 0.7, 0.9]
max_depth = [3, 4, 5]
min_samples_leaf = [4, 8, 16]

params_grid = list(itertools.product(learning_rate, n_estimators, subsample, max_depth, min_samples_leaf))
params_grid = pd.DataFrame(data = params_grid,
                           index = range(1,244), 
                           columns = ['learning_rate',
                                      'n_estimators', 
                                      'subsample', 
                                      'max_depth', 
                                      'min_samples_leaf'])
params_grid

In [None]:
#K-fold cross validation to choose the best model

start_time = time.time()
cv_errors = np.zeros(shape = len(params_grid)) #to save cv results
for i in range(len(params_grid)):
    gb_reg = GradientBoostingRegressor(learning_rate = params_grid.iloc[i, 0],
                                       n_estimators = params_grid.iloc[i, 1], 
                                       subsample = params_grid.iloc[i, 2], 
                                       max_depth = params_grid.iloc[i, 3],
                                       min_samples_leaf = params_grid.iloc[i, 4],
                                       random_state = 42)
    scores = cross_val_score(estimator = gb_reg, 
                             X = X_train, 
                             y = trans_y_train,
                             scoring = 'neg_root_mean_squared_error',
                             cv = 5, n_jobs = -1)
    cv_errors[i] = scores.mean() 
end_time = time.time()
print('The Processing time is: ', end_time - start_time, 'seconds')

cv_errors

In [None]:
# Find the best model hyperparameters
best_params = params_grid.iloc[np.argmax(cv_errors), :]
print(f'Best Random Forest parameters: {best_params}')

In [None]:
# Retrain model 2 with best hyperparmeters
boosting_reg = GradientBoostingRegressor(
    learning_rate = params_grid.iloc[np.argmax(cv_errors), 0],
    n_estimators = params_grid.iloc[np.argmax(cv_errors), 1],
    subsample = params_grid.iloc[np.argmax(cv_errors), 2],
    max_depth= params_grid.iloc[np.argmax(cv_errors), 3],
    min_samples_leaf = params_grid.iloc[i, 4],
    random_state = 42)
    
model_2 = boosting_reg.fit(X_train, trans_y_train.reshape(-1))

In [None]:
#Prediction using model 2
pred_gbr = model_2.predict(X_test)
pred_gbr = pd.Series(boxcox.inverse_transform(pred_gbr.reshape(-1, 1)).reshape(-1), 
                    index = test.index)
pred_gbr

In [None]:
#Absolute error
abs_err_gbr = abs(test['SalePrice'] - pred_gbr)

#Absolute error mean, median, sd, IQR, max, min
from scipy.stats import iqr
models_comp = pd.concat([models_comp,
                         pd.DataFrame({'Mean of AbsErrors':    abs_err_gbr.mean(),
                                       'Median of AbsErrors' : abs_err_gbr.median(),
                                       'SD of AbsErrors' :     abs_err_gbr.std(),
                                       'IQR of AbsErrors':     iqr(abs_err_gbr),
                                       'Min of AbsErrors':     abs_err_gbr.min(),
                                       'Max of AbsErrors':     abs_err_gbr.max()}, 
                                      index = ['GB Regressor'])])
models_comp

In [None]:
#Actual vs. Prediction
plt.scatter(x = test['SalePrice'], y = pred_gbr, c = 'black', alpha = 0.7)
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.title('Actual vs. Prediction - Gradient Boost Regressor')

#Add 45 degree line
xp = np.linspace(test['SalePrice'].min(), test['SalePrice'].max(), 100)
plt.plot(xp, xp, c = 'red', linewidth = 3)
plt.show()

# Final Step: prediction on test set and create submission file

In [None]:
pred = model_2.predict(test_data)

pred = pd.Series(boxcox.inverse_transform(pred.reshape(-1, 1)).reshape(-1),
                 index = test_data.index)

final = pd.DataFrame()
final['Id'] = pred.index
final['SalePrice'] = pred.tolist()

# Write DataFrame to a CSV file without index
final.to_csv('submission.csv', index=False)

In [None]:
final