# üè° House Price Prediction ‚Äì Ames, Iowa
### üìò Kaggle: Advanced Regression Techniques

## 1Ô∏è‚É£ Business Problem
A homebuyer wants to estimate the price of their dream house but does not know how each feature influences its value.  
In this project, we aim to understand the factors affecting house prices and build a predictive model capable of estimating a home's sale price accurately.

---

## 2Ô∏è‚É£ Dataset Story
This project uses the Kaggle competition dataset **"House Prices: Advanced Regression Techniques."**

- üìä Train Set: 1,460 houses  
- üìä Test Set: 1,459 houses  
- üß© Features: 79 variables describing structural, locational, and quality attributes  
- üéØ Target: `SalePrice`

The dataset includes:  
- Lot size and shape  
- House quality and overall condition  
- Living areas (basement, first floor, second floor)  
- Garage features  
- Year built and remodeled  
- Neighborhood characteristics  
- External materials and more

---




# üîß 1. Import Required Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import time

from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler



warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)


In [None]:
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# üì• 2. Loading the Dataset

In [None]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

df = pd.concat([train, test], ignore_index=True)

In [None]:
train.head()

In [None]:
train.shape 

In [None]:
test.head()

In [None]:
test.shape

# üîç 3. Exploratory Data Analysis (EDA)


In [None]:
def check_df(dataframe, head=5):
    print('##################### Shape #####################')
    print(dataframe.shape)
    print('##################### Types #####################')
    print(dataframe.dtypes)
    print('##################### Head #####################')
    print(dataframe.head(head))
    print('##################### Tail #####################')
    print(dataframe.tail(head))
    print('##################### NA #####################')
    print(dataframe.isnull().sum())
    print('##################### Quantiles #####################')
    print(dataframe.describe([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

check_df(df)

# üî¢ 4. Classifying Variables: Numerical vs Categorical

In [None]:
def grab_col_names(dataframe , cat_th=10, car_th=20):
    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"] 

    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]

    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]

    cat_cols = cat_cols + num_but_cat

    cat_cols = [col for col in cat_cols if col not in cat_but_car] 

    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"] 

    num_cols = [col for col in num_cols if col not in num_but_cat] 
    
    print(f"Observations: {dataframe.shape[0]}") 
    print(f"Variables: {dataframe.shape[1]}") 
    print(f'cat_cols: {len(cat_cols)}') 
    print(f'num_cols: {len(num_cols)}') 
    print(f'cat_but_car: {len(cat_but_car)}') 
    print(f'num_but_cat: {len(num_but_cat)}') 


    return cat_cols, num_cols, cat_but_car, num_but_cat

cat_cols, num_cols, cat_but_car,  num_but_cat = grab_col_names(df)

In [None]:
cat_cols

In [None]:
num_cols

In [None]:
cat_but_car

In [None]:
num_but_cat

# üî† 5. Analysis of Categorical Variables

In [None]:
def cat_summary(dataframe, col_name, plot=False):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        'Ratio': 100 * dataframe[col_name].value_counts() / len(dataframe)}))
    print('##########################################')
    if plot:
        sns.countplot(x=dataframe[col_name], data=dataframe)
        plt.show(block=True)

for col in cat_cols:
    cat_summary(df, col, plot=True)

# üßÆ 6. Understanding Numerical Features

In [None]:
def num_summary(dataframe, numerical_col, plot=False):
    quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]
    print(dataframe[numerical_col].describe(quantiles).T)

    if plot:
        dataframe[numerical_col].hist(bins=20)
        plt.xlabel(numerical_col)
        plt.title(numerical_col)
        plt.show(block=True)

for col in num_cols:
    num_summary(df, col, plot=True)

# üìå 7. Analysis of Categorical Variables by Target

In [None]:
def target_summary_with_cat(dataframe, target, categorical_col, plot=False):
    print(pd.DataFrame({'TARGET_MEAN': dataframe.groupby(categorical_col)[target].mean()}), end='\n\n\n')
    if plot:
        sns.barplot(x=categorical_col, y=target, data=dataframe)
        plt.show(block=True)

for col in cat_cols:
    target_summary_with_cat(df, 'SalePrice', col, plot=True)


# üìå 8. Analysis of Numerical Variables by Target

In [None]:
def target_summary_with_num(dataframe, target, numerical_col, plot=False):
    print(pd.DataFrame({numerical_col+'_mean': dataframe.groupby(target)[numerical_col].mean()}), end='\n\n\n')
    if plot:
        sns.barplot(x=target, y=numerical_col, data=dataframe)
        plt.show(block=True)

for col in num_cols:
    target_summary_with_cat(df, 'SalePrice', col, plot=False)

# üìà 9. Correlation Heatmap & Analysis

In [None]:
def high_correlated_cols(dataframe, plot=False, corr_th=0.70):
    # Sadece numerik kolonlarƒ± al
    df_numeric = dataframe.select_dtypes(include=[np.number])
    
    # Korelasyon matrisi
    corr = df_numeric.corr()
    cor_matrix = corr.abs()

    # √úst √º√ßgen
    upper_triangle_matrix = cor_matrix.where(
        np.triu(np.ones(cor_matrix.shape), k=1).astype(bool)
    )

    # E≈üik √ºzerindeki korelasyonlar
    drop_list = [col for col in upper_triangle_matrix.columns 
                 if any(upper_triangle_matrix[col] > corr_th)]

    # Plot opsiyonu
    if plot:
        plt.figure(figsize=(12, 12))
        sns.heatmap(corr, cmap="RdBu", annot=False)
        plt.show()

    return drop_list


high_correlated_cols(df, plot=True)

In [None]:
df_num = df[num_cols]

corr_features = pd.DataFrame(
    df_num.corr()['SalePrice']
    .drop('SalePrice', axis=0)
    .sort_values(ascending=False)
)

corr_features.apply(lambda x: round(x, 3) * 100).head(50)

In [None]:
def high_correlated_cols(dataframe, head=10):
    # sadece numerik kolonlarƒ± al
    df_num = dataframe.select_dtypes(include=[np.number])
    
    # korelasyon matrisi
    corr_matrix = df_num.corr().abs()
    
    # √ºst √º√ßgen ile y√ºksek korelasyon √ßiftleri
    corr_pairs = (
        corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        .stack()
        .sort_values(ascending=False)
        .head(head)
    )
    
    return corr_pairs

high_correlated_cols(df, 20)

In [None]:
df["SalePrice"].hist(bins=100)
plt.show(block=True)

In [None]:
np.log1p(df['SalePrice']).hist(bins=50)
plt.show(block=True)

In [None]:
def quantile_summary(df):
    num_df = df.select_dtypes(include=["int64", "float64"])
    q_df = num_df.quantile([0.01, 0.05, 0.50, 0.95, 0.99]).T
    q_df.columns = ["Q01", "Q05", "Median", "Q95", "Q99"]
    q_df["IQR"] = q_df["Q95"] - q_df["Q05"]
    q_df["Range99"] = q_df["Q99"] - q_df["Q01"]
    return q_df.sort_values("Range99", ascending=False)
quantile_summary(df)

# üõ†Ô∏è 10. Feature Engineering

In this section, we will apply several feature engineering steps to enhance the quality and predictive power of the dataset.  
These steps help the model better understand patterns and relationships within the data.

---

## üß™ 10.1 Feature Extraction  
New features will be created from existing variables to strengthen the model‚Äôs learning capability.  
This may include transformations, ratios, categorization, or domain-driven feature creation.



---

## üîç 10.2 Missing Values Detection  
We identify and handle missing values to prevent biases and errors during model training.

---

## üö® 10.3 Outlier Detection  
Outliers can negatively affect model performance.  
We will detect and treat outliers in numerical variables using appropriate statistical methods.

# üß™ 10.1 Feature Extraction 

In [None]:
# ============================================================
# ===============  FLOOR FEATURES (Kat Alanlarƒ±) ===============
# ============================================================
df["TotalFlrSF"] = df["1stFlrSF"] + df["2ndFlrSF"]
df["HasSecondFloor"] = (df["2ndFlrSF"] > 0).astype(int)
# ============================================================
# ================= BASEMENT FEATURES (BODRUM) ===============
# ============================================================
df["BsmtFinishedRatio"] = (df["BsmtFinSF1"] + df["BsmtFinSF2"]) / (df["TotalBsmtSF"] + 1)
df["BasementFinished"] = ((df["BsmtFinSF1"] + df["BsmtFinSF2"]) > 0).astype(int)
df["HasBasement"] = (df["TotalBsmtSF"] > 0).astype(int)
# ============================================================
# =================== GARAGE FEATURES (GARAJ) =================
# ============================================================
df["HasGarage"] = (df["GarageArea"] > 0).astype(int)
df["GarageCapacityQuality"] = df["GarageCars"] * df["GarageArea"]
finish_map = {"Fin":3, "RFn":2, "Unf":1, np.nan:0}
df["GarageFinishScore"] = df["GarageFinish"].map(finish_map)
df["GarageFinishArea"] = df["GarageArea"] * df["GarageFinishScore"]
# ============================================================
# =================== PORCH FEATURES (VERANDA) ===============
# ============================================================
df["TotalPorchSF"] = (
    df["OpenPorchSF"] +
    df["EnclosedPorch"] +
    df["3SsnPorch"] +
    df["ScreenPorch"]
)
df["HasPorch"] = (df["TotalPorchSF"] > 0).astype(int)
df["PorchCount"] = (
    (df["OpenPorchSF"] > 0).astype(int) +
    (df["EnclosedPorch"] > 0).astype(int) +
    (df["3SsnPorch"] > 0).astype(int) +
    (df["ScreenPorch"] > 0).astype(int)
)
# ============================================================
# ===================== MISC FEATURES =========================
# ============================================================
df["HasMiscVal"] = (df["MiscVal"] > 0).astype(int)
df["HasPool"] = (df["PoolArea"] > 0).astype(int)
# ============================================================
# =================== AGE & REMODEL FEATURES ==================
# ============================================================
df["RemodelAge"] = df["YearRemodAdd"] - df["YearBuilt"]
df["AgeSinceRemodel"] = df["YrSold"] - df["YearRemodAdd"]
df["AgeSinceBuilt"] = df["YrSold"] - df["YearBuilt"]
# ============================================================
# ================= QUALITY-BASED FEATURES ====================
# ============================================================
df["QualTotalSF"] = df["OverallQual"] * df["TotRmsAbvGrd"]
df["QualLivingRatio"] = df["OverallQual"] / (df["GrLivArea"] + 1)
df["QualMinusCond"] = df["OverallQual"] - df["OverallCond"]


drop_list = ["1stFlrSF" , "2ndFlrSF" , "BsmtFinSF1" , "BsmtFinSF2" , "GarageArea" ,"GarageYrBlt" ,   "GarageCars", "MiscVal" , "MasVnrArea" , "WoodDeckSF" , "OpenPorchSF" , "EnclosedPorch" , "3SsnPorch" ]
df.drop(drop_list, axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df["MSZoning"].unique()

# üîç 10.2 Missing Values Detection  

In [None]:
def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]

    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)

    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)

    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])

    print(missing_df, end="\n")

    if na_name:
        return na_columns

missing_values_table(df)

In [None]:
no_cols = ["Alley","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2","FireplaceQu",
           "GarageType","GarageFinish","GarageQual","GarageCond","PoolQC","Fence","MiscFeature" , "MasVnrType"]
for col in no_cols:
    df[col].fillna("No",inplace=True)

missing_values_table(df)

In [None]:
def quick_missing_imp(data, num_method="median", cat_length=20, target="SalePrice"):
    variables_with_na = [col for col in data.columns if data[col].isnull().sum() > 0]  

    temp_target = data[target]

    print("# BEFORE")
    print(data[variables_with_na].isnull().sum(), "\n\n")  

    
    data = data.apply(lambda x: x.fillna(x.mode()[0]) if (x.dtype == "O" and len(x.unique()) <= cat_length) else x, axis=0)

    
    if num_method == "mean":
        data = data.apply(lambda x: x.fillna(x.mean()) if x.dtype != "O" else x, axis=0)
    
    elif num_method == "median":
        data = data.apply(lambda x: x.fillna(x.median()) if x.dtype != "O" else x, axis=0)

    data[target] = temp_target

    print("# AFTER \n Imputation method is 'MODE' for categorical variables!")
    print(" Imputation method is '" + num_method.upper() + "' for numeric variables! \n")
    print(data[variables_with_na].isnull().sum(), "\n\n")

    return data

df = quick_missing_imp(df, num_method="median", cat_length=17)

# üö® 10.3 Outlier Detection  

In [None]:
cat_cols, num_cols, cat_but_car,  num_but_cat = grab_col_names(df)
true_outliers = []
def outlier_thresholds(dataframe, col_name, q1=0.05, q3=0.95):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        true_outliers.append(col)
        return True
    else:
        return False

for col in num_cols:
    if col != "SalePrice":
        print(col, ':', check_outlier(df, col))

def plot_outlier_columns(df, true_cols):
    """
    true_cols i√ßindeki kolonlarƒ± otomatik olarak
    boxplot + histogram + KDE ile √ßizer.
    """
    for col in true_cols:
        plt.figure(figsize=(16,5))

        # ---------------------------
        # BOX PLOT
        # ---------------------------
        plt.subplot(1, 2, 1)
        sns.boxplot(x=df[col], color="orange")
        plt.title(f"Boxplot ‚Äî {col}", fontsize=14)
        plt.xlabel(col)

        # ---------------------------
        # HISTOGRAM + KDE
        # ---------------------------
        plt.subplot(1, 2, 2)
        sns.histplot(df[col], kde=True, bins=30, color="skyblue")
        plt.title(f"Histogram ‚Äî {col}", fontsize=14)
        plt.xlabel(col)
        plt.ylabel("Frequency")

        plt.tight_layout()
        plt.show()

plot_outlier_columns(df ,true_outliers )

In [None]:
winsor_cols = ["LotFrontage", "GrLivArea", "TotalBsmtSF", "TotalFlrSF"]
def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

for col in num_cols:
    replace_with_thresholds(df, col)

df["LotArea_Log"] = np.log1p(df["LotArea"])

heavy_skew_cols = ["ScreenPorch", "PoolArea", "TotalPorchSF", "LowQualFinSF"]

for col in heavy_skew_cols:
    df[col + "_Binary"] = (df[col] > 0).astype(int)
    df[col + "_Log"] = np.log1p(df[col])




In [None]:
for col in num_cols:
    if col != "SalePrice":
        print(col, ':', check_outlier(df, col))

# üî§ 11. Rare Analysis

In [None]:
def rare_analyser(dataframe, target, cat_cols):
    for col in cat_cols:
        print(col, ':', len(dataframe[col].value_counts()))
        print(pd.DataFrame({'COUNT': dataframe[col].value_counts(),
                            'RATIO': dataframe[col].value_counts() / len(dataframe),
                            'TARGET_MEAN': dataframe.groupby(col)[target].mean()}), end='\n\n\n')

rare_analyser(df, "SalePrice", cat_cols)

In [None]:
def rare_encoder(dataframe, rare_perc):
    temp_df = dataframe.copy()

    rare_columns = [col for col in temp_df.columns if temp_df[col].dtypes == 'O'
                    and (temp_df[col].value_counts() / len(temp_df) < rare_perc).any(axis=None)]

    for var in rare_columns:
        tmp = temp_df[var].value_counts() / len(temp_df)
        rare_labels = tmp[tmp < rare_perc].index
        temp_df[var] = np.where(temp_df[var].isin(rare_labels), 'Rare', temp_df[var])
    return temp_df

rare_encoder(df, 0.01)

In [None]:
cat_cols, num_cols, cat_but_car, num_but_cat = grab_col_names(df)

# üî§ 12. Encoding

In [None]:
def binary_cols(dataframe):
    binary_cols = [col for col in dataframe.columns if dataframe[col].dtype not in ['int64', 'float64'] and dataframe[col].nunique() <= 2]
    return binary_cols

def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe

binary_cols = binary_cols(df)



In [None]:
for col in binary_cols:
    df = label_encoder(df, col)

In [None]:
def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe

df = one_hot_encoder(df, cat_cols, drop_first=True)

df.head()


In [None]:
df.columns = df.columns.str.replace(" ", "_")


In [None]:
df.drop("Neighborhood", axis=1, inplace=True)


# ü§ñ 13. Modeling

In [None]:
train_df = df[df['SalePrice'].notnull()].copy()
test_df  = df[df['SalePrice'].isnull()].copy()

train_df["SalePrice_Log"] = np.log1p(train_df["SalePrice"])


y = train_df["SalePrice_Log"]
X = train_df.drop(["Id", "SalePrice", "SalePrice_Log"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=17)


In [None]:
models = [('LR', LinearRegression()),
          ("Ridge", Ridge()),
          ("Lasso", Lasso()),
          ("ElasticNet", ElasticNet()),
          ('KNN', KNeighborsRegressor()),
          ('CART', DecisionTreeRegressor()),
          ('RF', RandomForestRegressor()),
          #('SVR', SVR()),
          ('GBM', GradientBoostingRegressor()),
          ("XGBoost", XGBRegressor(objective='reg:squarederror')),
          ("LightGBM", LGBMRegressor()),
          ("CatBoost", CatBoostRegressor(verbose=False))]

rmse_scores = []
execution_times = []

for name, regressor in models:
    start_time = time.time()

    # Fit the model
    regressor.fit(X_train, y_train)

    # Make predictions
    y_pred = regressor.predict(X_test)

    # Calculate RMSE
    rmse = np.mean(np.sqrt(-cross_val_score(regressor, X, y, cv=5, scoring="neg_mean_squared_error")))
    rmse_scores.append(rmse)

    # Calculate the execution time of the model
    execution_time = time.time() - start_time
    execution_times.append(execution_time)

    print(f"RMSE: {round(rmse, 4)} ({name})")
    print(f"Execution Time: {round(execution_time, 2)} seconds\n")

# Plot RMSE scores
plt.figure(figsize=(12, 8))
# Exclude LR from the plot
filtered_scores = [score for name, score in zip([name for name, _ in models], rmse_scores) if name != 'LR']
plt.bar([name for name, _ in models if name != 'LR'], filtered_scores)
plt.xlabel("Model")
plt.ylabel("RMSE")
plt.title("Model Performance (RMSE)")
plt.show()

# Plot execution times
plt.figure(figsize=(12, 8))
plt.bar([name for name, _ in models], execution_times)
plt.xlabel("Execution Time (seconds)")
plt.ylabel("Model")
plt.title("Execution Times for Different Models")
plt.show()

In [None]:
y_pred_test = regressor.predict(X_test)

# Log‚Äôdan geri d√∂n√º≈ü√ºm
y_pred_test_original_scale = np.expm1(y_pred_test)
y_test_original_scale = np.expm1(y_test)

# RMSE hesapla
rmse_original_scale = np.sqrt(np.mean((y_pred_test_original_scale - y_test_original_scale) ** 2))

print(f"RMSE in original scale: {round(rmse_original_scale, 4)}")

In [None]:

train_df = df[df['SalePrice'].notnull()].copy()
test_df  = df[df['SalePrice'].isnull()].copy()

y = np.log1p(train_df['SalePrice'])
X = train_df.drop(["Id", "SalePrice"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=17
)



models = [
    ("GBM", GradientBoostingRegressor()),
    ("XGBoost", XGBRegressor(objective='reg:squarederror')),
    ("LightGBM", LGBMRegressor()),
    ("CatBoost", CatBoostRegressor(verbose=False))
]

param_grids = {
    "GBM": {"n_estimators": [100, 200], "max_depth": [5, 7], "learning_rate": [0.01, 0.1]},
    "XGBoost": {"n_estimators": [100, 200], "max_depth": [5, 7], "learning_rate": [0.01, 0.1]},
    "LightGBM": {"n_estimators": [100, 200], "max_depth": [5, 7], "learning_rate": [0.01, 0.1]},
    "CatBoost": {"iterations": [100, 200], "depth": [5, 7], "learning_rate": [0.01, 0.1]}
}

rmse_values = []
execution_times = []
model_names = []



for name, regressor in models:
    print(f"Hyperparameter Tuning for {name}:")

    start_time = time.time()
    grid_search = GridSearchCV(
        regressor, 
        param_grid=param_grids[name], 
        cv=5, 
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    end_time = time.time()

    execution_time = end_time - start_time
    best_model = grid_search.best_estimator_

    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Execution Time: {execution_time:.4f} seconds")

    # RMSE in original scale
    y_pred = best_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred)))

    print(f"RMSE: {rmse:.4f}")
    print("---------------")

    rmse_values.append(rmse)
    execution_times.append(execution_time)
    model_names.append(name)


plt.figure(figsize=(10, 6))
plt.bar(model_names, rmse_values, color='steelblue')
plt.xlabel('Model')
plt.ylabel('RMSE')
plt.title('Top 4 Model Performance (RMSE)')
plt.xticks(rotation=45)
plt.show()


plt.figure(figsize=(10, 6))
plt.bar(model_names, execution_times, color='darkorange')
plt.xlabel('Model')
plt.ylabel('Execution Time (seconds)')
plt.title('Execution Times ‚Äì Top 4 Models')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Final Prediction Model
final_model = best_model

# Make predictions on the test set using the final model
y_final_pred = final_model.predict(X_test)
final_y_pred = np.expm1(y_final_pred)
final_y_test = np.expm1(y_test)

In [None]:
# Create a DataFrame with the predicted prices and true prices
results = pd.DataFrame({'Predicted Price': final_y_pred, 'True Price': final_y_test})

# Calculate the difference between the true prices and predicted prices and add a new column
results['Difference'] = results['True Price'] - results['Predicted Price']

# Display the results
print(results)

In [None]:
from sklearn.metrics import r2_score

r2_original = r2_score(final_y_test, final_y_pred)
print("R2 Score (Original Scale):", round(r2_original, 4))

In [None]:
# Create a DataFrame with the predicted prices and true prices
results = pd.DataFrame({'Predicted Price': final_y_pred, 'True Price': final_y_test})

# Create a line plot
sns.lineplot(data=results)

# Label the axes
plt.xlabel('Sample')
plt.ylabel('Price')

# Set the title
plt.title('Predicted Prices vs. True Prices')

# Show the plot
plt.show()

In [None]:
def plot_importance(model, features, num=50, save=False):
    feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': features.columns})
    plt.figure(figsize=(10, 10))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",
                                                                     ascending=False)[0:num])
    plt.title('Features')
    plt.tight_layout()
    plt.show(block=True)
    if save:
        plt.savefig('importances.png')

plot_importance(final_model, X)

In [None]:
final_y_pred = np.expm1(y_final_pred)
final_y_test = np.expm1(y_test)

# Kaggle Metric: RMSE(log(pred), log(actual))
kaggle_rmse = np.sqrt(mean_squared_error(
    np.log1p(final_y_test), 
    np.log1p(final_y_pred)
))

print("Kaggle Metric RMSE (log scale):", round(kaggle_rmse, 5))