<a href="https://www.kaggle.com/code/begumarici/houseprice-outlier-detection-feature-engineering?scriptVersionId=191819511" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, Lasso
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, StackingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings("ignore")

# Loading the Data and Data Overview

In [None]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
test.head()

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
train.info()

# Handling with the Outliers

In [None]:
def plot_outliers(df, column):
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    sns.boxplot(df[column])
    plt.title(f'{column} - Box Plot')
    
    plt.subplot(1, 2, 2)
    sns.histplot(df[column], kde=True)
    plt.title(f'{column} - Histogram')

In [None]:
plot_outliers(train, 'GrLivArea')
plot_outliers(train, 'TotalBsmtSF')

In [None]:
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

train = remove_outliers(train, 'GrLivArea')
train = remove_outliers(train, 'TotalBsmtSF')

# Filling missing values

In [None]:
missing_data = train.isnull().sum().sort_values(ascending=False)
missing_percentage = (train.isnull().sum() / train.isnull().count()).sort_values(ascending=False)
missing_info = pd.concat([missing_data, missing_percentage], axis=1, keys=['Total', 'Percent'])
missing_info.head(20)

In [None]:
# Splitting categorical and numerical feats
categorical_feats = train.select_dtypes(include=['object']).columns
numerical_feats = train.select_dtypes(exclude=['object']).columns

In [None]:
# Splitting categorical and numerical feats
categorical_feats_test = test.select_dtypes(include=['object']).columns
numerical_feats_test = test.select_dtypes(exclude=['object']).columns

In [None]:
# Identify columns with missing values
missing_cat_cols = [col for col in categorical_feats if train[col].isnull().sum() > 0]
missing_num_cols = [col for col in numerical_feats if train[col].isnull().sum() > 0]

In [None]:
missing_cat_cols_test = [col for col in categorical_feats_test if test[col].isnull().sum() > 0]
missing_num_cols_test = [col for col in numerical_feats_test if test[col].isnull().sum() > 0]

### According to the dataset documentation, NaN values in certain categorical columns indicate the absence of the feature. Hence, filling them with 'None'.

In [None]:
def fill_missing_values(df, cat_cols, num_cols):

    for col in cat_cols:
        if 'Pool' in col or 'Garage' in col or 'Fireplace' in col or 'Bsmt' in col or 'Alley' in col or 'Fence' in col or 'MiscFeature' in col:
            df[col].fillna('None', inplace=True)
        else:
            df[col].fillna(df[col].mode()[0], inplace=True)
    
    for col in num_cols:
        df[col].fillna(df[col].median(), inplace=True)

fill_missing_values(train, missing_cat_cols, missing_num_cols)
fill_missing_values(test, missing_cat_cols_test, missing_num_cols_test)

In [None]:
train['LotFrontage'] = train.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
test['LotFrontage'] = test.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

train['GarageYrBlt'] = train.apply(lambda row: 0 if row['GarageType'] == 'None' else row['GarageYrBlt'], axis=1)
test['GarageYrBlt'] = test.apply(lambda row: 0 if row['GarageType'] == 'None' else row['GarageYrBlt'], axis=1)

# Feature Engineering

In [None]:
train['HouseAge'] = train['YrSold'] - train['YearBuilt']
train['RemodAge'] = train['YrSold'] - train['YearRemodAdd']
test['HouseAge'] = test['YrSold'] - test['YearBuilt']
test['RemodAge'] = test['YrSold'] - test['YearRemodAdd']

In [None]:
train['TotalBathrooms'] = (train['FullBath'] + (0.5 * train['HalfBath']) + train['BsmtFullBath'] + (0.5 * train['BsmtHalfBath']))
test['TotalBathrooms'] = (test['FullBath'] + (0.5 * test['HalfBath']) + test['BsmtFullBath'] + (0.5 * test['BsmtHalfBath']))

train['TotalPorchSF'] = (train['OpenPorchSF'] + train['3SsnPorch'] + train['EnclosedPorch'] + train['ScreenPorch'] + train['WoodDeckSF'])
test['TotalPorchSF'] = (test['OpenPorchSF'] + test['3SsnPorch'] + test['EnclosedPorch'] + test['ScreenPorch'] + test['WoodDeckSF'])

train['TotalRooms'] = train['TotRmsAbvGrd'] + train['TotalBsmtSF']
test['TotalRooms'] = test['TotRmsAbvGrd'] + test['TotalBsmtSF']

# Encoding

In [None]:
y = train['SalePrice']
train_ids = train['Id']
test_ids = test['Id']

train.drop(['SalePrice', 'Id'], axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)

train = pd.get_dummies(train, drop_first=True)
test = pd.get_dummies(test, drop_first=True)

# Model Training and Evaluation

In [None]:
# Align train and test sets
train, test = train.align(test, join='outer', axis=1, fill_value=0)

X = train

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
# Model and hyperparameter settings
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'RandomForest': RandomForestRegressor(),
    'XGBRegressor': XGBRegressor(),
    'CatBoost': CatBoostRegressor(verbose=0),
    'Stacking': StackingRegressor(estimators=[
        ('LinearRegression', LinearRegression()),
        ('Ridge', Ridge()),
        ('Lasso', Lasso()),
        ('ElasticNet', ElasticNet()),
        ('RandomForest', RandomForestRegressor()),
        ('XGBRegressor', XGBRegressor()),
        ('CatBoost', CatBoostRegressor(verbose=0))
    ], final_estimator=LinearRegression())
}

params = {
    'LinearRegression': {},
    'Ridge': {'alpha': [0.1, 1.0, 10.0]},
    'Lasso': {'alpha': [0.01, 0.1, 1.0]},
    'ElasticNet': {'alpha': [0.01, 0.1, 1.0], 'l1_ratio': [0.1, 0.5, 0.9]},
    'RandomForest': {'n_estimators': [100, 200], 'max_features': ['auto', 'sqrt', 'log2']},
    'XGBRegressor': {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1]},
    'CatBoost': {'iterations': [100, 200], 'learning_rate': [0.01, 0.1]},
    'Stacking': {}  # No hyperparameters to tune for the stacking model
}

# Train and Evaluate Models

In [None]:
results = {}
for model_name in models:
    model = models[model_name]
    param = params[model_name]
    if model_name == 'Stacking':
        # No hyperparameter tuning for stacking model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    else:
        grid = GridSearchCV(model, param, cv=5, scoring='neg_mean_squared_error')
        grid.fit(X_train, y_train)
        y_pred = grid.best_estimator_.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    if model_name == 'Stacking':
        results[model_name] = {
            'mse': mse,
            'mae': mae,
            'rmse': rmse,
            'r2': r2
        }
    else:
        results[model_name] = {
            'best_params': grid.best_params_,
            'mse': mse,
            'mae': mae,
            'rmse': rmse,
            'r2': r2
        }
    
    print(f"{model_name} Mean Absolute Error (MAE): {mae:.2f}")
    print(f"{model_name} Mean Squared Error (MSE): {mse:.2f}")
    print(f"{model_name} Root Mean Squared Error (RMSE): {rmse:.2f}")
    print(f"{model_name} R-squared (R2): {r2:.2f}")

# Select the Best Model

In [None]:
best_model_name = min(results, key=lambda x: results[x]['mse'])
best_model = models[best_model_name]

In [None]:
print(f"\nBest Model: {best_model_name}")

if best_model_name != 'Stacking':
    print(f"Best Parameters: {results[best_model_name]['best_params']}")
else:
    print("No hyperparameters to tune for the Stacking model.")

print(f"Best MAE: {results[best_model_name]['mae']:.2f}")
print(f"Best MSE: {results[best_model_name]['mse']:.2f}")
print(f"Best RMSE: {results[best_model_name]['rmse']:.2f}")
print(f"Best R2: {results[best_model_name]['r2']:.2f}")

# Train the Best Model on the Entire Data

In [None]:
best_model.fit(X, y)

test_predictions = best_model.predict(test) 

# Save Predictions

In [None]:
submission = pd.DataFrame({
    'Id': test_ids, 
    'SalePrice': test_predictions
})
submission.to_csv('submission.csv', index=False)

In [None]:
submission.head()