In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import xgboost as xgb
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV, KFold

import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

In [18]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [19]:
test_id = test_df["Id"]

train_df = train_df.drop(["Id"], axis=1)
test_df = test_df.drop(["Id"], axis=1)

In [20]:
cat_cols = [col for col in train_df.columns if train_df[col].dtype == "O"]
num_cols = [
    col
    for col in train_df.columns
    if train_df[col].dtype != "O" and col not in ["Id", "SalePrice"]
]

In [21]:
none_columns = [col for col in cat_cols if train_df[col].isnull().sum() > 0]
def handle_none_missing_value(dataframe):
    for col in none_columns:
        dataframe[col].fillna("None", inplace=True)

    return dataframe

In [22]:
def fill_lot_frontage_by_group(dataframe, group_col):
    dataframe['LotFrontage'] = dataframe.groupby(group_col)['LotFrontage'].transform(lambda x: x.fillna(x.mean()))
    
    return dataframe

In [23]:
def fill_cols_with_zero(dataframe):
    zero_columns = [col for col in num_cols if dataframe[col].isnull().sum() > 0]
    
    for col in zero_columns:
        dataframe[col].fillna(0, inplace=True)

    return dataframe

In [24]:
def fill_cols_with_mode(dataframe):
    remaining_cols = [col for col in cat_cols if col not in none_columns if dataframe[col].isnull().any()]

    for col in remaining_cols:
        dataframe[col].fillna(dataframe[col].mode()[0], inplace=True)

    return dataframe

In [25]:
def add_features(dataframe):
    dataframe["New_Age"] = dataframe["YrSold"] - train_df["YearRemodAdd"]
    dataframe["New_HouseTotalSf"] = (
        dataframe["TotalBsmtSF"] + dataframe["1stFlrSF"] + dataframe["2ndFlrSF"]
    )
    dataframe["New_TotalBath"] = (
        dataframe["BsmtFullBath"]
        + (0.5 * dataframe["BsmtHalfBath"])
        + dataframe["FullBath"]
        + (0.5 * dataframe["HalfBath"])
    )
    dataframe["New_OverallQualCond"] = (
        dataframe["OverallQual"] * dataframe["OverallCond"]
    )
    dataframe["New_FrontagePerArea"] = dataframe["LotFrontage"] / dataframe["LotArea"]

    return dataframe

In [26]:
train_df = handle_none_missing_value(train_df)
train_df = fill_lot_frontage_by_group(train_df, "Neighborhood")

test_df = handle_none_missing_value(test_df)
test_df = fill_lot_frontage_by_group(test_df, "Neighborhood")

train_df = fill_cols_with_zero(train_df)

test_df = fill_cols_with_zero(test_df)
test_df = fill_cols_with_mode(test_df)

In [27]:
train_df = add_features(train_df)
test_df = add_features(test_df)

In [28]:
# Outliers
train_df = train_df.drop(
    train_df[(train_df["GrLivArea"] > 4000) & (train_df["SalePrice"] < 300000)].index
)

In [29]:
def outlier_threshold(dataframe, col, q1=0.05, q3=0.95):
    quartile1 = dataframe[col].quantile(q1)
    quartile3 = dataframe[col].quantile(q3)
    interquartile = quartile3 - quartile1
    upper_limit = quartile3 + interquartile*1.5
    lower_limit = quartile1 - interquartile*1.5

    return lower_limit, upper_limit

In [30]:
def check_outlier(dataframe, col_name, q1=0.05, q3=0.95):
    lower_limit, upper_limit = outlier_threshold(dataframe, col_name, q1, q3)
    if dataframe[
        (dataframe[col_name] > upper_limit) | (dataframe[col_name] < lower_limit)
    ].any(axis=None):
        return True
    else:
        return False

In [31]:
def replace_with_thresholds(dataframe, col):
    lower_limit, upper_limit = outlier_threshold(dataframe, col)
    dataframe.loc[(dataframe[col] < lower_limit), col] = lower_limit
    dataframe.loc[(dataframe[col] > upper_limit), col] = upper_limit
    return dataframe

In [32]:
# Encoding ordinal variables
def apply_mappings(dataframe, mapping_dict):
    """
    Applies mapping dictionaries to both train and test DataFrames.

    Parameters:
    - train_df: pandas DataFrame for training data
    - test_df: pandas DataFrame for test data
    - mapping_dict: dictionary with column names as keys and their respective mapping dictionaries as values

    Returns:
    - train_df: pandas DataFrame with applied mappings
    - test_df: pandas DataFrame with applied mappings
    """
    for col, mapping in mapping_dict.items():
        if col in dataframe.columns:
            dataframe[col] = dataframe[col].map(mapping)
    
    return dataframe

mapping_dict = {
    'PoolQC': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'None': 0},
    'Fence': {'GdPrv': 4, 'GdWo': 3, 'MnPrv': 2, 'MnWw': 1, 'None': 0},
    'GarageQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0},
    'GarageCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0},
    'FireplaceQu': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0},
    'KitchenQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
    'HeatingQC': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
    'BsmtCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0},
    'BsmtExposure': {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'None': 0},
    'BsmtQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0},
    'ExterQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
    'ExterCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}
}

train_df = apply_mappings(train_df, mapping_dict)
test_df = apply_mappings(test_df, mapping_dict)

In [33]:
ordinal_col = ['PoolQC', 'Fence', 'GarageQual', 'GarageCond', 'FireplaceQu', 
              'KitchenQual', 'HeatingQC', 'BsmtCond', 'BsmtExposure', 'BsmtQual', 'ExterQual', 'ExterCond']
remaining_cat_cols = [col for col in cat_cols if col not in ordinal_col]

In [34]:
def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe

In [35]:
combined_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
combined_df = one_hot_encoder(combined_df, remaining_cat_cols, True)

In [36]:
train_df = combined_df[:len(train_df)]
test_df = combined_df[len(train_df):]

In [37]:
# Model
X_train = train_df.drop(columns=['SalePrice'])
y_train = train_df['SalePrice']

kf = KFold(n_splits=5, shuffle=True, random_state=42)

xgb_model = xgb.XGBRegressor(random_state=42)

param_grid = {
    'n_estimators': [200, 300, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.5, 0.7, 1]
}

In [38]:
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=kf, verbose=1, n_jobs=-1)

In [39]:
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.5}


In [40]:
best_model = grid_search.best_estimator_

In [41]:
X_test = test_df.copy()
X_test = X_test.drop(["SalePrice"], axis=1)

In [42]:
y_pred = best_model.predict(X_test)

In [43]:
submission_df = pd.DataFrame({
    'Id': test_id,  
    'SalePrice': y_pred 
})

submission_df.to_csv('submission.csv', index=False)