In [736]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import warnings

warnings.filterwarnings("ignore")

In [737]:
# train_file_path = "/kaggle/input/house-prices-advanced-regression-techniques/train.csv"
# test_file_path = "/kaggle/input/house-prices-advanced-regression-techniques/test.csv"

train_file_path = "data/train.csv"
test_file_path = "data/test.csv"

In [738]:
data = pd.read_csv(train_file_path)
test = pd.read_csv(test_file_path)

Calculate the limits for each variable

In [739]:
def remove_outliers(data, column, m=3):
    mean = np.mean(data[column])
    std_dev = np.std(data[column])
    lower_bound = mean - m * std_dev
    upper_bound = mean + m * std_dev
    return lower_bound, upper_bound

Interest column (SalePrice) to remove outliers

In [740]:
Q1 = data['SalePrice'].quantile(0.25)
Q3 = data['SalePrice'].quantile(0.75)
IQR = Q3 - Q1

Set the thresholds to consider a point as an outlier and remove them

In [741]:
lower_bound = Q1 - 0.3 * IQR
upper_bound = Q3 + 0.3 * IQR

data = data[(data['SalePrice'] >= lower_bound) & (data['SalePrice'] <= upper_bound)]

Define the variable columns of interest and set the standard deviation threshold to 3

In [742]:
columns = ["YearBuilt", "GarageYrBlt", "GrLivArea", "GarageArea", "OverallQual", "SalePrice"]

m = 3

Calculate limits for each variable and remove outliers

In [743]:
for column in columns:
    lower_bound, upper_bound = remove_outliers(data, column, m)
    data = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]

Reset the index of the data

In [744]:
data.reset_index(drop=True, inplace=True)

# Feature Engineering

Create the label encoder object and apply it

In [745]:
Label_pre = LabelEncoder()
data_cols = data.select_dtypes(exclude=['int', 'float']).columns
label_col = list(data_cols)

In [746]:
data[label_col] = data[label_col].apply(lambda col: Label_pre.fit_transform(col))

Mapping data to convert categorical to numerical

In [747]:
def map_values(dataset):
    dataset["MSSubClass"] = dataset.MSSubClass.map({'180': 1,
                                                    '30': 2, '45': 2,
                                                    '190': 3, '50': 3, '90': 3,
                                                    '85': 4, '40': 4, '160': 4,
                                                    '70': 5, '20': 5, '75': 5, '80': 5, '150': 5,
                                                    '120': 6, '60': 6})

    dataset["MSZoning"] = dataset.MSZoning.map({'C (all)': 1, 'RH': 2, 'RM': 2, 'RL': 3, 'FV': 4})

    dataset["Neighborhood"] = dataset.Neighborhood.map({'MeadowV': 1,
                                                        'IDOTRR': 2, 'BrDale': 2,
                                                        'OldTown': 3, 'Edwards': 3, 'BrkSide': 3,
                                                        'Sawyer': 4, 'Blueste': 4, 'SWISU': 4, 'NAmes': 4,
                                                        'NPkVill': 5, 'Mitchel': 5,
                                                        'SawyerW': 6, 'Gilbert': 6, 'NWAmes': 6,
                                                        'Blmngtn': 7, 'CollgCr': 7, 'ClearCr': 7, 'Crawfor': 7,
                                                        'Veenker': 8, 'Somerst': 8, 'Timber': 8,
                                                        'StoneBr': 9,
                                                        'NoRidge': 10, 'NridgHt': 10})

    dataset["Condition1"] = dataset.Condition1.map({'Artery': 1,
                                                    'Feedr': 2, 'RRAe': 2,
                                                    'Norm': 3, 'RRAn': 3,
                                                    'PosN': 4, 'RRNe': 4,
                                                    'PosA': 5, 'RRNn': 5})

    dataset["BldgType"] = dataset.BldgType.map({'2fmCon': 1, 'Duplex': 1, 'Twnhs': 1, '1Fam': 2, 'TwnhsE': 2})

    dataset["HouseStyle"] = dataset.HouseStyle.map({'1.5Unf': 1,
                                                    '1.5Fin': 2, '2.5Unf': 2, 'SFoyer': 2,
                                                    '1Story': 3, 'SLvl': 3,
                                                    '2Story': 4, '2.5Fin': 4})

    dataset["Exterior1st"] = dataset.Exterior1st.map({'BrkComm': 1,
                                                      'AsphShn': 2, 'CBlock': 2, 'AsbShng': 2,
                                                      'WdShing': 3, 'Wd Sdng': 3, 'MetalSd': 3, 'Stucco': 3,
                                                      'HdBoard': 3,
                                                      'BrkFace': 4, 'Plywood': 4,
                                                      'VinylSd': 5,
                                                      'CemntBd': 6,
                                                      'Stone': 7, 'ImStucc': 7})

    dataset["MasVnrType"] = dataset.MasVnrType.map({'BrkCmn': 1, 'None': 1, 'BrkFace': 2, 'Stone': 3})

    dataset["ExterQual"] = dataset.ExterQual.map({'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4})

    dataset["Foundation"] = dataset.Foundation.map({'Slab': 1,
                                                    'BrkTil': 2, 'CBlock': 2, 'Stone': 2,
                                                    'Wood': 3, 'PConc': 4})

    dataset["BsmtQual"] = dataset.BsmtQual.map({'Fa': 2, 'None': 1, 'TA': 3, 'Gd': 4, 'Ex': 5})

    dataset["BsmtExposure"] = dataset.BsmtExposure.map({'None': 1, 'No': 2, 'Av': 3, 'Mn': 3, 'Gd': 4})

    dataset["Heating"] = dataset.Heating.map({'Floor': 1, 'Grav': 1, 'Wall': 2, 'OthW': 3, 'GasW': 4, 'GasA': 5})

    dataset["HeatingQC"] = dataset.HeatingQC.map({'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})

    dataset["KitchenQual"] = dataset.KitchenQual.map({'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4})

    dataset["Functional"] = dataset.Functional.map(
        {'Maj2': 1, 'Maj1': 2, 'Min1': 2, 'Min2': 2, 'Mod': 2, 'Sev': 2, 'Typ': 3})

    dataset["FireplaceQu"] = dataset.FireplaceQu.map({'None': 1, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})

    dataset["GarageType"] = dataset.GarageType.map({'CarPort': 1, 'None': 1,
                                                    'Detchd': 2,
                                                    '2Types': 3, 'Basment': 3,
                                                    'Attchd': 4, 'BuiltIn': 5})

    dataset["GarageFinish"] = dataset.GarageFinish.map({'None': 1, 'Unf': 2, 'RFn': 3, 'Fin': 4})

    dataset["PavedDrive"] = dataset.PavedDrive.map({'N': 1, 'P': 2, 'Y': 3})

    dataset["SaleType"] = dataset.SaleType.map({'COD': 1, 'ConLD': 1, 'ConLI': 1, 'ConLw': 1, 'Oth': 1, 'WD': 1,
                                                'CWD': 2, 'Con': 3, 'New': 3})

    dataset["SaleCondition"] = dataset.SaleCondition.map(
        {'AdjLand': 1, 'Abnorml': 2, 'Alloca': 2, 'Family': 2, 'Normal': 3, 'Partial': 4})

    dataset["Street"] = dataset.Street.map({'Grvl': 1, 'Pave': 2})

    dataset["Alley"] = dataset.Alley.map({'None': 1, 'Grvl': 2, 'Pave': 3})

    dataset["LotShape"] = dataset.LotShape.map({'Reg': 1, 'IR1': 2, 'IR2': 3, 'IR3': 4})

    dataset["LandContour"] = dataset.LandContour.map({'Bnk': 1, 'Lvl': 2, 'Low': 3, 'HLS': 4})

    dataset["Utilities"] = dataset.Utilities.map({'ELO': 1, 'NoSeWa': 2, 'NoSewr': 3, 'AllPub': 4})

    dataset["LotConfig"] = dataset.LotConfig.map({'Inside': 1, 'Corner': 2, 'FR2': 3, 'FR3': 4, 'CulDSac': 5})

    dataset["LandSlope"] = dataset.LandSlope.map({'Sev': 1, 'Mod': 2, 'Gtl': 3})

    dataset["Condition1"] = dataset.Condition1.map(
        {'Artery': 1, 'Feedr': 2, 'RRAe': 3, 'Norm': 4, 'RRAn': 5, 'PosN': 6, 'PosA': 7, 'RRNe': 8, 'RRNn': 9})

    dataset["Condition2"] = dataset.Condition2.map(
        {'Artery': 1, 'Feedr': 2, 'RRAe': 3, 'Norm': 4, 'RRAn': 5, 'PosN': 6, 'PosA': 7, 'RRNe': 8, 'RRNn': 9})

    dataset["RoofStyle"] = dataset.RoofStyle.map(
        {'Flat': 1, 'Gable': 2, 'Gambrel': 3, 'Hip': 4, 'Mansard': 5, 'Shed': 6})

    dataset["RoofMatl"] = dataset.RoofMatl.map(
        {'ClyTile': 1, 'CompShg': 2, 'Membran': 3, 'Metal': 4, 'Roll': 5, 'Tar&Grv': 6, 'WdShake': 7, 'WdShngl': 8})

    dataset["Exterior1st"] = dataset.Exterior1st.map(
        {'AsbShng': 1, 'AsphShn': 2, 'BrkComm': 3, 'BrkFace': 4, 'CBlock': 5, 'CemntBd': 6, 'HdBoard': 7, 'ImStucc': 8,
         'MetalSd': 9, 'Other': 10, 'Plywood': 11, 'PreCast': 12, 'Stone': 13, 'Stucco': 14, 'VinylSd': 15,
         'Wd Sdng': 16, 'WdShing': 17})

    dataset["Exterior2nd"] = dataset.Exterior2nd.map(
        {'AsbShng': 1, 'AsphShn': 2, 'Brk Cmn': 3, 'BrkFace': 4, 'CBlock': 5, 'CmentBd': 6, 'HdBoard': 7, 'ImStucc': 8,
         'MetalSd': 9, 'Other': 10, 'Plywood': 11, 'PreCast': 12, 'Stone': 13, 'Stucco': 14, 'VinylSd': 15,
         'Wd Sdng': 16, 'Wd Shng': 17})

    dataset["MasVnrType"] = dataset.MasVnrType.map({'BrkCmn': 1, 'BrkFace': 2, 'CBlock': 3, 'None': 4, 'Stone': 5})

    dataset["ExterQual"] = dataset.ExterQual.map({'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5})

    dataset["ExterQual"] = dataset.ExterQual.map({'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5})

    dataset["ExterCond"] = dataset.ExterCond.map({'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5})

    dataset["BsmtQual"] = dataset.BsmtQual.map({'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, 'None': 6})

    dataset["BsmtCond"] = dataset.BsmtCond.map({'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, 'None': 6})

    dataset["BsmtFinType1"] = dataset.BsmtFinType1.map(
        {'GLQ': 1, 'ALQ': 2, 'BLQ': 3, 'Rec': 4, 'LwQ': 5, 'Unf': 6, 'None': 7})

    dataset["BsmtFinType2"] = dataset.BsmtFinType2.map(
        {'GLQ': 1, 'ALQ': 2, 'BLQ': 3, 'Rec': 4, 'LwQ': 5, 'Unf': 6, 'None': 7})

    dataset["HeatingQC"] = dataset.HeatingQC.map({'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5})

    dataset["CentralAir"] = dataset.CentralAir.map({'N': 0, 'Y': 1})

    dataset["Electrical"] = dataset.Electrical.map({'SBrkr': 1, 'FuseA': 2, 'FuseF': 3, 'FuseP': 4, 'Mix': 5})

    dataset["KitchenQual"] = dataset.KitchenQual.map({'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5})

    dataset["Functional"] = dataset.Functional.map(
        {'Typ': 1, 'Min1': 2, 'Min2': 3, 'Mod': 4, 'Maj1': 5, 'Maj2': 6, 'Sev': 7, 'Sal': 8})

    dataset["FireplaceQu"] = dataset.FireplaceQu.map({'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, 'None': 6})

    dataset["GarageType"] = dataset.GarageType.map(
        {'2Types': 1, 'Attchd': 2, 'Basment': 3, 'BuiltIn': 4, 'CarPort': 5, 'Detchd': 6, 'None': 7})

    dataset["GarageFinish"] = dataset.GarageFinish.map({'Fin': 1, 'RFn': 2, 'Unf': 3, 'None': 4})

    dataset["GarageQual"] = dataset.GarageQual.map({'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, 'None': 6})

    dataset["GarageCond"] = dataset.GarageCond.map({'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, 'None': 6})

    dataset["PavedDrive"] = dataset.PavedDrive.map({'Y': 1, 'P': 2, 'N': 3})

    dataset["PoolQC"] = dataset.PoolQC.map({'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'None': 5})

    dataset["Fence"] = dataset.Fence.map({'GdPrv': 1, 'MnPrv': 2, 'GdWo': 3, 'MnWw': 4, 'None': 5})

    dataset["MiscFeature"] = dataset.MiscFeature.map({'Elev': 1, 'Gar2': 2, 'Othr': 3, 'Shed': 4, 'TenC': 5, 'None': 6})

    dataset["SaleType"] = dataset.SaleType.map(
        {'WD': 1, 'CWD': 2, 'VWD': 3, 'New': 4, 'COD': 5, 'Con': 6, 'ConLw': 7, 'ConLI': 8, 'ConLD': 9, 'Oth': 10})

    dataset["SaleCondition"] = dataset.SaleCondition.map(
        {'Normal': 1, 'Abnorml': 2, 'AdjLand': 3, 'Alloca': 4, 'Family': 5, 'Partial': 6})

    return dataset


data = map_values(data)
test = map_values(test)

# Model training

Selecting the columns for model training

In [748]:
target = 'SalePrice'
features = data.drop(columns=[target])
labels = data[target]

Separate Numeric and Categorical Columns

In [749]:
numeric_features = features.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = features.select_dtypes(exclude=[np.number]).columns.tolist()

## Creating Pipelines

In [750]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

Splitting the data into training and testing sets

In [751]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Using Regression Models

Using XGBRegressor model

In [None]:
param_grid = {
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'colsample_bytree': [0.3, 0.7]
}

grid_search = GridSearchCV(estimator=XGBRegressor(), param_grid=param_grid, cv=3, scoring='neg_mean_squared_error',
                           verbose=2, n_jobs=4)
grid_search.fit(X_train, y_train,
                eval_set=[(X_test, y_test)],
                early_stopping_rounds=10,
                verbose=True)
best_model = grid_search.best_estimator_

In [753]:
# model = XGBRegressor()
# model.fit(X_train, y_train)
# predictions = best_model.predict(X_test)

In [754]:
# sample_submission = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

test_preprocessed = preprocessor.transform(test)

test_predictions = best_model.predict(test_preprocessed)

sample_submission['SalePrice'] = test_predictions

sample_submission.to_csv('submission.csv', index=False)

1459 1459
