In [98]:
# For data manipulation and analysis
import pandas as pd

# For numerical operations
import numpy as np

# For creating visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# For regular expressions
import re

# For statistical functions
import scipy.stats
# Algorithms
from sklearn.neighbors import KNeighborsRegressor

In [99]:
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

# For scikit-learn features
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score

In [100]:
import optuna

# Suppress all warnings
import warnings
warnings.filterwarnings("ignore")

In [101]:
df_train = pd.read_csv("Data/train.csv")
df_test = pd.read_csv("Data/test.csv")

In [102]:
df_train.shape

(1460, 81)

In [103]:
# Comparing two dataframes to find any missing column
def compare_columns(df1, df2):
    return set(df1.columns) ^ set(df2.columns)

In [104]:
# Defining new columns and variables
target = df_train["SalePrice"]
test_id = df_test["Id"]

In [105]:
# Dropping unnecessary columns
df_train.drop(["Id","SalePrice"], axis = 1, inplace = True) 
df_test.drop(["Id"], axis = 1, inplace = True)

In [106]:
# Merging datasets
df_1 = pd.concat([df_train, df_test], ignore_index = True)
df_1.shape

(2919, 79)

In [107]:
df_2 = df_1.copy()

In [108]:
# Originally numerical --> coverted to categorical
df_2["MSSubClass"] = df_2["MSSubClass"].astype(str)

In [109]:
# Filling missing value using a constant value - "None"
for column in ["Alley", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual", "GarageCond", "PoolQC", "Fence", "MiscFeature"]:
    #df_2[column] = df_2[column].fillna("None")
    df_2.drop(columns=[column], inplace=True)
    
# Filling missing values by using the most frequent value in each column
for column in ["MSZoning", "Utilities", "Exterior1st", "Exterior2nd", "MasVnrType", "Electrical", "KitchenQual", "Functional", "SaleType"]:
    df_2[column] = df_2[column].fillna(df_2[column].mode()[0])

In [110]:
df_3 = df_2.copy()

In [111]:
def knn_impute(df, na_target):
    df = df.copy()
    
    # Selecting only numeric columns
    df_numeric = df.select_dtypes(np.number)
    
    # Identifying non-missing columns
    non_na_columns = df_numeric.loc[:, df_numeric.isna().sum() == 0].columns
    
    # Splitting data into train and test sets
    train_y = df_numeric.loc[df_numeric[na_target].isna() == False, na_target] 
    train_X = df_numeric.loc[df_numeric[na_target].isna() == False, non_na_columns]
    test_X = df_numeric.loc[df_numeric[na_target].isna() == True, non_na_columns]
    
    # KNN model
    knn = KNeighborsRegressor()
    knn.fit(train_X, train_y)
    
    # Predicting missing values
    y_predict = knn.predict(test_X)
    
    # Filling missing values
    df.loc[df[na_target].isna() == True, na_target] = y_predict

    return df

In [112]:
for column in ["LotFrontage","MasVnrArea","BsmtFinSF1","BsmtFinSF2","BsmtUnfSF","TotalBsmtSF","BsmtFullBath","BsmtHalfBath","GarageYrBlt","GarageCars","GarageArea"]:
    df_3 = knn_impute(df_3, column)

In [113]:
df_4 = df_3.copy()


In [114]:
# Square feet per room
#df_4["SqFtPerRoom"] = df_4["GrLivArea"] / (df_4["TotRmsAbvGrd"] + df_4["FullBath"] + df_4["HalfBath"] + df_4["KitchenAbvGr"])

# Total home quality
df_4["Total_Home_Quality"] = df_4["OverallQual"] + df_4["OverallCond"]

# Total number of bathrooms
df_4["Total_Bathrooms"] = (df_4["FullBath"] + (0.5 * df_4["HalfBath"]) + df_4["BsmtFullBath"] + (0.5 * df_4["BsmtHalfBath"]))

In [115]:
df_5 = df_4.copy()

In [116]:
# Selecting numerical columns and creating a dataframe
df_skew = pd.DataFrame(df_5.select_dtypes(np.number).columns, columns = ["Feature"])

# Calculating skewness
df_skew["Skew"] = df_skew["Feature"].apply(lambda feature: scipy.stats.skew(df_5[feature]))

# Calculating absolute skewness
df_skew["Absolute Skew"] = df_skew["Skew"].apply(abs)

# Flagging highly skewed features
df_skew["Skewed"] = df_skew["Absolute Skew"].apply(lambda x: True if x >= 0.5 else False)

# Display
df_skew[df_skew["Skewed"] == True]

Unnamed: 0,Feature,Skew,Absolute Skew,Skewed
0,LotFrontage,1.340751,1.340751,True
1,LotArea,12.822431,12.822431,True
3,OverallCond,0.570312,0.570312,True
4,YearBuilt,-0.599806,0.599806,True
6,MasVnrArea,2.603682,2.603682,True
7,BsmtFinSF1,1.425516,1.425516,True
8,BsmtFinSF2,4.146111,4.146111,True
9,BsmtUnfSF,0.919322,0.919322,True
10,TotalBsmtSF,1.162806,1.162806,True
11,1stFlrSF,1.469604,1.469604,True


In [None]:
# elemenate outliers using log:
for column in df_skew.query("Skewed == True")["Feature"].values:
    df_5[column] = np.log1p(df_5[column])

In [None]:
# To make the MoSold column between -1 and 1:
df_5["MoSold"] = (-np.cos(0.5236 * df_5["MoSold"]))

In [119]:
df_6 = df_5.copy()


In [120]:
df_6 = pd.get_dummies(df_6)

In [121]:
df_7 = df_6.copy()


In [122]:
scaler = StandardScaler()
scaler.fit(df_7)

df_7 = pd.DataFrame(scaler.transform(df_7), index=df_7.index, columns=df_7.columns)

In [123]:
df_8 = df_7.copy()


In [124]:
# Target with log transformation
log_target = np.log(target)
log_target

0       12.247694
1       12.109011
2       12.317167
3       11.849398
4       12.429216
          ...    
1455    12.072541
1456    12.254863
1457    12.493130
1458    11.864462
1459    11.901583
Name: SalePrice, Length: 1460, dtype: float64

In [125]:
train_set = df_8.loc[: df_train.index.max() , : ].copy()
test_set = df_8.loc[df_train.index.max() + 1: , : ].reset_index(drop = True).copy()

In [126]:
train_set.shape

(1460, 243)

In [127]:
# Merge train set and log target
df_8 = pd.concat([train_set, log_target.rename("log_target")], axis = 1)

In [128]:
# Define X and y
X = df_8.iloc[: , : 319]
y = df_8["log_target"]

In [167]:
from lightgbm import LGBMRegressor

models = {
    "CatBoost": CatBoostRegressor(
        n_estimators=860,
        learning_rate=0.02,
        depth=6,
        l2_leaf_reg=0.35,
        random_state=42,
        verbose=0     
    ),
    "XGBoost": XGBRegressor(
        n_estimators=700,
        learning_rate=0.02,
        max_depth=7,  # XGBoost uses max_depth instead of depth
        subsample=0.5,
        colsample_bytree=0.1,
        random_state=42,
        verbosity=0        
    ),
    "LightGBM": LGBMRegressor(
        n_estimators=800,
        learning_rate=0.02,
        max_depth=7,
        subsample=0.6,
        colsample_bytree=0.7,
        random_state=42,
        n_jobs=-1
    )
}

In [168]:
# Training models
for name, model in models.items():
    model.fit(train_set, log_target)
    print(name + " trained.")

CatBoost trained.
XGBoost trained.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001996 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3603
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 158
[LightGBM] [Info] Start training from score 12.024051
LightGBM trained.


In [169]:
# Evaluating models
results = {}

kf = KFold(n_splits = 10)

for name, model in models.items():
    result = np.exp(np.sqrt(-cross_val_score(model, train_set, log_target, scoring = "neg_mean_squared_error", cv = kf)))
    results[name] = result

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001472 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3492
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 154
[LightGBM] [Info] Start training from score 12.026856
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001219 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3471
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 153
[LightGBM] [Info] Start training from score 12.018892
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001245 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3489
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 155
[LightGBM] [Info] Start t

In [170]:
# Results
for name, result in results.items():
    print(name)
    print(np.mean(result))
    print(np.std(result))
    print("----------------")

CatBoost
1.123874304336978
0.019517399290236475
----------------
XGBoost
1.1292943357612921
0.021484339700009347
----------------
LightGBM
1.132433309710658
0.02151248824318779
----------------


In [171]:
final_predictions = (
    0.5 * np.exp(models["CatBoost"].predict(test_set)) +
    0.3 * np.exp(models["XGBoost"].predict(test_set)) +
    0.2 * np.exp(models["LightGBM"].predict(test_set))
)

In [172]:
submission = pd.concat([test_id, pd.Series(final_predictions, name = "SalePrice")], axis = 1)
submission

Unnamed: 0,Id,SalePrice
0,1461,124573.659052
1,1462,160745.468273
2,1463,181512.288550
3,1464,193338.322599
4,1465,189430.990293
...,...,...
1454,2915,84683.309122
1455,2916,82618.766224
1456,2917,167176.096746
1457,2918,117226.870962


In [173]:
submission.to_csv("submission.csv", index = False, header = True)