Load Data

In [25]:
# import Libraries
from sklearn.ensemble import RandomForestRegressor,StackingRegressor,GradientBoostingRegressor
from sklearn.linear_model import Ridge,Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV,cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
import pandas as pd
import numpy as np

In [26]:
# load data
df = pd.read_csv(
    r'C:\Users\ahmad\Documents\Coding Projects\Kaggle Projects\house-prices-advanced-regression-techniques\train.csv') 
df_test = pd.read_csv(
    r'C:\Users\ahmad\Documents\Coding Projects\Kaggle Projects\house-prices-advanced-regression-techniques\test.csv'
)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1000)

Preprocessing

Function to find General Info

In [27]:
#function that gives the following information for each column
    # 1. number of unique values 
    # 2. the data type 
    # 3. the null ratio 
def general_info(x):
    # Get unique values and data types in each column
    datatypes = x.dtypes
    unique_values = x.nunique()
    #get null ratio for each column
    nulls = x.isnull().sum()
    nullratios = nulls / x.shape[0]
    general_info = pd.DataFrame({'datatype': datatypes,'unique_values' : unique_values,'nullratios': nullratios})
    #print(general_info)
    return general_info
info = general_info(df)
print(df.shape)


(1460, 81)


Function that cleans NaNs

In [28]:
# function that cleans the data set by doing the following:
# 1. remove all columns with more than 25% missing data
# 2 fill all missing float data types with the mean value of their column
# 3. fill all missing categorical data with most previously filled category within that column
def clean_NaNs(x, info_x, x_test):
    # keep all rows with less than 25% NaN values
    x = x.loc[:, info_x["nullratios"] < 0.25]
    x_test = x_test[x.columns[x.columns != "SalePrice"]]
    info_x = general_info(x)
    # fix all the NaNs where floats becomes means
    float_nan_columns = info_x[
        (info_x["datatype"] == "float64") & (info_x["nullratios"] > 0)
    ].index.tolist()
    # print(float_nan_columns)
    means = x[float_nan_columns].mean()
    print("means: ", means)
    x.loc[:, float_nan_columns] = x[float_nan_columns].fillna(means)
    x_test.loc[:, float_nan_columns] = x_test[float_nan_columns].fillna(means)
    info_x = general_info(x)
    print(info_x.loc[info_x["nullratios"] > 0, :])
    # fix all the NaNs where objects becomes ffill()
    x = x.ffill()
    x_test = x_test.ffill()
    info_x = general_info(x)
    print(info_x.loc[info_x["nullratios"] > 0, :])
    # fix info
    return x, info_x, x_test


df, info, df_test = clean_NaNs(df, info, df_test)
# print(info.loc[info['nullratios']>0,:]) # should have no outputs

means:  LotFrontage      70.049958
MasVnrArea      103.685262
GarageYrBlt    1978.506164
dtype: float64
             datatype  unique_values  nullratios
BsmtQual       object              4    0.025342
BsmtCond       object              4    0.025342
BsmtExposure   object              4    0.026027
BsmtFinType1   object              6    0.025342
BsmtFinType2   object              6    0.026027
Electrical     object              5    0.000685
GarageType     object              6    0.055479
GarageFinish   object              3    0.055479
GarageQual     object              5    0.055479
GarageCond     object              5    0.055479
Empty DataFrame
Columns: [datatype, unique_values, nullratios]
Index: []


Functions that do Label Encoding or One-Hot encoding

In [29]:
print(df.shape)


# function that separates all the categories into labeled numbers
def Label_encode(x, x_info, x_test):
    object_columns = x_info.loc[x_info["datatype"] == "object"].index.tolist()
    print(object_columns)
    x[object_columns] = x[object_columns].astype("category")
    x_test[object_columns] = x_test[object_columns].astype("category")
    x[object_columns] = x[object_columns].apply(lambda col: col.cat.codes)
    x_test[object_columns] = x_test[object_columns].apply(lambda col: col.cat.codes)
    x_info = general_info(x)
    return x, x_info, object_columns, x_test


df, info, objectColumns, df_test = Label_encode(df, info, df_test)


def OH_encode(x, x_info):
    object_columns = x_info.loc[x_info["datatype"] == "object"].index.tolist()
    x = pd.get_dummies(x, columns=object_columns, drop_first=False)
    x_info = general_info(x)
    return x, x_info


# df, info = OH_encode(df, info)
print(df.head())
print(df_test.head())
# print(info)

(1460, 75)
['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']
   Id  MSSubClass  MSZoning  LotFrontage  LotArea  Street  LotShape  LandContour  Utilities  LotConfig  LandSlope  Neighborhood  Condition1  Condition2  BldgType  HouseStyle  OverallQual  OverallCond  YearBuilt  YearRemodAdd  RoofStyle  RoofMatl  Exterior1st  Exterior2nd  MasVnrArea  ExterQual  ExterCond  Foundation  BsmtQual  BsmtCond  BsmtExposure  BsmtFinType1  BsmtFinSF1  BsmtFinType2  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  Heating  HeatingQC  CentralAir  Electrical  1stFlrSF

Prepare the Data to fit testing requirements

In [30]:
# prepare inputs and outputs for training and testing
# Split the data
x = df.drop("SalePrice", axis=1)
y = df["SalePrice"]
print(x.shape)
print(y.shape)
y = np.log1p(y)

(1460, 74)
(1460,)


PCA

In [31]:
'''model = PCA(n_components = 60)
x_train = model.fit_transform(x_train)
x_test = model.transform(x_test)'''

'model = PCA(n_components = 60)\nx_train = model.fit_transform(x_train)\nx_test = model.transform(x_test)'

Model Selection

In [32]:
# used only for StackingRegressor
'''
baseModels = [('rf',RandomForestRegressor(n_estimators=50)),('xgb',XGBRegressor())]
finalModel = Lasso()
'''


# initialize the model
# model = RandomForestRegressor(n_estimators=100, random_state=42)
# model = GradientBoostingRegressor()
# model = StackingRegressor( estimators=baseModels,final_estimator=finalModel,cv =5)
# model = Lasso();
'''
model = XGBRegressor();
modelHyp =  {
    'n_estimators': [50, 100, 150, 200],                   # Number of boosting rounds
    'learning_rate': [0.01, 0.05, 0.1, 0.2],               # Step size shrinkage
    'max_depth': [3, 5, 7, 9],                             # Maximum depth of a tree
    'min_child_weight': [1, 3, 5, 7],                      # Minimum sum of instance weight
    'subsample': [0.6, 0.8, 1.0],                          # Fraction of samples used for fitting trees
    'colsample_bytree': [0.6, 0.8, 1.0],                   # Fraction of features used to build each tree
    'gamma': [0, 0.1, 0.2, 0.3],                           # Minimum loss reduction required for partition
    'alpha': [0, 0.1, 0.5, 1.0],                           # L1 regularization term
    'lambda': [1, 1.5, 2, 5],                              # L2 regularization term
    'booster': ['gbtree', 'gblinear', 'dart'],              # Type of boosting model
    'scale_pos_weight': [1, 2, 3, 5]                       # Weight of positive class in binary classification
}

rcv = RandomizedSearchCV( model, param_distributions=modelHyp, cv=5, scoring="neg_mean_squared_error")
# run the model
rcv.fit(x,y)
print(rcv.best_score_)
print(rcv.best_params_)

# results from running RCV
{
    "subsample": 0.8,
    "scale_pos_weight": 3,
    "n_estimators": 200,
    "min_child_weight": 3,
    "max_depth": 5,
    "learning_rate": 0.1,
    "lambda": 1,
    "gamma": 0,
    "colsample_bytree": 1.0,
    "booster": "dart",
    "alpha": 1.0,
}
'''
'''
model = XGBRegressor();
modelHyp =  {
    'n_estimators': [50, 100, 150, 200],                   # Number of boosting rounds
    'learning_rate': [0.01, 0.05, 0.1, 0.2],               # Step size shrinkage
    'max_depth': [3, 5, 7, 9],                             # Maximum depth of a tree
    'min_child_weight': [1, 3, 5, 7],                      # Minimum sum of instance weight
    'subsample': [0.6, 0.8, 1.0],                          # Fraction of samples used for fitting trees
    'colsample_bytree': [0.6, 0.8, 1.0],                   # Fraction of features used to build each tree
    'gamma': [0, 0.1, 0.2, 0.3],                           # Minimum loss reduction required for partition
    'alpha': [0, 0.1, 0.5, 1.0],                           # L1 regularization term
    'lambda': [1, 1.5, 2, 5],                              # L2 regularization term
    'booster': ['gbtree', 'gblinear', 'dart'],              # Type of boosting model
    'scale_pos_weight': [1, 2, 3, 5]                       # Weight of positive class in binary classification
}

rcv = RandomizedSearchCV( model, param_distributions=modelHyp, cv=5, scoring="neg_mean_squared_error")
# run the model
rcv.fit(x,y)
print(rcv.best_score_)
print(rcv.best_params_)

# results from running RCV
{
    "subsample": 0.8,
    "scale_pos_weight": 3,
    "n_estimators": 200,
    "min_child_weight": 3,
    "max_depth": 5,
    "learning_rate": 0.1,
    "lambda": 1,
    "gamma": 0,
    "colsample_bytree": 1.0,
    "booster": "dart",
    "alpha": 1.0,
}
'''

# Define your XGBRegressor model with the desired hyperparameters
model = XGBRegressor(
    subsample=0.8,
    scale_pos_weight=3,
    n_estimators=200,
    min_child_weight=3,
    max_depth=5,
    learning_rate=0.1,
    reg_lambda=1,
    gamma=0,
    colsample_bytree=1.0,
    booster="dart",
    reg_alpha=1.0,
)
model.fit(x,y)
# Perform cross-validation with negative mean squared error scoring
print(cross_val_score(model, x, y, cv=5, scoring="neg_mean_squared_error"))

[-0.01445348 -0.01884542 -0.02033203 -0.01335542 -0.01800175]


Run Model on Testing Data

In [34]:
y_pred = model.predict(df_test)
SalePrice = np.expm1(y_pred)
SalePrice = (SalePrice/100)*100
print(SalePrice)
submission = pd.DataFrame({'Id':df_test['Id'],'SalePrice':SalePrice.flatten()})
print(submission)

[126137.07 156395.36 182474.   ... 160259.39 110193.21 211366.1 ]
        Id      SalePrice
0     1461  126137.070312
1     1462  156395.359375
2     1463  182474.000000
3     1464  191656.187500
4     1465  185739.140625
...    ...            ...
1454  2915   77265.617188
1455  2916   76530.859375
1456  2917  160259.390625
1457  2918  110193.210938
1458  2919  211366.093750

[1459 rows x 2 columns]


Save Submission

In [35]:
submission.to_csv("submission.csv", index=False)
print(pd.read_csv("submission.csv").head())

     Id  SalePrice
0  1461  126137.07
1  1462  156395.36
2  1463  182474.00
3  1464  191656.19
4  1465  185739.14
