In [47]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

url = "https://drive.google.com/file/d/1mnHCEXUJz5pxyHi41f0OMEFlKvt1sDGn/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
training_data = pd.read_csv(path)

In [48]:
pd.set_option('display.max_columns', None)

In [49]:
training_data.head(1)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500


In [52]:
training_data.drop(columns = ['Id','Utilities','PoolQC','Alley','Fence','Condition2','BsmtFullBath'],axis=1, inplace=True)

In [53]:

X = training_data.drop(columns=['SalePrice'])
y = training_data['SalePrice']


# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# building the pipeline
X_cat = X_train.select_dtypes(exclude="number").copy()
X_num = X_train.select_dtypes(include="number").copy()

numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean"))
 
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(handle_unknown="ignore")
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
       ("cat_pipe", categoric_pipe, X_cat.columns),
    ]
)

In [54]:
!pip install xgboost

import sys
!{sys.executable} -m pip install xgboost

import xgboost
print(xgboost.__version__)
from sklearn.preprocessing import MaxAbsScaler
from xgboost import XGBRegressor
from sklearn.feature_selection import SelectKBest, f_regression


#model = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)

full_pipeline = make_pipeline(preprocessor, 
                              MaxAbsScaler(),
                              XGBRegressor())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    "xgbregressor__max_depth":[3,4],
    "xgbregressor__eta":[0.1,0.15],
    "xgbregressor__subsample":[0.3,0.4],
    "xgbregressor__colsample_bytree":[0.3]
}

xgb_search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      verbose=1)

xgb_search.fit(X_train, y_train)
 

print(
        f"""
        MSE: {mean_squared_error(xgb_search.predict(X_test), y_test)}
        RMSE: {mean_squared_error(xgb_search.predict(X_test), y_test)**0.5}
        MAE: {mean_absolute_error(xgb_search.predict(X_test), y_test)}
        MAPE: {mean_absolute_percentage_error(xgb_search.predict(X_test), y_test)}
        R2 Score: {r2_score(xgb_search.predict(X_test), y_test)}
        """
    )

1.7.3
Fitting 5 folds for each of 16 candidates, totalling 80 fits

        MSE: 519054888.6841625
        RMSE: 22782.77614085172
        MAE: 14821.798359910103
        MAPE: 0.08081398447949326
        R2 Score: 0.9032645772214102
        


In [55]:
# store the trained pipeline
import pickle
pickle.dump(xgb_search, 
            open(file='/Users/ranrandai/Downloads/models/trained_pipe_xgboost.sav', 
                 mode='wb'))