In [4]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import GridSearchCV 
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [5]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


Create Grid Search parameters, pre processing pipeline and regressor. 

In [6]:
Y_train = df_train["SalePrice"]
X_train = df_train.loc[:, df_train.columns != "SalePrice"]

# one hot encode categorical features
categorical_features = X_train.select_dtypes(include=['object']).columns

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)])

## Hyper Parameter Optimization
n_estimators = [1100, 1500]
max_depth = [10, 15]
learning_rate=[0.05,0.20]
min_child_weight=[1,4]

# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'regressor__n_estimators': n_estimators,
    'regressor__max_depth':max_depth,
    'regressor__learning_rate':learning_rate,
    'regressor__min_child_weight':min_child_weight
    }

# xgboost pipeline
regressor = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("regressor", XGBRegressor())]
    )

# grid search object
regressor_cv = GridSearchCV(
    estimator = regressor, 
    param_grid = hyperparameter_grid,
    cv = 3, 
    verbose = 1
)

fit regressor 

In [7]:
regressor_cv.fit(X_train, Y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessing',
                                        ColumnTransformer(transformers=[('cat',
                                                                         Pipeline(steps=[('onehot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl...
                                                     min_child_weight=None,
                                                     missing=nan,
                                                     monotone_constraints=None,
                                                     n_estimators=100,
                               

grid search results 

In [8]:
pd.DataFrame(regressor_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__learning_rate,param_regressor__max_depth,param_regressor__min_child_weight,param_regressor__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,2.001017,0.07054,0.031805,0.001685,0.05,10,1,1100,"{'regressor__learning_rate': 0.05, 'regressor_...",0.760744,0.728111,0.739279,0.742711,0.013542,1
1,2.610376,0.04998,0.044902,0.00421,0.05,10,1,1500,"{'regressor__learning_rate': 0.05, 'regressor_...",0.760715,0.728048,0.739243,0.742669,0.013554,2
2,1.580752,0.05982,0.019119,0.00286,0.05,10,4,1100,"{'regressor__learning_rate': 0.05, 'regressor_...",0.757726,0.739057,0.699419,0.732067,0.024311,7
3,2.194642,0.063992,0.033505,0.001783,0.05,10,4,1500,"{'regressor__learning_rate': 0.05, 'regressor_...",0.757537,0.73748,0.699058,0.731358,0.024263,8
4,3.567076,0.179403,0.056818,0.008749,0.05,15,1,1100,"{'regressor__learning_rate': 0.05, 'regressor_...",0.767542,0.708646,0.723552,0.733247,0.025002,5
5,4.346302,0.287023,0.084219,0.005892,0.05,15,1,1500,"{'regressor__learning_rate': 0.05, 'regressor_...",0.767542,0.708646,0.723552,0.733246,0.025002,6
6,2.369805,0.122153,0.039284,0.004536,0.05,15,4,1100,"{'regressor__learning_rate': 0.05, 'regressor_...",0.749871,0.729341,0.70224,0.727151,0.019507,11
7,3.01201,0.246838,0.050963,0.010187,0.05,15,4,1500,"{'regressor__learning_rate': 0.05, 'regressor_...",0.749851,0.728803,0.702209,0.726955,0.019493,12
8,1.844528,0.107494,0.026025,0.001486,0.2,10,1,1100,"{'regressor__learning_rate': 0.2, 'regressor__...",0.770292,0.730441,0.712053,0.737595,0.024308,4
9,2.432038,0.14838,0.03553,0.003339,0.2,10,1,1500,"{'regressor__learning_rate': 0.2, 'regressor__...",0.770292,0.730441,0.712053,0.737595,0.024308,3


make submission file 

In [9]:
df_test["SalePrice"] = regressor_cv.predict(df_test)

submission = df_test[["Id","SalePrice"]]

submission.to_csv("submission.csv", index=False)