In [177]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.base import TransformerMixin
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
import joblib
from sklearn.pipeline import Pipeline

# Data Importing & First Look

In [178]:
df = pd.read_csv("/Users/baranalp.ozkan/Desktop/house/house_price_1.csv")
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [179]:
df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [180]:
# for intuitive analysis
# profile = ProfileReport(df, title="Pandas Profiling Report")
# profile.to_file("your_report.html")

# Data Preparation

### Dropping & Filling Missing Values

In [181]:
df['MSSubClass'] = df['MSSubClass'].astype('object') # transformed from int64 to object data type

dfc =df.copy()

# dropping columns due to high number of missing values 
drop_cols = ['MiscFeature','PoolQC'] 
dfc = dfc.drop(drop_cols,axis=1)

# filling missing values
fill_miss_cols = ['MasVnrArea','Fence','GarageCond','GarageQual','GarageFinish','GarageYrBlt','GarageType','FireplaceQu','Electrical','BsmtFinType2','BsmtFinType1','BsmtExposure','BsmtCond','BsmtQual','MasVnrType','Alley','LotFrontage']
for i in fill_miss_cols:
    if dfc[i].dtypes in ["int64","float64"] :
        dfc[i].fillna(-999.0,inplace=True)
    elif dfc[i].dtypes in ["object"] :
        dfc[i].fillna('unknown',inplace=True)

# dropping columns due to Correlations
dfc = dfc.drop(columns=['Id','GarageYrBlt','TotRmsAbvGrd','1stFlrSF','ScreenPorch','PoolArea'])

for i in dfc :
    if (dfc[i].dtypes in ["int64","float64"]) and (abs(dfc.corr()['SalePrice'][i]) < 0.03):
        dfc = dfc.drop(columns = i)
        print(f"{i} column dropped")



LotFrontage column dropped
BsmtFinSF2 column dropped
LowQualFinSF column dropped
BsmtHalfBath column dropped
MiscVal column dropped
YrSold column dropped


In [182]:
dfc.isnull().sum() # checking for missing values after cleaning

MSSubClass       0
MSZoning         0
LotArea          0
Street           0
Alley            0
                ..
Fence            0
MoSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 67, dtype: int64

### Encoding

In [183]:
# manual encoding
qual_cols = ['ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC','KitchenQual','FireplaceQu','GarageQual','GarageCond']
a={'Ex':5,'Gd': 4, 'TA':3,'Fa' : 2, 'Po':1,'NA':0}
dfc[qual_cols]=dfc[qual_cols].replace(a)

dfc['BsmtExposure'] = dfc['BsmtExposure'].replace({'Gd':4,'Av':3,'Mn':2,'No':1,'NA':0})

dfc['CentralAir'] = dfc['CentralAir'].replace({'N':0,'Y':1})

In [184]:
categorical_columns = dfc.columns[dfc.dtypes==object].tolist()

In [185]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

hot = ohe.fit_transform(dfc[categorical_columns].astype(str))

In [186]:
# Numeric data exported as a cold dataframe
cold_df = dfc.select_dtypes(exclude=["object"])
cold_df.head()
cold_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 30 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotArea        1460 non-null   int64  
 1   OverallQual    1460 non-null   int64  
 2   OverallCond    1460 non-null   int64  
 3   YearBuilt      1460 non-null   int64  
 4   YearRemodAdd   1460 non-null   int64  
 5   MasVnrArea     1460 non-null   float64
 6   ExterQual      1460 non-null   int64  
 7   ExterCond      1460 non-null   int64  
 8   BsmtFinSF1     1460 non-null   int64  
 9   BsmtUnfSF      1460 non-null   int64  
 10  TotalBsmtSF    1460 non-null   int64  
 11  HeatingQC      1460 non-null   int64  
 12  CentralAir     1460 non-null   int64  
 13  2ndFlrSF       1460 non-null   int64  
 14  GrLivArea      1460 non-null   int64  
 15  BsmtFullBath   1460 non-null   int64  
 16  FullBath       1460 non-null   int64  
 17  HalfBath       1460 non-null   int64  
 18  BedroomA

In [187]:
# cold dataframe transformed to sparse matrix
from scipy.sparse import csr_matrix
cold = csr_matrix(cold_df)

In [188]:
# hot and cold sparse matrixes merged
from scipy.sparse import hstack
final_sparse_matrix = hstack((hot, cold))

In [189]:
# visualization of final data frame that we created above
final_df = pd.DataFrame(final_sparse_matrix.toarray())
final_df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,274,275,276,277,278,279,280,281,282,283
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,4.0,0.0,2.0,548.0,0.0,61.0,0.0,0.0,2.0,208500.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1.0,2.0,460.0,298.0,0.0,0.0,0.0,5.0,181500.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,4.0,1.0,2.0,608.0,0.0,42.0,0.0,0.0,9.0,223500.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,1.0,3.0,642.0,0.0,35.0,272.0,0.0,2.0,140000.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,4.0,1.0,3.0,836.0,192.0,84.0,0.0,0.0,12.0,250000.0


In [190]:
final_csr_matrix = final_sparse_matrix.tocsr()

## Cross Validation & Grid Searching

In [191]:
# Train Data Set and Test Data Set splitted

from sklearn.model_selection import train_test_split
X = final_df.iloc[:,:-1]
y = final_df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=2)

In [192]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error as MSE
from xgboost import XGBRegressor

In [193]:
# Cross Validation function defined

kfold = KFold(n_splits=5, shuffle=True, random_state=2)

def cross_val(model):
    scores = cross_val_score(model, 
                             X_train_transformed, 
                             y_train, 
                             scoring='neg_root_mean_squared_error', 
                             cv=kfold)
    rmse = (-scores.mean())
    return rmse


In [194]:
X_train_transformed = X_train # for pipeline processing dataframe's name is changed
cross_val(XGBRegressor(missing=-999.0))

32812.091158280135

### For Grid Searching X_train splitted again

In [195]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_train_transformed, y_train, random_state=2)

In [196]:
# Function defined for finding n_estimators 

def n_estimators(model):
    eval_set = [(X_test_2, y_test_2)]
    eval_metric="rmse"
    model.fit(X_train_2, y_train_2, 
              eval_metric=eval_metric, 
              eval_set=eval_set, 
              early_stopping_rounds=100)
    y_pred = model.predict(X_test_2)
    rmse = MSE(y_test_2, y_pred)**0.5
    return rmse

In [197]:
n_estimators(XGBRegressor(n_estimators=5000, missing=-999.0))


[0]	validation_0-rmse:138700.75115
[1]	validation_0-rmse:99694.86166
[2]	validation_0-rmse:73746.38241
[3]	validation_0-rmse:58023.15270
[4]	validation_0-rmse:49498.22864
[5]	validation_0-rmse:44667.91236
[6]	validation_0-rmse:42961.51405
[7]	validation_0-rmse:42541.29014
[8]	validation_0-rmse:42759.52876
[9]	validation_0-rmse:42814.50685
[10]	validation_0-rmse:42911.01284
[11]	validation_0-rmse:43220.42741
[12]	validation_0-rmse:43285.53899
[13]	validation_0-rmse:43517.55246




[14]	validation_0-rmse:43678.61516
[15]	validation_0-rmse:44003.47844
[16]	validation_0-rmse:44147.41795
[17]	validation_0-rmse:44191.67874
[18]	validation_0-rmse:44474.57942
[19]	validation_0-rmse:44533.56855
[20]	validation_0-rmse:44653.82019
[21]	validation_0-rmse:44688.23196
[22]	validation_0-rmse:44705.07118
[23]	validation_0-rmse:44775.23872
[24]	validation_0-rmse:44858.79825
[25]	validation_0-rmse:44749.75745
[26]	validation_0-rmse:44825.49546
[27]	validation_0-rmse:44743.79959
[28]	validation_0-rmse:44690.02733
[29]	validation_0-rmse:44647.17885
[30]	validation_0-rmse:44639.94566
[31]	validation_0-rmse:44649.39407
[32]	validation_0-rmse:44594.19501
[33]	validation_0-rmse:44579.49994
[34]	validation_0-rmse:44580.29979
[35]	validation_0-rmse:44572.78096
[36]	validation_0-rmse:44583.23068
[37]	validation_0-rmse:44586.27941
[38]	validation_0-rmse:44460.86650
[39]	validation_0-rmse:44460.96750
[40]	validation_0-rmse:44457.02419
[41]	validation_0-rmse:44451.52700
[42]	validation_0-rm

42541.29042554407

Using our default model, 7 estimators currently gives the best estimate. That will be our starting point.

[7]	validation_0-rmse:42541.29014

## GridSearch function defined and used for the hyperparameters analysis

In [198]:
def grid_search(params, reg=XGBRegressor(missing=-999.0)):
    grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=kfold)
    grid_reg.fit(X_train_transformed, y_train)
    best_params = grid_reg.best_params_
    print("Best params:", best_params)
    best_score = np.sqrt(-grid_reg.best_score_)
    print("Best score:", best_score)

In [199]:
grid_search(params={'max_depth':[1, 2, 3, 4, 6, 7, 8],
                     'n_estimators':[7]})

Best params: {'max_depth': 8, 'n_estimators': 7}
Best score: 37372.96084256663


In [143]:
grid_search(params={'max_depth':[7, 8, 9,10],
                    'min_child_weight':[1,2,3,4,5],
                    'n_estimators':[7]})

Best params: {'max_depth': 10, 'min_child_weight': 3, 'n_estimators': 7}
Best score: 36725.001929386046


In [144]:
# For ensuring that we chose optimal n_estimators value tried with another high number
grid_search(params={'max_depth':[9],
                    'min_child_weight':[4,5],
                    'subsample':[0.5, 0.6, 0.7, 0.8, 0.9],
                    'n_estimators':[7, 50]})

Best params: {'max_depth': 9, 'min_child_weight': 4, 'n_estimators': 50, 'subsample': 0.8}
Best score: 34903.57067342321


In [145]:
grid_search(params={'max_depth':[8],
                    'min_child_weight':[3, 4],
                    'subsample':[0.6, 0.7, 0.8],
                    'colsample_bytree':[0.6, 0.7, 0.8, 0.9],
                    'n_estimators':[50]})


Best params: {'colsample_bytree': 0.7, 'max_depth': 8, 'min_child_weight': 3, 'n_estimators': 50, 'subsample': 0.8}
Best score: 33065.0476540859


In [146]:
grid_search(params={'max_depth':[8],
                    'min_child_weight':[4],
                    'subsample':[.8],
                    'colsample_bytree':[0.8],
                    'colsample_bylevel':[0.6, 0.7, 0.8, 0.9, 1],
                    'colsample_bynode':[0.6, 0.7, 0.8, 0.9, 1],
                    'n_estimators':[50]})


Best params: {'colsample_bylevel': 0.6, 'colsample_bynode': 1, 'colsample_bytree': 0.8, 'max_depth': 8, 'min_child_weight': 4, 'n_estimators': 50, 'subsample': 0.8}
Best score: 32695.511204336915


In [147]:
grid_search(params={'max_depth':[7],
                    'min_child_weight':[4],
                    'subsample':[.8],
                    'colsample_bytree':[0.8],
                    'colsample_bylevel':[0.8],
                    'colsample_bynode':[0.6],
                    'n_estimators':[50]})


Best params: {'colsample_bylevel': 0.8, 'colsample_bynode': 0.6, 'colsample_bytree': 0.8, 'max_depth': 7, 'min_child_weight': 4, 'n_estimators': 50, 'subsample': 0.8}
Best score: 34964.99809820754


## Model's parameters are defined 

In [200]:
xgbr = XGBRegressor(max_depth=7, 
             min_child_weight=4, 
             subsample=0.8, 
             colsample_bytree=0.8, 
             colsample_bylevel=0.8, 
             colsample_bynode=0.6,
             n_estimators =100,
             missing=-999.0,
            learning_rate=0.1)


In [201]:
xgbr.fit(X_train.values,y_train.values)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=0.8, colsample_bynode=0.6, colsample_bytree=0.8,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=7, max_leaves=0, min_child_weight=4,
             missing=-999.0, monotone_constraints='()', n_estimators=100,
             n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
             reg_alpha=0, reg_lambda=1, ...)

In [202]:
pred = xgbr.predict(X_test.values)

## Model Metrics

In [203]:
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score

mse = MSE(y_test, pred)
r2 = r2_score(y_test, pred)

print("MSE: %.2f" % mse)
print("RMSE: %.2f" % (mse**(1/2.0)))
print(f"R2: {r2}")

MSE: 682126902.54
RMSE: 26117.56
R2: 0.8969949303728533


## Feature Importance

In [218]:
# getting feature names from one hot encoded dataframe, popping sales price column
names = ohe.get_feature_names() 
tot = list(names)+list(cold_df.columns)
tot.pop()

'SalePrice'

In [219]:
# features are sorted for their importance
feature_sorted = pd.DataFrame(xgbr.feature_importances_,
                                   index = tot,
                                    columns=['importance']).sort_values('importance',ascending=False)
feature_sorted.head(20)

Unnamed: 0,importance
GarageCars,0.207554
x20_5,0.168835
OverallQual,0.109026
KitchenQual,0.108699
ExterQual,0.061102
FullBath,0.025705
GrLivArea,0.020495
CentralAir,0.014566
x9_NoRidge,0.012968
BsmtFinSF1,0.011783


In [221]:
# Reversing feature names that one hot encoded
ilk20 =feature_sorted.head(20).index
ilk20 = list(ilk20)
col_names = list(dfc[categorical_columns].columns)

for i in range(len(ilk20)):
    if ilk20[i][0] == "x":        
        j = ilk20[i].index("_")
        ilk20[i] = col_names[int(ilk20[i][1:j])]
ilk20

['GarageCars',
 'BsmtQual',
 'OverallQual',
 'KitchenQual',
 'ExterQual',
 'FullBath',
 'GrLivArea',
 'CentralAir',
 'Neighborhood',
 'BsmtFinSF1',
 'Alley',
 '2ndFlrSF',
 'FireplaceQu',
 'LandContour',
 'GarageType',
 'TotalBsmtSF',
 'Exterior1st',
 'GarageFinish',
 'LandSlope',
 'Exterior2nd']

## Pipeline

In [125]:
# 2nd Way
# Creating our Transformer Classs for missing values

from sklearn.base import TransformerMixin 
class NullValueImputer(TransformerMixin):
    def __init__(self):
        None
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        for column in X.columns.tolist():
            if column in X.columns[X.dtypes==object].tolist():
                X[column] = X[column].fillna(X[column].mode())
            else:
                X[column]=X[column].fillna(-999.0)
        return X


In [126]:
# 2nd Way
# Creating a Transformer Class for PipeLine for automatization above processes

class SparseMatrix(TransformerMixin):
    def __init__(self):
        None
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        categorical_columns = X.columns[X.dtypes==object].tolist()
        ohe = OneHotEncoder()
        hot = ohe.fit_transform(X[categorical_columns])
        cold_df = X.select_dtypes(exclude=["object"])
        cold = csr_matrix(cold_df)
        final_sparse_matrix = hstack((hot, cold))
        final_csr_matrix = final_sparse_matrix.tocsr()
        return final_csr_matrix


In [222]:
from sklearn.pipeline import Pipeline
data_pipeline = Pipeline([('null_imputer', NullValueImputer()), ('sparse', SparseMatrix())])

from sklearn.model_selection import train_test_split
X = dfc.iloc[:,:-1]
y = dfc.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=2)

X_train_transformed = data_pipeline.fit_transform(X_train)

# After data preparation processes, grid search and model preperation can be done with these classes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = X[column].fillna(X[column].mode())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = X[column].fillna(X[column].mode())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column]=X[column].fillna(-999.0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

TypeError: Encoders require their input to be uniformly strings or numbers. Got ['int', 'str']

## Model Deployment for Streamlit

In [223]:
single_row = dfc.mode(axis=0)
single_row = single_row.iloc[:,:-1] # SalesPrice column dropped
display(single_row)

import joblib
joblib.dump(ohe, 'ohe.joblib')

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,Fence,MoSold,SaleType,SaleCondition
0,20,RL,7200,Pave,unknown,Reg,Lvl,AllPub,Inside,Gtl,...,3,Y,0,0,0,0,unknown,6,WD,Normal


['ohe.joblib']

In [226]:
xgbr.save_model('housepricexgb_final.model')