In [1]:
# import libraries 
!pip install --upgrade setuptools
!pip install --upgrade pip
!pip install xgboost



import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# magic word for producing visualizations in notebook
%matplotlib inline

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV



In [2]:
#import datasets
df_train = pd.read_csv(r"C:\Users\uallakulov\ds_projects\capstone_houseprices\train.csv")
df_test = pd.read_csv(r"C:\Users\uallakulov\ds_projects\capstone_houseprices\test.csv")
ids = df_test['Id'].values

In [3]:
df_train.shape

(1460, 81)

In [4]:
df_test.shape

(1459, 80)

In [5]:
df_train.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [6]:
#let's have a look at data description
#desc = open("ds_projects/capstone_houseprices/data_description.txt", "r")
#print(desc.read())

In [7]:
def percent_missing(df):
    '''
    Calculates the percentage of missing values in a dataset.
    
    Args:
    inputs: Dataset
    returns: Dataset showing the percentage of missing values per column.
    '''
    percent_missing = df.isnull().sum()* 100/len(df)
    percent_missing_df = pd.DataFrame({'col': df.columns,
                                 'percent_missing': percent_missing})
    return percent_missing_df

In [8]:
missing_train = percent_missing(df_train)

In [9]:
missing_train["percent_missing"].sort_values(ascending = False).head(15)

PoolQC          99.520548
MiscFeature     96.301370
Alley           93.767123
Fence           80.753425
FireplaceQu     47.260274
LotFrontage     17.739726
GarageCond       5.547945
GarageType       5.547945
GarageYrBlt      5.547945
GarageFinish     5.547945
GarageQual       5.547945
BsmtExposure     2.602740
BsmtFinType2     2.602740
BsmtFinType1     2.534247
BsmtCond         2.534247
Name: percent_missing, dtype: float64

Most variables have less than 6% values missing. For this analysis, I will drop variables that have more than 20% values missing.

In [10]:
#select columns to remove
missing_remove = list(missing_train[missing_train["percent_missing"]>19]["col"])
missing_remove

['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']

In [11]:
train = df_train.drop(missing_remove, inplace = False, axis = 1)
test = df_test.drop(missing_remove, inplace = False, axis = 1)

In [12]:
train.shape, test.shape

((1460, 76), (1459, 75))

In [13]:
# list of categorical variables
categorical_col = ["MSSubClass","MSZoning", "Street", "LotShape", "LandContour", 
                   "Utilities", "LotConfig", "LandSlope", "Neighborhood", "Condition1",
                   "Condition2", "BldgType", "HouseStyle", "RoofStyle", "Exterior1st",
                   "Exterior2nd", "MasVnrType", "Foundation", "Heating", "Electrical", 
                   "GarageType", "SaleType", "SaleCondition", "CentralAir", "RoofMatl"]

In [14]:
# check if they happen to be among remove variables
#missing_remove.isin(categorical_col)

In [15]:
# list of variables to be treated as ordinal
ordinal_col = ["OverallQual", "OverallCond", "ExterQual", "ExterCond", "BsmtQual", "BsmtCond", 
               "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "HeatingQC", "KitchenQual", 
               "Functional", "GarageFinish", "GarageQual", "GarageCond", 
               "PavedDrive"]

In [16]:
#Datetime
#MoSold: Month Sold (MM) 
#YrSold: Year Sold (YYYY)


In [17]:
x_train = train.drop("Id", axis = 1, inplace = False)
df_test = test.drop("Id", axis = 1, inplace = False)

In [18]:
x_train.shape

(1460, 75)

In [19]:
df_test.shape

(1459, 74)

In [20]:
list(set(x_train.columns) - set(df_test.columns))

['SalePrice']

In [21]:
SalePrice = x_train['SalePrice']

In [22]:
x_train.drop('SalePrice', inplace = True, axis = 1) 

In [23]:
x_train.shape, df_test.shape

((1460, 74), (1459, 74))

In [24]:
train_objs_num = len(x_train)

In [25]:
dataset = pd.concat(objs = [x_train, df_test], axis=0)

In [26]:
# get dummies on the combined data first
dataset_preprocessed  = pd.get_dummies(dataset, columns = categorical_col, prefix = categorical_col, dummy_na = False, drop_first = True)

In [27]:
x_train = dataset_preprocessed[:train_objs_num]
df_test = dataset_preprocessed[train_objs_num:]

In [28]:
x_train.shape, df_test.shape

((1460, 209), (1459, 209))

In [29]:
x_train[ordinal_col].head()

Unnamed: 0,OverallQual,OverallCond,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,HeatingQC,KitchenQual,Functional,GarageFinish,GarageQual,GarageCond,PavedDrive
0,7,5,Gd,TA,Gd,TA,No,GLQ,Unf,Ex,Gd,Typ,RFn,TA,TA,Y
1,6,8,TA,TA,Gd,TA,Gd,ALQ,Unf,Ex,TA,Typ,RFn,TA,TA,Y
2,7,5,Gd,TA,Gd,TA,Mn,GLQ,Unf,Ex,Gd,Typ,RFn,TA,TA,Y
3,7,5,TA,TA,TA,Gd,No,ALQ,Unf,Gd,Gd,Typ,Unf,TA,TA,Y
4,8,5,Gd,TA,Gd,TA,Av,GLQ,Unf,Ex,Gd,Typ,RFn,TA,TA,Y


In [30]:
x_train.isnull().sum()

LotFrontage         259
LotArea               0
OverallQual           0
OverallCond           0
YearBuilt             0
                   ... 
RoofMatl_Metal        0
RoofMatl_Roll         0
RoofMatl_Tar&Grv      0
RoofMatl_WdShake      0
RoofMatl_WdShngl      0
Length: 209, dtype: int64

In [31]:
# try imputing before dealing with ordinal data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')


In [32]:
imputed_train=pd.DataFrame(imputer.fit_transform(x_train))
imputed_train.columns=x_train.columns
imputed_train.index=x_train.index

In [33]:
imputed_test=pd.DataFrame(imputer.fit_transform(df_test))
imputed_test.columns=df_test.columns
imputed_test.index=df_test.index

In [34]:
imputed_train.shape, imputed_test.shape

((1460, 209), (1459, 209))

In [35]:
imputed_train.isnull().sum()

LotFrontage         0
LotArea             0
OverallQual         0
OverallCond         0
YearBuilt           0
                   ..
RoofMatl_Metal      0
RoofMatl_Roll       0
RoofMatl_Tar&Grv    0
RoofMatl_WdShake    0
RoofMatl_WdShngl    0
Length: 209, dtype: int64

In [36]:
imputed_train.head()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,BsmtQual,...,SaleCondition_Normal,SaleCondition_Partial,CentralAir_Y,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl
0,65,8450,7,5,2003,2003,196,Gd,TA,Gd,...,1,0,1,1,0,0,0,0,0,0
1,80,9600,6,8,1976,1976,0,TA,TA,Gd,...,1,0,1,1,0,0,0,0,0,0
2,68,11250,7,5,2001,2002,162,Gd,TA,Gd,...,1,0,1,1,0,0,0,0,0,0
3,60,9550,7,5,1915,1970,0,TA,TA,TA,...,0,0,1,1,0,0,0,0,0,0
4,84,14260,8,5,2000,2000,350,Gd,TA,Gd,...,1,0,1,1,0,0,0,0,0,0


In [37]:
#make sure no strings exist
#x_train.loc[:, x_train.dtypes == object]

In [38]:
#ordinal_col = ["BsmtCond", 
#               "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "HeatingQC", "KitchenQual", 
#               "Functional", "GarageFinish", "GarageQual", "GarageCond", 
 #              "PavedDrive"]

In [39]:
#"OverallQual", "OverallCond", "ExterQual", "ExterCond","BsmtQual",

In [40]:
# deal with ordinal data first
from sklearn.preprocessing import LabelEncoder

# make an encoder object
#x_train.replace("nan", np.nan, inplace = True, regex = True)
#df_train.replace("nan", np.nan, inplace = True, regex = True)
encoder = LabelEncoder()

# fit and transform
for col in ordinal_col:
    encoder.fit(pd.concat([imputed_train[col], imputed_test[col]], axis=0, sort=False))
    imputed_train[col] = encoder.transform(imputed_train[col])
    imputed_test[col] = encoder.transform(imputed_test[col])

print('Train data shape is {}'.format(imputed_train.shape))
print('Test data shape is {}'.format(imputed_test.shape))

Train data shape is (1460, 209)
Test data shape is (1459, 209)


In [41]:
imputed_train.head()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,BsmtQual,...,SaleCondition_Normal,SaleCondition_Partial,CentralAir_Y,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl
0,65,8450,6,4,2003,2003,196,2,4,2,...,1,0,1,1,0,0,0,0,0,0
1,80,9600,5,7,1976,1976,0,3,4,2,...,1,0,1,1,0,0,0,0,0,0
2,68,11250,6,4,2001,2002,162,2,4,2,...,1,0,1,1,0,0,0,0,0,0
3,60,9550,6,4,1915,1970,0,3,4,3,...,0,0,1,1,0,0,0,0,0,0
4,84,14260,7,4,2000,2000,350,2,4,2,...,1,0,1,1,0,0,0,0,0,0


In [42]:
X = imputed_train.astype(float)

In [43]:
y = df_train.SalePrice.values

In [44]:
X_test_fin = imputed_test.astype(float)

In [45]:
X.dtypes

LotFrontage         float64
LotArea             float64
OverallQual         float64
OverallCond         float64
YearBuilt           float64
                     ...   
RoofMatl_Metal      float64
RoofMatl_Roll       float64
RoofMatl_Tar&Grv    float64
RoofMatl_WdShake    float64
RoofMatl_WdShngl    float64
Length: 209, dtype: object

In [46]:
sample_submission = pd.read_csv(r"C:\Users\uallakulov\ds_projects\capstone_houseprices\sample_submission.csv")

In [47]:
sample_submission.shape

(1459, 2)

In [48]:
sample_submission.head()

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


In [49]:
grid_search.best_params_

NameError: name 'grid_search' is not defined

# iterative imputer

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
iter_imp = IterativeImputer(max_iter=10, random_state=0)

iter_train=pd.DataFrame(iter_imp.fit_transform(x_train))
iter_train.columns=x_train.columns
iter_train.index=x_train.index

In [51]:
import numpy as np
import pandas as pd


from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from xgboost.sklearn import XGBRegressor
import xgboost as xgb

pipeline3 = Pipeline([
        ('minmaxscaler', MinMaxScaler()),
        ('XGB', XGBRegressor())
    ])
    
parameters3 = {
        'XGB__eta': [0.04], 
        'XGB__max_depth': [3],
        'XGB__n_estimators': [900],
        'XGB__learning_rate': [0.06], 
        'XGB__eval_metric ' : ["rmse"], 
        'XGB__max_depth': range(3, 6, 1),
        'XGB__min_child_weight': [3],
        'XGB__subsample': [0.7],
        'XGB__colsample_bytree': [0.82]
    }

grid_search3 = GridSearchCV(pipeline3, param_grid = parameters3, n_jobs = 5, cv = 2, verbose=True)

In [52]:
grid_search3.fit(X, y)

Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   4 out of   6 | elapsed:   11.7s remaining:    5.8s
[Parallel(n_jobs=5)]: Done   6 out of   6 | elapsed:   15.0s finished


Parameters: { eval_metric  } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                                       ('XGB',
                                        XGBRegressor(base_score=None,
                                                     booster=None,
                                                     colsample_bylevel=None,
                                                     colsample_bynode=None,
                                                     colsample_bytree=None,
                                                     gamma=None, gpu_id=None,
                                                     importance_type='gain',
                                                     interaction_constraints=None,
                                                     learning_rate=None,
                                                     max_delta_step=None,
                                                     max_depth=None,
                                         

In [55]:
pred3 = grid_search3.predict(X_test_fin)

In [57]:
%cd "C:\Users\uallakulov\ds_projects\capstone_houseprices"

C:\Users\uallakulov\ds_projects\capstone_houseprices


In [59]:
submission3 = pd.DataFrame()
submission3['Id'] = test['Id']
submission3['SalePrice'] = pred3
submission3.to_csv('submission3.csv', index=False)

In [60]:
import numpy as np
import pandas as pd


from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from xgboost.sklearn import XGBRegressor
import xgboost as xgb

pipeline4 = Pipeline([
        ('minmaxscaler', MinMaxScaler()),
        ('XGB', XGBRegressor())
    ])
    
parameters4 = {
        'XGB__eta': [0.04], 
        'XGB__max_depth': [3],
        'XGB__n_estimators': [900],
        'XGB__learning_rate': [0.06, 0.07, 0.08], 
        'XGB__eval_metric ' : ["rmse"], 
        'XGB__max_depth': range(1, 4, 1),
        'XGB__min_child_weight': [3],
        'XGB__subsample': [0.5, 0.7],
        'XGB__colsample_bytree': [0.82]
    }

grid_search4 = GridSearchCV(pipeline4, param_grid = parameters4, n_jobs = 5, cv = 2, verbose=True)

In [61]:
grid_search4.fit(X, y)

Fitting 2 folds for each of 18 candidates, totalling 36 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  36 out of  36 | elapsed:   44.3s finished


Parameters: { eval_metric  } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                                       ('XGB',
                                        XGBRegressor(base_score=None,
                                                     booster=None,
                                                     colsample_bylevel=None,
                                                     colsample_bynode=None,
                                                     colsample_bytree=None,
                                                     gamma=None, gpu_id=None,
                                                     importance_type='gain',
                                                     interaction_constraints=None,
                                                     learning_rate=None,
                                                     max_delta_step=None,
                                                     max_depth=None,
                                         

In [62]:
pred4 = grid_search4.predict(X_test_fin)

In [72]:
grid_search4.best_params_

{'XGB__colsample_bytree': 0.82,
 'XGB__eta': 0.04,
 'XGB__eval_metric ': 'rmse',
 'XGB__learning_rate': 0.06,
 'XGB__max_depth': 3,
 'XGB__min_child_weight': 3,
 'XGB__n_estimators': 900,
 'XGB__subsample': 0.7}

In [77]:
grid_search4.best_estimator_ 

Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                ('XGB',
                 XGBRegressor(base_score=0.5, booster='gbtree',
                              colsample_bylevel=1, colsample_bynode=1,
                              colsample_bytree=0.82, eta=0.04,
                              eval_metric ='rmse', gamma=0, gpu_id=-1,
                              importance_type='gain',
                              interaction_constraints='', learning_rate=0.06,
                              max_delta_step=0, max_depth=3, min_child_weight=3,
                              missing=nan, monotone_constraints='()',
                              n_estimators=900, n_jobs=8, num_parallel_tree=1,
                              random_state=0, reg_alpha=0, reg_lambda=1,
                              scale_pos_weight=1, subsample=0.7,
                              tree_method='exact', validate_parameters=1,
                              verbosity=None))])

In [73]:
grid_search4.best_score_ 

0.884523927780772

In [63]:
submission4 = pd.DataFrame()
submission4['Id'] = test['Id']
submission4['SalePrice'] = pred4
submission4.to_csv('submission4.csv', index=False)