In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [2]:
X = pd.read_csv('https://storage.googleapis.com/kaggle-competitions-data/kaggle/10211/train.csv?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1561901199&Signature=Tok2i%2Fddx3XDpkbe%2FHOcxxVSAqGPMYC%2B1IJZMzuO6yH4BFQDUFe5zk9daZ0iQPJXBdg4TodP4eV9Z2Ycu3cPCJ%2FmGXjYlZDBwwRSZOh9oprBi4CKR5EavLNgk7BZ7Z6PgTxaPOcm3nGA6PzoFR1wI0rZOZPgl5rADNo3CVvjomaL9VK%2BQF9PCk6wsKNplnov6xbPGIiO9ZD89TCtMEG%2F0s6jixCgNysCFnfpf2IBYA3fIKtUdPyyQxWoselztESRuKkAVp20gUgSZgYG5OZnLG9X1FnWHJlm2lr48wclflCsifWkXItOxv%2FA4UgeUhxVd2D75zweqMa5tmrE4JXLtQ%3D%3D', index_col='Id')
X_test_full = pd.read_csv('https://storage.googleapis.com/kaggle-competitions-data/kaggle/10211/test.csv?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1561901250&Signature=VV82UXmJARoOBlo4OZ4Cxag%2F9nFeOKvz2VCkISsDHvsjEkuun5FLDEPiIdBhLorM0yymalzb0cE%2FKrvQgTEHu0kWelUwmXHzso6AeyNYErmlKVrQll7G3ySY6ghWOsx6TGgY20XKPBj0hb%2FPtKMt4AqaErpKboQofb9PQgDusUoudonRXhosfr8i1VEXV9a3wLyVRuMxnIhKveStr0Y87oNO9UJOOLYfl9ZXYjP7RR2Doj1CWbYgMoC3cFdGbwlZatBKVE95VLL%2B3v80qYD4BRVt9%2FMuO929z%2FEychB7ZZeb%2Byc02zhriAGN1mBmNxy3rBdWv01nIxmy4tYM%2F5nQYQ%3D%3D', index_col='Id')

Checking for NA in Target Column in train data. There is no NA

In [3]:
X.SalePrice.isnull().sum()

0

In [4]:
# Separating target value and rest of data in train data set
y = X.SalePrice              
X.drop(['SalePrice'], axis=1, inplace=True)

In [5]:
# Checking the size of train dataset
X.shape

(1460, 79)

In [6]:
# Calculating number of NA in each column
missing_val_count_by_column = (X.isnull().sum())
missing_val_count_by_column[missing_val_count_by_column > 0]

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [7]:
# So, a lot of variables are missing in columns: Alley, PoolQC, Fence, MiscFeature (more than half) - 
# so I decided to exclude this vars from model prediction.
X = X.drop(['Alley', 'PoolQC','Fence','MiscFeature'], axis=1)
X_test_full = X_test_full.drop(['Alley', 'PoolQC','Fence','MiscFeature'], axis=1)
X.shape

(1460, 75)

In [8]:
# We also want to leave columns only with unique number of categorical variables less than 10, 
# so I decided to select categorical columns with relatively low cardinality
categorical_cols = [cname for cname in X.columns if X[cname].nunique() < 10 and 
                        X[cname].dtype == "object"]
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
all_cols = categorical_cols + numerical_cols
X_cor = X[all_cols].copy()
X_test = X_test_full[all_cols].copy()

In [9]:
# # After preparing data with necessary columns, we can write pipeline for filling NA and OneHotEncoder of categorical variables to preprocess categorical variables
# numerical_transformer = SimpleImputer(strategy='median')

# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent'))
# ])

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numerical_transformer, numerical_cols),
#         ('cat', categorical_transformer, categorical_cols)
#     ])

In [12]:
X_cor = pd.get_dummies(X_cor)
X_test = pd.get_dummies(X_test)

si = SimpleImputer(strategy='median')

X_cor_ed = pd.DataFrame(si.fit_transform(X_cor))
X_cor_ed.columns = X_cor.columns

X_test_ed = pd.DataFrame(si.fit_transform(X_test))
X_test_ed.columns = X_test.columns

In [13]:
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X_cor_ed, y, train_size=0.8, test_size=0.2,
                                                               random_state=0)

In [49]:
train_cols = X_valid.columns
test_cols = X_test.columns
common_cols = train_cols.intersection(test_cols)
train_not_test = train_cols.difference(test_cols)
train_not_test

Index(['Condition2_RRAe', 'Condition2_RRAn', 'Condition2_RRNn',
       'Electrical_Mix', 'GarageQual_Ex', 'Heating_Floor', 'Heating_OthW',
       'HouseStyle_2.5Fin', 'RoofMatl_ClyTile', 'RoofMatl_Membran',
       'RoofMatl_Metal', 'RoofMatl_Roll', 'Utilities_NoSeWa'],
      dtype='object')

In [47]:
X_train.drop(train_not_test, axis=1)
X_valid.drop(train_not_test, axis=1)
X_test.shape

(1459, 206)

In [32]:
model = XGBRegressor(n_estimators = 900, learning_rate = 0.04, max_depth = 4, reg_alpha = 1, reg_lambda = 1, 
                     subsample = 0.5)

# parameters = {
#                'subsample': [0.5, 0.7, 0.8, 0.9]
#              }

# grid_search = GridSearchCV(model, param_grid = parameters, cv = 3)

# grid_search.fit(X_train, y_train)

In [34]:
# grid_search.best_params_

model.fit(X_train, y_train)
preds = model.predict(X_valid)

# # Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

score2 = np.sqrt(mean_squared_error(np.log(preds), np.log(y_valid)))
print('RMSE:', score2)

MAE: 15750.379468107876
RMSE: 0.12437173923199994


In [9]:
# n_estimators=800, max_depth=4, learning_rate=0.05, - MAE 16492.056493471748
# n_estimators = 900, learning_rate = 0.04, max_depth = 4, reg_alpha = 1, reg_lambda = 1, subsample = 0.5
#                                        - MAE MAE: 15750.379468107876

In [44]:
train_not_test

Index(['Condition2_RRAe', 'Condition2_RRAn', 'Condition2_RRNn',
       'Electrical_Mix', 'GarageQual_Ex', 'Heating_Floor', 'Heating_OthW',
       'HouseStyle_2.5Fin', 'RoofMatl_ClyTile', 'RoofMatl_Membran',
       'RoofMatl_Metal', 'RoofMatl_Roll', 'Utilities_NoSeWa'],
      dtype='object')

In [None]:
model2 = RandomForestRegressor(oob_score = True)

parameters2 = {'n_estimators': [20, 50, 100, 200, 250, 400],
              'criterion': ['mse', 'mae'],
              'max_depth': [3, 4, 5, 7, 8],
              'min_samples_split': [4, 6, 8, 10, 12]
             }

grid_search2 = GridSearchCV(model, param_grid = parameters2, cv = 3)

grid_search2.fit(X_train_ed, y_train)

In [None]:
grid_search2.best_params_
grid_search2.oob_score_

In [None]:
## Cначала нужно позаполнять пропуски, а потом делать get_dummies

# numerical_transformer = SimpleImputer(strategy='median')

# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent'))
# ])

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numerical_transformer, numerical_cols),
#         ('cat', categorical_transformer, categorical_cols)
#     ])