In [216]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Lasso
import warnings
warnings.filterwarnings('ignore')

In [217]:
X = pd.read_csv('/Users/avyny/Documents/Data_analyses-Kaggle/Data_House_Pricing/train.csv', index_col='Id')
X_test_full = pd.read_csv('/Users/avyny/Documents/Data_analyses-Kaggle/Data_House_Pricing/test.csv', index_col='Id')
# X = pd.read_csv('D:\\Learning_IT\\Data_analyses-Kaggle\\Data_House_Pricing\\train.csv', index_col='Id')
# X_test_full = pd.read_csv('D:\\Learning_IT\\Data_analyses-Kaggle\\Data_House_Pricing\\test.csv', index_col='Id')

In [218]:
# Separating target value and rest of data in train data set
y = X.SalePrice              
X.drop(['SalePrice'], axis=1, inplace=True)

In [219]:
# Calculating number of NA in each column
missing_val_count_by_column = (X.isnull().sum())
missing_val_count_by_column[missing_val_count_by_column > 0]

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [220]:
# So, a lot of variables are missing in columns: Alley, PoolQC, Fence, MiscFeature (more than half) - 
# so I decided to exclude these vars from model prediction.
# Also it was decided to drop "Utilities" variable, because it is almost constant (except 1 value, that test-set doesn't 
# include)

# 'TotalBsmtSF' is sum of 'BsmtUnfSF' + 'BsmtFinSF2' + 'BsmtFinSF1'
# ???? TotRmsAbvGrd

'Alley', 'PoolQC','Fence','MiscFeature', 'Utilities', 'Street', 'PoolArea', 'MiscVal'
X = X.drop(['Alley', 'PoolQC','Fence','MiscFeature', 'Utilities', 'Street', 'PoolArea', 'MiscVal', 'TotalBsmtSF', '3SsnPorch'], axis=1)
X_test_full = X_test_full.drop(['Alley', 'PoolQC','Fence','MiscFeature', 'Utilities', 'Street', 'PoolArea', 'MiscVal', 'TotalBsmtSF', '3SsnPorch'], axis=1)

In [221]:
# We also want to leave columns only with unique number of categorical variables less than 10, 
# so I decided to select categorical columns with relatively low cardinality

categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
# categorical_cols = [cname for cname in X.columns if X[cname].nunique() < 10 and 
#                         X[cname].dtype == "object"]
numerical_median = ['OverallCond', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
                   'BsmtUnfSF', 'LowQualFinSF', 'GrLivArea', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
                   'TotRmsAbvGrd', 'GarageYrBlt', 'GarageCars', 'EnclosedPorch', 'ScreenPorch', 'MoSold', 'YrSold']

numerical_mean = ['LotFrontage', 'LotArea', '1stFlrSF', '2ndFlrSF', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF']

numerical_frequent = ['MSSubClass', 'BsmtFullBath', 'BsmtHalfBath', 'Fireplaces']

all_cols = categorical_cols + numerical_median + numerical_mean + numerical_frequent
X_cor = X[all_cols].copy()
X_test = X_test_full[all_cols].copy()

**HERE WILL START PART WITH PIPELINE AND OneHotEncoder**

In [222]:
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X_cor, y, train_size=0.8, test_size=0.2,
                                                               random_state=0)

numerical_transformer_mean = SimpleImputer(strategy='mean')
numerical_transformer_median = SimpleImputer(strategy='median')
numerical_transformer_freq = SimpleImputer(strategy='most_frequent')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num1', numerical_transformer_mean, numerical_mean),
        ('num2', numerical_transformer_median, numerical_median),
        ('num3', numerical_transformer_freq, numerical_frequent),
        ('cat', categorical_transformer, categorical_cols)
    ])

model = XGBRegressor(colsample_bytree = 0.7, learning_rate=0.03, max_depth=6, n_estimators=850)

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

In [223]:
## ! Grid Search for Pipeline

# parameters = dict(model__max_depth = [3, 4, 5], 
#                   model__learning_rate = [0.01, 0.03, 0.04, 0.05, 0.06],
#                   model__n_estimators = [100, 300, 400, 450, 550, 650],
#                   model__colsample_bytree = [0.7, 0.8, 1]
#                  )
# cv = GridSearchCV(clf, param_grid=parameters)

# cv.fit(X_train, y_train)

In [224]:
clf.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num1',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0),
                                                  ['LotFrontage', 'LotArea',
                                                   '1stFlrSF', '2ndFlrSF',
                                                   'GarageArea', 'Woo

In [225]:
preds11 = clf.predict(X_valid)

# # Evaluate the model
score11 = mean_absolute_error(y_valid, preds11)
print('MAE:', score11)

rmse = np.sqrt(mean_squared_error(y_valid, preds11))
print('RMSE:', rmse)

rmse2 = np.sqrt(mean_squared_error(np.log(preds11), np.log(y_valid)))
print("Root Mean Squared Error:" , rmse2)

MAE: 15570.23527129709
RMSE: 29632.487944218585
Root Mean Squared Error: 0.12486422381380102


In [226]:
# cross_val_score(clf, X_train, y_train, cv = 5).mean()

**HERE THIS PART WAS ENDED**

**HERE START PART WITH MANUAL PREPROCESSING**

In [236]:
X_cor = pd.get_dummies(X_cor)
X_test = pd.get_dummies(X_test)

si = SimpleImputer(strategy='mean')

X_cor_ed = pd.DataFrame(si.fit_transform(X_cor))
X_cor_ed.columns = X_cor.columns

X_test_ed = pd.DataFrame(si.fit_transform(X_test))
X_test_ed.columns = X_test.columns
X_test_ed.index = X_test.index

In [237]:
train_cols = X_cor_ed.columns
test_cols = X_test_ed.columns
common_cols = train_cols.intersection(test_cols)
train_not_test = train_cols.difference(test_cols)
train_not_test

Index(['Condition2_RRAe', 'Condition2_RRAn', 'Condition2_RRNn',
       'Electrical_Mix', 'Exterior1st_ImStucc', 'Exterior1st_Stone',
       'Exterior2nd_Other', 'GarageQual_Ex', 'Heating_Floor', 'Heating_OthW',
       'HouseStyle_2.5Fin', 'RoofMatl_ClyTile', 'RoofMatl_Membran',
       'RoofMatl_Metal', 'RoofMatl_Roll'],
      dtype='object')

In [238]:
# So now we will take columns that are in common between train and test datasets.
X_cor_ed = X_cor_ed.drop(train_not_test, axis=1)


In [239]:
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X_cor_ed, y, train_size=0.8, test_size=0.2,
                                                               random_state=0)

In [240]:
model_xgb = XGBRegressor(colsample_bytree = 0.7, learning_rate=0.03, max_depth=6, n_estimators=850)

model_xgb.fit(X_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0,
             importance_type='gain', learning_rate=0.03, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=None, n_estimators=850,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [241]:
preds = model_xgb.predict(X_valid)

# # Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

rmse = np.sqrt(mean_squared_error(y_valid, preds))
print('RMSE:', rmse)

rmse2 = np.sqrt(mean_squared_error(np.log(preds), np.log(y_valid)))
print("Root Mean Squared Error:" , rmse2)

MAE: 16205.5849609375
RMSE: 27940.01413820555
Root Mean Squared Error: 0.12718326053139634


In [235]:
# cross_val_score(model_xgb, X_train, y_train, cv = 5).mean()

In [28]:
## Writing output to file

preds_out = model_xgb.predict(X_test_ed)
output = pd.DataFrame({'Id': X_test_ed.index,
                       'SalePrice': preds_out})

output.to_csv('submission_1.csv', index=False)

In [None]:
# RESULTS:

# n_estimators=800, max_depth=4, learning_rate=0.05, - MAE 16492.056493471748
# n_estimators = 900, learning_rate = 0.04, max_depth = 4, reg_alpha = 1, reg_lambda = 1, subsample = 0.5
#                                        - MAE MAE: 15750.379468107876
# n_estimators = 900, learning_rate = 0.04, subsample = 0.8 - MAE: 15617.851789918665

# imputer mean: model n_estimators = 900, learning_rate = 0.04, subsample = 0.8 - MAE: 15036.240002497147
 
# pipeline with imputer mean for num, delete FireplaceQu, without limitation for categoric variables + 
#  + XGB n_estimators = 900, learning_rate = 0.04, subsample = 0.8 - MAE: 15023.990341395547


**END OF XGBOOST**


**2-ND MODEL RANDOM FOREST**

In [242]:
model_rf = RandomForestRegressor(criterion = 'mse', max_depth = 5, min_samples_split = 4, n_estimators = 400)
model_rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=4,
                      min_weight_fraction_leaf=0.0, n_estimators=400,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [243]:
# cross_val_score(model_rf, X_train, y_train, cv = 5).mean()

In [244]:
preds_rf = model_rf.predict(X_valid)

# # Evaluate the model
score2 = mean_absolute_error(y_valid, preds_rf)
print('MAE:', score2)

rmse2 = np.sqrt(mean_squared_error(np.log(preds_rf), np.log(y_valid)))
print("Root Mean Squared Error:" , rmse2)

MAE: 20034.07882477593
Root Mean Squared Error: 0.15706657559926132


In [245]:
## Writing output to file

preds_out = model_rf.predict(X_test_ed)
output = pd.DataFrame({'Id': X_test_ed.index,
                       'SalePrice': preds_out})

output.to_csv('submission_2.csv', index=False)