## random forest with Grid search

In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns 
import csv

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

df_train = pd.read_csv('train_norm_3.csv')
df_test = pd.read_csv('test_norm_3.csv')

In [2]:
# Select  all features, but SalePrice
x = df_train[['OverallQual', 'GrLivArea', 'Neighborhood', 'GarageCars', 'ExterQual',
       'TotalBsmtSF', 'YearBuilt', '2ndFlrSF', 'KitchenQual', 'BsmtFinSF1',
       'BsmtQual', 'LotArea', 'FullBath', 'YearRemodAdd', 'MasVnrArea',
       'LotFrontage', 'Fireplaces', 'OpenPorchSF', 'BsmtUnfSF', 'WoodDeckSF',
       'OverallCond', 'MoSold', 'BsmtFinType1', 'BsmtExposure',
       'Exterior1st', 'BedroomAbvGr', 'Exterior2nd', 'MSSubClass',
       'HouseStyle', 'BsmtFullBath', 'Foundation', 'MSZoning', 'SaleCondition'
       ]]

x_test = df_test[['OverallQual', 'GrLivArea', 'Neighborhood', 'GarageCars', 'ExterQual',
       'TotalBsmtSF', 'YearBuilt', '2ndFlrSF', 'KitchenQual', 'BsmtFinSF1',
       'BsmtQual', 'LotArea', 'FullBath', 'YearRemodAdd', 'MasVnrArea',
       'LotFrontage', 'Fireplaces', 'OpenPorchSF', 'BsmtUnfSF', 'WoodDeckSF',
       'OverallCond', 'MoSold', 'BsmtFinType1', 'BsmtExposure',
       'Exterior1st', 'BedroomAbvGr', 'Exterior2nd', 'MSSubClass',
       'HouseStyle', 'BsmtFullBath', 'Foundation', 'MSZoning', 'SaleCondition'
       ]]

#regressor values
#df_train['SalePrice'] = np.exp(df_train['SalePrice'])
#df_train['SalePrice'] = df_train['SalePrice'].astype(np.int64, copy=False)

y = pd.read_csv('train_norm_3.csv')['SalePrice']


In [3]:
# thanks to 
# https://habr.com/ru/company/ods/blog/324402/

# Инициализируем страифицированную разбивку нашего датасета для валидации
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=33)

# defauly parameters model
rfr = RandomForestRegressor(random_state=33)

# Обучаем на тренировочном датасете
results = cross_val_score(rfr, x, y, cv=skf)

# Оцениваем долю верных ответов на тестовом датасете
print("CV accuracy score: {:.2f}%".format(results.mean()*100))

CV accuracy score: 87.13%


In [4]:
# Perform train, test, split
#x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size=0.2)

#df_result_x = pd.DataFrame()
df_result_x_test = pd.DataFrame(data={'Id': df_test['Id'], 'SalePrice': 0})

In [5]:
# Инициализируем валидацию
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=33)

In [None]:
# Создаем списки для сохранения точности на тренировочном и тестовом датасете
'''train_acc = []
test_acc = []
temp_train_acc = []
temp_test_acc = []
trees_grid = [5, 10, 15, 20, 30, 50, 75, 100]

# Обучаем на тренировочном датасете
for ntrees in trees_grid:
    rfr = RandomForestRegressor(n_estimators=ntrees, random_state=33, n_jobs=-1, oob_score=True)
    temp_train_acc = []
    temp_test_acc = []
    for train_index, test_index in skf.split(x, y):
        X_train, X_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        rfr.fit(X_train, y_train)
        temp_train_acc.append(rfr.score(X_train, y_train))
        temp_test_acc.append(rfr.score(X_test, y_test))
    train_acc.append(temp_train_acc)
    test_acc.append(temp_test_acc)

train_acc, test_acc = np.asarray(train_acc), np.asarray(test_acc)
print("Best accuracy on CV is {:.2f}% with {} trees".format(max(test_acc.mean(axis=1))*100, 
                                                        trees_grid[np.argmax(test_acc.mean(axis=1))]))'''

In [6]:
parameters = {'max_features': [15,18], 'max_depth': [20,25], 'n_estimators': [300, 400]}
rfr = RandomForestRegressor(random_state=33, n_jobs=-1, oob_score=True)
gcv = GridSearchCV(rfr, parameters, n_jobs=-1, cv=skf, verbose=1)
gcv.fit(x, y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=33, shuffle=True),
             estimator=RandomForestRegressor(n_jobs=-1, oob_score=True,
                                             random_state=33),
             n_jobs=-1,
             param_grid={'max_depth': [20, 25], 'max_features': [15, 18],
                         'n_estimators': [300, 400]},
             verbose=1)

In [7]:
print(f'Best estimator: {gcv.best_estimator_}')
print(f'Best score: {gcv.best_score_}')
print(f'Best params: {gcv.best_params_}')

Best estimator: RandomForestRegressor(max_depth=25, max_features=15, n_estimators=400,
                      n_jobs=-1, oob_score=True, random_state=33)
Best score: 0.8835801707836728
Best params: {'max_depth': 25, 'max_features': 15, 'n_estimators': 400}


In [10]:
best_rfr = RandomForestRegressor(max_depth=25, max_features=12, 
                      min_impurity_decrease=0.0, min_impurity_split=None,                      
                      min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=-1,
                      oob_score=True)
best_rfr.fit(x, y)

df_result_x_test['SalePrice'] = best_rfr.predict(x_test)

In [11]:

#df_result_x_test['SalePrice'] = np.exp(df_result_x_test['SalePrice'])
df_result_x_test['SalePrice'] = df_result_x_test['SalePrice'].astype(np.int64, copy=False)
df_result_x_test.to_csv('rf_grid_submission_3.csv', index=False)