### 1. Importing the Libraries

In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor

### 2. Loading the Data

In [2]:
train_data_path = '../datasets/house-prices-advanced-regression-techniques/train.csv'
test_data_path = '../datasets/house-prices-advanced-regression-techniques/test.csv'
submission_data_path = '../datasets/house-prices-advanced-regression-techniques/sample_submission.csv'

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
submission = pd.read_csv(submission_data_path)

print(train_data.shape)
print(test_data.shape)

train_data.head(5)

(1460, 81)
(1459, 80)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


### 3. Data Preprocessing

In [3]:
# combine train and test data
all_data = pd.concat([train_data, test_data], axis=0, sort=False)
print(all_data.shape)

# filter the object columns and numeric columns
object_cols = all_data.select_dtypes(include=['object']).columns
numeric_cols = all_data.select_dtypes(exclude=['object']).columns

# dummy the object colomns
all_data = pd.get_dummies(all_data, columns=object_cols)

print(all_data.shape)

# fill missing values
all_data.fillna(all_data.mean(), inplace=True)

# check for missing values
missing_values = all_data.isnull().sum()
missing_values = missing_values[missing_values > 0]
missing_values.sort_values(ascending=False, inplace=True)

print(missing_values)


(2919, 81)
(2919, 289)
Series([], dtype: int64)


In [4]:
# split all_data back to train and test data
train_data = all_data.iloc[:train_data.shape[0], :]
test_data = all_data.iloc[train_data.shape[0]:, :]

print(train_data.shape)
print(test_data.shape)

train_data.head(5)

(1460, 289)
(1459, 289)


Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706.0,...,False,False,False,True,False,False,False,False,True,False
1,2,20,80.0,9600,6,8,1976,1976,0.0,978.0,...,False,False,False,True,False,False,False,False,True,False
2,3,60,68.0,11250,7,5,2001,2002,162.0,486.0,...,False,False,False,True,False,False,False,False,True,False
3,4,70,60.0,9550,7,5,1915,1970,0.0,216.0,...,False,False,False,True,True,False,False,False,False,False
4,5,60,84.0,14260,8,5,2000,2000,350.0,655.0,...,False,False,False,True,False,False,False,False,True,False


In [5]:
# split the data into training and validation sets
from sklearn.model_selection import train_test_split

X = train_data.drop(['Id', 'SalePrice'], axis=1)
y = train_data['SalePrice']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

print(X_train.shape)

(1168, 287)


In [6]:
# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train, y_train)

In [7]:
print(model.score(X_train, y_train))
print(model.score(X_val, y_val))

0.9799183941702947
0.8381545091284197


In [8]:
# search for the best hyperparameters
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in range(100, 1000, 100)]
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in range(10, 110, 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}

rf_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=100, cv=3, verbose=2, random_state=0, n_jobs=-1)

# Fit the random search model
rf_random.fit(X_train, y_train)

# print the best parameters
rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


148 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
68 fits failed with the following error:
Traceback (most recent call last):
  File "E:\envis\Anaconda3\envs\dl_env\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "E:\envis\Anaconda3\envs\dl_env\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "E:\envis\Anaconda3\envs\dl_env\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "E:\envis\Anaconda3\envs\dl_env\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints


{'n_estimators': 600,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': False}

In [9]:
# Evaluate the model with the best hyperparameters
best_model = rf_random.best_estimator_
print(best_model.score(X_train, y_train))
print(best_model.score(X_val, y_val))

0.999999993790923
0.8446205404461484


In [18]:
# predict the test data
predictions = best_model.predict(test_data)

submission['SalePrice'] = predictions
submission.to_csv('submission.csv', index=False)