**Team Name:** DataNerds

**Student Details:**

--------------------------

Name : Reshma Vijay Jawale

Student Id : A0236581B

--------------------------
Name : Aiden Low Yew Woei

Student Id : A0121969W

--------------------------
Name : Raivat Bhupesh Shah 

Student Id : A0184879A

--------------------------

## Setting up the Notebook

In [1]:
%load_ext autoreload
%autoreload 2

In [43]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

from IPython.display import display
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Cleaning methods
import library_code.cleaning as cleaning
import library_code.constants as constants
import library_code.imputation as imputation
import library_code.auxiliary as auxiliary

# Scikit learn
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedKFold
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import KFold
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import KNNImputer
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge

from scipy.spatial import Voronoi, voronoi_plot_2d
from matplotlib.patches import Rectangle, Circle
from matplotlib.colors import ListedColormap

from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

# For adaboost
from sklearn.ensemble import AdaBoostClassifier


import os
for dirname, _, filenames in os.walk('data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_prices_train = pd.read_csv("../data/train.csv")
df_prices_test = pd.read_csv("../data/test.csv")

# Data Cleaning

The data cleaning steps here are based off our EDA notebook.

In [47]:
# First, drop all fields in TO_IGNORE as these columns are either redundant
# e.g. we drop model because we have type
# or they only have one unique value, e.g. 'market_segment' and 'type_of_area'.
df_prices_train = df_prices_train.drop(columns=constants.TO_IGNORE)
df_prices_test = df_prices_test.drop(columns=constants.TO_IGNORE)

In [48]:
# Then, we clean up the tenure fields to 3 categories: 60 years, 99 years, and freehold/999
df_prices_train['tenure'] = df_prices_train['tenure'].apply(cleaning.tenure_to_binary)
df_prices_test['tenure'] = df_prices_test['tenure'].apply(cleaning.tenure_to_binary)
# Perform one hot encoding
df_prices_train = cleaning.categorical_to_onehot(df_prices_train, ['tenure', 'type', 'region', 'planning_area'])
df_prices_test = cleaning.categorical_to_onehot(df_prices_test, ['tenure', 'type', 'region', 'planning_area'])
# Remove 'planning_area_seletar' column in test as that's not seen in train.
df_prices_test.drop(columns=['planning_area_seletar'], inplace=True)

## Cleaning values for bedroom and bathroom

Some of the values in the bedroom column are math expressions, e.g. '4+1. We're not sure what this means! Is it 5 bedrooms? Is the +1 because it is not a *full room* (e.g. a servant quarter or living room) or is the +1 referring to a bathroom? Due to these different possibilities, we follow an iterative approach where for we take 4+1 as 5, 4.5 (to quantify only half a room) and 4 (to quantify no room). We will go with the representation that gives us the most accuracy

In [50]:
df_prices_train['bedrooms'] = df_prices_train['bedrooms'].apply(cleaning.process_bedroom_sum)
df_prices_test['bedrooms'] = df_prices_test['bedrooms'].apply(cleaning.process_bedroom_sum)
df_prices_train[df_prices_train['bedrooms'].isna()]

Unnamed: 0,bedrooms,bathrooms,district,lat,lng,no_of_units,area_size,price,tenure_60,tenure_99,...,planning_area_sembawang,planning_area_sengkang,planning_area_serangoon,planning_area_singapore river,planning_area_southern islands,planning_area_tampines,planning_area_tanglin,planning_area_toa payoh,planning_area_woodlands,planning_area_yishun
59,,1.0,15,1.314620,103.932237,116.0,409.0,715000.0,0,0,...,0,0,0,0,0,0,0,0,0,0
111,,,6,1.292988,103.851047,39.0,6609.0,27423000.0,0,1,...,0,0,0,0,0,0,0,0,0,0
203,,1.0,2,1.274644,103.844742,360.0,365.0,990000.0,0,1,...,0,0,0,0,0,0,0,0,0,0
296,,1.0,2,1.274644,103.844742,360.0,365.0,1100000.0,0,1,...,0,0,0,0,0,0,0,0,0,0
443,,,22,1.335557,103.742417,738.0,506.0,1045000.0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25849,,1.0,18,1.376176,103.960488,473.0,571.0,693000.0,0,1,...,0,0,0,0,0,0,0,0,0,0
25866,,2.0,23,1.379727,103.760191,338.0,624.0,825000.0,0,1,...,0,0,0,0,0,0,0,0,0,0
25884,,1.0,1,1.277083,103.849181,510.0,441.0,1099800.0,0,1,...,0,0,0,0,0,0,0,0,0,0
25916,,1.0,7,1.297510,103.856297,522.0,409.0,1098900.0,0,1,...,0,0,0,0,0,0,0,0,0,0


# Adaboost

In [64]:
X_train = df_prices_train.loc[:, df_prices_train.columns != 'price']
y_train = df_prices_train['price'].values.ravel()

model = AdaBoostRegressor(base_estimator=DecisionTreeRegressor())
# Set up pipeline with imputer for proper grid search
# Reference: https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html
pipeline = Pipeline(steps=[('scaler', MinMaxScaler()), ('imputer', KNNImputer()), ('model', model)])

# define grid search for hyperparameters
grid = {
    'imputer__n_neighbors': [3, 5, 7, 11, 15],
    'model__base_estimator__max_depth': [i for i in range(1, 21, 2)],
    'model__n_estimators': [50, 100, 150, 250, 500],
    'model__learning_rate': [0.1, 1.0],
}
# Repeats set at 3 to reduce model 
cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=1)

mse = make_scorer(mean_squared_error, greater_is_better=False)
grid_search = HalvingGridSearchCV(estimator=pipeline, param_grid=grid, cv=cv, n_jobs=-1, scoring=mse, verbose=1)
# Execute the grid search
grid_results = grid_search.fit(X_train, y_train)

n_iterations: 5
n_required_iterations: 6
n_possible_iterations: 5
min_resources_: 107
max_resources_: 26048
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 500
n_resources: 107
Fitting 10 folds for each of 500 candidates, totalling 5000 fits
----------
iter: 1
n_candidates: 167
n_resources: 321
Fitting 10 folds for each of 167 candidates, totalling 1670 fits
----------
iter: 2
n_candidates: 56
n_resources: 963
Fitting 10 folds for each of 56 candidates, totalling 560 fits
----------
iter: 3
n_candidates: 19
n_resources: 2889
Fitting 10 folds for each of 19 candidates, totalling 190 fits




----------
iter: 4
n_candidates: 7
n_resources: 8667
Fitting 10 folds for each of 7 candidates, totalling 70 fits


In [65]:
# Get RMSE of grid results best score
best_rmse = (-grid_results.best_score_) ** 0.5
print("Best: %f using %s" % (best_rmse, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

Best: 1002524.512938 using {'imputer__n_neighbors': 3, 'model__base_estimator__max_depth': 13, 'model__learning_rate': 1.0, 'model__n_estimators': 50}


In [66]:
clf = grid_results
X_test = df_prices_test[:]
y_pred = clf.predict(X_test)

In [67]:
result = pd.DataFrame({"Predicted": y_pred})
result.index.name = "Id"
result.to_csv("submission_adaboost.csv")

# Random Forest

In [71]:
# Get data
X_train = df_prices_train.loc[:, df_prices_train.columns != 'price']
y_train = df_prices_train[['price']].values.ravel()

In [72]:
X_train.shape

(26048, 55)

In [97]:
model = RandomForestRegressor(n_jobs=-1)
# Set up pipeline with imputer for proper grid search
# Reference: https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html
pipeline = Pipeline(steps=[('scaler', MinMaxScaler()), ('imputer', KNNImputer()), ('model', model)])

# define grid search for hyperparameters
grid = {
    'imputer__n_neighbors': [3, 5, 7, 11, 15],
    'model__max_features': ['auto', 'sqrt'],
    'model__max_depth': [i for i in range(1, 21, 2)],
    'model__n_estimators': [25, 50, 100, 150, 250, 500],
}


cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=1)
# define grid search for hyperparameters
num_features = X_train.shape[1]

mse = make_scorer(mean_squared_error, greater_is_better=False)
grid_search = HalvingGridSearchCV(estimator=pipeline, param_grid=grid, n_jobs=-1, cv=cv, scoring=mse, verbose=1)
# Execute the grid search
grid_results = grid_search.fit(X_train, y_train)

n_iterations: 5
n_required_iterations: 6
n_possible_iterations: 5
min_resources_: 107
max_resources_: 26048
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 600
n_resources: 107
Fitting 10 folds for each of 600 candidates, totalling 6000 fits
----------
iter: 1
n_candidates: 200
n_resources: 321
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
----------
iter: 2
n_candidates: 67
n_resources: 963
Fitting 10 folds for each of 67 candidates, totalling 670 fits
----------
iter: 3
n_candidates: 23
n_resources: 2889
Fitting 10 folds for each of 23 candidates, totalling 230 fits




----------
iter: 4
n_candidates: 8
n_resources: 8667
Fitting 10 folds for each of 8 candidates, totalling 80 fits


In [98]:
# Get RMSE of grid results best score
best_rmse = (-grid_results.best_score_) ** 0.5
print("Best: %f using %s" % (best_rmse, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

Best: 1159476.955259 using {'imputer__n_neighbors': 3, 'model__max_depth': 17, 'model__max_features': 'auto', 'model__n_estimators': 100}


In [99]:
clf = grid_results
X_test = df_prices_test.loc[:, df_prices_test.columns != 'price']
y_pred = clf.predict(X_test)

In [100]:
result = pd.DataFrame({"Predicted": y_pred})
result.index.name = "Id"
result.to_csv("submission_forest.csv")

# Bagging Regressor

In [77]:
# Get data
X_train = df_prices_train.loc[:, df_prices_train.columns != 'price']
y_train = df_prices_train[['price']].values.ravel()
print(X_train.shape)
model = BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_jobs=-1)
# Set up pipeline with imputer for proper grid search
# Reference: https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html
pipeline = Pipeline(steps=[('scaler', MinMaxScaler()), ('imputer', KNNImputer()), ('model', model)])

# define grid search for hyperparameters
grid = {
    'imputer__n_neighbors': [3, 5, 7, 11, 15],
    'model__max_features': [0.5, 0.7, 0.9, 1],
    'model__base_estimator__max_depth': [i for i in range(1, 21, 2)],
    'model__n_estimators': [50, 100, 150, 250, 500],
}


cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=1)
# define grid search for hyperparameters
num_features = X_train.shape[1]

mse = make_scorer(mean_squared_error, greater_is_better=False)
grid_search = HalvingGridSearchCV(estimator=pipeline, param_grid=grid, n_jobs=-1, cv=cv, scoring=mse, verbose=1)
# Execute the grid search
grid_results = grid_search.fit(X_train, y_train)

(26048, 55)
n_iterations: 7
n_required_iterations: 7
n_possible_iterations: 7
min_resources_: 35
max_resources_: 26048
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1000
n_resources: 35
Fitting 10 folds for each of 1000 candidates, totalling 10000 fits
----------
iter: 1
n_candidates: 334
n_resources: 105
Fitting 10 folds for each of 334 candidates, totalling 3340 fits
----------
iter: 2
n_candidates: 112
n_resources: 315
Fitting 10 folds for each of 112 candidates, totalling 1120 fits
----------
iter: 3
n_candidates: 38
n_resources: 945
Fitting 10 folds for each of 38 candidates, totalling 380 fits
----------
iter: 4
n_candidates: 13
n_resources: 2835
Fitting 10 folds for each of 13 candidates, totalling 130 fits
----------
iter: 5
n_candidates: 5
n_resources: 8505
Fitting 10 folds for each of 5 candidates, totalling 50 fits




----------
iter: 6
n_candidates: 2
n_resources: 25515
Fitting 10 folds for each of 2 candidates, totalling 20 fits


In [78]:
# Get RMSE of grid results best score
best_rmse = (-grid_results.best_score_) ** 0.5
print("Best: %f using %s" % (best_rmse, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

Best: 796625.230065 using {'imputer__n_neighbors': 3, 'model__base_estimator__max_depth': 17, 'model__max_features': 0.9, 'model__n_estimators': 100}


In [79]:
# Predict
clf = grid_results
X_test = df_prices_test.loc[:, df_prices_test.columns != 'price']
y_pred = clf.predict(X_test)

In [80]:
result = pd.DataFrame({"Predicted": y_pred})
result.index.name = "Id"
result.to_csv("submission_bagging.csv")

# GradientBoostingRegressor

In [81]:
# Get data
X_train = df_prices_train.loc[:, df_prices_train.columns != 'price']
y_train = df_prices_train[['price']].values.ravel()
print(X_train.shape)
model = GradientBoostingRegressor()
# Set up pipeline with imputer for proper grid search
# Reference: https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html
pipeline = Pipeline(steps=[('scaler', MinMaxScaler()), ('imputer', KNNImputer()), ('model', model)])

# define grid search for hyperparameters
grid = {
    'imputer__n_neighbors': [3, 5, 7, 11, 15],
    'model__subsample': [0.2, 0.5, 0.9, 1],
    'model__max_features': ['auto', 'sqrt'],
    'model__max_depth': [i for i in range(1, 21, 2)],
    'model__n_estimators': [50, 100, 150, 250, 500],
}


cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=1)
# define grid search for hyperparameters
num_features = X_train.shape[1]

mse = make_scorer(mean_squared_error, greater_is_better=False)
grid_search = HalvingGridSearchCV(estimator=pipeline, param_grid=grid, n_jobs=-1, cv=cv, scoring=mse, verbose=1)
# Execute the grid search
grid_results = grid_search.fit(X_train, y_train)

(26048, 55)
n_iterations: 7
n_required_iterations: 7
n_possible_iterations: 7
min_resources_: 35
max_resources_: 26048
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 2000
n_resources: 35
Fitting 10 folds for each of 2000 candidates, totalling 20000 fits
----------
iter: 1
n_candidates: 667
n_resources: 105
Fitting 10 folds for each of 667 candidates, totalling 6670 fits
----------
iter: 2
n_candidates: 223
n_resources: 315
Fitting 10 folds for each of 223 candidates, totalling 2230 fits
----------
iter: 3
n_candidates: 75
n_resources: 945
Fitting 10 folds for each of 75 candidates, totalling 750 fits
----------
iter: 4
n_candidates: 25
n_resources: 2835
Fitting 10 folds for each of 25 candidates, totalling 250 fits
----------
iter: 5
n_candidates: 9
n_resources: 8505
Fitting 10 folds for each of 9 candidates, totalling 90 fits
----------
iter: 6
n_candidates: 3
n_resources: 25515
Fitting 10 folds for each of 3 candidates, totalling 30 fits




In [82]:
# Get RMSE of grid results best score
best_rmse = (-grid_results.best_score_) ** 0.5
print("Best: %f using %s" % (best_rmse, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

Best: 759509.398026 using {'imputer__n_neighbors': 5, 'model__max_depth': 7, 'model__max_features': 'auto', 'model__n_estimators': 500, 'model__subsample': 0.5}


In [83]:
# Predict
clf = grid_results
X_test = df_prices_test.loc[:, df_prices_test.columns != 'price']
y_pred = clf.predict(X_test)

In [84]:
result = pd.DataFrame({"Predicted": y_pred})
result.index.name = "Id"
result.to_csv("submission_gradient_boosting_no_aux_add_inflation.csv")

# Linear Regression

In [85]:
# Get data
X_train = df_prices_train.loc[:, df_prices_train.columns != 'price']
y_train = df_prices_train[['price']].values.ravel()
print(X_train.shape)
model = LinearRegression()
# Set up pipeline with imputer for proper grid search
# Reference: https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html
pipeline = Pipeline(steps=[('scaler', MinMaxScaler()), ('imputer', KNNImputer()), ('model', model)])

# define grid search for hyperparameters
grid = {
    'imputer__n_neighbors': [3, 5, 7, 11, 15],
    'model__fit_intercept': [False, True],
}


cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=1)
# define grid search for hyperparameters
num_features = X_train.shape[1]

mse = make_scorer(mean_squared_error, greater_is_better=False)
grid_search = HalvingGridSearchCV(estimator=pipeline, param_grid=grid, n_jobs=-1, cv=cv, scoring=mse, verbose=1)
# Execute the grid search
grid_results = grid_search.fit(X_train, y_train)

(26048, 55)
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 2894
max_resources_: 26048
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 10
n_resources: 2894
Fitting 10 folds for each of 10 candidates, totalling 100 fits
----------
iter: 1
n_candidates: 4
n_resources: 8682
Fitting 10 folds for each of 4 candidates, totalling 40 fits




----------
iter: 2
n_candidates: 2
n_resources: 26046
Fitting 10 folds for each of 2 candidates, totalling 20 fits


In [86]:
# Get RMSE of grid results best score
best_rmse = (-grid_results.best_score_) ** 0.5
print("Best: %f using %s" % (best_rmse, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

Best: 1912274.814626 using {'imputer__n_neighbors': 15, 'model__fit_intercept': False}


In [87]:
# Predict
clf = grid_results
X_test = df_prices_test.loc[:, df_prices_test.columns != 'price']
y_pred = clf.predict(X_test)

In [88]:
result = pd.DataFrame({"Predicted": y_pred})
result.index.name = "Id"
result.to_csv("submission_linear_regression.csv")

# Bayesian Regression

In [89]:
# Get data
X_train = df_prices_train.loc[:, df_prices_train.columns != 'price']
y_train = df_prices_train[['price']].values.ravel()
print(X_train.shape)
model = BayesianRidge()
# Set up pipeline with imputer for proper grid search
# Reference: https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html
pipeline = Pipeline(steps=[('scaler', MinMaxScaler()), ('imputer', KNNImputer()), ('model', model)])

# define grid search for hyperparameters
grid = {
    'imputer__n_neighbors': [3, 5, 7, 11, 15],
}


cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=1)
# define grid search for hyperparameters
num_features = X_train.shape[1]

mse = make_scorer(mean_squared_error, greater_is_better=False)
grid_search = HalvingGridSearchCV(estimator=pipeline, param_grid=grid, n_jobs=-1, cv=cv, scoring=mse, verbose=1)
# Execute the grid search
grid_results = grid_search.fit(X_train, y_train)

(26048, 55)
n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 2
min_resources_: 8682
max_resources_: 26048
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 5
n_resources: 8682
Fitting 10 folds for each of 5 candidates, totalling 50 fits
----------
iter: 1
n_candidates: 2
n_resources: 26046
Fitting 10 folds for each of 2 candidates, totalling 20 fits




In [90]:
# Get RMSE of grid results best score
best_rmse = (-grid_results.best_score_) ** 0.5
print("Best: %f using %s" % (best_rmse, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

Best: 1912495.609717 using {'imputer__n_neighbors': 3}


In [91]:
# Predict
clf = grid_results
X_test = df_prices_test.loc[:, df_prices_test.columns != 'price']
y_pred = clf.predict(X_test)

In [92]:
result = pd.DataFrame({"Predicted": y_pred})
result.index.name = "Id"
result.to_csv("submission_bayesian_regression.csv")

# KNeighbors Regression

In [93]:
# Get data
X_train = df_prices_train.loc[:, df_prices_train.columns != 'price']
y_train = df_prices_train[['price']].values.ravel()
print(X_train.shape)
model = KNeighborsRegressor()
# Set up pipeline with imputer for proper grid search
# Reference: https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html
pipeline = Pipeline(steps=[('scaler', MinMaxScaler()), ('imputer', KNNImputer()), ('model', model)])

# define grid search for hyperparameters
grid = {
    'imputer__n_neighbors': [3, 5, 7, 11, 15],
    'model__n_neighbors': [i for i in range(1, 32, 2)]
}


cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=1)
# define grid search for hyperparameters
num_features = X_train.shape[1]

mse = make_scorer(mean_squared_error, greater_is_better=False)
grid_search = HalvingGridSearchCV(estimator=pipeline, param_grid=grid, n_jobs=-1, cv=cv, scoring=mse, verbose=1)
# Execute the grid search
grid_results = grid_search.fit(X_train, y_train)

(26048, 55)
n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 964
max_resources_: 26048
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 80
n_resources: 964
Fitting 10 folds for each of 80 candidates, totalling 800 fits
----------
iter: 1
n_candidates: 27
n_resources: 2892
Fitting 10 folds for each of 27 candidates, totalling 270 fits
----------
iter: 2
n_candidates: 9
n_resources: 8676
Fitting 10 folds for each of 9 candidates, totalling 90 fits
----------
iter: 3
n_candidates: 3
n_resources: 26028
Fitting 10 folds for each of 3 candidates, totalling 30 fits


In [94]:
# Get RMSE of grid results best score
best_rmse = (-grid_results.best_score_) ** 0.5
print("Best: %f using %s" % (best_rmse, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

Best: 1033868.471874 using {'imputer__n_neighbors': 11, 'model__n_neighbors': 3}


In [95]:
# Predict
clf = grid_results
X_test = df_prices_test.loc[:, df_prices_test.columns != 'price']
y_pred = clf.predict(X_test)

In [96]:
result = pd.DataFrame({"Predicted": y_pred})
result.index.name = "Id"
result.to_csv("submission_kneighbors.csv")