In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn import metrics

In [3]:
train = pd.read_csv("train_validation.csv")
test = pd.read_csv("test.csv")

In [14]:
y = train['price_log']
X = train.drop("price_log", axis = 1)
y_test = test['price_log']
X_test = test.drop("price_log", axis = 1)

## SVR

In [46]:
parameters = {'C':[0.6, 0.7, 0.8, 0.9, 1], 'epsilon':[0.08, 0.1, 0.2]}
svr = SVR(gamma='scale')
clf = GridSearchCV(estimator = svr, param_grid = parameters, scoring = 'neg_mean_squared_error', cv = 5)
clf.fit(X, y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.6, 0.7, 0.8, 0.9, 1],
                         'epsilon': [0.08, 0.1, 0.2]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [47]:
clf.cv_results_

{'mean_fit_time': array([3.86645045, 3.60759859, 2.5534668 , 4.13312759, 3.82270217,
        2.60684485, 4.1533947 , 3.80577197, 2.72263689, 4.27774644,
        3.95155673, 2.72351522, 4.32143235, 4.06672835, 2.69856801]),
 'std_fit_time': array([0.08271776, 0.03431746, 0.06261345, 0.08176882, 0.12472308,
        0.04838934, 0.03709456, 0.10118939, 0.04775751, 0.12143161,
        0.11533965, 0.05864693, 0.04748071, 0.0648038 , 0.03676051]),
 'mean_score_time': array([0.74354591, 0.70281525, 0.49830661, 0.78875618, 0.72040472,
        0.50081539, 0.77868805, 0.70996242, 0.51421213, 0.78244424,
        0.72273269, 0.51182671, 0.78002644, 0.70853157, 0.49456959]),
 'std_score_time': array([0.01706345, 0.03171981, 0.00944039, 0.02735362, 0.01885887,
        0.01024734, 0.02703302, 0.01513744, 0.01119915, 0.02337924,
        0.01878756, 0.00918123, 0.02729831, 0.01402829, 0.00388754]),
 'param_C': masked_array(data=[0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.9, 0.9,
                    

In [48]:
clf.best_estimator_

SVR(C=1, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [49]:
clf.best_score_

-0.14707204848678226

In [50]:
svr_pred = clf.predict(X_test)
metrics.mean_squared_error(y_test, svr_pred)

0.12942335965135573

## KNR

In [20]:
from sklearn.neighbors import KNeighborsRegressor

In [27]:
parameters = {'n_neighbors':[2,4,6,7,8]}
knr = KNeighborsRegressor()
knr_clf = GridSearchCV(estimator = knr, param_grid = parameters, scoring = 'neg_mean_squared_error', cv = 5)
knr_clf.fit(X, y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                           metric='minkowski',
                                           metric_params=None, n_jobs=None,
                                           n_neighbors=5, p=2,
                                           weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': [2, 4, 6, 7, 8]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [28]:
knr_clf.cv_results_

{'mean_fit_time': array([0.01809177, 0.01619091, 0.01716394, 0.0159287 , 0.01709967]),
 'std_fit_time': array([0.00290822, 0.00087049, 0.00067418, 0.0010843 , 0.00197975]),
 'mean_score_time': array([1.12645741, 1.27125182, 1.32766223, 1.23669729, 1.33978896]),
 'std_score_time': array([0.04005603, 0.09805055, 0.08025148, 0.04139996, 0.15899738]),
 'param_n_neighbors': masked_array(data=[2, 4, 6, 7, 8],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neighbors': 2},
  {'n_neighbors': 4},
  {'n_neighbors': 6},
  {'n_neighbors': 7},
  {'n_neighbors': 8}],
 'split0_test_score': array([-0.24120197, -0.2134189 , -0.20603189, -0.20368285, -0.20312995]),
 'split1_test_score': array([-0.22768972, -0.19947745, -0.18901656, -0.19028257, -0.18852334]),
 'split2_test_score': array([-0.2477454 , -0.21709775, -0.2089788 , -0.20368107, -0.20458208]),
 'split3_test_score': array([-0.25011848, -0.21601682, -0.20686638, -0.20124

In [29]:
knr_clf.best_estimator_

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=8, p=2,
                    weights='uniform')

In [30]:
knr_clf.best_score_

-0.1988059931419575

In [31]:
knr_pred = knr_clf.predict(X_test)
metrics.mean_squared_error(y_test, knr_pred)

0.18055357984092313

## Random Forest

In [41]:
parameters = {'n_estimators': [250, 260, 270, 280, 290, 300]}
rf = RandomForestRegressor(random_state = 0, max_depth = None)
rf_clf = GridSearchCV(estimator = rf, param_grid = parameters, scoring = 'neg_mean_squared_error', cv = 5)
rf_clf.fit(X, y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=0,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'n_estimators': [250, 260, 270, 28

In [42]:
rf_clf.cv_results_

{'mean_fit_time': array([35.74517717, 36.31204667, 38.03263984, 39.32788081, 40.54379725,
        42.117378  ]),
 'std_fit_time': array([0.80081411, 0.17992131, 0.66766007, 0.47499086, 0.31338867,
        0.64853877]),
 'mean_score_time': array([0.09296279, 0.09460621, 0.09788475, 0.1019093 , 0.10641284,
        0.11338449]),
 'std_score_time': array([0.00297915, 0.00306045, 0.00144591, 0.00324883, 0.00289106,
        0.00563061]),
 'param_n_estimators': masked_array(data=[250, 260, 270, 280, 290, 300],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_estimators': 250},
  {'n_estimators': 260},
  {'n_estimators': 270},
  {'n_estimators': 280},
  {'n_estimators': 290},
  {'n_estimators': 300}],
 'split0_test_score': array([-0.14299984, -0.14312392, -0.14322671, -0.14324242, -0.14317985,
        -0.14324043]),
 'split1_test_score': array([-0.12955131, -0.12959765, -0.12997156, -0.12991775, -0.12992666,
    

In [43]:
rf_clf.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=260,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [44]:
rf_clf.best_score_

-0.13516532730340775

In [45]:
rf_pred = rf_clf.predict(X_test)
metrics.mean_squared_error(y_test, rf_pred)

0.12415653396840075