**SVM Regressor**

In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVR, SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.kernel_approximation import Nystroem
from utilities import cross_val_metrics_calculate

In [2]:
data = pd.read_csv("data/train_data.csv")
X = data.iloc[:, 1:-1]
y = data.iloc[:, -1]

In [3]:
feature_names = X.columns
feature_names

Index(['Area (m2)', 'Property Type', 'Bedrooms', 'Bathrooms', 'Address',
       'Law Document', 'Quarter', 'Year', 'Latitude', 'Longitude',
       'Postal Code', 'Importance', 'Place Rank', 'City'],
      dtype='object')

In [4]:
n_folds = 5
kfold = KFold(n_folds)

*Linear SVM Regressor*

In [10]:
l_svm_search = GridSearchCV(LinearSVR(max_iter=5000),
                            param_grid={'C':[0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5],
                                        'loss':['epsilon_insensitive']},
                            scoring=['neg_mean_squared_error',
                                     'neg_root_mean_squared_error',
                                     'neg_mean_absolute_error',
                                     'neg_mean_absolute_percentage_error'],
                            cv=5,
                            refit='neg_mean_absolute_percentage_error')

l_svm_search.fit(X, y)



In [11]:
l_svm = l_svm_search.best_estimator_

cv_results = cross_val_metrics_calculate(l_svm, X, y, kfold.split(X))
print(cv_results)



{'mse': 134101.78129926388, 'rmse': 240.2584499494611, 'mae': 15.187090152543181, 'mape': 4.267543845429522}




In [14]:
l_svm.fit(X, y)
print("Coefficients of each feature:")
for i in range(l_svm.n_features_in_):
    print("%s: %.10f" % (feature_names[i],l_svm.coef_[i]))

Coefficients of each feature:
Area (m2): -0.0000000015
Property Type: 0.0000193777
Bedrooms: 0.0000578016
Bathrooms: 0.0000494659
Address: 0.0001395240
Law Document: 0.0000039769
Quarter: 0.0000074846
Year: 0.0052708441
Latitude: 0.0000604402
Longitude: 0.0002795101
Postal Code: -0.0000271687
Importance: 0.0000000578
Place Rank: 0.0000786468
City: 0.0000051943


In [12]:
l_svm_search_s = make_pipeline(StandardScaler(), l_svm_search)

l_svm_search_s.fit(X, y)

In [13]:
l_svm_with_standardize = make_pipeline(StandardScaler(), l_svm_search.best_estimator_)

cv_results = cross_val_metrics_calculate(l_svm_with_standardize, X, y, kfold.split(X))
print(cv_results)

{'mse': 1023.0684031420649, 'rmse': 30.2482949077709, 'mae': 7.017375739840235, 'mape': 1.5841100002998783}


In [23]:
l_svm_with_standardize.fit(X, y)
print("Coefficients of each feature:")
for i in range(l_svm_with_standardize.n_features_in_):
    print("%s: %.10f" % (feature_names[i],l_svm_search.best_estimator_.coef_[i]))

Coefficients of each feature:
Area (m2): -0.0399357970
Property Type: 1.3690010636
Bedrooms: 1.9732266217
Bathrooms: 1.3043910279
Address: -0.0146818534
Law Document: -0.1994166355
Quarter: 0.4214208710
Year: 1.5762023885
Latitude: -0.4340643665
Longitude: -0.1141937025
Postal Code: -0.1437087766
Importance: 0.1277492228
Place Rank: 0.2179418878
City: 0.4541806188


- With data standardization: MAE from 9.5 -> 7, MAPE from 4% -> 1.68%
- Important features: Property type, bedrooms, bathrooms, year
- Linear SVM Regressor with data standardization > linear models

*Non-linear SVM Regressor*
(SVR takes too long; use kernel map with linear SVR)

In [None]:
kernel = Nystroem(kernel='rbf')

In [8]:
svm_search = GridSearchCV(make_pipeline(StandardScaler(), Nystroem(), LinearSVR(max_iter=5000)),
                            param_grid={'nystroem__kernel':['rbf', 'poly', 'sigmoid'],
                                        'nystroem__degree':[2],
                                        # 'nystroem__coef0':[0.1, 0.25, 0.5],
                                        'linearsvr__C':[0.05, 0.1, 0.25, 0.5, 1, 2],
                                        'linearsvr__epsilon':[0.1, 0.25, 0.5],
                                        'linearsvr__loss':['squared_epsilon_insensitive']},
                            scoring=['neg_mean_squared_error',
                                     'neg_root_mean_squared_error',
                                     'neg_mean_absolute_error',
                                     'neg_mean_absolute_percentage_error'],
                            cv=5,
                            refit='neg_mean_absolute_error')

svm_search.fit(X, y)

*cannot infer feature importance from SVM model with kernel function*

In [13]:
svm_search.best_params_

{'linearsvr__C': 0.1,
 'linearsvr__epsilon': 0.1,
 'linearsvr__loss': 'squared_epsilon_insensitive',
 'nystroem__degree': 2,
 'nystroem__kernel': 'rbf'}

In [19]:
svm = make_pipeline(StandardScaler(), 
                    Nystroem(kernel='rbf', n_components=500), 
                    LinearSVR(C=0.1, loss='squared_epsilon_insensitive', epsilon=0.1))

cv_results = cross_val_metrics_calculate(svm, X, y, kfold.split(X))
print(cv_results)

{'mse': 582.879264272432, 'rmse': 24.012337977598825, 'mae': 7.850612256544139, 'mape': 3.110285803898443}


- RBF kernel still worse than linear SVM
- Linear SVM with data standardization works best

In [14]:
import pickle

pickle.dump(l_svm_with_standardize, open("models/SVM.h5", 'wb'))