# SVM Regressor

In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVR, SVR, LinearSVC, SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.kernel_approximation import RBFSampler
import pickle
from utilities import cross_val_metrics_calculate

In [None]:
data = pd.read_csv("../data/train_data_2nd.csv")
X = data.iloc[:, 1:-1]
y = data.iloc[:, -1]

In [None]:
# data for property type classification
y = data['Property Type']
X = data.loc[:, data.columns != 'Property Type'].iloc[:, 1:]

In [6]:
feature_names = X.columns
feature_names

Index(['Area (m2)', 'Property Type', 'Bedrooms', 'Bathrooms', 'Address',
       'Law Document', 'Quarter', 'Year', 'Latitude', 'Longitude',
       'Postal Code', 'Importance', 'Place Rank', 'City'],
      dtype='object')

In [7]:
n_folds = 5
kfold = KFold(n_folds)

# Linear SVM Regressor

In [None]:
l_svm_search = GridSearchCV(
    LinearSVR(max_iter=5000),
    param_grid={
        'C':[0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5],
        'loss':['epsilon_insensitive', 'squared_epsilon_insensitive']
        },
    scoring=[
        'neg_mean_squared_error',
        'neg_root_mean_squared_error',
        'neg_mean_absolute_error',
        'neg_mean_absolute_percentage_error'
        ],
    cv=5,
    refit='neg_mean_absolute_percentage_error'
    )

l_svm_search.fit(X, y)



In [8]:
# l_svm = l_svm_search.best_estimator_
l_svm = LinearSVR(C=0.005, loss='squared_epsilon_insensitive', max_iter=5000)

cv_results = cross_val_metrics_calculate(l_svm, X, y, kfold.split(X))
print(cv_results)

{'mse': 23115.333628103814, 'rmse': 87.00791632104819, 'mae': 10.308093028304896, 'mape': 4.174366353658648, 'medae': 6.790877989887582, 'medape': 1.1371680331551957}


In [None]:
l_svm.fit(X, y)
print("Coefficients of each feature:")
for i in range(l_svm.n_features_in_):
    print("%s: %.10f" % (feature_names[i],l_svm.coef_[i]))

Coefficients of each feature:
Area (m2): 0.0000005893
Property Type: 0.0000257131
Bedrooms: 0.0000807941
Bathrooms: 0.0000664564
Address: 0.0000932602
Law Document: 0.0000005562
Quarter: 0.0000081148
Year: 0.0053519921
Latitude: 0.0000617423
Longitude: 0.0002850741
Postal Code: -0.0000263517
Importance: 0.0000000267
Place Rank: 0.0000835416
City: 0.0000062133


**With standardize**

In [None]:
l_svm_search_s = make_pipeline(StandardScaler(), l_svm_search)

l_svm_search_s.fit(X, y)

In [None]:
print(l_svm_search.best_params_)

{'C': 0.005, 'loss': 'epsilon_insensitive'}


In [9]:
l_svm_with_standardize = make_pipeline(StandardScaler(), LinearSVR(C=0.005, max_iter=5000))

cv_results = cross_val_metrics_calculate(l_svm_with_standardize, X, y, kfold.split(X))
print(cv_results)

{'mse': 22587.302139438918, 'rmse': 86.06140995420118, 'mae': 7.7574058885234844, 'mape': 1.6317430053928306, 'medae': 2.0917325049992272, 'medape': 0.4521011081372531}


In [None]:
l_svm_with_standardize.fit(X, y)
print("Coefficients of each feature:")
for i in range(l_svm_with_standardize.n_features_in_):
    print("%s: %.10f" % (feature_names[i],l_svm_search.best_estimator_.coef_[i]))

Coefficients of each feature:
Area (m2): 0.0356162139
Property Type: 1.2829686397
Bedrooms: 1.7216941057
Bathrooms: 1.2293439432
Address: -0.0280893710
Law Document: -0.1801567485
Quarter: 0.3509453503
Year: 1.4329003094
Latitude: -0.2765188697
Longitude: -0.0959793968
Postal Code: -0.0992826549
Importance: 0.1062859484
Place Rank: 0.1637884788
City: 0.2916990360


- With data standardization: MAE from 10.3 -> 7,75, MAPE from 4.17% -> 1.63%
- Important features: Property type, bedrooms, bathrooms, year
- Linear SVM Regressor with data standardization > linear models

In [None]:
import pickle

pickle.dump(l_svm_with_standardize, open("../models/LinearSVM.h5", 'wb'))

# Non-linear SVM Regressor
(SVR takes too long; use kernel map with linear SVR)

In [10]:
# kernel = Nystroem(kernel='rbf')
kernel = RBFSampler()

In [None]:
svm_search = GridSearchCV(
    make_pipeline(
        RBFSampler(),
        LinearSVR(max_iter=5000)
        ),
    param_grid={
        # 'nystroem__kernel':['rbf', 'poly', 'sigmoid'],
        # 'nystroem__degree':[2],
        # 'nystroem__coef0':[0.1, 0.25, 0.5],
        'linearsvr__C':[0.05, 0.1, 0.25, 0.5, 1, 2],
        'linearsvr__epsilon':[0.1, 0.25, 0.5],
        'linearsvr__loss':['epsilon_insensitive','squared_epsilon_insensitive']
        },
    scoring=[
        'neg_mean_squared_error',
        'neg_root_mean_squared_error',
        'neg_mean_absolute_error',
        'neg_mean_absolute_percentage_error'
        ],
    cv=5,
    refit='neg_mean_absolute_percentage_error'
    )

svm_search.fit(X, y)

*cannot infer feature importance from SVM model with kernel function*

In [None]:
svm_search.best_params_

{'linearsvr__C': 0.05,
 'linearsvr__epsilon': 0.1,
 'linearsvr__loss': 'epsilon_insensitive'}

In [11]:
# svm = svm_search.best_estimator_
svm = make_pipeline(
    RBFSampler(),
    LinearSVR(C=0.05, epsilon=0.1, loss='epsilon_insensitive', max_iter=5000)
)
cv_results = cross_val_metrics_calculate(svm, X, y, kfold.split(X))
print(cv_results)

{'mse': 656.7462367986491, 'rmse': 25.515847839733073, 'mae': 7.781936569214906, 'mape': 1.72594090509446, 'medae': 2.637199540323741, 'medape': 0.5992641512897088}


**With standardize**

In [None]:
svm_search_s = make_pipeline(StandardScaler(), svm_search)

svm_search_s.fit(X, y)

In [12]:
svm_with_standardize = make_pipeline(
    StandardScaler(),
    RBFSampler(),
    LinearSVR(C=0.05, epsilon=0.1, max_iter=5000)
)
cv_results = cross_val_metrics_calculate(svm_with_standardize, X, y, kfold.split(X))
print(cv_results)

{'mse': 649.6559725058875, 'rmse': 25.376361079992048, 'mae': 7.5817250883385565, 'mape': 1.7388104204124304, 'medae': 2.6078324744334695, 'medape': 0.5552883649509782}


- With RBF kernel: MAE & MAPE slightly increase, but RMSE drops from 86.14 -> 25.5 -> handles high-price estates better
- Standardization does not change results much, slightly reduces RMSE and MAE

In [None]:
# import pickle

# pickle.dump(l_svm_with_standardize, open("models/LinearSVM.h5", 'wb'))
pickle.dump(svm_with_standardize, open("SVM.h5", 'wb'))

# SVM Property Type Classifier

# Linear SVM Classifier

In [None]:
l_svm_search = GridSearchCV(
    LinearSVC(max_iter=10000),
    param_grid={
        'penalty':['l1','l2'],
        'C':[0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5]
    },
    scoring=[
        'accuracy'
    ],
    cv=5,
    refit='accuracy'
)

l_svm_search.fit(X, y)



In [None]:
l_svm = l_svm_search.best_estimator_ # LinearSVC(C=0.05, penalty='l1', max_iter=10000)

cv_results = cross_val_metrics_calculate(l_svm, X, y, kfold.split(X), metrics=['accuracy','precision','recall','f1'])
print(cv_results)



{'accuracy': 0.5765262153087554, 'precision': 0.46789489383365596, 'recall': 0.4174422566367131, 'f1': 0.4007271037934016}




In [None]:
l_svm.fit(X, y)
print("Coefficients of each feature:")
for i in range(l_svm.n_features_in_):
    print("%s: %.10f" % (feature_names[i],l_svm.coef_[0][i]))

Coefficients of each feature:
Area (m2): -0.0000000403
Bedrooms: -0.2362895643
Bathrooms: -0.1480065340
Address: -0.0000425594
Law Document: -0.0005230429
Quarter: 0.0464684728
Year: 0.0004624020
Latitude: 0.0322955718
Longitude: -0.0011906050
Postal Code: 0.0000047309
Importance: -0.0591763825
Place Rank: -0.0156131525
City: -0.1591731381
Price (billion VND): -0.0180024229




- Bedrooms, bathrooms, and city matter the most (city may be due to value issues, check with standardized data)
- Importance, price, and place rank can indicate property types

In [None]:
pickle.dump(l_svm, open("../models/LinearSVM_Classifier.h5", 'wb'))

**With standardize**

In [None]:
l_svm_search_s = make_pipeline(StandardScaler(), l_svm_search)

l_svm_search_s.fit(X, y)

In [None]:
l_svm_with_standardize = make_pipeline(StandardScaler(), l_svm_search.best_estimator_)

cv_results = cross_val_metrics_calculate(l_svm_with_standardize, X, y, kfold.split(X), metrics=['accuracy','precision','recall','f1'])
print(cv_results)

{'accuracy': 0.5875673855476655, 'precision': 0.4895897379594391, 'recall': 0.4342324181193429, 'f1': 0.42921336672469257}


In [None]:
# l_svm_with_standardize.fit(X, y)
print("Coefficients of each feature:")
for j in range(l_svm_with_standardize.named_steps['linearsvc'].coef_.shape[0]):
  print("##########")
  for i in range(l_svm_with_standardize.named_steps['linearsvc'].n_features_in_):
      print("%s: %.10f" % (feature_names[i],l_svm_with_standardize.named_steps['linearsvc'].coef_[j][i]))

Coefficients of each feature:
##########
Area (m2): -0.0016976590
Bedrooms: -0.6253033318
Bathrooms: -0.4092894148
Address: -0.0835677401
Law Document: -0.0197023004
Quarter: 0.0553859302
Year: 0.1733095916
Latitude: 0.1782659056
Longitude: -0.0113312217
Postal Code: 0.1109473431
Importance: -0.0203689799
Place Rank: -0.0319370231
City: -0.0503197679
Price (billion VND): -0.5400925288
##########
Area (m2): 0.0000000000
Bedrooms: -0.4266597383
Bathrooms: -0.1374803939
Address: 0.0006175833
Law Document: -0.0696374151
Quarter: 0.0348272160
Year: 0.0934771405
Latitude: 0.0000000000
Longitude: 0.0000000000
Postal Code: 0.0022154859
Importance: -0.0256886329
Place Rank: 0.0013020727
City: -0.1153637327
Price (billion VND): -0.3871552792
##########
Area (m2): -0.0038480802
Bedrooms: 0.0775383840
Bathrooms: 0.0423574705
Address: 0.0171435853
Law Document: 0.0860702820
Quarter: -0.0084398851
Year: 0.0482698728
Latitude: -0.3126323410
Longitude: -0.0280814392
Postal Code: -0.1529548514
Importan

- Most important features across all classes: Bedrooms, bathrooms, price
- 2 classes have high area and city coefficients (nhà riêng & biệt thự)
- 1 class (căn hộ studio) does not depend on area, latitude, longitude; possibly due to L1 regularization
- 58% accuracy, not too good

In [None]:
l_svm_with_standardize.named_steps['linearsvc'].classes_

array([0., 1., 2., 3., 4.])

In [None]:
pickle.dump(l_svm_with_standardize, open("../models/LinearSVM_Classifier.h5", 'wb'))

# Non-linear SVM Classifier

In [None]:
svm_search = GridSearchCV(
    make_pipeline(
        RBFSampler(),
        LinearSVC(max_iter=10000)
    ),
    param_grid={
        'linearsvc__penalty':['l1','l2'],
        'linearsvc__C':[0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5]
    },
    scoring=[
        'accuracy'
    ],
    cv=5,
    refit='accuracy'
)

svm_search.fit(X, y)

In [None]:
svm = svm_search.best_estimator_

cv_results = cross_val_metrics_calculate(svm, X, y, kfold.split(X), metrics=['accuracy','precision','recall','f1'])
print(cv_results)

{'accuracy': 0.34241305652392845, 'precision': 0.14087784677264456, 'recall': 0.20202211874447834, 'f1': 0.11943954258625207}


**With standardize**

In [None]:
svm_search_s = make_pipeline(StandardScaler(), svm_search)

svm_search_s.fit(X, y)

In [None]:
svm_with_standardize = make_pipeline(StandardScaler(), svm_search.best_estimator_)

cv_results = cross_val_metrics_calculate(svm_with_standardize, X, y, kfold.split(X), metrics=['accuracy','precision','recall','f1'])
print(cv_results)

{'accuracy': 0.4962260918805158, 'precision': 0.40052260121915334, 'recall': 0.3503399557646684, 'f1': 0.35116134985525777}


In [None]:
pickle.dump(svm, open("../models/SVM_Classifier.h5", 'wb'))

# Summary on Property type Classifier
- Non-linear SVM performs worse than linear SVM
- Only 58% accuracy