If Dockerfiles have not been modified, connect to the Jupyter server with ```http://localhost:8010/tree?token=train-svr```.  

This notebook provides an interface to perform regression on a labelled dataset via a support vector machine.  
Each data point is represented by an n×1 matrix in a ```.json```, and labels are input separately via a ```.json```.  
The filename of the data point should correspond to a key in the labels file.

In [103]:
target_dir = "data"
input_dir = "place-pulse-singapore-segmented-point-clouds-combined-encoded"
label_filename = "place-pulse-singapore-labels/wealthier.csv"
output_dir = "segmented-point-clouds-combined-encoded-models"

In [104]:
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.metrics
import sklearn.model_selection
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.svm import NuSVR

import json
import os
from pathlib import Path

In [105]:
xs_path = []
for dirpath, dirnames, filenames in os.walk(os.path.join(target_dir, input_dir)):
    xs_path.extend(filenames)
    break
xs_id = {}
for x_path in xs_path:
    with open(os.path.join(target_dir, input_dir, x_path), 'r') as fp:
        xs_id['.'.join(x_path.split('.')[:-1])] = json.load(fp)
xs_df = pd.DataFrame.from_dict(xs_id, orient='index')

with open(os.path.join(target_dir, label_filename), 'r') as fp:
    ys_df = pd.read_csv(fp, index_col=0)

df = ys_df.join(xs_df, how="inner")

In [106]:
X = df.drop("trueskill_score", axis=1).values
y = df.iloc[:,0].values

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=0)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
Path(os.path.join(target_dir, output_dir)).mkdir(parents=True, exist_ok=True)
with open(os.path.join(target_dir, output_dir, "scaler.pkl"), "wb") as fp:
    joblib.dump(scaler, fp)

In [113]:
svr = NuSVR(nu=0.5, C=1, gamma=0.0001, kernel="rbf", cache_size=2000)
svr.fit(X_train, y_train)

In [114]:
y_pred = svr.predict(X_test)
r2 = sklearn.metrics.r2_score(y_test, y_pred)
print(r2)

0.011850128672709404


In [115]:
Path(os.path.join(target_dir, output_dir)).mkdir(parents=True, exist_ok=True)
with open(os.path.join(target_dir, output_dir, f"segmentedpc_wealthier_vsvr_nu{svr.nu}_C{svr.C}_gamma{svr.gamma}_kernel{svr.gamma}_r2{r2:.3f}.pkl"), 'wb') as fp:
    joblib.dump(optimal_svr, fp)

In [110]:
param_grid = [{
    'nu': [0.01, 0.25, 0.5, 1],
    'C': [0.01, 0.1, 1.0, 10, 100],
    "gamma": ["scale", 1, 0.1, 0.01, 0.001, 0.0001],
    "kernel": ["rbf"]
}]
rbf_svr = GridSearchCV(NuSVR(cache_size=1000), param_grid, scoring="r2", n_jobs=1, cv=5)
rbf_svr.fit(X_train, y_train)
print(rbf_svr.best_params_)

{'C': 1.0, 'gamma': 0.0001, 'kernel': 'rbf', 'nu': 0.25}


In [111]:
y_pred = rbf_svr.predict(X_test)
r2 = sklearn.metrics.r2_score(y_test, y_pred)
print(r2)

0.00614338162550887


In [112]:
optimal_svr = rbf_svr
Path(os.path.join(target_dir, output_dir)).mkdir(parents=True, exist_ok=True)
with open(os.path.join(target_dir, output_dir, f"segmentedpc_wealthier_vsvr_nu{
        optimal_svr.best_params_["nu"]
    }_C{
        optimal_svr.best_params_['C']
    }_gamma{
        optimal_svr.best_params_["gamma"]
    }_kernel{
        optimal_svr.best_params_["kernel"]
    }_r2{r2:.3f}.pkl"), 'wb') as fp:
    joblib.dump(optimal_svr, fp)