In [3]:
import sklearn
import numpy as np
import pandas as pd
from pathlib import Path

In [4]:
def parse_data(data):
    Y = data[data.columns[-1]]
    X = data.drop([data.columns[-1]], axis=1)
    
    return X, Y

DATASETS = []

NAMES = [
    "autoprice", "linear", "sawtooth"
]
BASE_DIR = Path("../datasets/regressao/")

for name in NAMES:
    path = BASE_DIR / name / "data.csv"
    d = pd.read_csv(str(path), header=None)
    DATASETS.append(d)

In [5]:
from sklearn.svm import SVR
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings("ignore")

def make_grid_params_rbf():
    
    Cs = np.logspace(-1, 2.1, 5)
    Gammas = np.logspace(-2, 2.1, 5)
    kernel = np.array(["rbf", "linear"])

    grid = {
        "rbf__C": Cs,
        "rbf__gamma": Gammas,
        "rbf__kernel": kernel,
    }
    
    return grid

def make_grid_params_poly():
    
    Cs = np.logspace(-1, 2.1, 5)
    Gammas = np.logspace(-2, 2.1, 5)
    kernel = np.array(["poly"])
    degree = np.array([2, 3, 4])

    grid = {
        "poly__C": Cs,
        "poly__gamma": Gammas,
        "poly__degree": degree,
        "poly__kernel": kernel,
    }
    
    return grid

# SVR

In [4]:
plot_colors = "ryb"
grid = make_grid_params_rbf()
for (i, dataset) in enumerate(DATASETS):
    
    X, Y = parse_data(dataset)
    ACCS = []
    SENS = []
    SPECS = []
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
    scaler = StandardScaler()
    pipeline = Pipeline(steps = [("scaler", scaler), ("rbf", SVR())])
    
    gscv = GridSearchCV(
        estimator=pipeline, param_grid=grid, cv=5
    )
    gscv.fit(X_train, y_train)
    best_params = {key.split("__")[-1]: value for key, value in gscv.best_params_.items()}
    
    print(best_params)
    r2 = []
    for _ in range(5):
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
        
        # ETAPA DE TREINAMENTO
        clf = SVR(**best_params)
        X_train = scaler.fit_transform(X_train)
        clf.fit(X_train, y_train)

        # ETAPA DE TESTES
        X_test = scaler.transform(X_test)
        Y_pred = clf.predict(X_test)
        
        r2.append(r2_score(y_test, Y_pred))
        
    print("R²: {} +/- {}".format(
        np.mean(r2), np.std(r2)
    ))

{'C': 125.89254117941675, 'gamma': 0.01, 'kernel': 'linear'}
R²: 0.7927547622449088 +/- 0.022832853780859218
{'C': 21.134890398366476, 'gamma': 0.01, 'kernel': 'linear'}
R²: 0.999974630052491 +/- 2.7573160542486627e-07
{'C': 21.134890398366476, 'gamma': 11.885022274370177, 'kernel': 'rbf'}
R²: 0.9935337434600393 +/- 0.0008195089253510663


# Simple Regression

In [7]:
plot_colors = "ryb"
for (i, dataset) in enumerate(DATASETS):
    
    X, Y = parse_data(dataset)
    ACCS = []
    SENS = []
    SPECS = []
    
    r2 = []
    for _ in range(5):
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
        
        # ETAPA DE TREINAMENTO
        clf = LinearRegression()
        X_train = scaler.fit_transform(X_train)
        clf.fit(X_train, y_train)

        # ETAPA DE TESTES
        X_test = scaler.transform(X_test)
        Y_pred = clf.predict(X_test)
        
        r2.append(r2_score(y_test, Y_pred))
        
    print("R²: {} +/- {}".format(
        np.mean(r2), np.std(r2)
    ))

R²: 0.7795065636159733 +/- 0.013172477469909932
R²: 0.9999999998877389 +/- 3.333560819090082e-11
R²: 0.7167538323424262 +/- 0.017895084622443498


# Decision Tree Regression

In [7]:
plot_colors = "ryb"
grid = {
    'tree__max_depth': [2, 4, 6, 8, 10, 12],
    'tree__min_samples_split': [2, 4, 6, 8, 10, 12],
}

from sklearn.tree import DecisionTreeRegressor

for (i, dataset) in enumerate(DATASETS):
    
    X, Y = parse_data(dataset)
    ACCS = []
    SENS = []
    SPECS = []
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
    scaler = StandardScaler()
    pipeline = Pipeline(steps = [("scaler", scaler), ("tree", DecisionTreeRegressor())])
    
    gscv = GridSearchCV(
        estimator=pipeline, param_grid=grid, cv=5
    )
    gscv.fit(X_train, y_train)
    best_params = {key.split("__")[-1]: value for key, value in gscv.best_params_.items()}
    
    print(best_params)
    r2 = []
    for _ in range(5):
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
        
        # ETAPA DE TREINAMENTO
        clf = DecisionTreeRegressor(**best_params)
        X_train = scaler.fit_transform(X_train)
        clf.fit(X_train, y_train)

        # ETAPA DE TESTES
        X_test = scaler.transform(X_test)
        Y_pred = clf.predict(X_test)
        
        r2.append(r2_score(y_test, Y_pred))
        
    print("R²: {} +/- {}".format(
        np.mean(r2), np.std(r2)
    ))

{'max_depth': 4, 'min_samples_split': 6}
R²: 0.759677619018564 +/- 0.08776787971205904
{'max_depth': 8, 'min_samples_split': 2}
R²: 0.9995745071950376 +/- 9.317884521575387e-05
{'max_depth': 10, 'min_samples_split': 2}
R²: 0.994692424013075 +/- 0.0013099142629427916
