# Emplear Regresión Por Maquina de Soporte de Vectores

## Traer los datos

Obtener los datos de las viviendas.

In [1]:
import os
import pandas as pd

def load_housing_data():
    csv_path = os.path.join("datasets", "housing", "housing.csv")
    return pd.read_csv(csv_path)

housing = load_housing_data()

In [2]:
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


## Obtener los conjuntos de training y testing

Generar categorias para el ingreso medio

In [3]:
import numpy as np

housing["income_cat"] = pd.cut(
    housing["median_income"],
    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
    labels=[1, 2, 3, 4, 5])

In [4]:
housing["income_cat"].head()

0    5
1    5
2    5
3    4
4    3
Name: income_cat, dtype: category
Categories (5, int64): [1 < 2 < 3 < 4 < 5]

Dividir el conjunto de datos en dos subconjuntos, uno para entrenamiento y otro para pruebas. Ambos de forma estratificada en base a un atributo importante (que las categorias de ingresos sean proporcionales).

In [5]:
from sklearn.model_selection import StratifiedShuffleSplit

# Dividir los dos conjuntos proporcionalmente a la categoria de ingresos
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

# Eliminar la columna de categoria, ya no es necesaria
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

Reasignar a "housing" los datos de entrenamiento.

In [6]:
housing = strat_train_set.drop("median_house_value", axis=1)

## Aplicar transformadores para limpieza y desarrollo

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
        
housing_num = housing.drop("ocean_proximity", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])
housing_prepared = full_pipeline.fit_transform(housing)

## Entrenamiento

### Usando GridSearchCV

El mejor rendimiento (el error más bajo) hasta el momento se logró son RandomForestRegressor. La siguiente es una tabla de RMSE.
|    LinealReg   |     TreeReg    |  RandomForest  |
|     :----:     |     :----:     |    :----:      |
|     68627.87   |     71035.40   |    47293.78    |

Emplear SVMR para mejorar el rendimiento y bajar el nivel de error. Primero entrenar el modelo usando GridSearch.

In [16]:
from sklearn.svm import SVR

from sklearn.model_selection import GridSearchCV

param_grid = [
    {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},
    {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0], 'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
]

svm_reg = SVR()

grid_search = GridSearchCV(
    svm_reg,
    param_grid,
    cv=2,
    scoring='neg_mean_squared_error',
    )
grid_search.fit(housing_prepared, housing_labels)

Conocer las combinaciones ejecutadas

In [17]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

108819.5187052252 {'C': 80, 'gamma': 0.03, 'kernel': 'rbf'}
106456.7875373488 {'C': 80, 'gamma': 0.07, 'kernel': 'rbf'}
106699.560727087 {'C': 80, 'gamma': 0.1, 'kernel': 'rbf'}


Conocer el mejor estimador

In [19]:
grid_search.best_estimator_

Calcular el nivel de RMSE con el mejor estimador.

In [20]:
from sklearn.metrics import mean_squared_error

housing_predictions = grid_search.best_estimator_.predict(housing_prepared)
svm_mse = mean_squared_error(housing_labels, housing_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_rmse

97878.26901241782

Mejores resultados usando GridSearchCV
| SVR linear C=70 | SVR rbf C=80 gamma=0.07 |
| :----: | :----: |
| 71258.6 | 97878.2 |

### Usando RandomSearchCV

Entrenamiento del mejor modelo

In [26]:
from scipy.stats import expon, reciprocal
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

svm_reg = SVR()

param_distribs = {
    'kernel': ['linear', 'rbf'],
    'C': reciprocal(20, 200000),
    'gamma': expon(scale=1.0),
}

svm_reg = SVR()
random_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs,
                                n_iter=50, cv=5, scoring='neg_mean_squared_error',
                                verbose=2, random_state=42)
random_search.fit(housing_prepared, housing_labels)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END C=629.782329591372, gamma=3.010121430917521, kernel=linear; total time=   8.1s
[CV] END C=629.782329591372, gamma=3.010121430917521, kernel=linear; total time=   5.9s
[CV] END C=629.782329591372, gamma=3.010121430917521, kernel=linear; total time=   5.9s
[CV] END C=629.782329591372, gamma=3.010121430917521, kernel=linear; total time=   5.8s
[CV] END C=629.782329591372, gamma=3.010121430917521, kernel=linear; total time=   6.4s
[CV] END C=26290.206464300216, gamma=0.9084469696321253, kernel=rbf; total time=  13.1s


KeyboardInterrupt: 

Conocer el mejor modelo

In [23]:
random_search.best_estimator_

Calcular su nivel de error

In [24]:
from sklearn.metrics import mean_squared_error

housing_predictions = random_search.best_estimator_.predict(housing_prepared)
svm_mse = mean_squared_error(housing_labels, housing_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_rmse

71363.435557401