# Emplear Regresión Por Maquina de Soporte de Vectores

## Traer los datos

Obtener los datos de las viviendas.

In [1]:
import os
import pandas as pd

def load_housing_data():
    csv_path = os.path.join("datasets", "housing", "housing.csv")
    return pd.read_csv(csv_path)

housing = load_housing_data()

In [2]:
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


## Obtener los conjuntos de training y testing

Generar categorias para el ingreso medio

In [3]:
import numpy as np

housing["income_cat"] = pd.cut(
    housing["median_income"],
    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
    labels=[1, 2, 3, 4, 5])

In [4]:
housing["income_cat"].head()

0    5
1    5
2    5
3    4
4    3
Name: income_cat, dtype: category
Categories (5, int64): [1 < 2 < 3 < 4 < 5]

Dividir el conjunto de datos en dos subconjuntos, uno para entrenamiento y otro para pruebas. Ambos de forma estratificada en base a un atributo importante (que las categorias de ingresos sean proporcionales).

In [5]:
from sklearn.model_selection import StratifiedShuffleSplit

# Dividir los dos conjuntos proporcionalmente a la categoria de ingresos
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

# Eliminar la columna de categoria, ya no es necesaria
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

Reasignar a "housing" los datos de entrenamiento.

In [6]:
housing = strat_train_set.drop("median_house_value", axis=1)

## Aplicar transformadores para limpieza y desarrollo

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
        
housing_num = housing.drop("ocean_proximity", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])
housing_prepared = full_pipeline.fit_transform(housing)

## Entrenamiento

### Utilidades para el entrenamiento
Funciones para guardar en un archivo el modelo entrenado.

In [8]:
import joblib
from enum import Enum
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

class Model(Enum):
    REGRESSION = 1
    DECISION_TREE_REGRESSION = 2
    RANDOM_FOREST_REGRESSION = 3

path_models_saved = "models_saved"
extension = ".joblib"

def get_object_saved_full_filename(model_name):
    return os.path.join(path_models_saved, model_name+extension)

def create_models_directory_if_not_exist():
    os.makedirs(path_models_saved, exist_ok=True)

def save_object(object, filename):
    full_path = get_object_saved_full_filename(filename)
    joblib.dump(object, full_path)

def object_is_saved(filename):
    full_path = get_object_saved_full_filename(filename)
    return os.path.exists(full_path)

def get_object_saved(filename):
    if object_is_saved(filename):
        full_path = get_object_saved_full_filename(filename)
        return joblib.load(full_path)
    else:
        return None
    
def create_model_fit_and_save_it(filename, model_type, train_data, label_data):
    if model_type == Model.REGRESSION:
        new_model = LinearRegression()
    if model_type == Model.DECISION_TREE_REGRESSION:
        new_model = DecisionTreeRegressor()
    if model_type == Model.RANDOM_FOREST_REGRESSION:
        new_model = RandomForestRegressor()
    new_model.fit(train_data, label_data)
    save_object(new_model, filename)
    

def create_model_or_get_it_if_is_saved(filename, model_type, train_data, label_data):
    if object_is_saved(filename) == False:
        create_model_fit_and_save_it(filename, model_type, train_data, label_data)
    return get_object_saved(filename)
        
def create_cross_validation_and_save(model, filename, train_data, label_data):
    new_cross_validation = cross_val_score(model, train_data, label_data, scoring="neg_mean_squared_error", cv=10)
    save_object(new_cross_validation, filename)

def create_cross_validation_or_get_it_if_is_saved(model, filename, train_data, label_data):
    if object_is_saved(filename) == False:
        create_cross_validation_and_save(model, filename, train_data, label_data)
    return get_object_saved(filename)
    
create_models_directory_if_not_exist()

### Aplicar el modelo RandomForestRegressor

El mejor rendimiento (el error más bajo) hasta el momento se logró son RandomForestRegressor. La siguiente es una tabla de RMSE.
|    LinealReg   |     TreeReg    |  RandomForest  |
|     :----:     |     :----:     |    :----:      |
|     68627.87   |     71035.40   |    47293.78    |

Conocer el nivel de error del RFR

In [11]:
from sklearn.metrics import mean_squared_error

forest_reg = create_model_or_get_it_if_is_saved(
    "housing_random_forest_reg_v1",
    Model.RANDOM_FOREST_REGRESSION,
    housing_prepared,
    housing_labels)

housing_predicts_forest = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predicts_forest)
forest_rmse = np.sqrt(forest_mse)

forest_rmse

18576.97018560798

Estadisticas de ejecución en de RandomForestRegressor en local y remoto.

| Local | Remoto |
| :----: | :----: |
| 22s | 18s |

# Hallar mejores parámetros usando GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

param_grid = [
        {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},
        {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
         'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
    ]

svm_reg = SVR()
grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(housing_prepared, housing_labels)
save_object(grid_search, "grid_search_svr")

# Hallar mejore parámetros usando RandomSearchCV