## Seleção do Modelo (Regressão Linear Simples) 

$Atividade (2.5):$ Utilizando o conjunto de dados do covid-19 realize as seguintes atividades:
<ol>
    <li>Apresentar o rmse para os diferentes modelos de regressão (LinearRegressor, DecisionTreeRegressor, RandonForestRegressor) e utilizando o GridSearchCV para definir o melhor set de parâmetros para o RandonForest </li>
    <li>Fazer uma função para realizar a seleção das características, conforme a ordem de importância (min(modelo_reg.feature_importances_ )) do melhor regressor e ir removendo as características até impactar no rmse do conjunto de teste</li>
    <li>Investigar utilizar Support Vector Regressor (sklearn.svm.SVR) variando automaticamente os hiperparâmetros (kernel e C) e apresentar o RMSE</li>
</ol>

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import itertools

### Carregando os pickles

In [2]:
def read_pickle(name):
    with (open(name, 'rb')) as openfile:
        while True:
            try:
                one_instance = pickle.load(openfile)
            except EOFError:
                break
    one_instance = np.asanyarray(one_instance)
    return one_instance

In [3]:
X_train = read_pickle('X_train.pickle') # Colunas de treino, como latitude e longitude
X_test = read_pickle('X_test.pickle') # Caracteristica dos 20% para teste e avaliacao
y_train = read_pickle('y_train.pickle') # Label, valor estimado ou alvo
y_test = read_pickle('y_test.pickle') # Valores medios da residencia que sao os alvos, para garantir que o modelo nunca viu esses dados antes

In [4]:
print(y_train.shape, X_train.shape, y_test.shape, X_test.shape)

(180,) (180, 7) (45,) (45, 7)


## Treinamendo do Modelo 

In [5]:
lin_reg = LinearRegression() # Instanciando o modelo
lin_reg.fit(X_train, y_train) # y = ax + b

## Avaliação do modelo

In [6]:
predictions = lin_reg.predict(X_test) # Estimando novos valores para os dados de entrada, sendo o conjunto de teste

In [7]:
print(predictions.shape)
print(predictions)

(45,)
[-1.78261722e+00  2.45440159e+00  1.93809306e+01  4.33417000e+00
  7.03621254e+00  5.85261208e+01  1.56081032e+01  3.04782539e+00
  2.50361601e+01  1.50128695e+01  2.82037207e+00  5.48553192e+00
  1.09519975e+01  1.59979710e+01  2.37379742e+01  8.08585652e+00
  5.31186121e+00  3.20617344e+01  2.13947540e+01  2.98969520e+00
  5.51423101e+00  1.22494845e+01  7.19938843e+01  1.72698777e+01
  5.40412962e+00  2.51390688e+02  6.80034026e+00  5.09491194e+01
  6.52191797e-01  6.53968285e+00  4.19142448e+00  4.58953887e-01
  1.31122554e+02  1.27682860e+02 -4.07258394e-01  2.07442302e+00
  9.97089580e+00  1.33781535e+01 -3.64354237e-02  1.20338949e+00
  2.31708100e+00  1.24377508e+00  3.87195151e+00  4.26383484e+00
  1.68730486e+01]


In [8]:
lin_mse = mean_squared_error(y_test, predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

12.219456416381817


## Selecionando o Regressor DecisionTree

In [9]:
from sklearn.tree import DecisionTreeRegressor

In [10]:
dt_reg = DecisionTreeRegressor() # Instanciando o modelo
dt_reg.fit(X_train,y_train) # Modelo treinado e ajustado

In [11]:
dt_reg.get_params() # hiperParametros

{'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

## Avaliando o Modelo

In [12]:
predictions = dt_reg.predict(X_test)

In [13]:
print(predictions.shape)
print(predictions)

(45,)
[  4.   4.  16.   8.  13.  38.  14.   5.  11.  13.   9.   3.  21.  13.
  24.   8.   9.  28.  13.   3.   7.  14.  66.  21.  14. 182.   9.  50.
   6.   8.   4.   3.  93. 180.   9.   4.  15.  23.   6.   9.  12.   3.
   3.   2.  16.]


In [14]:
dt_mse = mean_squared_error(y_test,predictions)
dt_rmse = np.sqrt(dt_mse)
dt_rmse

14.807655677610372

## Random Forest Regressor - Treinando o Modelo

In [15]:
rf_reg = RandomForestRegressor() # Instanciando o Modelo
rf_reg.fit(X_train,y_train)

In [16]:
rf_predictions = rf_reg.predict(X_test)

In [18]:
rf_mse = mean_squared_error(y_test,rf_predictions)
rf_rmse = np.sqrt(rf_mse)
rf_rmse

26.392487609587356

## Ajuste dos hiperparâmetros do modelo

In [19]:
lin_reg = LinearRegression()

In [20]:
lin_reg.get_params()

{'copy_X': True,
 'fit_intercept': True,
 'n_jobs': None,
 'normalize': 'deprecated',
 'positive': False}

In [21]:
param_grid = [ 
    {'fit_intercept': [False, True], 'n_jobs': [-1]}
]

In [22]:
lin_reg_best = LinearRegression()

In [23]:
grid_search = GridSearchCV(lin_reg_best, param_grid, cv=5, scoring='neg_root_mean_squared_error')

In [24]:
grid_search.fit(X_train, y_train)

In [25]:
grid_search.best_params_

{'fit_intercept': True, 'n_jobs': -1}

In [26]:
lin_reg_best = LinearRegression(n_jobs=-1)
lin_reg_best.fit(X_train, y_train)

In [27]:
predictions = lin_reg_best.predict(X_test)


In [28]:
lin_mse = mean_squared_error(y_test, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

12.219456416381817

## Função de Seleção das Características

In [29]:
def lSC(rmse, X_train, y_train, X_test, y_test):
    lrb = LinearRegression(n_jobs=-1)
    lrb.fit(X_train, y_train)
    importante = list(lrb.coef_)
    importante = list(map(abs, importante))
    i = importante.index(min(importante))

    X_train_feat_select = np.delete(X_train, i, 1)
    X_test_feat_select = np.delete(X_test, i, 1)

    lrb.fit(X_train_feat_select, y_train)
    predictions = lrb.predict(X_test_feat_select)
    lin_mse = mean_squared_error(y_test, predictions)
    lin_rmse = np.sqrt(lin_mse)

    if(lin_rmse < rmse):
        return lSC(lin_rmse, X_train_feat_select, y_train, X_test_feat_select, y_test)
    else:
        return rmse

In [30]:
print(lSC(lin_rmse, X_train, y_train, X_test, y_test))

11.948646247423797


## Variando automaticamente os hiperparâmetros (kernel e C)

In [31]:
kernels = ["rbf", "linear", "poly"]
c = [0.1, 0.5, 1, 10, 100]

In [32]:
for k, c in itertools.product(kernels, c):
    svr = SVR(kernel=k, C=c)
    prediction = svr.fit(X_train, y_train).predict(X_test)
    mse = mean_squared_error(y_test, prediction)
    rmse = np.sqrt(mse)
    print(c, k, rmse)

0.1 rbf 39.9167585635048
0.5 rbf 38.84262737405531
1 rbf 38.0477991110566
10 rbf 32.236401909364645
100 rbf 21.064002377731985
0.1 linear 38.07181530108867
0.5 linear 35.41695063213822
1 linear 33.569927352258766
10 linear 11.956078624055468
100 linear 9.198736405062283
0.1 poly 40.27901918413995
0.5 poly 39.87246713418461
1 poly 39.64985121170206
10 poly 37.74287829356917
100 poly 35.57679869396265
