# S04T01: Seleção do Modelo (Regressão Linear Simples) 

## Importando as bibliotecas

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

## Carregando os pickles

In [2]:
def read_pickle(name):
    with (open(name, 'rb')) as openfile:
        while True:
            try:
                one_instance = pickle.load(openfile)
            except EOFError:
                break
    one_instance = np.asanyarray(one_instance)
    return one_instance

In [3]:
X_train = read_pickle('X_train.pickle')
X_test = read_pickle('X_test.pickle')
y_train = read_pickle('y_train.pickle')
y_test = read_pickle('y_test.pickle')

In [4]:
print(y_train.shape, X_train.shape, y_test.shape, X_test.shape)

(16512,) (16512, 9) (4128,) (4128, 9)


## Treinamendo do Modelo 

In [5]:
lin_reg = LinearRegression() 
lin_reg.fit(X_train, y_train) # y = ax + b

LinearRegression()

## Avaliação do modelo

In [6]:
predictions = lin_reg.predict(X_test)

In [7]:
print(predictions.shape)
print(predictions)

(4128,)
[240651.85445272 103520.6848257  250366.70728054 ... 280203.73002747
 259111.37376096 158524.35811483]


In [8]:
lin_mse = mean_squared_error(y_test, predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

69878.93643370147


![Banana](https://wikimedia.org/api/rest_v1/media/math/render/svg/e258221518869aa1c6561bb75b99476c4734108e)

## Selecionando o Regressor DecisionTree

In [9]:
from sklearn.tree import DecisionTreeRegressor

In [10]:
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train,y_train)

DecisionTreeRegressor()

In [11]:
dt_reg.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': None,
 'splitter': 'best'}

In [12]:
predictions = dt_reg.predict(X_test)

In [13]:
predictions.shape

(4128,)

In [14]:
dt_mse = mean_squared_error(y_test,predictions)

In [15]:
dt_rmse = np.sqrt(dt_mse)
print(dt_rmse)

69239.65135601227


## Seleção de Características

In [40]:
from sklearn.ensemble import RandomForestRegressor

In [41]:
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train,y_train)

RandomForestRegressor()

In [42]:
rf_predictions = rf_reg.predict(X_test)

In [43]:
rf_mse = mean_squared_error(y_test,rf_predictions)
rf_rmse = np.sqrt(rf_mse)
rf_rmse

49049.266589032275

In [44]:
rf_reg.feature_importances_

array([0.11244913, 0.11224319, 0.05377099, 0.02432834, 0.02182012,
       0.03151603, 0.01847914, 0.51235191, 0.11304116])

In [45]:
pd.read_csv('housing.csv').columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [46]:
min(rf_reg.feature_importances_)

0.018479135431604624

In [47]:
X_train_feat_selected = np.delete(X_train,6,1)
X_train_feat_selected.shape

(16512, 8)

In [48]:
X_train_feat_selected = np.delete(X_train_feat_selected,4,1)
X_train_feat_selected.shape

(16512, 7)

In [49]:
X_test_feat_selected = np.delete(X_test,6,1)
X_test_feat_selected = np.delete(X_test_feat_selected,4,1)

In [50]:
X_test_feat_selected.shape

(4128, 7)

In [51]:
rf_reg_s = RandomForestRegressor()
rf_reg_s.fit(X_train_feat_selected,y_train)

RandomForestRegressor()

In [52]:
rf_predictions = rf_reg_s.predict(X_test_feat_selected)

In [53]:
rf_mse = mean_squared_error(y_test,rf_predictions)
rf_rmse = np.sqrt(rf_mse)
rf_rmse

48872.97480425113

## Ajuste dos hiperparâmetros do modelo

In [30]:
rf_reg_s.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [31]:
from sklearn.model_selection import GridSearchCV

In [32]:
param_grid = [ 
    {'n_estimators': [80, 90, 100, 120], 'max_features': [2, 4, 6], 'bootstrap': [False], 'n_jobs': [-1]}   
]

In [33]:
rf_reg_best = RandomForestRegressor()

In [34]:
grid_search = GridSearchCV(rf_reg_best, param_grid, cv=5, scoring='neg_root_mean_squared_error')

In [35]:
grid_search.fit(X_train_feat_selected, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'bootstrap': [False], 'max_features': [2, 4, 6],
                          'n_estimators': [80, 90, 100, 120], 'n_jobs': [-1]}],
             scoring='neg_root_mean_squared_error')

In [36]:
grid_search.best_params_

{'bootstrap': False, 'max_features': 2, 'n_estimators': 120, 'n_jobs': -1}

In [37]:
rf_reg_best = RandomForestRegressor(max_features=2, n_estimators=120, bootstrap=False, n_jobs=-1)
rf_reg_best.fit(X_train_feat_selected,y_train)

RandomForestRegressor(bootstrap=False, max_features=2, n_estimators=120,
                      n_jobs=-1)

In [38]:
rf_best_predictions = rf_reg_best.predict(X_test_feat_selected)

In [39]:
rf_mse = mean_squared_error(y_test,rf_best_predictions)
rf_rmse = np.sqrt(rf_mse)
rf_rmse

46901.5583252891

$Atividade (2.5):$ Utilizando o conjunto de dados do covid-19 realize as seguintes atividades:
<ol>
    <li>Apresentar o rmse para os diferentes modelos de regressão (LinearRegressor, DecisionTreeRegressor, RandonForestRegressor) e utilizando o GridSearchCV para definir o melhor set de parâmetros para o RandonForest </li>
    <li>Fazer uma função para realizar a seleção das características, conforme a ordem de importância (min(modelo_reg.feature_importances_ )) do melhor regressor e ir removendo as características até impactar no rmse do conjunto de teste</li>
    <li>Investigar utilizar Support Vector Regressor (sklearn.svm.SVR) variando automaticamente os hiperparâmetros (kernel e C) e apresentar o RMSE</li>
</ol>