# S04T01: Seleção do Modelo (Regressão Linear Simples) 

## Importando as bibliotecas

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

## Carregando os pickles

In [2]:
def read_pickle(name):
    with (open(name, 'rb')) as openfile:
        while True:
            try:
                one_instance = pickle.load(openfile)
            except EOFError:
                break
    one_instance = np.asanyarray(one_instance)
    return one_instance

In [3]:
path_pickles = 'C://Users//alexw//Documents//UFPI//Sistemas_Inteligentes//PICKLES//'

X_train = read_pickle(path_pickles+'X_train.pickle')
X_test = read_pickle(path_pickles+'X_test.pickle')
y_train = read_pickle(path_pickles+'y_train.pickle')
y_test = read_pickle(path_pickles+'y_test.pickle')

In [4]:
y_train.shape

(179,)

## Treinamendo do Modelo 

In [5]:
lin_reg = LinearRegression() 
lin_reg.fit(X_train, y_train) # y = ax + b

LinearRegression()

## Avaliação do modelo

In [6]:
predictions = lin_reg.predict(X_test)

In [7]:
print(predictions.shape)
print(predictions)

(45,)
[  1.27587602   7.47657903   6.82486001   5.79448274   1.04305057
   4.09116356   3.88314317   3.25952212   7.17577276   4.03251723
 112.79796594   1.59593978  16.40647002  34.05922369  16.91043529
   7.77940695  -0.83939647   2.20147547   2.69719018  12.48969805
 210.76867577   5.37148249   5.61683275  -1.53414479  40.08233051
   3.90490821   2.49318087   9.29878371  24.63748707   4.38765692
  70.37631775   2.90229765   1.36770825   6.42156577   5.57972969
   4.80468037   1.33640133   2.42593699   3.44226538  -0.67439734
  17.46111977  -0.33124426   2.95403161   5.66761184  11.40059086]


In [8]:
lin_mse = mean_squared_error(y_test, predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

2.6621349750888226


![Banana](https://wikimedia.org/api/rest_v1/media/math/render/svg/e258221518869aa1c6561bb75b99476c4734108e)

# Selecionando o Regressor DecisionTree

In [9]:
from sklearn.tree import DecisionTreeRegressor

In [10]:
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, y_train)

DecisionTreeRegressor()

In [11]:
dt_reg.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [12]:
predictions = dt_reg.predict(X_test)

In [13]:
predictions.shape

(45,)

In [14]:
dt_mse = mean_squared_error(y_test,predictions)

In [15]:
dt_rmse = np.sqrt(dt_mse)
print(dt_rmse)

17.86803477349053


# Seleção de Características

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [17]:
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train, y_train)

RandomForestRegressor()

In [18]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

grid_search = GridSearchCV(estimator = rf_reg, param_grid = param_grid, n_jobs = -1, verbose = 2)

In [19]:
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


GridSearchCV(estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [80, 90, 100, 110],
                         'max_features': [2, 3], 'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [100, 200, 300, 1000]},
             verbose=2)

{'bootstrap': True,
 'max_depth': 100,
 'max_features': 2,
 'min_samples_leaf': 3,
 'min_samples_split': 10,
 'n_estimators': 300}

In [20]:
best_grid = grid_search.best_params_

In [21]:
rf_reg = RandomForestRegressor(**best_grid)

In [22]:
rf_reg.fit(X_train, y_train)

RandomForestRegressor(max_depth=100, max_features=2, min_samples_leaf=3,
                      min_samples_split=10, n_estimators=300)

In [23]:
rf_predictions = rf_reg.predict(X_test)

In [24]:
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_rmse = np.sqrt(rf_mse)
rf_rmse

25.44675972722252

In [25]:
rf_reg.feature_importances_

array([0.25368011, 0.15417682, 0.21022228, 0.20807219, 0.05916081,
       0.00974587, 0.0069694 , 0.02245351, 0.07551901])

In [27]:
pd.read_csv('NOVO_covid-piaui.csv').columns

Index(['Unnamed: 0', 'confirmed', 'order_for_place',
       'estimated_population_2019', 'estimated_population',
       'confirmed_per_100k_inhabitants', 'death_rate', 'longitude', 'latitude',
       'city'],
      dtype='object')

In [28]:
min(rf_reg.feature_importances_)

0.006969396602292145

In [29]:
rf_reg_sort = sorted(rf_reg.feature_importances_, reverse=True)
rf_reg_sort

[0.25368011002325985,
 0.21022228335678092,
 0.20807218848521639,
 0.15417682475428227,
 0.07551900639407291,
 0.05916080842337693,
 0.02245350970676372,
 0.009745872253954794,
 0.006969396602292145]

In [30]:
def delete_col(rf, X_pickel):
    cols = np.delete(X_pickel, len(rf)-1,1)
    rf.pop()
    resp = [rf, cols]
    return resp

In [31]:
rf_reg_sort_train = sorted(rf_reg.feature_importances_, reverse=True)

rf_reg_sort_train, X_train_feat_selected  = delete_col(rf_reg_sort_train, X_train)
rf_reg_sort_train, X_train_feat_selected  = delete_col(rf_reg_sort_train, X_train_feat_selected)
rf_reg_sort_train, X_train_feat_selected  = delete_col(rf_reg_sort_train, X_train_feat_selected)

X_train_feat_selected.shape

(179, 6)

In [32]:
rf_reg_sort

[0.25368011002325985,
 0.21022228335678092,
 0.20807218848521639,
 0.15417682475428227,
 0.07551900639407291,
 0.05916080842337693,
 0.02245350970676372,
 0.009745872253954794,
 0.006969396602292145]

In [33]:
rf_reg_sort_test = sorted(rf_reg.feature_importances_, reverse=True)

rf_reg_sort_test, X_test_feat_selected  = delete_col(rf_reg_sort_test, X_test)
rf_reg_sort_test, X_test_feat_selected  = delete_col(rf_reg_sort_test, X_test_feat_selected)
rf_reg_sort_test, X_test_feat_selected  = delete_col(rf_reg_sort_test, X_test_feat_selected)
X_test_feat_selected.shape

(45, 6)

In [34]:
rf_reg_s = RandomForestRegressor()
rf_reg_s.fit(X_train_feat_selected, y_train)

RandomForestRegressor()

In [35]:
rf_predictions = rf_reg_s.predict(X_test_feat_selected)

In [36]:
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_rmse = np.sqrt(rf_mse)
rf_rmse

17.51210368732311

## LINEAR WITH DROP

In [41]:
lin_reg_drop = LinearRegression() 
lin_reg_drop.fit(X_train_feat_selected, y_train) # y = ax + b

LinearRegression()

In [42]:
predicti = lin_reg_drop.predict(X_test_feat_selected)

In [44]:
lin_drop_mse = mean_squared_error(y_test, predicti)
lin_drop_rmse = np.sqrt(lin_drop_mse)
print(lin_drop_rmse)

2.5062282412387034


## Support Vector Regressor SVR

In [5]:
from sklearn.svm import SVR

In [6]:
params_kernel = ['linear', 'poly', 'rbf', 'sigmoid']
params_c = [1.0, 1.5, 2.0, 2.5, 3.0]


for x in range(len(params_c)):
    for y in range(len(params_kernel)):

        rf_svr = SVR(kernel=params_kernel[y], C=params_c[x])

        rf_svr.fit(X_train, y_train)

        rf_predic_svr = rf_svr.predict(X_test)

        rf_mse_svr = mean_squared_error(y_test, rf_predic_svr)
        rf_rmse_svr = np.sqrt(rf_mse_svr)
        print('Kernel: '+str(params_kernel[y])+' \nC: '+str(params_c[x])+' \nRMSE: '+str(rf_rmse_svr)+'\n')


SVR(kernel='linear')

Kernel: linear 
C: 1.0 
RMSE: 15.339658739499784



SVR(kernel='poly')

Kernel: poly 
C: 1.0 
RMSE: 35.03408780991188



SVR()

Kernel: rbf 
C: 1.0 
RMSE: 35.7847417655916



SVR(kernel='sigmoid')

Kernel: sigmoid 
C: 1.0 
RMSE: 34.29789718665084



SVR(C=1.5, kernel='linear')

Kernel: linear 
C: 1.5 
RMSE: 11.211929456489203



SVR(C=1.5, kernel='poly')

Kernel: poly 
C: 1.5 
RMSE: 34.49320122885328



SVR(C=1.5)

Kernel: rbf 
C: 1.5 
RMSE: 35.363263556344



SVR(C=1.5, kernel='sigmoid')

Kernel: sigmoid 
C: 1.5 
RMSE: 33.46925706824658



SVR(C=2.0, kernel='linear')

Kernel: linear 
C: 2.0 
RMSE: 8.10480248418277



SVR(C=2.0, kernel='poly')

Kernel: poly 
C: 2.0 
RMSE: 34.00994887557791



SVR(C=2.0)

Kernel: rbf 
C: 2.0 
RMSE: 34.97956185502932



SVR(C=2.0, kernel='sigmoid')

Kernel: sigmoid 
C: 2.0 
RMSE: 32.7634129205607



SVR(C=2.5, kernel='linear')

Kernel: linear 
C: 2.5 
RMSE: 6.113714239349739



SVR(C=2.5, kernel='poly')

Kernel: poly 
C: 2.5 
RMSE: 33.53719014599477



SVR(C=2.5)

Kernel: rbf 
C: 2.5 
RMSE: 34.59665199655163



SVR(C=2.5, kernel='sigmoid')

Kernel: sigmoid 
C: 2.5 
RMSE: 32.437126583846



SVR(C=3.0, kernel='linear')

Kernel: linear 
C: 3.0 
RMSE: 4.229781956873769



SVR(C=3.0, kernel='poly')

Kernel: poly 
C: 3.0 
RMSE: 33.12207316495794



SVR(C=3.0)

Kernel: rbf 
C: 3.0 
RMSE: 34.19453344771496



SVR(C=3.0, kernel='sigmoid')

Kernel: sigmoid 
C: 3.0 
RMSE: 31.42743687231091

