# Tarefa 2

In [2]:
import pandas as pd
import numpy as np
import random
from scipy.stats import loguniform, uniform

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import LinearSVR


## Leitura dos Dados

In [2]:
df = pd.read_csv("Bias_correction_ucl.csv")
df.head()

Unnamed: 0,station,Date,Present_Tmax,Present_Tmin,LDAPS_RHmin,LDAPS_RHmax,LDAPS_Tmax_lapse,LDAPS_Tmin_lapse,LDAPS_WS,LDAPS_LH,...,LDAPS_PPT2,LDAPS_PPT3,LDAPS_PPT4,lat,lon,DEM,Slope,Solar radiation,Next_Tmax,Next_Tmin
0,1.0,2013-06-30,28.7,21.4,58.255688,91.116364,28.074101,23.006936,6.818887,69.451805,...,0.0,0.0,0.0,37.6046,126.991,212.335,2.785,5992.895996,29.1,21.2
1,2.0,2013-06-30,31.9,21.6,52.263397,90.604721,29.850689,24.035009,5.69189,51.937448,...,0.0,0.0,0.0,37.6046,127.032,44.7624,0.5141,5869.3125,30.5,22.5
2,3.0,2013-06-30,31.6,23.3,48.690479,83.973587,30.091292,24.565633,6.138224,20.57305,...,0.0,0.0,0.0,37.5776,127.058,33.3068,0.2661,5863.555664,31.1,23.9
3,4.0,2013-06-30,32.0,23.4,58.239788,96.483688,29.704629,23.326177,5.65005,65.727144,...,0.0,0.0,0.0,37.645,127.022,45.716,2.5348,5856.964844,31.7,24.3
4,5.0,2013-06-30,31.4,21.9,56.174095,90.155128,29.113934,23.48648,5.735004,107.965535,...,0.0,0.0,0.0,37.5507,127.135,35.038,0.5055,5859.552246,31.2,22.5


### 1) Remove colunas <b>Date</b> e <b>Next_Tmin</b> e linhas com valor faltante

In [3]:
df.drop(['Date','Next_Tmin'], axis = "columns", inplace = True)
df.dropna(axis = "index", inplace = True)
print('Infos do DataFrame:')
print('Linhas:',df.shape[0])
print('Colunas:',df.shape[1])

Infos do DataFrame:
Linhas: 7588
Colunas: 23


In [4]:
# X = atributos de entrada // y = atributos de saída
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

### 2) Centraliza e normaliza cada atributo de entrada

In [5]:
X = StandardScaler().fit_transform(X)

## Regressores

### 1) Regressão Linear

In [6]:
linear_regressor = LinearRegression()
scores = cross_val_score(linear_regressor, X, y, cv = 5, scoring = 'neg_mean_squared_error')
rmse =  np.sqrt(np.mean(np.absolute(scores)))
print('RMSE médio dos 5 folds:', rmse)

RMSE médio dos 5 folds: 1.5809671184346967


### 2) Regressão Linear com regularização L2

In [7]:
param_grid = {'alpha': loguniform.rvs(10**-3, 10**3, size = 10)}

best_rmse = np.Infinity
for alpha_value in param_grid['alpha']:
    ridge = Ridge(alpha = alpha_value)
    scores = cross_val_score(ridge, X, y, cv = 5, scoring = 'neg_mean_squared_error')
    rmse =  np.sqrt(np.mean(np.absolute(scores)))
    if rmse < best_rmse:
        best_alpha = alpha_value
        best_rmse = rmse

print("Resultado da busca de hiperparâmtros: ")
print("Melhor RMSE: {}\nMelhor combinação de hiperparametros: {}".format(
    best_rmse,
    best_alpha))

ridge = Ridge()
scores = cross_val_score(ridge, X, y, cv = 5, scoring = 'neg_mean_squared_error')
rmse =  np.sqrt(np.mean(np.absolute(scores)))
print("\nResultados usando valores default: ", rmse)

Resultado da busca de hiperparâmtros: 
Melhor RMSE: 1.5759356364844723
Melhor combinação de hiperparametros: 236.23595423065996

Resultados usando valores default:  1.5808995831441663


### 3) Regressão Linear com regularização L1

In [8]:
param_grid = {'alpha': loguniform.rvs(10**-3, 10**3, size = 10)}

best_rmse = np.Infinity
for alpha_value in param_grid['alpha']:
    lasso = Lasso(alpha = alpha_value)
    scores = cross_val_score(lasso, X, y, cv = 5, scoring = 'neg_mean_squared_error')
    rmse =  np.sqrt(np.mean(np.absolute(scores)))
    if rmse < best_rmse:
        best_alpha = alpha_value
        best_rmse = rmse

print("Resultado da busca de hiperparâmtros: ")
print("Melhor RMSE: {}\nMelhor combinação de hiperparametros:\n\tapha: {}".format(
    best_rmse,
    best_alpha))

lasso = Lasso()
scores = cross_val_score(lasso, X, y, cv = 5, scoring = 'neg_mean_squared_error')
rmse =  np.sqrt(np.mean(np.absolute(scores)))
print("\nResultados usando valores default: ", rmse)

Resultado da busca de hiperparâmtros: 
Melhor RMSE: 1.5719452194534167
Melhor combinação de hiperparametros:
	apha: 0.022622299684717205

Resultados usando valores default:  2.044148597098836


### 4) SVM Linear

https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVR.html?highlight=linearsvr

In [13]:
param_grid = {'C': loguniform.rvs(2**-5, 2**15, size = 10),
              'epsilon': [0.1, 0.3]}

best_rmse = np.Infinity
for c_value in param_grid['C']:
    epsilon_value = random.choice(param_grid['epsilon'])
    svr_linear = LinearSVR(epsilon = epsilon_value, C = c_value, max_iter=3000)
    scores = cross_val_score(svr_linear, X, y, cv = 5, scoring = 'neg_mean_squared_error')
    rmse =  np.sqrt(np.mean(np.absolute(scores)))
    if rmse < best_rmse:
        best_c = c_value
        best_epsilon = epsilon_value
        best_rmse = rmse

print("Resultado da busca de hiperparâmtros: ")
print("Melhor RMSE: {}\nMelhor combinação de hiperparametros:\n\tepsion: {}\n\tC: {}".format(
    best_rmse,
    best_epsilon, best_c))

svr_linear = LinearSVR(max_iter=3000)
scores = cross_val_score(svr_linear, X, y, cv = 5, scoring = 'neg_mean_squared_error')
rmse =  np.sqrt(np.mean(np.absolute(scores)))
print("\nResultados usando valores default:\nMelhor combinação de hiperparametros:\n\tepsion: {}\n\tC: {} ".format(
    best_rmse,
    best_epsilon, best_c))



Resultado da busca de hiperparâmtros: 
Melhor RMSE: 1.5575286492419054
Melhor combinação de hiperparametros:
	epsion: 0.1
	C: 16.850878835183057

Resultados usando valores default:
Melhor combinação de hiperparametros:
	epsion: 1.5575286492419054
	C: 0.1 


### 5) SVM com kernel RBF

In [None]:
param_grid = {'C': loguniform.rvs(2**-5, 2**15, size = 10),
              'gamma': loguniform.rvs(2**-9, 2*3, size = 10),
              'epsilon': [0.1, 0.3]}

### 6) KNN

KNNeighborsRegressor

### 7) MLP

### 8) Árvore de decisão

### 9) Random Forest

### 10) GBM

In [3]:
loguniform.rvs(10**-3, 10**3, size = 10)

array([3.48005660e-03, 1.59225618e+00, 4.23703757e-03, 3.52190099e+01,
       7.03496716e-02, 4.83574252e+01, 8.40457609e-03, 9.25653406e-01,
       1.56677256e-03, 5.22978728e-02])

In [4]:
loguniform(10**-3, 10**3)

<scipy.stats._distn_infrastructure.rv_frozen at 0x7f4dcc399f10>