### Importando as bibliotecas

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import loguniform, uniform

from sklearn.svm import SVR,LinearSVR
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso
from sklearn.model_selection import cross_val_score,RandomizedSearchCV,KFold,train_test_split

### Lendo DF

As informações sobre o DF podem ser encontradas aqui: https://archive.ics.uci.edu/ml/datasets/Bias+correction+of+numerical+prediction+model+temperature+forecast

In [2]:
df=pd.read_csv('Bias_correction_ucl.csv')
print('Infos do DataFrame:')
print('Linhas:',df.shape[0])
print('Colunas:',df.shape[1])

Infos do DataFrame:
Linhas: 7752
Colunas: 25


### Removendo colunas Date e Next_Tmin
Após retiras estas duas colunas, devemos ficar com 23 colunas

In [3]:
df.drop(['Date','Next_Tmin'],axis=1,inplace=True)
print('Infos do DataFrame:')
print('Linhas:',df.shape[0])
print('Colunas:',df.shape[1])

Infos do DataFrame:
Linhas: 7752
Colunas: 23


### Removendo linhas com valores faltantes
Após retirar os dados faltantes, devemos ficar com 7588 linhas

In [4]:
df.dropna(axis=0,inplace=True)
print('Infos do DataFrame:')
print('Linhas:',df.shape[0])
print('Colunas:',df.shape[1])

Infos do DataFrame:
Linhas: 7588
Colunas: 23


In [5]:
df.columns

Index(['station', 'Present_Tmax', 'Present_Tmin', 'LDAPS_RHmin', 'LDAPS_RHmax',
       'LDAPS_Tmax_lapse', 'LDAPS_Tmin_lapse', 'LDAPS_WS', 'LDAPS_LH',
       'LDAPS_CC1', 'LDAPS_CC2', 'LDAPS_CC3', 'LDAPS_CC4', 'LDAPS_PPT1',
       'LDAPS_PPT2', 'LDAPS_PPT3', 'LDAPS_PPT4', 'lat', 'lon', 'DEM', 'Slope',
       'Solar radiation', 'Next_Tmax'],
      dtype='object')

### Separando valores de entrada (X) e de saída (y)

In [6]:
y=df.iloc[:,-1]
X=df.iloc[:,:-1]

### Normalizando os dados de entrada (X)

In [7]:
scaler=StandardScaler()
X = scaler.fit_transform(X)

### Dividindo dados de treino e teste

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### Regressão Linear

In [9]:
lr=LinearRegression()
cv=cross_val_score(lr,X_train,y_train,cv=5,scoring='neg_mean_squared_error')
rmse_simples =  np.sqrt(np.mean(np.absolute(cv)))
print('Regressão Linear Simples\n')
print('\tRMSE médio dos 5 folds:', rmse_simples)

Regressão Linear Simples

	RMSE médio dos 5 folds: 1.4744093253956094


### Regressão Linear com Regularização L2

In [10]:
ridge=Ridge()

kfold=KFold(n_splits=5)

param_grid = {'alpha': loguniform(10**-3, 10**3)}

random_search=RandomizedSearchCV(
    estimator=ridge,
    scoring='neg_mean_squared_error', 
    param_distributions=param_grid,
    cv=kfold, 
    return_train_score=True)

result_ridge=random_search.fit(X_train,y_train)

In [11]:
regressor=Ridge(alpha=result_ridge.best_params_['alpha'])

cv=cross_val_score(
    regressor,
    X_train,
    y_train,
    cv=kfold,
    scoring='neg_mean_squared_error')

rmse_l2 =  np.sqrt(np.mean(np.absolute(cv)))

print('Regressão Linear L2')
print('\tRMSE médio dos 5 folds:',rmse_l2)

Regressão Linear L2
	RMSE médio dos 5 folds: 1.4744043814057808


### Regressão Linear com Regularização L1

In [12]:
lasso=Lasso()

kfold=KFold(n_splits=5)

param_grid = {'alpha': loguniform(10**-3, 10**3)}

random_search=RandomizedSearchCV(
    estimator=lasso,
    scoring='neg_mean_squared_error',
    param_distributions=param_grid,
    cv=kfold, 
    return_train_score=True)

result_lasso=random_search.fit(X_train,y_train)

In [13]:
regressor=Lasso(alpha=result_lasso.best_params_['alpha'])

cv=cross_val_score(
    regressor,
    X_train,
    y_train,
    cv=kfold,
    scoring='neg_mean_squared_error')

rmse_l1 =  np.sqrt(np.mean(np.absolute(cv)))

print('Regressão Linear L1\n')
print('\tRMSE médio dos 5 folds:',rmse_l1)

Regressão Linear L1

	RMSE médio dos 5 folds: 1.474351713139172


### SVM Linear

In [14]:
svm_lin = LinearSVR()

param_grid = {'C': loguniform(2**-5, 2**15), 
                       'epsilon': [0.1,0.3]}

random_search = RandomizedSearchCV(
    estimator=svm_lin,
    param_distributions=param_grid,
    n_iter=10,
    scoring = 'neg_mean_squared_error',
    cv = kfold)

result_SVR_lin = random_search.fit(X_train, y_train)



In [15]:
regressor=LinearSVR(C = result_SVR_lin.best_params_['C'],
                epsilon = result_SVR_lin.best_params_['epsilon'])

cv=cross_val_score(
    regressor,
    X_train,
    y_train,
    cv=kfold,
    scoring='neg_mean_squared_error')

rmse_svm_lin =  np.sqrt(np.mean(np.absolute(cv)))

print('Suport Vector Regresion\n')
print('\tRMSE médio dos 5 folds:',rmse_svm_lin)

Suport Vector Regresion

	RMSE médio dos 5 folds: 1.479031956606943


### SVM com kernel RBF

In [16]:
svm_rbf = SVR(kernel='rbf')

param_grid = {'C': loguniform(2**-5, 2**15), 
                       'gamma': loguniform(2**-9, 2**3) ,
                       'epsilon': [0.1,0.3]}

random_search = RandomizedSearchCV(
    estimator=svm_rbf,
    param_distributions=param_grid,
    n_iter=10,
    scoring = 'neg_mean_squared_error',
    cv = kfold)

result_SVR_rbf = random_search.fit(X_train, y_train)

In [17]:
regressor = SVR(kernel = 'rbf', C = result_SVR_rbf.best_params_['C'],
                gamma = result_SVR_rbf.best_params_['gamma'],
                epsilon = result_SVR_rbf.best_params_['epsilon'])

cv=cross_val_score(
    regressor,
    X_train,
    y_train,
    cv=kfold,
    scoring='neg_mean_squared_error')

rmse_svm_rbf =  np.sqrt(np.mean(np.absolute(cv)))

print('Suport Vector Regresion\n')
print('\tRMSE médio dos 5 folds:',rmse_svm_rbf)

Suport Vector Regresion

	RMSE médio dos 5 folds: 0.9891921134082128


### KNN

### MLP