**Importação das bibliotecas**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
import time
import numpy as np; np.random.seed(42)
import io
from sklearn.preprocessing import label_binarize
from google.colab import files
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingRegressor
import datetime as dt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
import math

**Criação do dataset**

In [None]:
uploaded = files.upload()

In [None]:
energy= pd.read_csv(r"energydata_complete (1).csv")

**Criação das variáveis extras**

In [None]:
energy['month'] = energy['date'].astype(str).str[5:7]
energy['day'] = energy['date'].astype(str).str[8:10]
energy['hour'] = energy['date'].astype(str).str[11:13]
energy['total_use']=energy['Appliances']+energy['lights']

In [None]:
energy['date']=pd.to_datetime(energy['date'])
energy['NSM'] = ((energy['date'] - energy['date'].dt.normalize())/ pd.Timedelta('1 second')).astype(float)
energy['day_of_week'] = energy['date'].dt.dayofweek
energy['week_status']=np.where((energy['day_of_week'] == 5) | (energy["day_of_week"] == 6), 0, 1)
energy.pop('date')

**Exploração do dataset e criação de visualizações**

In [None]:
energy.groupby("Appliances").count()
energy.groupby("lights").count()

In [None]:
df.groupby('month')
energy_mean=energy.groupby(['month','day']).mean().reset_index()
energy_sum=energy.groupby(['month','day']).sum().reset_index()

In [None]:
plt.figure(figsize=(15,9))
plt.plot(energy_sum['lights'], label= 'Lights')
plt.plot(energy_sum['Appliances'],label = 'Appliances')
plt.title("Energy Consumption in kWh per day")
plt.ylabel("kWh")
plt.xlabel("Days")
plt.legend();

In [None]:
plt.figure(figsize=(25,15))
plt.plot(energy_mean['T1'], label= 'kitchen ')
plt.plot(energy_mean['T2'],label = 'living room')
plt.plot(energy_mean['T3'], label= 'laundry ')
plt.plot(energy_mean['T4'],label = 'office room')
plt.plot(energy_mean['T5'], label= 'bathroom')
plt.plot(energy_mean['T7'], label= 'ironing room')
plt.plot(energy_mean['T8'], label= 'teenager room')
plt.plot(energy_mean['T9'], label= 'parents room')
plt.plot(energy_mean['T_out'],label = 'outside')
plt.rc('xtick', labelsize=20) 
plt.rcParams['legend.fontsize'] = 17
plt.rc('axes', labelsize=20)
plt.title("Mean Day Temperature (Celsius)")
plt.ylabel("Temperature")
plt.xlabel("Days")
plt.legend();

In [None]:
data = pd.DataFrame(data = energy, columns = ['T1','T2','T3','T4','T5','T6','T7','T8','T9'])
sns.boxplot(x="variable", y="value", data=pd.melt(data))
plt.show()

In [None]:
data = pd.DataFrame(data = energy, columns = ['RH_1','RH_2','RH_3','RH_4','RH_5','RH_6','RH_7','RH_8','RH_9'])
sns.boxplot(x="variable", y="value", data=pd.melt(data))
plt.show()

In [None]:
data = pd.DataFrame(data = energy, columns = ["Appliances"])
sns.boxplot(x="variable", y="value", data=pd.melt(data))
plt.xlabel('')
plt.ylabel('kWh')
plt.show()

In [None]:
data = pd.DataFrame(data = energy, columns = ["lights"])
sns.boxplot(x="variable", y="value", data=pd.melt(data))
plt.xlabel('')
plt.ylabel('kWh')
plt.show()

In [None]:
corr_energy = data.corr(method='pearson')
plt.figure(figsize=(40, 30))
sns.heatmap(corr_energy, annot=True)
plt.rc('xtick', labelsize=20) 
plt.rc('ytick', labelsize=20) 
plt.rc('figure', titlesize=20)
plt.show()

**Separação dos conjuntos de treino e teste**

In [None]:
y = energy.pop('Appliances')
X = energy
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25)

**Gradient Boosting Regressor**

In [None]:
model = make_pipeline(MinMaxScaler(),GradientBoostingRegressor())
params = {'gradientboostingregressor__max_depth': range(3, 12), 'gradientboostingregressor__n_estimators': range(100, 2001, 100)}
grid = GridSearchCV(model, params, cv=3)

Treino e resultados com o conjunto de testes

In [None]:
grid.fit(X_train, y_train)
grid.best_estimator_.score(X_valid, y_valid)

In [None]:
grid.best_params_

In [None]:
mse = mean_squared_error(y_valid, grid.predict(X_valid))
print(math.sqrt(mse))

Resultados com o conjunto de treino

In [None]:
r2_train_gb = r2_score(y_train, grid.predict(X_train))
r2_train_gb

In [None]:
mse_grid_train = mean_squared_error(y_train, grid.predict(X_train))
print(math.sqrt(mse_grid_train))

Importância dos atributos - Gradient Boosting

In [None]:
variances = grid.best_estimator_.named_steps['gradientboostingregressor'].feature_importances_
series = pd.Series(variances, index=X.columns, name='Explained Variances')
series = series.sort_values(ascending=True)
plt.figure(figsize=(18,6))
plt.barh(series.index,series)
plt.show()

**Random Forest Regressor**

In [None]:
regr = make_pipeline(MinMaxScaler(),RandomForestRegressor())
params_regr = {'randomforestregressor__n_estimators': range(100, 500, 100)}
grid_regr = GridSearchCV(regr, params_regr, cv=3)

Treino e resultados com o conjunto de testes

In [None]:
grid_regr.fit(X_train, y_train)

In [None]:
grid_regr.best_estimator_.score(X_valid, y_valid)

In [None]:
grid_regr.best_params_

In [None]:
mse_regr = mean_squared_error(y_valid, grid_regr.predict(X_valid))
print(math.sqrt(mse_regr))

Resultados com o conjunto de treino

In [None]:
r2_train_rf = r2_score(y_train, grid_regr.predict(X_train))
r2_train_rf

In [None]:
mse_regr_train = mean_squared_error(y_train, grid_regr.predict(X_train))
print(math.sqrt(mse_regr_train))

Importância dos atributos - Random Forest

In [None]:
variances_rf = grid_regr.best_estimator_.named_steps['randomforestregressor'].feature_importances_
series_rf = pd.Series(variances_rf, index=X.columns, name='Explained Variances')
series_rf = series_rf.sort_values(ascending=True)
plt.figure(figsize=(18,6))
plt.barh(series_rf.index,series)
plt.show()