In [2]:
import pydot
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
from sklearn import metrics
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

In [3]:
!apt update && apt install graphviz -y

Hit:1 http://deb.debian.org/debian bullseye InRelease
Hit:2 http://security.debian.org/debian-security bullseye-security InRelease
Hit:3 http://deb.debian.org/debian bullseye-updates InRelease
Reading package lists... Done3m
Building dependency tree... Done
Reading state information... Done
89 packages can be upgraded. Run 'apt list --upgradable' to see them.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
graphviz is already the newest version (2.42.2-5).
0 upgraded, 0 newly installed, 0 to remove and 89 not upgraded.


## Funções

In [16]:
def data(content):
    return "/app/data/" + content

# Gerar previsões
def predict():
    prediction = regressor.predict(test_features)
    error = abs(prediction - test_labels)
    r_sq = regressor.score(features, labels)
    mae = metrics.mean_absolute_error(test_labels, prediction)
    mse = metrics.mean_squared_error(test_labels, prediction)
    print("R²:", r_sq)
    print("MAE:", mae)
    print("MSE:", mse)
    print()

# Verificar quais variáveis serão utilizadas na Árvore
def print_variables():
    importances = list(regressor.feature_importances_)
    feature_importance = [(feature, round(importance,2)) for feature, importance in zip(feature_list, importances)]
    feature_importance = sorted(feature_importance, key=lambda x: x[1], reverse=True)
    [print("Feature: {:20} Importance {}".format(*pair)) for pair in feature_importance];
    print()

# Visualização da Árvore de Decisão
def tree_view():
    dot_file = data(prefix + "_tree.dot")
    png_file = data(prefix + "_tree.png")

    tree = regressor.estimators_[5]
    export_graphviz(tree, out_file=dot_file, feature_names=feature_list, rounded=True, precision=1)
    (graph, ) = pydot.graph_from_dot_file(dot_file)
    graph.write_png(png_file)

## Desenvolvimento

In [17]:
file = data('temps.xlsx')
dataframe = pd.read_excel(file)
dataframe.head()

Unnamed: 0,year,month,day,week,temp_2,temp_1,average,actual
0,2016,1,1,Fri,45,45,45.6,45
1,2016,1,2,Sat,44,45,45.7,44
2,2016,1,3,Sun,45,44,45.8,41
3,2016,1,4,Mon,44,41,45.9,40
4,2016,1,5,Tues,41,40,46.0,44


In [6]:
dataframe.describe()

Unnamed: 0,year,month,day,temp_2,temp_1,average,actual
count,348.0,348.0,348.0,348.0,348.0,348.0,348.0
mean,2016.0,6.477011,15.514368,62.652299,62.701149,59.760632,62.543103
std,0.0,3.49838,8.772982,12.165398,12.120542,10.527306,11.794146
min,2016.0,1.0,1.0,35.0,35.0,45.1,35.0
25%,2016.0,3.0,8.0,54.0,54.0,49.975,54.0
50%,2016.0,6.0,15.0,62.5,62.5,58.2,62.5
75%,2016.0,10.0,23.0,71.0,71.0,69.025,71.0
max,2016.0,12.0,31.0,117.0,117.0,77.4,92.0


In [7]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 348 entries, 0 to 347
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   year     348 non-null    int64  
 1   month    348 non-null    int64  
 2   day      348 non-null    int64  
 3   week     348 non-null    object 
 4   temp_2   348 non-null    int64  
 5   temp_1   348 non-null    int64  
 6   average  348 non-null    float64
 7   actual   348 non-null    int64  
dtypes: float64(1), int64(6), object(1)
memory usage: 21.9+ KB


In [8]:
dataframe = pd.get_dummies(dataframe)
dataframe.head()

Unnamed: 0,year,month,day,temp_2,temp_1,average,actual,week_Fri,week_Mon,week_Sat,week_Sun,week_Thurs,week_Tues,week_Wed
0,2016,1,1,45,45,45.6,45,1,0,0,0,0,0,0
1,2016,1,2,44,45,45.7,44,0,0,1,0,0,0,0
2,2016,1,3,45,44,45.8,41,0,0,0,1,0,0,0
3,2016,1,4,44,41,45.9,40,0,1,0,0,0,0,0
4,2016,1,5,41,40,46.0,44,0,0,0,0,0,1,0


In [9]:
labels = np.array(dataframe['actual'])
features = dataframe.drop("actual", axis = 1)
feature_list = list(features.columns)
features = np.array(features)

In [10]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size  =0.25, random_state = 42)

In [11]:
baseline_preds = test_features[:, feature_list.index("average")]
baseline_error = abs(baseline_preds - test_labels)
print("Baseline error average:", round(np.mean(baseline_error),2))

Baseline error average: 5.06


### Random Forest Regressor

In [12]:
regressor = RandomForestRegressor(n_estimators=1000, random_state=42)
regressor.fit(train_features, train_labels);
predict()
print_variables()

R²: 0.932094797587982
MAE: 3.932057471264368
MSE: 26.68358100000001

Feature: temp_1               Importance 0.7
Feature: average              Importance 0.2
Feature: day                  Importance 0.03
Feature: temp_2               Importance 0.03
Feature: month                Importance 0.01
Feature: week_Fri             Importance 0.01
Feature: week_Sat             Importance 0.01
Feature: year                 Importance 0.0
Feature: week_Mon             Importance 0.0
Feature: week_Sun             Importance 0.0
Feature: week_Thurs           Importance 0.0
Feature: week_Tues            Importance 0.0
Feature: week_Wed             Importance 0.0



### AdaBoost Regressor

In [13]:

regressor = AdaBoostRegressor(n_estimators=1000, random_state=42)
regressor.fit(train_features, train_labels);
predict()
print_variables()

R²: 0.880058663428009
MAE: 3.6986729499615043
MSE: 23.466630658730693

Feature: temp_1               Importance 0.48
Feature: average              Importance 0.25
Feature: temp_2               Importance 0.1
Feature: month                Importance 0.07
Feature: day                  Importance 0.04
Feature: week_Mon             Importance 0.04
Feature: week_Fri             Importance 0.01
Feature: week_Sun             Importance 0.01
Feature: year                 Importance 0.0
Feature: week_Sat             Importance 0.0
Feature: week_Thurs           Importance 0.0
Feature: week_Tues            Importance 0.0
Feature: week_Wed             Importance 0.0



### Gradient Boosting Regressor

In [14]:
regressor = GradientBoostingRegressor(n_estimators=1000, random_state=42)
regressor.fit(train_features, train_labels);
predict()
print_variables()

R²: 0.9393909909403808
MAE: 4.34445968243629
MSE: 33.59672049188874

Feature: temp_1               Importance 0.61
Feature: average              Importance 0.3
Feature: day                  Importance 0.03
Feature: temp_2               Importance 0.02
Feature: month                Importance 0.01
Feature: week_Fri             Importance 0.01
Feature: year                 Importance 0.0
Feature: week_Mon             Importance 0.0
Feature: week_Sat             Importance 0.0
Feature: week_Sun             Importance 0.0
Feature: week_Thurs           Importance 0.0
Feature: week_Tues            Importance 0.0
Feature: week_Wed             Importance 0.0



### Visualização da Árvore de Decisão

In [15]:
prefix = "RandomForestRegressor"
regressor = RandomForestRegressor(max_depth=3)
regressor.fit(train_features, train_labels);
tree_view()