In [None]:
"""
(12) Regressão com Árvores de Decisão
o Implemente uma árvore de decisão para regressão no conjunto de dados Bike Sharing.
Utilize a métrica R² e MAE para avaliação do desempenho.
__________________________________________________________________________________________

(12) Regression with Decision Trees
o Implement a decision tree for regression on the Bike Sharing dataset.
Use the R² and MAE metrics to evaluate the model's performance.

DATASET LINK:
https://www.kaggle.com/datasets/lakshmi25npathi/bike-sharing-dataset

"""

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score

In [None]:
daily_bikes_data = pd.read_csv("daily-bike-sharing.csv", encoding="utf-8")

print(daily_bikes_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   dteday      731 non-null    object 
 2   season      731 non-null    int64  
 3   yr          731 non-null    int64  
 4   mnth        731 non-null    int64  
 5   holiday     731 non-null    int64  
 6   weekday     731 non-null    int64  
 7   workingday  731 non-null    int64  
 8   weathersit  731 non-null    int64  
 9   temp        731 non-null    float64
 10  atemp       731 non-null    float64
 11  hum         731 non-null    float64
 12  windspeed   731 non-null    float64
 13  casual      731 non-null    int64  
 14  registered  731 non-null    int64  
 15  cnt         731 non-null    int64  
dtypes: float64(4), int64(11), object(1)
memory usage: 91.5+ KB
None


In [None]:
print(daily_bikes_data)

     instant      dteday  season  yr  mnth  holiday  weekday  workingday  \
0          1  2011-01-01       1   0     1        0        6           0   
1          2  2011-01-02       1   0     1        0        0           0   
2          3  2011-01-03       1   0     1        0        1           1   
3          4  2011-01-04       1   0     1        0        2           1   
4          5  2011-01-05       1   0     1        0        3           1   
..       ...         ...     ...  ..   ...      ...      ...         ...   
726      727  2012-12-27       1   1    12        0        4           1   
727      728  2012-12-28       1   1    12        0        5           1   
728      729  2012-12-29       1   1    12        0        6           0   
729      730  2012-12-30       1   1    12        0        0           0   
730      731  2012-12-31       1   1    12        0        1           1   

     weathersit      temp     atemp       hum  windspeed  casual  registered  \
0      

In [None]:
# The data int 'dteday' column is in object type, so it needs to be converted to datetime
daily_bikes_data['dteday'] = pd.to_datetime(daily_bikes_data['dteday'], errors='coerce')
print(daily_bikes_data['dteday'].dtype)

datetime64[ns]


In [None]:
# Creating a day column to indicate the day of the month
daily_bikes_data['day'] = daily_bikes_data['dteday'].dt.day

In [None]:
print(daily_bikes_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   instant     731 non-null    int64         
 1   dteday      731 non-null    datetime64[ns]
 2   season      731 non-null    int64         
 3   yr          731 non-null    int64         
 4   mnth        731 non-null    int64         
 5   holiday     731 non-null    int64         
 6   weekday     731 non-null    int64         
 7   workingday  731 non-null    int64         
 8   weathersit  731 non-null    int64         
 9   temp        731 non-null    float64       
 10  atemp       731 non-null    float64       
 11  hum         731 non-null    float64       
 12  windspeed   731 non-null    float64       
 13  casual      731 non-null    int64         
 14  registered  731 non-null    int64         
 15  cnt         731 non-null    int64         
 16  day         731 non-null  

In [None]:
# This can be used to delete the datetime type columns from the dataset
daily_bikes_data.drop(columns=['dteday'], inplace=True)

In [None]:
# Moving the 'day' column ahead of 'cnt'
cols = daily_bikes_data.columns.tolist()
cols.remove('day')
idx = cols.index('cnt')
cols = cols[:idx] + ['day'] + cols[idx:]
daily_bikes_data = daily_bikes_data[cols]

In [None]:
# Checking the columns after the changes
print(daily_bikes_data.columns)
print("\n")
print(daily_bikes_data.info())

Index(['instant', 'season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday',
       'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'casual',
       'registered', 'day', 'cnt'],
      dtype='object')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   season      731 non-null    int64  
 2   yr          731 non-null    int64  
 3   mnth        731 non-null    int64  
 4   holiday     731 non-null    int64  
 5   weekday     731 non-null    int64  
 6   workingday  731 non-null    int64  
 7   weathersit  731 non-null    int64  
 8   temp        731 non-null    float64
 9   atemp       731 non-null    float64
 10  hum         731 non-null    float64
 11  windspeed   731 non-null    float64
 12  casual      731 non-null    int64  
 13  registered  731 non-null    int64  
 14  day         731 non-null    

In [None]:
# '.iloc' extracts all the entry data and output from all the lines(:)
# and all the columns(:-1) of the dataset except the last one
x = daily_bikes_data.iloc[:,:-1].values
y = daily_bikes_data.iloc[:,-1].values

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30,random_state=42)

print("{0:0.2f}% in the training set".format((len(x_train)/len(daily_bikes_data.index)) * 100))
print("{0:0.2f}% int the validation set".format((len(x_test)/len(daily_bikes_data.index)) * 100))

69.90% in the training set
30.10% int the validation set


In [None]:
model = DecisionTreeRegressor()
model.fit(x_train, y_train)

In [None]:
# 'y_pred' is an array containing the predicted values for the testing/validation set
y_pred = model.predict(x_test)

# Configures numpy to display numbers with a precision of 2 decimal places when printing arrays.
np.set_printoptions(precision=2)
print(y_pred)

[6778. 1538. 3456. 5464. 7733. 7446. 1606. 2169. 7736. 5713. 1749. 2914.
 4839. 5315. 2192. 2252. 2192. 8173. 5538. 2432. 7504. 5515. 5698. 1795.
 1891. 5084. 2115. 7525. 3777. 5572. 2947. 3784. 6133. 7504. 5936. 2918.
 7375. 6304. 2633. 1301. 4010. 3959. 7504. 4966. 6830.  627. 4342. 5515.
 5169. 5010. 2895. 2432. 1027. 4714. 4763. 4608. 2475. 3071. 4758. 7013.
 6370. 7525. 1301. 7641. 6591. 6227. 7429. 4150. 1623. 6889. 3744. 1248.
 3894. 4608. 4375. 2056. 5847. 4803. 1450. 4362. 1944. 1951. 5986.  441.
 4128. 4023. 5892. 1115. 3915. 3641. 1526. 6864. 6203. 4639. 3190. 2947.
 4694. 2376. 2999. 4844. 7504. 3820. 6572. 2252. 2192. 5375. 1098. 2808.
 6869. 1301. 4780. 7328. 3659. 7870. 6093. 4459. 1446. 3376. 5302. 2376.
 5936. 1712. 2927. 1301. 4035. 5115. 4586. 3228. 1996. 7429. 3141. 4128.
 5315. 5501. 5629. 2192. 4717. 7132. 1996. 4304. 4839. 2115. 1623. 4304.
 3068. 5312. 4073. 4068. 1996. 3485. 4459. 7525. 4274. 4586. 1450. 3510.
 4098. 6904. 2416. 4833. 6457. 4274. 4035. 5315. 17

In [None]:
# APPPLYING R² SCORE
r2_score(y_pred,y_test)

0.9887424437814073

In [None]:
# APPLYING MAE
mae = mean_absolute_error(y_test, y_pred)

print(f"MAE: {mae:.2f}")

MAE: 145.52
