# Decision Trees for Regression

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

# fix_yahoo_finance is used to fetch data 
import yfinance as yf
yf.pdr_override()

In [5]:
# input
symbol = 'BAJFINANCE.NS'
start = '2014-01-01'
end = '2020-07-27'

# Read data 
dataset = yf.download(symbol,start,end)

# View Columns
dataset.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-01-01,156.0,161.5,156.0,157.029999,133.94075,119950
2014-01-02,157.050003,158.005005,155.880005,156.919998,133.846954,69730
2014-01-03,157.179993,160.399994,153.865005,155.550003,132.678375,512820
2014-01-06,157.115005,158.0,153.009995,155.145004,132.332932,476960
2014-01-07,155.524994,157.190002,154.529999,155.404999,132.554718,225390


In [6]:
# Create more data
dataset['Increase_Decrease'] = np.where(dataset['Volume'].shift(-1) > dataset['Volume'],1,0)
dataset['Buy_Sell_on_Open'] = np.where(dataset['Open'].shift(-1) > dataset['Open'],1,-1)
dataset['Buy_Sell'] = np.where(dataset['Adj Close'].shift(-1) > dataset['Adj Close'],1,-1)
dataset['Return'] = dataset['Adj Close'].pct_change()
dataset = dataset.dropna()
dataset.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Increase_Decrease,Buy_Sell_on_Open,Buy_Sell,Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-07-20,3344.949951,3453.949951,3316.5,3441.5,3441.5,10555495,1,1,-1,0.042468
2020-07-21,3500.0,3519.5,3218.649902,3292.449951,3292.449951,25960048,0,-1,-1,-0.04331
2020-07-22,3320.0,3340.0,3193.600098,3253.0,3253.0,17069659,0,-1,1,-0.011982
2020-07-23,3253.0,3320.0,3216.050049,3297.800049,3297.800049,8718035,0,-1,-1,0.013772
2020-07-24,3245.0,3290.0,3220.0,3251.850098,3251.850098,6637683,0,-1,-1,-0.013934


In [7]:
dataset.shape

(1611, 10)

In [8]:
X = dataset.drop(['Adj Close', 'Close'], axis=1)  
y = dataset['Adj Close'] 

In [9]:
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  

In [10]:
from sklearn.tree import DecisionTreeRegressor  
regressor = DecisionTreeRegressor()  
regressor.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [11]:
y_pred = regressor.predict(X_test)

In [17]:
df = pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})  
#print(df.head())
print(df.tail())

                 Actual    Predicted
Date                                
2014-11-17   302.666962   300.511017
2017-02-17  1080.449585  1078.813110
2015-11-05   509.265747   507.558990
2020-03-13  3952.550049  4014.522949
2016-09-22  1118.579956  1091.903687


In [13]:
from sklearn import metrics  
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))  

Mean Absolute Error: 15.559916564185553
Mean Squared Error: 811.1550277520276
Root Mean Squared Error: 28.480783482060804


In [14]:
print(y_test.shape)
print(y_pred.shape)

(323,)
(323,)


In [15]:
from sklearn.model_selection import cross_val_score

dt_fit = regressor.fit(X_train, y_train)
dt_scores = cross_val_score(dt_fit, X_train, y_train, cv = 5)

print("Mean cross validation score: {}".format(np.mean(dt_scores)))
print("Score without cv: {}".format(dt_fit.score(X_train, y_train)))

Mean cross validation score: 0.9991359931328935
Score without cv: 1.0


In [16]:
from sklearn.metrics import r2_score

print('r2 score:', r2_score(y_test, dt_fit.predict(X_test)))
print('Accuracy Score:', dt_fit.score(X_test, y_test))

r2 score: 0.9993590701143477
Accuracy Score: 0.9993590701143477
