# Árboles de Regresión

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("../datasets/boston/Boston.csv")

In [3]:
data.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [4]:
data.shape

(506, 14)

In [5]:
colnames = data.columns.values.tolist()
predictors = colnames[0:13]
target = colnames[13]
X = data[predictors]
Y = data[target]

In [6]:
from sklearn.tree import DecisionTreeRegressor

In [7]:
regtree = DecisionTreeRegressor(min_samples_split=30, min_samples_leaf=10, random_state=0)

In [8]:
regtree.fit(X,Y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=10,
                      min_samples_split=30, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=0, splitter='best')

In [9]:
preds = regtree.predict(data[predictors])

In [10]:
data["preds"] = preds

#### Cuando queremos comparar dos o varias variables de un mismo dataset ponemos doble corchete data[[.. , ..]]

In [11]:
data[["medv", "preds"]]

Unnamed: 0,medv,preds
0,24.0,22.840000
1,21.6,22.840000
2,34.7,35.247826
3,33.4,35.247826
4,36.2,35.247826
...,...,...
501,22.4,22.840000
502,20.6,20.624138
503,23.9,28.978261
504,22.0,31.170000


In [12]:
from sklearn.tree import export_graphviz
with open("../notebooks/resources/boston_rtree.dot", "w") as dotfile:
    export_graphviz(regtree, out_file=dotfile, feature_names= predictors)
    dotfile.close()
import os
from graphviz import Source
file = open("../notebooks/resources/boston_rtree.dot", "r")
text = file.read()
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
Source(text)

ModuleNotFoundError: No module named 'graphviz'

### Cross-Validation

In [13]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import numpy as np

In [14]:
cv = KFold(n_splits=10, shuffle=True, random_state=1)

In [15]:
scores = cross_val_score(regtree, X, Y, scoring="neg_mean_squared_error", cv= cv, n_jobs=1)
print(scores)
score = np.mean(scores)
print(score)
## Esto nos puede dar negativo, no pasa nada, se multiplica por -1

[-13.64925886 -17.28987161 -16.98569707 -47.56954086  -9.26202865
 -17.23057023 -15.41541493 -31.33011027 -22.79877067  -9.54180723]
-20.107307036443846


In [16]:
list(zip(predictors, regtree.feature_importances_))

[('crim', 0.03421203230639308),
 ('zn', 0.0),
 ('indus', 0.0011605887788380146),
 ('chas', 0.0),
 ('nox', 0.01856163073811432),
 ('rm', 0.6308568014337028),
 ('age', 0.01725115143448847),
 ('dis', 0.0013745115995791378),
 ('rad', 0.0),
 ('tax', 0.0023698305298803803),
 ('ptratio', 0.009333247332530954),
 ('black', 0.0),
 ('lstat', 0.28488020584647283)]

## Randon Forests 

In [17]:
from sklearn.ensemble import RandomForestRegressor

#### n_jobs es el número de procesos paralelos ocurriendo
#### oob_score=True  Significa un muestreo tomando datos que no hacen parte del Bootstrap (Rama de decision) (OOB: out of bootstrap)
#### n_estimators  Es el número de árboles a hacer en el random forest

In [33]:
forest = RandomForestRegressor(n_jobs=2, oob_score=True, n_estimators= 500)
forest.fit(X,Y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=2,
                      oob_score=True, random_state=None, verbose=0,
                      warm_start=False)

In [34]:
data["rforest_pred"] = forest.oob_prediction_
data[["rforest_pred", "medv"]]

Unnamed: 0,rforest_pred,medv
0,28.518129,24.0
1,22.580526,21.6
2,34.119149,34.7
3,34.750000,33.4
4,34.215764,36.2
...,...,...
501,24.835260,22.4
502,18.350641,20.6
503,27.559459,23.9
504,26.690576,22.0


In [35]:
data["rforest_error2"] = (data["rforest_pred"] - data["medv"])**2
sum(data["rforest_error2"])/len(data)

10.108928265792573

In [36]:
forest.oob_score_

0.8802537145884499