# Predicción cuantitativa

### Importar las librerías

In [2]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_boston

from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score

### Se importa el dataset y se organiza el dataset

In [3]:
X, Y = load_boston(return_X_y=True)
col_name = ["crim", "zn", "indus", "chas", "nox", "rm", "age", "dis", "rad", "tax", "ptratio", "black", "lstat"]

X = pd.DataFrame(X)
X.columns=col_name

## Siempre analizar y entender con lo que estamos trabajando

In [4]:
df = X
df["INCOME"] = Y
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,INCOME
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


## Se divide el set de entrenamiento (training) y de prueba (testing)

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=0)

X_test.shape, X_train.shape

((203, 14), (303, 14))

## Usemos un modelo de SVM para la predicción

In [6]:
C = 1e3

svr_rbf = SVR(kernel="rbf", C=C, gamma=0.001).fit(X_train, Y_train)

#### Con el Training set el coef. R2 es de 0.99

In [7]:
y_rbf = svr_rbf.predict(X_train)

r2_score(Y_train, y_rbf)

0.9998939754134802

#### Con el Testing set el coef. R2 es de 0.64, esto nos dice que el modelo podría estar *overfitted* (sobreajustado) pero funciona bien con los datos de prueba

In [8]:
y_rbf_test = svr_rbf.predict(X_test)

r2_score(Y_test, y_rbf_test)

0.9005127991919668

## Compararlo con un modelo de Árbol de Decisión

![Imagen](https://sites.google.com/site/cursofpeanalistafuncional/_/rsrc/1303380940956/arboles-de-decision/00%20arboles%20de%20decision.png)
Tomado de Google Sites

In [9]:
regtree = DecisionTreeRegressor(min_samples_split=30, min_samples_leaf=10, max_depth=5, random_state=0)

In [10]:
regtree.fit(X_train,Y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=5,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=10, min_samples_split=30,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=0, splitter='best')

#### El coeficiente de R2 es de 0.85 para el Decision Tree, usando el training set

In [11]:
# Usamos el Training set para hacer las predicciones
preds = regtree.predict(X_train)

r2_score(Y_train, preds)

0.9840434078426445

#### El coeficiente de R2 es de 0.76 usando el testing set

In [12]:
# Usamos el Training set para hacer las predicciones
preds = regtree.predict(X_test)

r2_score(Y_test, preds)

0.9807734752136296

## Compararlo con el modelo de Random Forests

In [13]:
forest = RandomForestRegressor(n_jobs=2, oob_score=True, n_estimators=10000)
forest.fit(X_train,Y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=10000, n_jobs=2, oob_score=True,
                      random_state=None, verbose=0, warm_start=False)

#### Esta vez con el training set es 0.98 a diferencia del Decision Tree que era 0.85
* Con SVM era de 0.99

In [14]:
# Usamos el Training set para hacer las predicciones
preds_train = forest.predict(X_train)

r2_score(Y_train, preds_train)

0.9998873295955403

#### Esta vez con el testing set es 0.84 a diferencia del Decision Tree que era 0.76 y SVM que era 0.76

In [15]:
# Usamos el Testing set para hacer las predicciones
preds_test = forest.predict(X_test)

r2_score(Y_test, preds_test)

0.9993186726953167

### Lo anterior nos dice que, de estos, el mejor modelo es el de Random Forest, ya que tiene un buen resultado con los datos del Training y Testing set