In [3]:

# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Visualizaciones
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import tree

# Para realizar la regresión lineal y la evaluación del modelo
# -----------------------------------------------------------------------
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


from sklearn.model_selection import KFold,LeaveOneOut, cross_val_score


from sklearn.preprocessing import StandardScaler

from tqdm import tqdm

import sys
sys.path.append("../../")
from src.soporte_metricas import (
    obtener_metricas,
    comparar_arboles
)
pd.options.display.float_format = "{:,.2f}".format
# Ignorar los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv("../../datos/02_Modelo2/06_autorenew_target_encoded.csv",index_col=0)
df.sample()

Unnamed: 0,offerType,brand,vehicleType,fuelType,gearbox,notRepairedDamage,kilometer,price,yearOfRegistration,powerCV_robust_scaler
56738,6621.23,8910.6,5641.66,5850.27,5308.59,8033.46,8128.94,4700.0,5626.24,9201.07


# Decision Tree
- El árbol de decisiones se usa para modelos no lineales
- Vamos a probarlo a ver si mejora nuestras métricas

In [5]:
X = df.drop(columns="price")
y = df[["price"]]
X_train, X_test, y_train, y_test = train_test_split(X, # Todo menos la Variable Respuesta
                                                    y, # La variable Respuesta
                                                    train_size=0.7, # Como dividir los datos en este caso 80% train, 20% test
                                                    random_state=42, # Semilla, para que lo haga igual para cualquier persona
                                                    shuffle=True 
                                                    )

In [6]:
modelo_arbol = DecisionTreeRegressor()
modelo_arbol.fit(X_train,y_train)
y_pred_test = modelo_arbol.predict(X_test)
y_pred_train = modelo_arbol.predict(X_train)
df_metricas = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricas

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.89,929.26,17810700.37,4220.27
test,0.28,2398.64,293630162.67,17135.64


In [59]:
modelo_arbol.get_depth()

35

# Mejora sustancial! pero...
- Seguimos teniendo un error bastante alto.
- Y aquí tenemos un overfitting de manual
- Vamos a paralizar este mo

Con todos estos params podemos intentar conseguir algo mucho mejor

In [8]:
# Parámetros a evaluar
params_arbol = {
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [20, 30, 40, 50],
    'min_samples_leaf': [5, 10, 15, 20],
    'max_leaf_nodes': [10 , 20, 30, 40]
}

grid_search_arbol = GridSearchCV(DecisionTreeRegressor(),
                                 param_grid= params_arbol,
                                 cv = 5, # Cross Validation de 5 grupos solo
                                 scoring="neg_mean_squared_error",
                                 n_jobs=-1)

grid_search_arbol.fit(X_train, y_train)
modelo_final_arbol = grid_search_arbol.best_estimator_

In [9]:
modelo_final_arbol.fit(X_train,y_train)
y_pred_test = modelo_final_arbol.predict(X_test)
y_pred_train = modelo_final_arbol.predict(X_train)
df_metricas = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricas

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.54,1864.02,74980139.68,8659.11
test,0.28,2266.8,292357276.56,17098.46


In [10]:
modelo_final_arbol

Ha tardado 43 Segundos

# Que nos dice
La primera iteración dice que el mejor resultado es el siguiente:
- max_depth = 10
- max_leaf_nodes = 40
- min_samples_leaf = 20
- min_samples_split = 20

Esto no significa que sea el mejor aún nos dice que:
- max_depth = Entre 5 y 10 y 15
- max_leaf_nodes = Entre 35 y 40 y 45
- min_samples_leaf = Entre 0 y 7
- min_samples_split = Entre 15 y 20 y 25

Así que toca nueva iteración probando entre esos rangos

In [None]:
# Parámetros a evaluar
params_arbol = {
    'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
    'max_leaf_nodes': [35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45],
    'min_samples_leaf': [0, 1, 2, 3, 4, 5, 6, 7],
    'min_samples_split': [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]
    
}

grid_search_arbol = GridSearchCV(DecisionTreeRegressor(),
                                 param_grid= params_arbol,
                                 cv = 5, # Cross Validation de 5 grupos solo
                                 scoring="neg_mean_squared_error",
                                 n_jobs=-1)

grid_search_arbol.fit(X_train, y_train)
modelo_final_arbolv2 = grid_search_arbol.best_estimator_

In [None]:
modelo_final_arbolv2.fit(X_train,y_train)
y_pred_test = modelo_final_arbolv2.predict(X_test)
y_pred_train = modelo_final_arbolv2.predict(X_train)
df_metricasv2 = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricasv2

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.42,3102.95,93294234.93,9658.89
test,0.25,3275.65,307575781.75,17537.84


In [None]:
modelo_final_arbolv2

tardó 25 minutos

# Resultados
- max_depth = 9
- max_leaf_nodes = 45
- min_samples_leaf = 4
- min_samples_split = 21
### Otra iteración para ver si podemos ver el numero de nodes


In [11]:
# Parámetros a evaluar
params_arbol = {
    'max_depth': [8, 9, 10],
    'max_leaf_nodes': [45,50,55,60],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [20, 21, 22]
    
}

grid_search_arbol = GridSearchCV(DecisionTreeRegressor(),
                                 param_grid= params_arbol,
                                 cv = 5, # Cross Validation de 5 grupos solo
                                 scoring="neg_mean_squared_error",
                                 n_jobs=-1)

grid_search_arbol.fit(X_train, y_train)
modelo_final_arbolv3 = grid_search_arbol.best_estimator_

In [12]:
modelo_final_arbolv3.fit(X_train,y_train)
y_pred_test = modelo_final_arbolv3.predict(X_test)
y_pred_train = modelo_final_arbolv3.predict(X_train)
df_metricasv3 = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricasv3

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.44,2952.8,91466538.06,9563.81
test,0.25,3135.27,305711121.13,17484.6


In [13]:
modelo_final_arbolv3

# Resultados
- max_depth = 8
- max_leaf_nodes = 60
- min_samples_leaf = 4
- min_samples_split = 21

### Nueva iteración
- Quiero comprobar max_depth y max_leaf_nodes

In [14]:
# Parámetros a evaluar
params_arbol = {
    'max_depth': [7, 8, 9],
    'max_leaf_nodes': [60,70,80,90,100],
    'min_samples_leaf': [4],
    'min_samples_split': [21]
    
}

grid_search_arbol = GridSearchCV(DecisionTreeRegressor(),
                                 param_grid= params_arbol,
                                 cv = 5, # Cross Validation de 5 grupos solo
                                 scoring="neg_mean_squared_error",
                                 n_jobs=-1)

grid_search_arbol.fit(X_train, y_train)
modelo_final_arbolv4 = grid_search_arbol.best_estimator_

In [15]:
modelo_final_arbolv4.fit(X_train,y_train)
y_pred_test = modelo_final_arbolv4.predict(X_test)
y_pred_train = modelo_final_arbolv4.predict(X_train)
df_metricasv4 = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricasv4

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.46,2792.96,87732214.85,9366.55
test,0.27,2971.31,299041128.4,17292.81


In [16]:
modelo_final_arbolv4

# Resultados
- max_depth = 9
- max_leaf_nodes = 100 
- min_samples_leaf = 4
- min_samples_split = 21

### Nueva iteración
- Quiero comprobar max_depth y max_leaf_nodes

In [20]:
# Parámetros a evaluar
params_arbol = {
    'max_depth': [7, 8, 9],
    'max_leaf_nodes': [100,200,300,400,500],
    'min_samples_leaf': [4],
    'min_samples_split': [21]
    
}

grid_search_arbol = GridSearchCV(DecisionTreeRegressor(),
                                 param_grid= params_arbol,
                                 cv = 5, # Cross Validation de 5 grupos solo
                                 scoring="neg_mean_squared_error",
                                 n_jobs=-1)

grid_search_arbol.fit(X_train, y_train)
modelo_final_arbolv5 = grid_search_arbol.best_estimator_

In [21]:
modelo_final_arbolv5.fit(X_train,y_train)
y_pred_test = modelo_final_arbolv5.predict(X_test)
y_pred_train = modelo_final_arbolv5.predict(X_train)
df_metricasv5 = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricasv5

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.49,2498.77,83385864.38,9131.59
test,0.27,2698.13,297417081.08,17245.78


In [22]:
modelo_final_arbolv5

# Resultados
- max_depth = 9
- max_leaf_nodes = 400 
- min_samples_leaf = 4
- min_samples_split = 21

### Nueva iteración
- Quiero comprobar max_depth y max_leaf_nodes

In [24]:
# Parámetros a evaluar
params_arbol = {
    'max_depth': [6, 7, 8, 9, 10, 11, 12, 13],
    'max_leaf_nodes': [350,400,450],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [20,21,22]
    
}

grid_search_arbol = GridSearchCV(DecisionTreeRegressor(),
                                 param_grid= params_arbol,
                                 cv = 5, # Cross Validation de 5 grupos solo
                                 scoring="neg_mean_squared_error",
                                 n_jobs=-1)

grid_search_arbol.fit(X_train, y_train)
modelo_final_arbolv6 = grid_search_arbol.best_estimator_

In [25]:
modelo_final_arbolv6.fit(X_train,y_train)
y_pred_test = modelo_final_arbolv6.predict(X_test)
y_pred_train = modelo_final_arbolv6.predict(X_train)
df_metricasv6 = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricasv6

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.51,2398.11,78748308.2,8874.02
test,0.28,2612.19,294796693.35,17169.64


In [26]:
modelo_final_arbolv6

# Resultados
- max_depth = 13
- max_leaf_nodes = 450
- min_samples_leaf = 4
- min_samples_split = 21

### Nueva iteración
- Quiero comprobar max_depth y max_leaf_nodes

In [27]:
# Parámetros a evaluar
params_arbol = {
    'max_depth': [13,20,25, 30],
    'max_leaf_nodes': [420,450,480],
    'min_samples_leaf': [4],
    'min_samples_split': [21]
    
}

grid_search_arbol = GridSearchCV(DecisionTreeRegressor(),
                                 param_grid= params_arbol,
                                 cv = 5, # Cross Validation de 5 grupos solo
                                 scoring="neg_mean_squared_error",
                                 n_jobs=-1)

grid_search_arbol.fit(X_train, y_train)
modelo_final_arbolv7 = grid_search_arbol.best_estimator_

In [28]:
modelo_final_arbolv7.fit(X_train,y_train)
y_pred_test = modelo_final_arbolv7.predict(X_test)
y_pred_train = modelo_final_arbolv7.predict(X_train)
df_metricasv7 = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricasv7

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.52,2383.97,78555378.92,8863.15
test,0.28,2600.29,294610230.83,17164.21


In [29]:
modelo_final_arbolv7

# Resultados
- max_depth = 13
- max_leaf_nodes = 480
- min_samples_leaf = 4
- min_samples_split = 21

### Nueva iteración
- Quiero comprobar max_depth y max_leaf_nodes

In [38]:
# Parámetros a evaluar
params_arbol = {
    'max_depth': [12, 13, 14,15,16],
    'max_leaf_nodes': [460,470,480,490,500,510],
    'min_samples_leaf': [4],
    'min_samples_split': [21]
    
}

grid_search_arbol = GridSearchCV(DecisionTreeRegressor(),
                                 param_grid= params_arbol,
                                 cv = 5, # Cross Validation de 5 grupos solo
                                 scoring="neg_mean_squared_error",
                                 n_jobs=-1)

grid_search_arbol.fit(X_train, y_train)
modelo_final_arbolv8 = grid_search_arbol.best_estimator_

In [39]:
modelo_final_arbolv8.fit(X_train,y_train)
y_pred_test = modelo_final_arbolv8.predict(X_test)
y_pred_train = modelo_final_arbolv8.predict(X_train)
df_metricasv8 = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricasv8

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.52,2368.62,78153888.59,8840.47
test,0.28,2586.13,294472449.22,17160.2


In [40]:
modelo_final_arbolv8

# Resultados
- max_depth = 8
- max_leaf_nodes = 80 
- min_samples_leaf = 2
- min_samples_split = 14

### Nueva iteración
- Quiero comprobar max_depth y max_leaf_nodes

In [44]:
# Parámetros a evaluar
params_arbol = {
    'max_depth': [12, 13, 14,15, 16,50],
    'max_leaf_nodes': [500,510,520,530,540,550,800,1000],
    'min_samples_leaf': [4],
    'min_samples_split': [21]
    
}

grid_search_arbol = GridSearchCV(DecisionTreeRegressor(),
                                 param_grid= params_arbol,
                                 cv = 5, # Cross Validation de 5 grupos solo
                                 scoring="neg_mean_squared_error",
                                 n_jobs=-1)

grid_search_arbol.fit(X_train, y_train)
modelo_final_arbolv9 = grid_search_arbol.best_estimator_

In [45]:
modelo_final_arbolv9.fit(X_train,y_train)
y_pred_test = modelo_final_arbolv9.predict(X_test)
y_pred_train = modelo_final_arbolv9.predict(X_train)
df_metricasv9 = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricasv9

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.53,2217.94,76142670.77,8725.98
test,0.28,2468.06,293797198.62,17140.51


In [47]:
modelo_final_arbolv9

# Resultados
- max_depth = 8
- max_leaf_nodes = 80 
- min_samples_leaf = 2
- min_samples_split = 14

### Nueva iteración
- Quiero comprobar max_depth y max_leaf_nodes

In [61]:
# Parámetros a evaluar
params_arbol = {
    'max_depth': [20,30,40,50],
    'max_leaf_nodes': [6000,7000,8000,9000],
    'min_samples_leaf': [4],
    'min_samples_split': [21]
    
}

grid_search_arbol = GridSearchCV(DecisionTreeRegressor(),
                                 param_grid= params_arbol,
                                 cv = 5, # Cross Validation de 5 grupos solo
                                 scoring="neg_mean_squared_error",
                                 n_jobs=-1)

grid_search_arbol.fit(X_train, y_train)
modelo_final_arbolv10 = grid_search_arbol.best_estimator_

In [62]:
modelo_final_arbolv10.fit(X_train,y_train)
y_pred_test = modelo_final_arbolv10.predict(X_test)
y_pred_train = modelo_final_arbolv10.predict(X_train)
df_metricasv10 = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricasv10

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.55,1839.39,73086541.77,8549.07
test,0.28,2266.96,292941690.1,17115.54


In [64]:
modelo_final_arbolv10

# Resultados
- max_depth = 40
- max_leaf_nodes = 8000
- min_samples_leaf = 4
- min_samples_split = 21

### Nueva iteración
- Quiero comprobar max_depth y max_leaf_nodes

In [None]:
# Parámetros a evaluar
params_arbol = {
    'max_depth': [20,30,40,50],
    'max_leaf_nodes': [7200,7300,7400,8000,8200,8300],
    'min_samples_leaf': [4],
    'min_samples_split': [21]
    
}

grid_search_arbol = GridSearchCV(DecisionTreeRegressor(),
                                 param_grid= params_arbol,
                                 cv = 5, # Cross Validation de 5 grupos solo
                                 scoring="neg_mean_squared_error",
                                 n_jobs=-1)

grid_search_arbol.fit(X_train, y_train)
modelo_final_arbolv11 = grid_search_arbol.best_estimator_

In [66]:
modelo_final_arbolv11.fit(X_train,y_train)
y_pred_test = modelo_final_arbolv11.predict(X_test)
y_pred_train = modelo_final_arbolv11.predict(X_train)
df_metricasv11 = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricasv11

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.55,1839.39,73086541.77,8549.07
test,0.28,2267.46,292954548.27,17115.92


In [67]:
modelo_final_arbolv11

# Resultados
- max_depth = 40
- max_leaf_nodes = 8000
- min_samples_leaf = 4
- min_samples_split = 21



# Probemos el modelo final

In [129]:
modelo_final_arbolv11.fit(X_train,y_train)
y_pred_test = modelo_final_arbolv11.predict(X_test)
y_pred_train = modelo_final_arbolv11.predict(X_train)
df_metricasvfinal = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricasvfinal

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.55,1839.39,73086541.77,8549.07
test,0.28,2267.44,292947501.97,17115.71


# Demasiado Overfit
- No soy capaz de reducir el árbol
- Hay demasiado overfit
- Veamos importancia columnas

In [130]:
pd.DataFrame(modelo_final_arbol.feature_importances_,columns=["Importancia"], index= X.columns).sort_values(by="Importancia",ascending=False)

Unnamed: 0,Importancia
powerCV_robust_scaler,0.37
kilometer,0.18
brand,0.17
vehicleType,0.1
yearOfRegistration,0.1
gearbox,0.05
fuelType,0.01
notRepairedDamage,0.01
offerType,0.0


# Está mejor distribuida la importancia

# Para el Modelo 3
- Eliminar columna brand
    - Es probable que al ser tantas categorías estemos siendo demasiado específicos
    - Queremos predecir precios en general, no por marca
    
- Manteniendo el cambio del Modelo 2 añadimos el eliminar columna "brand"