In [1]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Visualizaciones
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import tree

# Para realizar la regresión lineal y la evaluación del modelo
# -----------------------------------------------------------------------
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


from sklearn.model_selection import KFold,LeaveOneOut, cross_val_score


from sklearn.preprocessing import StandardScaler

from tqdm import tqdm

import sys
sys.path.append("../../")
from src.soporte_metricas import (
    obtener_metricas,
    comparar_arbol,
    comparar_arboles
)
pd.options.display.float_format = "{:,.2f}".format
# Ignorar los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings('ignore')

In [34]:
df = pd.read_csv("../../datos/01_Modelo1/06_autorenew_target_encoded.csv",index_col=0)
df.sample()

Unnamed: 0,offerType,brand,vehicleType,fuelType,gearbox,notRepairedDamage,kilometer,price,yearOfRegistration,powerCV_robust_scaler
218702,6621.41,9871.3,5641.86,5850.35,5308.77,3628.83,4091.7,1590.0,2173.38,6650.76


# Decision Tree
- El árbol de decisiones se usa para modelos no lineales
- Vamos a probarlo a ver si mejora nuestras métricas

In [35]:
X = df.drop(columns="price")
y = df[["price"]]
X_train, X_test, y_train, y_test = train_test_split(X, # Todo menos la Variable Respuesta
                                                    y, # La variable Respuesta
                                                    train_size=0.7, # Como dividir los datos en este caso 80% train, 20% test
                                                    random_state=42, # Semilla, para que lo haga igual para cualquier persona
                                                    shuffle=True 
                                                    )

In [36]:
modelo_arbol = DecisionTreeRegressor()
modelo_arbol.fit(X_train,y_train)
y_pred_test = modelo_arbol.predict(X_test)
y_pred_train = modelo_arbol.predict(X_train)
df_metricas = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricas

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.9,825.78,18369852.25,4286.01
test,0.62,2165.1,130911837.67,11441.67


# Mejora sustancial! pero...
- Seguimos teniendo un error bastante alto.
- Y aquí tenemos un overfitting de manual
- Vamos a realizar varios Decision Tree con custom parameters, a ver si podemos tener mejores resultados:
    - max_depth: Profundidaz máxima del árbol, es decir, la cantidad de veces que puedo dividir y tirar hacia abajo.
    - min_samples_split: Mínimo de datos que debe existir para poder seguir dividiéndose.
    - min_samples_leaf: Cantidad mínima de datos que deben existir al dividirse para poder continuar
    - max_leaf_nodes: máximo de nodos finales

Con todos estos params podemos intentar conseguir algo mucho mejor

In [33]:
# Parámetros a evaluar
params_arbol = {
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [20, 30, 40, 50],
    'min_samples_leaf': [5, 10, 15, 20],
    'max_leaf_nodes': [10 , 20, 30, 40]
}

grid_search_arbol = GridSearchCV(DecisionTreeRegressor(),
                                 param_grid= params_arbol,
                                 cv = 5, # Cross Validation de 5 grupos solo
                                 scoring="neg_mean_squared_error",
                                 n_jobs=-1)

grid_search_arbol.fit(X_train, y_train)
modelo_final_arbolv1 = grid_search_arbol.best_estimator_

In [34]:
modelo_final_arbolv1.fit(X_train,y_train)
y_pred_test = modelo_final_arbolv1.predict(X_test)
y_pred_train = modelo_final_arbolv1.predict(X_train)
df_metricasv1 = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricasv1

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.5,3032.97,93735196.43,9681.69
test,0.46,3054.35,188102431.61,13715.04


In [36]:
modelo_final_arbolv1

Ha tardado 43 Segundos

# Que nos dice
La primera iteración dice que el mejor resultado es el siguiente:
- max_depth = 10
- max_leaf_nodes = 40
- min_samples_leaf = 5
- min_samples_split = 20

Esto no significa que sea el mejor aún nos dice que:
- max_depth = Entre 5 y 10
- max_leaf_nodes = Entre 30 y 40
- min_samples_leaf = Entre 0 y 5
- min_samples_split = Entre 10 y 20

Así que toca nueva iteración probando entre esos rangos

In [37]:
# Parámetros a evaluar
params_arbol = {
    'max_depth': [5, 6, 7, 8, 9, 10],
    'min_samples_split': [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40],
    'min_samples_leaf': [0, 1, 2, 3, 4, 5],
    'max_leaf_nodes': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
}

grid_search_arbol = GridSearchCV(DecisionTreeRegressor(),
                                 param_grid= params_arbol,
                                 cv = 5, # Cross Validation de 5 grupos solo
                                 scoring="neg_mean_squared_error",
                                 n_jobs=-1)

grid_search_arbol.fit(X_train, y_train)
modelo_final_arbolv2 = grid_search_arbol.best_estimator_

In [38]:
modelo_final_arbolv2.fit(X_train,y_train)
y_pred_test = modelo_final_arbolv2.predict(X_test)
y_pred_train = modelo_final_arbolv2.predict(X_train)
df_metricasv2 = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricasv2

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.52,3349.46,90355979.0,9505.58
test,0.71,3340.02,100678838.89,10033.88


In [39]:
modelo_final_arbolv2

tardó 8 minutos

# Resultados
- max_depth = 6
- max_leaf_nodes = 20
- min_samples_leaf = 2
- min_samples_split = 30
### Otra iteración para ver si podemos ver el numero de splits
- A parte creo que he confundido valores de lugar y los puse donde no era pero tardando lo que tarda ya me quedo con los avances

In [40]:
# Parámetros a evaluar
params_arbol = {
    'max_depth': [5,6,7],
    'min_samples_split': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
    'min_samples_leaf': [1,2,3],
    'max_leaf_nodes': [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]
}

grid_search_arbol = GridSearchCV(DecisionTreeRegressor(),
                                 param_grid= params_arbol,
                                 cv = 5, # Cross Validation de 5 grupos solo
                                 scoring="neg_mean_squared_error",
                                 n_jobs=-1)

grid_search_arbol.fit(X_train, y_train)
modelo_final_arbolv3 = grid_search_arbol.best_estimator_

In [41]:
modelo_final_arbolv3.fit(X_train,y_train)
y_pred_test = modelo_final_arbolv3.predict(X_test)
y_pred_train = modelo_final_arbolv3.predict(X_train)
df_metricasv3 = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricasv3

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.57,2967.89,81755639.08,9041.88
test,0.73,2961.97,93061255.71,9646.83


In [42]:
modelo_final_arbolv3

# Resultados
- max_depth = 7
- max_leaf_nodes = 40
- min_samples_leaf = 2
- min_samples_split = 14

### Nueva iteración
- Quiero comprobar max_depth y max_leaf_nodes

In [43]:
# Parámetros a evaluar
params_arbol = {
    'max_depth': [5,6,7,8,9,10],
    'min_samples_split': [12, 13, 14, 15, 16],
    'min_samples_leaf': [2],
    'max_leaf_nodes': [35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45]
}

grid_search_arbol = GridSearchCV(DecisionTreeRegressor(),
                                 param_grid= params_arbol,
                                 cv = 5, # Cross Validation de 5 grupos solo
                                 scoring="neg_mean_squared_error",
                                 n_jobs=-1)

grid_search_arbol.fit(X_train, y_train)
modelo_final_arbolv4 = grid_search_arbol.best_estimator_

In [44]:
modelo_final_arbolv4.fit(X_train,y_train)
y_pred_test = modelo_final_arbolv4.predict(X_test)
y_pred_train = modelo_final_arbolv4.predict(X_train)
df_metricasv4 = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricasv4

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.57,2938.97,80055542.75,8947.38
test,0.73,2939.91,92226690.48,9603.47


# Resultados
- max_depth = 8
- max_leaf_nodes = 45 
- min_samples_leaf = 2
- min_samples_split = 16

### Nueva iteración
- Quiero comprobar max_depth y max_leaf_nodes

In [45]:
# Parámetros a evaluar
params_arbol = {
    'max_depth': [5,6,7,8,9,10],
    'min_samples_split': [12, 13, 14, 15, 16],
    'min_samples_leaf': [2],
    'max_leaf_nodes': [50,60,70,80,90,100]
}

grid_search_arbol = GridSearchCV(DecisionTreeRegressor(),
                                 param_grid= params_arbol,
                                 cv = 5, # Cross Validation de 5 grupos solo
                                 scoring="neg_mean_squared_error",
                                 n_jobs=-1)

grid_search_arbol.fit(X_train, y_train)
modelo_final_arbolv5 = grid_search_arbol.best_estimator_

In [46]:
modelo_final_arbolv5.fit(X_train,y_train)
y_pred_test = modelo_final_arbolv5.predict(X_test)
y_pred_train = modelo_final_arbolv5.predict(X_train)
df_metricasv5 = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricasv5

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.6,2761.5,74874242.51,8652.99
test,0.74,2774.97,90361342.78,9505.86


# Resultados
- max_depth = 8
- max_leaf_nodes = 80 
- min_samples_leaf = 2
- min_samples_split = 14

### Nueva iteración
- Quiero comprobar max_depth y max_leaf_nodes

In [37]:
# Parámetros a evaluar
params_arbol = {
    'max_depth': [6,7,8,9],
    'min_samples_split': [12, 13, 14, 15, 16],
    'min_samples_leaf': [2],
    'max_leaf_nodes': [70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90]
}

grid_search_arbol = GridSearchCV(DecisionTreeRegressor(),
                                 param_grid= params_arbol,
                                 cv = 5, # Cross Validation de 5 grupos solo
                                 scoring="neg_mean_squared_error",
                                 n_jobs=-1)

grid_search_arbol.fit(X_train, y_train)
modelo_final_arbolvfinal = grid_search_arbol.best_estimator_

In [38]:
modelo_final_arbolvfinal

# Resultado Final!
- max_depth = 8
- max_leaf_nodes = 81
- min_samples_leaf = 2
- min_samples_split = 14

### Vamos a ver el top 5

In [39]:
df_params = pd.DataFrame(grid_search_arbol.cv_results_).sort_values(by="rank_test_score",ascending=True)
df_params.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_leaf_nodes,param_min_samples_leaf,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
267,0.43,0.03,0.01,0.0,8,81,2,14,"{'max_depth': 8, 'max_leaf_nodes': 81, 'min_sa...",-53506760.13,-93385440.29,-135693238.07,-75586265.86,-93650789.89,-90364498.85,27024098.39,1
262,0.41,0.01,0.01,0.0,8,80,2,14,"{'max_depth': 8, 'max_leaf_nodes': 80, 'min_sa...",-53548371.01,-93402651.49,-135654758.04,-75525398.5,-93712819.13,-90368799.63,27008425.62,2
272,0.41,0.01,0.01,0.0,8,82,2,14,"{'max_depth': 8, 'max_leaf_nodes': 82, 'min_sa...",-53465647.07,-93327530.33,-135815938.7,-75645862.62,-93598429.53,-90370681.65,27067451.2,3
263,0.42,0.02,0.01,0.0,8,80,2,15,"{'max_depth': 8, 'max_leaf_nodes': 80, 'min_sa...",-53548371.01,-93402651.49,-135730502.58,-75525398.5,-93712819.13,-90383948.54,27033831.37,4
253,0.41,0.02,0.01,0.0,8,78,2,15,"{'max_depth': 8, 'max_leaf_nodes': 78, 'min_sa...",-53655773.55,-93158324.77,-135701799.83,-75555826.73,-93861156.0,-90386576.18,26990101.08,5


# Probemos el modelo final

In [40]:
modelo_final_arbolvfinal.fit(X_train,y_train)
y_pred_test = modelo_final_arbolvfinal.predict(X_test)
y_pred_train = modelo_final_arbolvfinal.predict(X_train)
df_metricasvfinal = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricasvfinal

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.6,2754.78,74819432.18,8649.82
test,0.74,2768.15,90309297.79,9503.12


# Comparemos el modelo Final
Lo comparamos con el resto

# Guardar Resultados Arboles
Para no volver a ejecutarlo

In [41]:
df_metricasv1.to_csv("../../datos/01_Modelo1/BackupDecisionTree/01_metricasv1.csv")
df_metricasv2.to_csv("../../datos/01_Modelo1/BackupDecisionTree/02_metricasv2.csv")
df_metricasv3.to_csv("../../datos/01_Modelo1/BackupDecisionTree/03_metricasv3.csv")
df_metricasv4.to_csv("../../datos/01_Modelo1/BackupDecisionTree/04_metricasv4.csv")
df_metricasv5.to_csv("../../datos/01_Modelo1/BackupDecisionTree/05_metricasv5.csv")
df_metricasvfinal.to_csv("../../datos/01_Modelo1/BackupDecisionTree/06_metricasv6.csv")

Leerlos

In [45]:
df_metricasv1 = pd.read_csv("../../datos/01_Modelo1/BackupDecisionTree/01_metricasv1.csv",index_col=0)
df_metricasv2 = pd.read_csv("../../datos/01_Modelo1/BackupDecisionTree/02_metricasv2.csv",index_col=0)
df_metricasv3 = pd.read_csv("../../datos/01_Modelo1/BackupDecisionTree/03_metricasv3.csv",index_col=0)
df_metricasv4 = pd.read_csv("../../datos/01_Modelo1/BackupDecisionTree/04_metricasv4.csv",index_col=0)
df_metricasv5 = pd.read_csv("../../datos/01_Modelo1/BackupDecisionTree/05_metricasv5.csv",index_col=0)
df_metricasvfinal = pd.read_csv("../../datos/01_Modelo1/BackupDecisionTree/06_metricasv6.csv",index_col=0)

In [43]:
comparar_arboles(df_metricasv1,df_metricasvfinal,lista_previos=False,nombre_modelo="modelo 1")

Unnamed: 0,modelo,entrenamiento,r2_score,MAE,MSE,RMSE
0,modelo 1,train,0.5,3032.97,93735196.43,9681.69
1,modelo 1,test,0.46,3054.35,188102431.61,13715.04
0,modelo final,train,0.6,2754.78,74819432.18,8649.82
1,modelo final,test,0.74,2768.15,90309297.79,9503.12


In [46]:
lista_dfs = [df_metricasv1,df_metricasv2,df_metricasv3,df_metricasv4,df_metricasv5]
comparar_arboles(lista_dfs,df_metricasvfinal,lista_previos=True)

Unnamed: 0,modelo,entrenamiento,r2_score,MAE,MSE,RMSE
0,modelo 0,train,0.5,3032.97,93735196.43,9681.69
1,modelo 0,test,0.46,3054.35,188102431.61,13715.04
0,modelo 1,train,0.52,3349.46,90355979.0,9505.58
1,modelo 1,test,0.71,3340.02,100678838.89,10033.88
0,modelo 2,train,0.57,2967.89,81755639.08,9041.88
1,modelo 2,test,0.73,2961.97,93061255.71,9646.83
0,modelo 3,train,0.57,2938.97,80055542.75,8947.38
1,modelo 3,test,0.73,2939.91,92226690.48,9603.47
0,modelo 4,train,0.6,2761.5,74874242.51,8652.99
1,modelo 4,test,0.74,2774.97,90361342.78,9505.86


# Hay buena mejora
Pero creo que podemos aumentar el CV a ver si baja usando el árbol que hemos visto que es mejor
Usaremos, 10, 50, 100, 500 y 1000 CV

- max_depth = 8
- max_leaf_nodes = 81
- min_samples_leaf = 2
- min_samples_split = 14


In [51]:
# Parámetros a evaluar
params_arbol = {
    'max_depth': [8],
    'min_samples_split': [14],
    'min_samples_leaf': [2],
    'max_leaf_nodes': [81]
}

grid_search_arbol = GridSearchCV(DecisionTreeRegressor(),
                                 param_grid= params_arbol,
                                 cv = 10, # Cross Validation de 10 grupos 
                                 scoring="neg_mean_squared_error",
                                 n_jobs=-1)

grid_search_arbol.fit(X_train, y_train)
modelo_final_arbol_cv10 = grid_search_arbol.best_estimator_
modelo_final_arbol_cv10

In [57]:
modelo_final_arbol_cv10.fit(X_train,y_train)
y_pred_test = modelo_final_arbol_cv10.predict(X_test)
y_pred_train = modelo_final_arbol_cv10.predict(X_train)
df_metricas_cv10 = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricas_cv10

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.6,2754.78,74819432.18,8649.82
test,0.74,2768.15,90309297.79,9503.12


In [52]:
# Parámetros a evaluar
params_arbol = {
    'max_depth': [8],
    'min_samples_split': [14],
    'min_samples_leaf': [2],
    'max_leaf_nodes': [81]
}

grid_search_arbol = GridSearchCV(DecisionTreeRegressor(),
                                 param_grid= params_arbol,
                                 cv = 50, # Cross Validation de 10 grupos 
                                 scoring="neg_mean_squared_error",
                                 n_jobs=-1)

grid_search_arbol.fit(X_train, y_train)
modelo_final_arbol_cv50 = grid_search_arbol.best_estimator_
modelo_final_arbol_cv50

In [58]:
modelo_final_arbol_cv50.fit(X_train,y_train)
y_pred_test = modelo_final_arbol_cv50.predict(X_test)
y_pred_train = modelo_final_arbol_cv50.predict(X_train)
df_metricas_cv50 = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricas_cv50

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.6,2754.78,74819432.18,8649.82
test,0.74,2768.15,90309297.79,9503.12


In [54]:
# Parámetros a evaluar
params_arbol = {
    'max_depth': [8],
    'min_samples_split': [14],
    'min_samples_leaf': [2],
    'max_leaf_nodes': [81]
}

grid_search_arbol = GridSearchCV(DecisionTreeRegressor(),
                                 param_grid= params_arbol,
                                 cv = 100, # Cross Validation de 10 grupos 
                                 scoring="neg_mean_squared_error",
                                 n_jobs=-1)

grid_search_arbol.fit(X_train, y_train)
modelo_final_arbol_cv100 = grid_search_arbol.best_estimator_
modelo_final_arbol_cv100

In [59]:
modelo_final_arbol_cv100.fit(X_train,y_train)
y_pred_test = modelo_final_arbol_cv100.predict(X_test)
y_pred_train = modelo_final_arbol_cv100.predict(X_train)
df_metricas_cv100 = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricas_cv100

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.6,2754.78,74819432.18,8649.82
test,0.74,2768.15,90309297.79,9503.12


In [55]:
# Parámetros a evaluar
params_arbol = {
    'max_depth': [8],
    'min_samples_split': [14],
    'min_samples_leaf': [2],
    'max_leaf_nodes': [81]
}

grid_search_arbol = GridSearchCV(DecisionTreeRegressor(),
                                 param_grid= params_arbol,
                                 cv = 500, # Cross Validation de 10 grupos 
                                 scoring="neg_mean_squared_error",
                                 n_jobs=-1)

grid_search_arbol.fit(X_train, y_train)
modelo_final_arbol_cv500 = grid_search_arbol.best_estimator_
modelo_final_arbol_cv500

In [60]:
modelo_final_arbol_cv500.fit(X_train,y_train)
y_pred_test = modelo_final_arbol_cv500.predict(X_test)
y_pred_train = modelo_final_arbol_cv500.predict(X_train)
df_metricas_cv500 = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricas_cv500

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.6,2754.78,74819432.18,8649.82
test,0.74,2768.15,90309297.79,9503.12


In [56]:
# Parámetros a evaluar
params_arbol = {
    'max_depth': [8],
    'min_samples_split': [14],
    'min_samples_leaf': [2],
    'max_leaf_nodes': [81]
}

grid_search_arbol = GridSearchCV(DecisionTreeRegressor(),
                                 param_grid= params_arbol,
                                 cv = 1000, # Cross Validation de 10 grupos 
                                 scoring="neg_mean_squared_error",
                                 n_jobs=-1)

grid_search_arbol.fit(X_train, y_train)
modelo_final_arbol_cv1000 = grid_search_arbol.best_estimator_
modelo_final_arbol_cv1000

In [61]:
modelo_final_arbol_cv1000.fit(X_train,y_train)
y_pred_test = modelo_final_arbol_cv1000.predict(X_test)
y_pred_train = modelo_final_arbol_cv1000.predict(X_train)
df_metricas_cv1000 = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricas_cv1000

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.6,2754.78,74819432.18,8649.82
test,0.74,2768.15,90309297.79,9503.12


# Subir el CV no vale de nada
Así que, nuestro modelo final es el siguiente:

In [63]:
# Parámetros a evaluar
params_arbol = {
    'max_depth': [8],
    'min_samples_split': [14],
    'min_samples_leaf': [2],
    'max_leaf_nodes': [81]
}

grid_search_arbol = GridSearchCV(DecisionTreeRegressor(),
                                 param_grid= params_arbol,
                                 cv = 5, # Cross Validation de 5 grupos solo
                                 scoring="neg_mean_squared_error",
                                 n_jobs=-1)

grid_search_arbol.fit(X_train, y_train)
modelo_final_arbol = grid_search_arbol.best_estimator_
modelo_final_arbol

In [64]:
modelo_final_arbol.fit(X_train,y_train)
y_pred_test = modelo_final_arbol.predict(X_test)
y_pred_train = modelo_final_arbol.predict(X_train)
df_metricas_final = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metricas_final

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.6,2754.78,74819432.18,8649.82
test,0.74,2768.15,90309297.79,9503.12


# Comparemos con el modelo Linear

In [67]:
modelo_lineal = LinearRegression()
modelo_lineal.fit(X_train, y_train)
y_pred_test = modelo_lineal.predict(X_test)
y_pred_train = modelo_lineal.predict(X_train)
df_metrica_lineal = obtener_metricas(y_train,y_pred_train,y_test,y_pred_test)
df_metrica_lineal

Unnamed: 0,r2_score,MAE,MSE,RMSE
train,0.34,3402.4,124031407.03,11136.94
test,0.23,3436.13,265812831.86,16303.77


In [68]:
comparar_arboles(df_metrica_lineal,df_metricas_final,lista_previos=False,nombre_modelo="Regresión Lineal")

Unnamed: 0,modelo,entrenamiento,r2_score,MAE,MSE,RMSE
0,Regresión Lineal,train,0.34,3402.4,124031407.03,11136.94
1,Regresión Lineal,test,0.23,3436.13,265812831.86,16303.77
0,modelo final,train,0.6,2754.78,74819432.18,8649.82
1,modelo final,test,0.74,2768.15,90309297.79,9503.12


# Vemos buenas señales
- Hemos aumentado R2 de forma drástica
- Hemos reducido el error bastante tambien 
- Esto indica que el modelo es más preciso
- Sin embargo el error sigue siendo muy amplio
- Investiguemos:

In [73]:
pd.DataFrame(modelo_final_arbol.feature_importances_,columns=["Importancia"], index= X.columns).sort_values(by="Importancia",ascending=False)

Unnamed: 0,Importancia
powerCV_robust_scaler,0.56
brand,0.17
kilometer,0.11
yearOfRegistration,0.07
gearbox,0.04
vehicleType,0.03
fuelType,0.01
notRepairedDamage,0.0
offerType,0.0


# Eliminar columnas
Viendo la importancia de las columnas, podemos prescindir de:
- notRepairedDamage
- OfferType

El resto aportan valor 

### Posible Nuevo Modelo 1
- Rellenar nulos con Random Forest
- Eliminar las dos columnas previamente mencionadas
### Posible Nuevo Modelo 2
- Gestionar nulos de forma distinta (cargarme menos datos)
- Usar KnnImputer para los nulos
### Posible Nuevo Modelo 3
-
