In [None]:
import numpy as np
import missingno as msno
import pandas as pd
from pandas import Series, DataFrame
from pandas.api.types import is_string_dtype, is_numeric_dtype  
import matplotlib.pyplot as plt
from matplotlib import rcParams
from collections import Counter
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from datetime import datetime
import datetime as dt
from sklearn import metrics
from sklearn.model_selection import cross_val_score


import utils as u

In [None]:
%matplotlib inline 
rcParams['figure.figsize'] = 5,4 

#### Prediccion del valor de automoviles
Se leen los datso iniciales para conocer el numero de registros y variables

In [None]:
df_inicial = pd.read_csv("datos_para_trabajo.csv")
print("Numero de registros y variables: " , df_inicial.shape)
df_inicial.head(3)

Datos faltantes: en principio no existen ya que en el encabezado del problema se indico que el conjunto de datos tiene una limpieza previa

In [None]:
print("Datos faltantes")
print(df_inicial.isnull().sum())
plt.style.use('ggplot')
msno.bar(df_inicial, figsize=(10,5), color="dodgerblue", fontsize=12)

Se imprime el tipo de datos de cada variable

In [None]:
df_inicial.dtypes

 Separamos entre las variable numericas y categoricas

In [None]:
num_list = []
cat_list = []

for column in df_inicial:
    if is_numeric_dtype(df_inicial[column]):
        num_list.append(column)
    elif is_string_dtype(df_inicial[column]):
        cat_list.append(column)
print('Categoricas: ', cat_list)
print('Numericas: ', num_list)

A continuacion se analiza las correlaciones existentes entre las variables

In [None]:
corr = df_inicial[num_list].corr()
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

Se observa que las variables city_mpg y highway_mpg estan muy correlacionadas entre si, se puede seleccionar una y continuar con el analisis o en este caso se va a calcular una nueva variable.
Las variables peak_rpm, compression_ratio, stroke y height son las que menos se correlacionan con la variable objetivo, podrian quitarse del conjunto de datos.

### Feature engineering
Se crea una nueva variable que pondere con el mismo porcentaje el consumo de combustible en ciudad y carretera

In [None]:
df_inicial['fuel_mpg'] = (0.50 * df_inicial['city_mpg']) + (0.50 * df_inicial['highway_mpg'])

Quitamos las variables menos correlacionadas con la variable objetivo stroke, compression_ratio


Se retira tambien la variable highway-mpg ya que esta muy correlacionada con city-mpg

In [None]:

num_list.append('fuel_mpg')
num_list.remove('stroke')
num_list.remove('compression_ratio')
num_list.remove('height')
num_list.remove('peak_rpm')
num_list.remove('city_mpg')
num_list.remove('highway_mpg')

corr = df_inicial[num_list].corr()
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
corr

Se analizan las variables categoricas y si existe una correlacion con la variable objetivo

In [None]:
g = sns.boxplot(x='make',y='lnprice',data=df_inicial.sort_values(by=['lnprice']))
g.set_ylabel('lnprice',size=14)
g.set_xticklabels(g.get_xticklabels(),rotation = 90)
g.set_xlabel('make',size=14);

In [None]:
Counter(df_inicial.make)

Dado que hay un conteo muy pequeño de autos en una marca, se pueden agrupar, ademas se observa en la grafica que  un grupo de autos tienen mayor precio. Se agrupan en 3 categorias

In [None]:
temp = df_inicial.copy()
table = df_inicial.groupby(['make'])['lnprice'].mean()
temp = temp.merge(table.reset_index(), how='left',on='make')
bins = [-2,-0.5,1,3]
cars_bin=['economico','medio','costoso']
df_inicial['price_range'] = pd.cut(temp['lnprice_y'],bins,right=False,labels=cars_bin)
df_inicial.head()

In [None]:
g = sns.boxplot(x='price_range',y='lnprice',data=df_inicial.sort_values(by=['lnprice']))
g.set_ylabel('lnprice',size=14)
g.set_xticklabels(g.get_xticklabels(),rotation = 90)
g.set_xlabel('fuel_type',size=14);

Se grafican las demas variables, se pondra atencion en como varian las medias en cada categoria y si se pueden calcular nuevas variables

In [None]:
g = sns.boxplot(x='fuel_type',y='lnprice',data=df_inicial.sort_values(by=['lnprice']))
g.set_ylabel('lnprice',size=14)
g.set_xticklabels(g.get_xticklabels(),rotation = 90)
g.set_xlabel('fuel_type',size=14);

In [None]:
g = sns.boxplot(x='aspiration',y='lnprice',data=df_inicial.sort_values(by=['lnprice']))
g.set_ylabel('lnprice',size=14)
g.set_xticklabels(g.get_xticklabels(),rotation = 90)
g.set_xlabel('aspiration',size=14);

In [None]:
g = sns.boxplot(x='body_style',y='lnprice',data=df_inicial.sort_values(by=['lnprice']))
g.set_ylabel('lnprice',size=14)
g.set_xticklabels(g.get_xticklabels(),rotation = 90)
g.set_xlabel('body_style',size=14);

In [None]:
g = sns.boxplot(x='engine_location',y='lnprice',data=df_inicial.sort_values(by=['lnprice']))
g.set_ylabel('lnprice',size=14)
g.set_xticklabels(g.get_xticklabels(),rotation = 90)
g.set_xlabel('engine_location',size=14);

In [None]:
g = sns.boxplot(x='num_of_cylinders',y='lnprice',data=df_inicial.sort_values(by=['lnprice']))
g.set_ylabel('lnprice',size=14)
g.set_xticklabels(g.get_xticklabels(),rotation = 90)
g.set_xlabel('num_of_cylinders',size=14);

In [None]:
g = sns.boxplot(x='fuel_system',y='lnprice',data=df_inicial.sort_values(by=['lnprice']))
g.set_ylabel('lnprice',size=14)
g.set_xticklabels(g.get_xticklabels(),rotation = 90)
g.set_xlabel('make',size=14);

Se crea un dataset con las variables con las que se continua al modelado

In [None]:
## Variables finales
df_inicial.drop(['city_mpg','highway_mpg','engine_type','drive_wheels','num_of_doors','stroke','compression_ratio','height','peak_rpm', 'make'], axis=1, inplace=True)

In [None]:
df_inicial.columns

In [None]:
df_modelo = df_inicial[['fuel_type', 'aspiration', 'body_style', 'engine_location',
       'wheel_base', 'length', 'width', 'curb_weight', 'num_of_cylinders',
       'engine_size', 'fuel_system', 'bore', 'horsepower',
       'fuel_mpg', 'price_range', 'lnprice']]
print("Numero de filas y variables final: " , df_modelo.shape)
df_modelo.head(5)

Se transforma las variables categoricas

In [None]:
lst_cat = ['fuel_type', 'aspiration', 'body_style', 'engine_location', 'num_of_cylinders', 'fuel_system','price_range']

for col in lst_cat:
    
    df_modelo[col] = df_modelo[col].astype('category')
    df_modelo[col] = df_modelo[col] .cat.codes

df_modelo.head(3)

#### Modelos

In [None]:
X = df_modelo.loc[:, df_modelo.columns != 'lnprice']
y = df_modelo['lnprice']

Se divide en conjunto de train y test (75 - 25)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.25, random_state = 2402)

plt.subplots(figsize=(10, 6))
plt.subplot(1, 2, 1)
sns.boxplot(y = (y_train))

plt.subplot(1, 2, 2)
sns.boxplot(y = (y_test))

### Modelo inicial - linea base

In [None]:
rf = RandomForestRegressor(n_estimators = 1000, random_state = 2402)
# Entrenar el modelo
rf.fit(X_train, y_train)

In [None]:
print ('-----------')
print ('RFR - linea base')
print ('-----------')
u.launch_model('RFR - linea base', rf, X_train, y_train, X_test, y_test);

In [None]:
u.graficas_resultados(X_test,y_test, rf)

Mejorando el modelo

In [None]:
rf = RandomForestRegressor(random_state = 2402)
from pprint import pprint
print('Parameters currently in use:\n')
pprint(rf.get_params())

In [None]:
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

Se usa una grilla random para encontrar los mejores parametros, se ussa cross validation con 3 folds

In [None]:
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

In [None]:
best_random = rf_random.best_estimator_
best_random.fit(X_train, y_train)

In [None]:
print ('-----------')
print ('RFR - best random')
print ('-----------')
u.launch_model('RFR - best random', best_random, X_train, y_train, X_test, y_test);

In [None]:
u.graficas_resultados(X_test,y_test, best_random)

Se mejora la busqueda en la grilla usando los valores random encontrados

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'bootstrap': [True],
    'max_depth': [30, 40, 50, 60],
    'max_features': [2, 3],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [4, 5, 6],
    'n_estimators': [400, 500, 600, 700]
}
rf = RandomForestRegressor()
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)

In [None]:
print ('-----------')
print ('RFR - best grid')
print ('-----------')
u.launch_model('RFR - best grid', best_grid, X_train, y_train, X_test, y_test);

El mejor modelo usando random forest es best grid. Se consideran dos metricas para esta seleccion.
MSE (Error cuadratico medio) mientras menor sea es mejor
y el R^2 (r-square) mientras mas cercano a uno es mejor. 

Para identificar el underfitting se considera el error entre el conjunto de entrenamiento y prueba, pensando en que el error en el conjunto de train debe ser menor que el de test.

Para identificar overfitting en cambio se verifica el valor de R^2, para entrenamiento debe ser mayor que para prueba, sin embargo se considera aceptable una diferenica de hasta el 5%

SVR Model

In [None]:
from sklearn.svm import SVR

svr_lineal = SVR(kernel='linear')

In [None]:
svr_lineal.fit(X_train, y_train)

In [None]:
print("Para training")
print_evaluate(y_train, svr_lineal.predict(X_train))

In [None]:
print("Para testing")
print_evaluate(y_test, svr_lineal.predict(X_test))

In [None]:
from sklearn.model_selection import RepeatedKFold

model = SVR()
kernel = ["linear", "rbf", "sigmoid", "poly"]
tolerance = [1e-3, 1e-4, 1e-5, 1e-6]
C = [1, 1.5, 2, 2.5, 3]
grid = dict(kernel=kernel, tol=tolerance, C=C)

cvFold = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
gridSearch = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1,
	cv=cvFold, scoring="neg_mean_squared_error")
searchResults = gridSearch.fit(X_train, y_train)

In [None]:
bestModel_SVR = searchResults.best_estimator_
bestModel_SVR.fit(X_train,y_train)

In [None]:
print("Para training")
print_evaluate(y_train, bestModel_SVR.predict(X_train))

In [None]:
print("Para testing")
print_evaluate(y_test, bestModel_SVR.predict(X_test))

Seleccion del modelo y analisis de errores

In [None]:
results = []
print ('-----------')
print ('SVR - RBF')
print ('-----------')
results.append(launch_model('SVR - RBF', bestModel_SVR, X_train, y_train, X_test, y_test))
print ('-----------')
print ('RF - Best Model')
print ('-----------')
results.append(launch_model('RF - Best Model', best_grid, X_train, y_train, X_test, y_test))

In [None]:
plot(results)

In [None]:
from yellowbrick.regressor import ResidualsPlot

#
visualizer = ResidualsPlot(bestModel_SVR)
visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
visualizer.show() 

In [None]:
import shap
explainer = shap.TreeExplainer(best_grid)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, X_train, plot_type="bar")

In [None]:
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, plot_type="bar")

In [None]:
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, X_train)