## Get data and packages

In [1]:
import os

os.chdir(os.path.join(os.getcwd(), '..'))
import src.preprocessing
import src.model_training

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import time
import shap
import lightgbm as lgb
import joblib

import json
import sys

from verstack import LGBMTuner
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
df = src.preprocessing.get_train_data('data/data_pricing_challenge.csv')

Model Training

In [3]:
target = 'price'
variables = src.model_training.txt_load('utils/model_variables.txt')
hyperparameters = src.model_training.json_load('utils/model_hyperparameters.json')

X = df.loc[:, variables]
y = df.loc[:, [target]]

In [4]:
# Initial tune to optimize hyperparameters
# tuner = LGBMTuner(metric = 'rmse', random_state = 123)
# tuner.fit(X, y['price'])

In [5]:
X_train, X_test, y_train, y_test = src.model_training.split_dataset(df, variables, target, 0.4)

model = src.model_training.lightgbm_training(X_train, y_train, hyperparameters)

In [12]:
results = X_test.copy()
results = results.assign(
    price_real = y_test,
    price_pred = model.predict(X_test)
)
results['error'] = results['price_real'] - results['price_pred']
results.sort_values(by='error', ascending = True).head(10)

Unnamed: 0,mileage,engine_power,antiquity,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,convertible,coupe,estate,hatchback,sedan,subcompact,suv,van,price_real,price_pred,error
4789,97835,190,5.007529,1,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,5400,27452.519907,-22052.519907
4582,61827,160,4.0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,15200,32851.301915,-17651.301915
4524,65918,120,2.009582,1,1,1,1,0,1,1,1,0,0,0,0,0,0,1,0,15600,30108.549654,-14508.549654
4711,117393,190,4.991102,1,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,11900,24526.572676,-12626.572676
1997,23755,190,2.984257,1,1,1,0,1,1,1,1,0,0,0,1,0,0,0,0,29200,40997.669788,-11797.669788
1479,174817,135,5.002053,1,1,1,0,1,1,1,1,0,0,1,0,0,0,0,0,4800,15612.565779,-10812.565779
2450,88092,120,5.007529,1,1,1,0,1,0,1,1,0,0,0,1,0,0,0,0,8400,19207.931051,-10807.931051
1612,23811,190,2.989733,1,1,1,0,0,0,1,1,0,0,1,0,0,0,0,0,28900,39610.34122,-10710.34122
113,12634,142,1.990418,1,1,0,0,1,0,1,1,0,1,0,0,0,0,0,0,17800,27936.965019,-10136.965019
4825,33300,170,3.003422,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,19200,29073.769981,-9873.769981


In [34]:
import plotly.express as px

max_value = 70000

fig = px.scatter(results, x='price_real', y='price_pred', title='Price Real vs Pred', width = 700, height = 700)

# Agregar la línea y=x en rojo
fig.add_trace(px.line(x=[0, max_value], y=[0, max_value]).update_traces(line_shape='linear', line_color='red').data[0])

fig.update_xaxes(range=[0, max_value])
fig.update_yaxes(range=[0, max_value])

fig.show()

In [11]:
aaaaaaaaaaaaa

NameError: name 'aaaaaaaaaaaaa' is not defined

Plots and Model Performance Analysis

In [None]:
lgb.plot_importance(model, max_num_features=30, figsize=(10, 6))

In [None]:
src.model_training.model_results(model, X_train, X_test, y_train, y_test)

In [None]:
explainer, shap_values = src.model_training.calculate_shap_values(model, X_train)

# Visualize Shapley values
shap.summary_plot(shap_values, X_train)

In [None]:
shap.dependence_plot("engine_power", shap_values, X_train)

In [None]:
feature_names = X_train.columns.tolist()

# Seleccionar las variables que te interesan (puedes usar los nombres directamente)
variable1_name = 'feature_1'
variable2_name = 'feature_4'

# Encontrar los índices correspondientes a los nombres de las variables
variable1_index = feature_names.index(variable1_name)
variable2_index = feature_names.index(variable2_name)

# Crear un Dependence Plot
shap.dependence_plot(variable1_index, shap_values, X_train, feature_names=feature_names, interaction_index=variable2_index, show=False)
plt.show()