Get packages

In [1]:
import os

os.chdir(os.path.join(os.getcwd(), '..'))
import src.preprocessing
import src.model_training
import src.feature_engineering

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import time
import shap
import lightgbm as lgb

import json
import sys
import pickle

from verstack import LGBMTuner
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

Get data & model

In [2]:
df = src.feature_engineering.get_train_data('data/data_pricing_challenge.csv')
variables = src.model_training.txt_load('utils/model_variables.txt')


In [3]:
pkl_file_path  = 'models/model'
# Load the model
with open(pkl_file_path, 'rb') as f:
    model = pickle.load(f)

Approach: Selling price is known

In [4]:
df = df.loc[df['price'] >= 20000]

df_pred = df[variables]
df_pred['mileage'] = df_pred['mileage']+10000
df_pred['antiquity'] = df_pred['antiquity']+1


# Make predictions
prediction = model.predict(df_pred)

# Add prediction to dataset
df_pred['price_pred_fin'] = prediction
df_pred['price_real_ini'] = df['price']

df_pred['loss_in_value'] = df_pred['price_real_ini'] - df_pred['price_pred_fin']
df_pred = df_pred.sort_values(by='loss_in_value', ascending = True)
df_pred.head()

Unnamed: 0,mileage,engine_power,antiquity,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,estate,hatchback,sedan,suv,avg_mileage,avg_antiquity,avg_engine_power,price_pred_fin,price_real_ini,loss_in_value
4585,63055,140,4.085558,1,1,0,0,1,0,1,1,0,0,0,1,81386.78,3.15,153.91,40054.553629,23800,-16254.553629
3842,97161,140,4.581109,1,1,0,0,0,0,0,1,0,0,0,1,81386.78,3.15,153.91,40190.221593,28200,-11990.221593
3950,102108,120,4.329227,1,1,1,1,1,0,1,0,0,0,0,1,81386.78,3.15,153.91,36954.691023,25600,-11354.691023
4280,100853,140,4.581109,1,1,0,1,1,0,1,1,0,0,0,1,81386.78,3.15,153.91,42594.478223,32200,-10394.478223
3830,105351,140,4.252567,1,1,1,0,0,1,0,1,0,0,0,1,81386.78,3.15,153.91,38881.413502,28600,-10281.413502


In [5]:
# Obtain selected value

idx_sel = df_pred['loss_in_value'].idxmin()
mileage_selected = df_pred.loc[idx_sel]['mileage']-10000
price_selected = df_pred.loc[idx_sel]['price_real_ini']

original_data = src.preprocessing.load_data('data/data_pricing_challenge.csv')
original_data.loc[(original_data['mileage'] == 53055) & (original_data['price'] == price_selected)]

Unnamed: 0,maker_key,model_key,mileage,engine_power,registration_date,fuel,paint_color,car_type,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,price,sold_at
4648,BMW,X4,53055,140,07/01/2015,diesel,black,suv,True,True,False,False,True,False,True,True,23800,08/01/2018


## Conclusions

Methodoly used:
    1)	Train a model with available data.
    2)	Filter data by current price >= 20000
    3)	Modify available registers:
            mileage = mileage + 10000
            antiquity = antiquity + 1
    4)	Predict price after 1 year and +10000 miles.
    5)	Calculate Loss in value = Current price - predicted price.
    6)	Minimize loss in value --> BMW; X4; 53055; 140; 07/01/2015; diesel; black; suv

As observed, loss in value for car selected is positive, which seems counter intuitive.
With avialable data for training, prediction of the model estimates that for the selected car, selling price after 1 year and 10000 miles will be higher than current one.