## Answer questions

In [77]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [78]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR, MODELS_DIR, RAW_DATA_DIR

In [79]:
data = pd.read_csv(TRANSFORMED_DATA_DIR / 'transformed_data.csv')

In [80]:
data.shape

(4817, 22)

In [81]:
# Convert date columns to datetime
data['registration_date'] = pd.to_datetime(data['registration_date'])
data['sold_at'] = pd.to_datetime(data['sold_at'])

In [82]:
# Change data types from object to categorical
from src.data import convert_object_columns_to_category

data = convert_object_columns_to_category(data)

In [83]:
from src.data import get_train_test_data

In [84]:
# Read metadata json file from models folder
import json

metadata = json.load(open(MODELS_DIR / 'metadata.json'))

In [85]:
print(f'Model being used is: {metadata["name"]}')

Model being used is: XGBoost with trees


In [86]:
features = metadata['features']
target = metadata['target']

In [87]:
# Print the features
features

['model_key',
 'mileage',
 'engine_power',
 'fuel',
 'paint_color',
 'car_type',
 'feature_1',
 'feature_2',
 'feature_3',
 'feature_4',
 'feature_5',
 'feature_6',
 'feature_7',
 'feature_8',
 'age_in_months_when_sold',
 'month_sold_at',
 'season_sold_at',
 'model_initial']

In [88]:
X, y, X_train, X_test, y_train, y_test = get_train_test_data(data, features, target)

In [89]:
import joblib

In [90]:
# Load the model
model = joblib.load(MODELS_DIR / 'model.pkl')

In [91]:
# Print the model
model

### Q1

In [92]:
# Read feature importance df from models folder
feature_importance = joblib.load(MODELS_DIR / 'feature_importance_df.pkl')

In [93]:
feature_importance

Unnamed: 0,feature,importance
17,model_initial,0.248121
0,model_key,0.176764
13,feature_8,0.110575
14,age_in_months_when_sold,0.091855
12,feature_7,0.085964
2,engine_power,0.048129
8,feature_3,0.035199
10,feature_5,0.03356
1,mileage,0.032679
7,feature_2,0.029472


### Q2

As found during the data exploration phase:
- Hybrid and electrical cars are more expensive on average.
- Electrical cars average prices were stable from winter to summer, and were not sold in autumn.
- Diesel and petrol cars had similar average prices, although petrol cars had a drop in average prices starting summer 2018.
- The most expensive car type is, on average, suv, although coupe was most expensive at the start of the year and then dropped below suv also starting in summer.
- Coupe and convertible cars were, on average, more expensive in winter than in summer.
- Vans where more expensive, on average, in spring, summer, and autumn, than in winter.
- Subcompact had generally the lowest average prices.
- Paint color does not seem to generally determine or be associated with the average price, except for color green, which consistently had prices much lower than other colors. Maybe not very popular.
- Orange and white cars were sold for more, on average, during summer than during winter and spring.
- Red cars were the opposite, with lower average prices during summer than during winter and spring.

Find similar observations using estimated price instead of real price

In [94]:
pred = model.predict(X)

In [95]:
data_q2 = data.copy()

In [96]:
data_q2['price'] = pred

In [97]:
from src.plots import plot_avg_target_time_series_by_features

In [98]:
# Load car features
car_features = joblib.load(RAW_DATA_DIR / 'car_features.pkl')

In [99]:
plot_avg_target_time_series_by_features(data_q2, car_features)

In [100]:
# Load small cardinality features
small_cardinality_features = joblib.load(RAW_DATA_DIR / 'small_cardinality_features.pkl')

In [101]:
plot_avg_target_time_series_by_features(data_q2, small_cardinality_features)

In [102]:
from src.questions import ttest_mean_price_difference_between_groups_after_filter

In [103]:
grouping_column = 'season_sold_at'
group_1 = 'winter'
group_2 = 'summer'
feature_list = []
feature_value_list = []
t_stat_list = []
p_val_list = []
for feature in small_cardinality_features:
    for feature_value in data_q2[feature].unique():
        t_stat, p_val = ttest_mean_price_difference_between_groups_after_filter(
            data_q2, feature, feature_value, grouping_column, group_1, group_2)
        feature_list.append(feature)
        feature_value_list.append(feature_value)
        t_stat_list.append(t_stat)
        p_val_list.append(p_val)

ttest_df = pd.DataFrame({'feature': feature_list, 'feature_value': feature_value_list, 't_stat': t_stat_list, 'p_val': p_val_list})

In [104]:
ttest_df.sort_values(by='p_val')

Unnamed: 0,feature,feature_value,t_stat,p_val
8,car_type,sedan,3.992051,7.5e-05
5,car_type,coupe,3.692988,0.000634
11,car_type,van,-3.400213,0.003406
6,car_type,estate,2.71002,0.006881
14,paint_color,white,-2.377194,0.018163
1,fuel,petrol,1.813184,0.073914
10,car_type,suv,-1.191461,0.233937
20,paint_color,brown,-1.112803,0.267432
12,paint_color,black,1.054728,0.291866
4,car_type,convertible,1.025446,0.317397


### Q3

In [105]:
today_date = '3/1/2024'

In [106]:
data_q3 = data.copy()

In [107]:
# Convert date columns to datetime and mock the sold_at date as today's date
data_q3['registration_date'] = pd.to_datetime(data_q3['registration_date'])
data_q3['sold_at'] = today_date
data_q3['sold_at'] = pd.to_datetime(data_q3['sold_at'])

In [108]:
# Calculate mileage per month
data_q3['mileage_per_month'] = data_q3['mileage'] / data_q3['age_in_months_when_sold']

In [109]:
# Calculate age in month at today date and replace in data_q3
data_q3['age_in_months_when_sold'] = (data_q3['sold_at'].dt.to_period('M') - data_q3['registration_date'].dt.to_period('M')).apply(lambda x: x.n)

In [110]:
# Update estimated mileage at today date
data_q3['mileage'] = data_q3['age_in_months_when_sold'] * data_q3['mileage_per_month']

In [111]:
data_q3[['registration_date', 'sold_at', 'age_in_months_when_sold', 'mileage']].head()

Unnamed: 0,registration_date,sold_at,age_in_months_when_sold,mileage
0,2012-02-01,2024-03-01,145,286754.859155
1,2016-04-01,2024-03-01,95,60147.954545
2,2012-04-01,2024-03-01,143,374449.585714
3,2014-07-01,2024-03-01,116,345396.744186
4,2014-12-01,2024-03-01,111,269444.175


In [112]:
X_q3, y_q3, X_train_q3, X_test_q3, y_train_q3, y_test_q3 = get_train_test_data(data_q3, features, target)

In [113]:
# Get estimated prices today
pred_q3 = model.predict(X_q3)

In [114]:
# Add 1 year to the age_in_months_when_sold
X_q3['age_in_months_when_sold'] = X_q3['age_in_months_when_sold'] + 12

In [115]:
# Add 1 year worth of mileage
X_q3['mileage'] = X_q3['mileage'] + 12 * data_q3['mileage_per_month']

In [116]:
# Get estimated prices 1 year later
pred_q3_one_year_later = model.predict(X_q3)

In [117]:
X_q3['price_today'] = pred_q3
X_q3['price_one_year_later'] = pred_q3_one_year_later
X_q3['loss'] = X_q3['price_today'] - X_q3['price_one_year_later'] 

In [118]:
price_today_threshold = 20000
loss_threshold = 2000

In [119]:
# Indentify cars that are candidates for buying
candidate_cars = X_q3[(X_q3['price_today'] >= price_today_threshold) & (X_q3['loss'] <= loss_threshold)]

In [120]:
# Show top 10 candidate cars with lowest loss
candidate_cars.sort_values('loss').head(10)

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,feature_1,feature_2,feature_3,feature_4,...,feature_6,feature_7,feature_8,age_in_months_when_sold,month_sold_at,season_sold_at,model_initial,price_today,price_one_year_later,loss
4223,X4,101785.6,140,diesel,white,suv,False,True,False,False,...,False,True,False,112,5,spring,X,38054.914062,40720.117188,-2665.203125
4106,X6,169312.064516,230,diesel,grey,suv,True,True,True,False,...,False,True,True,114,4,spring,X,21883.425781,22955.607422,-1072.181641
3073,M3,146533.333333,317,petrol,black,sedan,True,True,False,False,...,True,True,True,112,5,spring,M,25658.361328,26528.835938,-870.474609
4148,X4,191877.766667,140,diesel,black,suv,True,True,True,True,...,False,True,True,113,4,spring,X,20928.089844,21619.753906,-691.664062
4261,X4,99720.466667,120,diesel,black,suv,True,True,False,True,...,False,True,True,109,8,summer,X,31389.205078,32042.566406,-653.361328
4210,X6,224153.090909,230,diesel,black,suv,True,True,True,True,...,True,True,True,104,5,spring,X,20133.105469,20453.537109,-320.431641
3565,530,53503.157895,195,diesel,grey,sedan,True,True,True,True,...,True,True,True,97,9,autumn,5,27270.511719,27452.873047,-182.361328
67,M3,66996.268657,309,petrol,silver,coupe,True,True,False,False,...,True,True,True,150,4,spring,M,24615.210938,24761.449219,-146.238281
2625,640 Gran Coupé,365512.484848,230,diesel,grey,sedan,True,True,True,True,...,False,True,True,116,4,spring,6,24002.169922,23848.236328,153.933594
4275,X4,189215.0625,120,diesel,black,suv,True,True,False,True,...,False,True,True,114,5,spring,X,21336.669922,21153.660156,183.009766


In [121]:
# Find the value counts for model_key in candidate_cars if exists. If not try model key initial.
try:
    model_related_value_counts = candidate_cars['model_key'].value_counts()
except KeyError:
    model_related_value_counts = candidate_cars['model_initial'].value_counts()

In [122]:
model_related_value_counts

model_key
X4                  15
X6                   6
M3                   3
640 Gran Coupé       3
530                  3
                    ..
520                  0
520 Gran Turismo     0
523                  0
525                  0
114                  0
Name: count, Length: 75, dtype: int64

In [123]:
# Describe numerical features of candidate cars
candidate_cars.describe()

Unnamed: 0,mileage,engine_power,age_in_months_when_sold,month_sold_at,price_today,price_one_year_later,loss
count,42.0,42.0,42.0,42.0,42.0,42.0,42.0
mean,155819.045917,190.142857,114.761905,5.333333,25321.476562,24747.996094,573.477966
std,99535.800837,58.271263,11.231075,2.436177,4880.483398,5105.255371,912.649231
min,1213.333333,120.0,92.0,1.0,20133.105469,18356.115234,-2665.203125
25%,72922.915778,140.0,110.25,4.0,21360.869141,20724.953125,231.875
50%,162803.463187,186.5,113.0,5.0,24308.69043,23289.792969,670.676758
75%,212187.163462,230.0,118.0,8.0,27894.376465,27412.60791,1180.303223
max,430294.090909,317.0,150.0,9.0,38054.914062,40720.117188,1938.529297


In [124]:
# Show the car with the lowest loss
car_index = candidate_cars.sort_values('loss').index[0]
data_q3.loc[car_index]

maker_key                                  BMW
model_key                                   X4
mileage                                90880.0
engine_power                               140
registration_date          2015-11-01 00:00:00
fuel                                    diesel
paint_color                              white
car_type                                   suv
feature_1                                False
feature_2                                 True
feature_3                                False
feature_4                                False
feature_5                                False
feature_6                                False
feature_7                                 True
feature_8                                False
price                                    35300
sold_at                    2024-03-01 00:00:00
age_in_months_when_sold                    100
month_sold_at                                5
season_sold_at                          spring
model_initial

In [125]:
data.loc[car_index]

maker_key                                  BMW
model_key                                   X4
mileage                                  27264
engine_power                               140
registration_date          2015-11-01 00:00:00
fuel                                    diesel
paint_color                              white
car_type                                   suv
feature_1                                False
feature_2                                 True
feature_3                                False
feature_4                                False
feature_5                                False
feature_6                                False
feature_7                                 True
feature_8                                False
price                                    35300
sold_at                    2018-05-01 00:00:00
age_in_months_when_sold                     30
month_sold_at                                5
season_sold_at                          spring
model_initial

In [126]:
X_q3.loc[car_index]

model_key                            X4
mileage                        101785.6
engine_power                        140
fuel                             diesel
paint_color                       white
car_type                            suv
feature_1                         False
feature_2                          True
feature_3                         False
feature_4                         False
feature_5                         False
feature_6                         False
feature_7                          True
feature_8                         False
age_in_months_when_sold             112
month_sold_at                         5
season_sold_at                   spring
model_initial                         X
price_today                38054.914062
price_one_year_later       40720.117188
loss                       -2665.203125
Name: 4223, dtype: object

In [127]:
# Check cars with the same model_key
data[data['model_key'] == data['model_key'].loc[car_index]].sort_values(
    'price', ascending=False)[['model_key', 'price', 'mileage', 'age_in_months_when_sold']]

Unnamed: 0,model_key,price,mileage,age_in_months_when_sold
4658,X4,142800,103222,48
4067,X4,47400,56981,35
3908,X4,38100,41099,37
4065,X4,37200,39061,15
3903,X4,36900,50003,37
4534,X4,36300,118606,37
4153,X4,35900,62592,39
4223,X4,35300,27264,30
4275,X4,35200,53113,32
4152,X4,35000,112720,44


### Q3 assumming today is a month after latest sold at date in the data

In [128]:
today_date = data['sold_at'].max()

In [129]:
data_q3 = data.copy()

In [130]:
# Convert date columns to datetime and mock the sold_at date as today's date
data_q3['registration_date'] = pd.to_datetime(data_q3['registration_date'])
data_q3['sold_at'] = today_date
data_q3['sold_at'] = pd.to_datetime(data_q3['sold_at'])

In [131]:
# Calculate mileage per month
data_q3['mileage_per_month'] = data_q3['mileage'] / data_q3['age_in_months_when_sold']

In [132]:
# Calculate age in month at today date and replace in data_q3
data_q3['age_in_months_when_sold'] = (data_q3['sold_at'].dt.to_period('M') - data_q3['registration_date'].dt.to_period('M')).apply(lambda x: x.n)

In [133]:
# Update estimated mileage at today date
data_q3['mileage'] = data_q3['age_in_months_when_sold'] * data_q3['mileage_per_month']

In [134]:
data_q3[['registration_date', 'sold_at', 'age_in_months_when_sold', 'mileage']].head()

Unnamed: 0,registration_date,sold_at,age_in_months_when_sold,mileage
0,2012-02-01,2018-09-01,79,156231.957746
1,2016-04-01,2018-09-01,29,18360.954545
2,2012-04-01,2018-09-01,77,201626.7
3,2014-07-01,2018-09-01,50,148877.906977
4,2014-12-01,2018-09-01,45,109234.125


In [135]:
X_q3, y_q3, X_train_q3, X_test_q3, y_train_q3, y_test_q3 = get_train_test_data(data_q3, features, target)

In [136]:
# Get estimated prices today
pred_q3 = model.predict(X_q3)

In [137]:
# Add 1 year to the age_in_months_when_sold
X_q3['age_in_months_when_sold'] = X_q3['age_in_months_when_sold'] + 12

In [138]:
# Add 1 year worth of mileage
X_q3['mileage'] = X_q3['mileage'] + 12 * data_q3['mileage_per_month']

In [139]:
# Get estimated prices 1 year later
pred_q3_one_year_later = model.predict(X_q3)

In [140]:
X_q3['price_today'] = pred_q3
X_q3['price_one_year_later'] = pred_q3_one_year_later
X_q3['loss'] = X_q3['price_today'] - X_q3['price_one_year_later'] 

In [141]:
# Indentify cars that are candidates for buying
candidate_cars = X_q3[(X_q3['price_today'] >= price_today_threshold) & (X_q3['loss'] <= loss_threshold)]

In [142]:
# Show top 10 candidate cars with lowest loss
candidate_cars.sort_values('loss').head(10)

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,feature_1,feature_2,feature_3,feature_4,...,feature_6,feature_7,feature_8,age_in_months_when_sold,month_sold_at,season_sold_at,model_initial,price_today,price_one_year_later,loss
4622,X4,71695.945946,140,diesel,black,suv,True,True,False,False,...,False,True,True,50,8,summer,X,26153.548828,59674.882812,-33521.335938
4285,X3,61562.959184,135,diesel,silver,suv,True,True,False,False,...,False,False,True,65,5,spring,X,25348.0,53132.0625,-27784.0625
4546,X4,75029.27027,140,diesel,white,suv,False,False,False,False,...,False,False,False,51,7,summer,X,33014.90625,60576.886719,-27561.980469
2540,530 Gran Turismo,69131.25,190,diesel,black,sedan,True,True,False,True,...,False,True,True,65,8,summer,5,26501.138672,51391.515625,-24890.376953
4117,X4,140770.341463,140,diesel,grey,suv,True,True,False,False,...,False,True,True,56,6,summer,X,26283.736328,49729.574219,-23445.837891
4424,X4,101363.625,140,diesel,grey,suv,True,True,True,True,...,False,True,True,55,6,summer,X,31112.990234,54548.339844,-23435.349609
3763,X3,72054.818182,135,diesel,silver,suv,False,False,False,False,...,False,False,True,69,7,summer,X,26510.46875,47754.265625,-21243.796875
4731,X4,78519.4,140,diesel,grey,suv,False,False,False,False,...,False,False,True,57,9,autumn,X,28916.439453,49046.511719,-20130.072266
2924,520,57133.645833,135,diesel,grey,sedan,True,True,False,False,...,False,True,True,65,4,spring,5,21534.236328,40544.101562,-19009.865234
4618,X4,72836.052632,140,diesel,blue,suv,True,False,False,True,...,False,True,True,51,8,summer,X,33325.957031,50393.453125,-17067.496094


In [143]:
# Find the value counts for model_key in candidate_cars if exists. If not try model key initial.
try:
    model_related_value_counts = candidate_cars['model_key'].value_counts()
except KeyError:
    model_related_value_counts = candidate_cars['model_initial'].value_counts()

In [144]:
model_related_value_counts

model_key
X3                40
X4                32
X5                21
420 Gran Coupé    20
420               20
                  ..
430                0
116                0
523                0
630                0
i8                 0
Name: count, Length: 75, dtype: int64

In [145]:
# Describe numerical features of candidate cars
candidate_cars.describe()

Unnamed: 0,mileage,engine_power,age_in_months_when_sold,month_sold_at,price_today,price_one_year_later,loss
count,270.0,270.0,270.0,270.0,270.0,270.0,270.0
mean,120055.93268,164.211111,63.203704,5.122222,25674.070312,26576.017578,-901.944763
std,67234.666023,43.39406,15.096197,2.2227,6001.734863,8822.40918,5387.13623
min,597.333333,75.0,26.0,1.0,20015.189453,18235.335938,-33521.335938
25%,70157.668367,135.0,54.0,3.0,21369.412109,20338.068848,-290.216797
50%,102113.982311,140.0,62.0,5.0,23474.064453,22868.022461,852.999023
75%,168862.191837,190.0,68.0,7.0,28362.908203,29286.517578,1494.930664
max,347105.37037,317.0,132.0,9.0,54905.984375,60576.886719,1983.90625


In [146]:
# Show the car with the lowest loss
car_index = candidate_cars.sort_values('loss').index[0]
data_q3.loc[car_index]

maker_key                                  BMW
model_key                                   X4
mileage                           54488.918919
engine_power                               140
registration_date          2015-07-01 00:00:00
fuel                                    diesel
paint_color                              black
car_type                                   suv
feature_1                                 True
feature_2                                 True
feature_3                                False
feature_4                                False
feature_5                                 True
feature_6                                False
feature_7                                 True
feature_8                                 True
price                                    23800
sold_at                    2018-09-01 00:00:00
age_in_months_when_sold                     38
month_sold_at                                8
season_sold_at                          summer
model_initial

In [147]:
data.loc[car_index]

maker_key                                  BMW
model_key                                   X4
mileage                                  53055
engine_power                               140
registration_date          2015-07-01 00:00:00
fuel                                    diesel
paint_color                              black
car_type                                   suv
feature_1                                 True
feature_2                                 True
feature_3                                False
feature_4                                False
feature_5                                 True
feature_6                                False
feature_7                                 True
feature_8                                 True
price                                    23800
sold_at                    2018-08-01 00:00:00
age_in_months_when_sold                     37
month_sold_at                                8
season_sold_at                          summer
model_initial

In [148]:
X_q3.loc[car_index]

model_key                            X4
mileage                    71695.945946
engine_power                        140
fuel                             diesel
paint_color                       black
car_type                            suv
feature_1                          True
feature_2                          True
feature_3                         False
feature_4                         False
feature_5                          True
feature_6                         False
feature_7                          True
feature_8                          True
age_in_months_when_sold              50
month_sold_at                         8
season_sold_at                   summer
model_initial                         X
price_today                26153.548828
price_one_year_later       59674.882812
loss                      -33521.335938
Name: 4622, dtype: object

In [149]:
# Check cars with the same model_key
data[data['model_key'] == data['model_key'].loc[car_index]].sort_values(
    'price', ascending=False)[['model_key', 'price', 'mileage', 'age_in_months_when_sold']]

Unnamed: 0,model_key,price,mileage,age_in_months_when_sold
4658,X4,142800,103222,48
4067,X4,47400,56981,35
3908,X4,38100,41099,37
4065,X4,37200,39061,15
3903,X4,36900,50003,37
4534,X4,36300,118606,37
4153,X4,35900,62592,39
4223,X4,35300,27264,30
4275,X4,35200,53113,32
4152,X4,35000,112720,44


### Q4

In [150]:
print(f'Test MSE: {metadata["mse_test"]:.2f}')
print(f'Test RMSE: {metadata["rmse_test"]:.2f}')
print(f'Test MAE: {metadata["mae_test"]:.2f}')
print(f'Test R2: {metadata["r2_test"]:.2f}')

Test MSE: 9333397.73
Test RMSE: 3055.06
Test MAE: 1896.94
Test R2: 0.86


### Q5

Add other findings from data exploration