## Answer questions

In [135]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [136]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR, MODELS_DIR, RAW_DATA_DIR

In [137]:
data = pd.read_csv(TRANSFORMED_DATA_DIR / 'transformed_data.csv')

In [138]:
data.shape

(4817, 22)

In [139]:
# Convert date columns to datetime
data['registration_date'] = pd.to_datetime(data['registration_date'])
data['sold_at'] = pd.to_datetime(data['sold_at'])

In [140]:
# Change data types from object to categorical
from src.data import convert_object_columns_to_category

data = convert_object_columns_to_category(data)

In [141]:
from src.data import get_train_test_data

In [142]:
import joblib

In [143]:
features = joblib.load(MODELS_DIR / 'features.pkl')
target = joblib.load(MODELS_DIR / 'target.pkl')

In [144]:
# Print the features
features

['mileage',
 'engine_power',
 'fuel',
 'paint_color',
 'car_type',
 'feature_1',
 'feature_2',
 'feature_3',
 'feature_4',
 'feature_5',
 'feature_6',
 'feature_7',
 'feature_8',
 'age_in_months_when_sold',
 'month_sold_at',
 'season_sold_at',
 'model_initial']

In [145]:
X, y, X_train, X_test, y_train, y_test = get_train_test_data(data, features, target)

In [146]:
# Load the model
model = joblib.load(MODELS_DIR / 'model.pkl')

In [147]:
# Print the model
model

### Q1

In [148]:
# Read feature importance df from models folder
feature_importance = joblib.load(MODELS_DIR / 'feature_importance_df.pkl')

In [149]:
feature_importance

Unnamed: 0,feature,importance
1,engine_power,0.300262
12,feature_8,0.145404
16,model_initial,0.124069
13,age_in_months_when_sold,0.113622
0,mileage,0.057269
9,feature_5,0.047976
5,feature_1,0.030138
3,paint_color,0.02926
8,feature_4,0.02918
4,car_type,0.018999


### Q2

As found during the data exploration phase:
- Hybrid and electrical cars are more expensive on average.
- Electrical cars average prices were stable from winter to summer, and were not sold in autumn.
- Diesel and petrol cars had similar average prices, although petrol cars had a drop in average prices starting summer 2018.
- The most expensive car type is, on average, suv, although coupe was most expensive at the start of the year and then dropped below suv also starting in summer.
- Coupe and convertible cars were, on average, more expensive in winter than in summer.
- Vans where more expensive, on average, in spring, summer, and autumn, than in winter.
- Subcompact had generally the lowest average prices.
- Paint color does not seem to generally determine or be associated with the average price, except for color green, which consistently had prices much lower than other colors. Maybe not very popular.
- Orange and white cars were sold for more, on average, during summer than during winter and spring.
- Red cars were the opposite, with lower average prices during summer than during winter and spring.

Find similar observations using estimated price instead of real price

In [150]:
pred = model.predict(X)

In [151]:
data_q2 = data.copy()

In [152]:
data_q2['price'] = pred

In [153]:
from src.plots import plot_avg_target_time_series_by_features

In [154]:
# Load car features
car_features = joblib.load(RAW_DATA_DIR / 'car_features.pkl')

In [155]:
plot_avg_target_time_series_by_features(data_q2, car_features)

In [156]:
# Load small cardinality features
small_cardinality_features = joblib.load(RAW_DATA_DIR / 'small_cardinality_features.pkl')

In [157]:
plot_avg_target_time_series_by_features(data_q2, small_cardinality_features)

In [158]:
from src.questions import ttest_mean_price_difference_between_groups_after_filter

In [159]:
grouping_column = 'season_sold_at'
group_1 = 'winter'
group_2 = 'summer'
for feature in small_cardinality_features:
    for feature_value in data_q2[feature].unique():
        t_stat, p_val = ttest_mean_price_difference_between_groups_after_filter(data_q2, feature, feature_value, grouping_column, group_1, group_2)
        print(f'Feature: {feature}, Feature Value: {feature_value}, T-stat: {t_stat:.4f}, P-value: {p_val:.4f}') 

Feature: fuel, Feature Value: diesel, T-stat: -0.4993, P-value: 0.6176
Feature: fuel, Feature Value: petrol, T-stat: 2.1326, P-value: 0.0363
Feature: fuel, Feature Value: hybrid_petrol, T-stat: nan, P-value: nan
Feature: fuel, Feature Value: electro, T-stat: 0.1195, P-value: 0.9243
Feature: car_type, Feature Value: convertible, T-stat: 1.1531, P-value: 0.2625
Feature: car_type, Feature Value: coupe, T-stat: 3.5146, P-value: 0.0011
Feature: car_type, Feature Value: estate, T-stat: 2.7952, P-value: 0.0053
Feature: car_type, Feature Value: hatchback, T-stat: -0.3526, P-value: 0.7246
Feature: car_type, Feature Value: sedan, T-stat: 4.3080, P-value: 0.0000
Feature: car_type, Feature Value: subcompact, T-stat: 0.7040, P-value: 0.4852
Feature: car_type, Feature Value: suv, T-stat: -0.9809, P-value: 0.3270
Feature: car_type, Feature Value: van, T-stat: -2.1605, P-value: 0.0453
Feature: paint_color, Feature Value: black, T-stat: 1.1852, P-value: 0.2363
Feature: paint_color, Feature Value: grey,

### Q3

In [160]:
today_date = '3/1/2024'

In [161]:
data_q3 = data.copy()

In [162]:
# Convert date columns to datetime and mock the sold_at date as today's date
data_q3['registration_date'] = pd.to_datetime(data_q3['registration_date'])
data_q3['sold_at'] = today_date
data_q3['sold_at'] = pd.to_datetime(data_q3['sold_at'])

In [163]:
# Calculate mileage per month
data_q3['mileage_per_month'] = data_q3['mileage'] / data_q3['age_in_months_when_sold']

In [164]:
# Calculate age in month at today date and replace in data_q3
data_q3['age_in_months_when_sold'] = (data_q3['sold_at'].dt.to_period('M') - data_q3['registration_date'].dt.to_period('M')).apply(lambda x: x.n)

In [165]:
# Update estimated mileage at today date
data_q3['mileage'] = data_q3['age_in_months_when_sold'] * data_q3['mileage_per_month']

In [166]:
data_q3[['registration_date', 'sold_at', 'age_in_months_when_sold', 'mileage']].head()

Unnamed: 0,registration_date,sold_at,age_in_months_when_sold,mileage
0,2012-02-01,2024-03-01,145,286754.859155
1,2016-04-01,2024-03-01,95,60147.954545
2,2012-04-01,2024-03-01,143,374449.585714
3,2014-07-01,2024-03-01,116,345396.744186
4,2014-12-01,2024-03-01,111,269444.175


In [167]:
X_q3, y_q3, X_train_q3, X_test_q3, y_train_q3, y_test_q3 = get_train_test_data(data_q3, features, target)

In [168]:
# Get estimated prices today
pred_q3 = model.predict(X_q3)

In [169]:
# Add 1 year to the age_in_months_when_sold
X_q3['age_in_months_when_sold'] = X_q3['age_in_months_when_sold'] + 12

In [170]:
# Add 1 year worth of mileage
X_q3['mileage'] = X_q3['mileage'] + 12 * data_q3['mileage_per_month']

In [171]:
# Get estimated prices 1 year later
pred_q3_one_year_later = model.predict(X_q3)

In [172]:
X_q3['price_today'] = pred_q3
X_q3['price_one_year_later'] = pred_q3_one_year_later
X_q3['loss'] = X_q3['price_today'] - X_q3['price_one_year_later'] 

In [173]:
# Indentify cars that are candidates for buying
candidate_cars = X_q3[(X_q3['price_today'] >= 20000) & (X_q3['loss'] <= 1000)]

In [174]:
# Show top 10 candidate cars with lowest loss
candidate_cars.sort_values('loss').head(10)

Unnamed: 0,mileage,engine_power,fuel,paint_color,car_type,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,age_in_months_when_sold,month_sold_at,season_sold_at,model_initial,price_today,price_one_year_later,loss
4615,103518.040816,180,diesel,black,suv,True,True,False,True,True,False,True,True,128,8,summer,X,22284.501953,33537.890625,-11253.388672
67,66996.268657,309,petrol,silver,coupe,True,True,False,False,True,True,True,True,150,4,spring,M,31947.21875,32150.009766,-202.791016
3910,9278.857143,155,diesel,black,suv,True,True,False,True,True,True,True,True,92,2,winter,X,24734.441406,24621.988281,112.453125
4563,148566.018868,280,diesel,grey,suv,True,True,False,False,True,False,True,True,133,7,summer,X,22155.429688,22030.566406,124.863281
113,64433.4,142,diesel,grey,coupe,True,True,False,False,True,False,True,True,102,5,spring,4,22225.837891,21977.050781,248.787109
3272,166284.324324,230,diesel,blue,sedan,True,True,True,False,False,True,False,True,118,6,summer,4,20743.990234,20383.414062,360.576172
3908,128850.918919,230,diesel,red,suv,False,False,False,False,False,False,False,True,116,8,summer,X,22815.355469,22438.533203,376.822266
57,146814.769231,240,petrol,grey,coupe,False,True,True,False,True,True,True,True,112,1,winter,M,22907.533203,22474.998047,432.535156
4106,169312.064516,230,diesel,grey,suv,True,True,True,False,True,False,True,True,114,4,spring,X,21125.960938,20583.814453,542.146484
60,85229.783784,225,petrol,blue,coupe,True,True,False,True,True,True,True,True,121,3,spring,4,22047.244141,21498.25,548.994141


In [175]:
# Find the value counts for model_key in candidate_cars if exists. If not try model key initial.
try:
    model_related_value_counts = candidate_cars['model_key'].value_counts()
except KeyError:
    model_related_value_counts = candidate_cars['model_initial'].value_counts()

In [176]:
model_related_value_counts

model_initial
X    6
4    3
M    2
5    1
i    1
1    0
2    0
3    0
6    0
7    0
A    0
Z    0
Name: count, dtype: int64

In [177]:
# Describe numerical features of candidate cars
candidate_cars.describe()

Unnamed: 0,mileage,engine_power,age_in_months_when_sold,month_sold_at,price_today,price_one_year_later,loss
count,13.0,13.0,13.0,13.0,13.0,13.0,13.0
mean,104521.330604,216.615385,116.076923,5.307692,24579.886719,25070.482422,-490.597961
std,53248.430802,47.636713,15.294125,2.56205,5557.169434,6121.775391,3247.756592
min,9278.857143,142.0,92.0,1.0,20743.990234,20383.414062,-11253.388672
25%,64433.4,180.0,110.0,4.0,22047.244141,21498.25,124.863281
50%,103518.040816,230.0,116.0,5.0,22284.501953,22438.533203,376.822266
75%,148566.018868,230.0,121.0,8.0,24734.441406,24621.988281,548.994141
max,169312.064516,309.0,150.0,9.0,40392.964844,39813.746094,944.212891


In [178]:
# Show the car with the lowest loss
car_index = candidate_cars.sort_values('loss').index[0]
data_q3.loc[car_index]

maker_key                                  BMW
model_key                                   X6
mileage                            93813.22449
engine_power                               180
registration_date          2014-07-01 00:00:00
fuel                                    diesel
paint_color                              black
car_type                                   suv
feature_1                                 True
feature_2                                 True
feature_3                                False
feature_4                                 True
feature_5                                 True
feature_6                                False
feature_7                                 True
feature_8                                 True
price                                    43600
sold_at                    2024-03-01 00:00:00
age_in_months_when_sold                    116
month_sold_at                                8
season_sold_at                          summer
model_initial

In [179]:
data.loc[car_index]

maker_key                                  BMW
model_key                                   X6
mileage                                  39628
engine_power                               180
registration_date          2014-07-01 00:00:00
fuel                                    diesel
paint_color                              black
car_type                                   suv
feature_1                                 True
feature_2                                 True
feature_3                                False
feature_4                                 True
feature_5                                 True
feature_6                                False
feature_7                                 True
feature_8                                 True
price                                    43600
sold_at                    2018-08-01 00:00:00
age_in_months_when_sold                     49
month_sold_at                                8
season_sold_at                          summer
model_initial

In [180]:
X_q3.loc[car_index]

mileage                    103518.040816
engine_power                         180
fuel                              diesel
paint_color                        black
car_type                             suv
feature_1                           True
feature_2                           True
feature_3                          False
feature_4                           True
feature_5                           True
feature_6                          False
feature_7                           True
feature_8                           True
age_in_months_when_sold              128
month_sold_at                          8
season_sold_at                    summer
model_initial                          X
price_today                 22284.501953
price_one_year_later        33537.890625
loss                       -11253.388672
Name: 4615, dtype: object

### Q3 assumming today is a month after latest sold at date in the data

In [181]:
today_date = data['sold_at'].max()

In [182]:
data_q3 = data.copy()

In [183]:
# Convert date columns to datetime and mock the sold_at date as today's date
data_q3['registration_date'] = pd.to_datetime(data_q3['registration_date'])
data_q3['sold_at'] = today_date
data_q3['sold_at'] = pd.to_datetime(data_q3['sold_at'])

In [184]:
# Calculate mileage per month
data_q3['mileage_per_month'] = data_q3['mileage'] / data_q3['age_in_months_when_sold']

In [185]:
# Calculate age in month at today date and replace in data_q3
data_q3['age_in_months_when_sold'] = (data_q3['sold_at'].dt.to_period('M') - data_q3['registration_date'].dt.to_period('M')).apply(lambda x: x.n)

In [186]:
# Update estimated mileage at today date
data_q3['mileage'] = data_q3['age_in_months_when_sold'] * data_q3['mileage_per_month']

In [187]:
data_q3[['registration_date', 'sold_at', 'age_in_months_when_sold', 'mileage']].head()

Unnamed: 0,registration_date,sold_at,age_in_months_when_sold,mileage
0,2012-02-01,2018-09-01,79,156231.957746
1,2016-04-01,2018-09-01,29,18360.954545
2,2012-04-01,2018-09-01,77,201626.7
3,2014-07-01,2018-09-01,50,148877.906977
4,2014-12-01,2018-09-01,45,109234.125


In [188]:
X_q3, y_q3, X_train_q3, X_test_q3, y_train_q3, y_test_q3 = get_train_test_data(data_q3, features, target)

In [189]:
# Get estimated prices today
pred_q3 = model.predict(X_q3)

In [190]:
# Add 1 year to the age_in_months_when_sold
X_q3['age_in_months_when_sold'] = X_q3['age_in_months_when_sold'] + 12

In [191]:
# Add 1 year worth of mileage
X_q3['mileage'] = X_q3['mileage'] + 12 * data_q3['mileage_per_month']

In [192]:
# Get estimated prices 1 year later
pred_q3_one_year_later = model.predict(X_q3)

In [193]:
X_q3['price_today'] = pred_q3
X_q3['price_one_year_later'] = pred_q3_one_year_later
X_q3['loss'] = X_q3['price_today'] - X_q3['price_one_year_later'] 

In [194]:
# Indentify cars that are candidates for buying
candidate_cars = X_q3[(X_q3['price_today'] >= 20000) & (X_q3['loss'] <= 1000)]

In [195]:
# Show top 10 candidate cars with lowest loss
candidate_cars.sort_values('loss').head(10)

Unnamed: 0,mileage,engine_power,fuel,paint_color,car_type,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,age_in_months_when_sold,month_sold_at,season_sold_at,model_initial,price_today,price_one_year_later,loss
4499,103077.391304,140,diesel,grey,suv,True,True,True,True,True,False,True,True,60,7,summer,X,31356.259766,48322.757812,-16966.498047
105,45031.734694,135,petrol,grey,coupe,False,True,False,False,True,True,True,True,65,5,spring,4,26176.636719,35504.261719,-9327.625
131,61018.431373,120,diesel,blue,coupe,True,True,False,False,True,True,True,True,65,7,summer,4,25067.146484,34247.511719,-9180.365234
123,60717.8,120,diesel,blue,coupe,True,True,False,False,True,True,True,True,65,6,summer,4,25067.146484,34247.511719,-9180.365234
48,66621.020408,135,diesel,grey,coupe,False,False,False,False,False,False,False,True,65,5,spring,4,25542.998047,34323.164062,-8780.166016
4285,61562.959184,135,diesel,silver,suv,True,True,False,False,False,False,False,True,65,5,spring,X,24696.689453,33400.625,-8703.935547
109,54856.020408,135,diesel,black,coupe,False,False,False,False,True,True,True,False,65,5,spring,4,22132.576172,29999.287109,-7866.710938
74,58191.595745,135,diesel,black,coupe,False,False,False,False,True,True,True,False,65,3,spring,4,22132.576172,29999.287109,-7866.710938
94,65583.645833,135,diesel,grey,coupe,True,True,False,True,True,True,True,False,65,4,spring,4,22682.480469,29954.042969,-7271.5625
3033,56074.98,135,diesel,black,sedan,True,True,False,False,True,True,True,True,67,4,spring,5,22966.826172,30166.919922,-7200.09375


In [196]:
# Find the value counts for model_key in candidate_cars if exists. If not try model key initial.
try:
    model_related_value_counts = candidate_cars['model_key'].value_counts()
except KeyError:
    model_related_value_counts = candidate_cars['model_initial'].value_counts()

In [197]:
model_related_value_counts

model_initial
X    36
5    19
4    15
2     6
M     6
6     5
7     5
3     2
i     2
A     1
1     0
Z     0
Name: count, dtype: int64

In [198]:
# Describe numerical features of candidate cars
candidate_cars.describe()

Unnamed: 0,mileage,engine_power,age_in_months_when_sold,month_sold_at,price_today,price_one_year_later,loss
count,97.0,97.0,97.0,97.0,97.0,97.0,97.0
mean,94594.103741,163.680412,61.216495,5.402062,25451.914062,26555.289062,-1103.375
std,61006.640969,44.74035,16.572585,2.008626,4902.523438,5917.231934,3240.264893
min,2622.285714,75.0,26.0,1.0,20060.949219,19444.675781,-16966.498047
25%,57133.645833,135.0,50.0,4.0,21962.951172,21898.197266,-937.896484
50%,84378.8,140.0,59.0,5.0,23752.552734,24388.894531,233.806641
75%,114748.757576,190.0,74.0,7.0,27070.324219,30552.521484,661.884766
max,327917.692308,280.0,95.0,9.0,40239.789062,48322.757812,989.498047


In [199]:
# Show the car with the lowest loss
car_index = candidate_cars.sort_values('loss').index[0]
data_q3.loc[car_index]

maker_key                                  BMW
model_key                                   X3
mileage                           82461.913043
engine_power                               140
registration_date          2014-09-01 00:00:00
fuel                                    diesel
paint_color                               grey
car_type                                   suv
feature_1                                 True
feature_2                                 True
feature_3                                 True
feature_4                                 True
feature_5                                 True
feature_6                                False
feature_7                                 True
feature_8                                 True
price                                    25900
sold_at                    2018-09-01 00:00:00
age_in_months_when_sold                     48
month_sold_at                                7
season_sold_at                          summer
model_initial

In [200]:
data.loc[car_index]

maker_key                                  BMW
model_key                                   X3
mileage                                  79026
engine_power                               140
registration_date          2014-09-01 00:00:00
fuel                                    diesel
paint_color                               grey
car_type                                   suv
feature_1                                 True
feature_2                                 True
feature_3                                 True
feature_4                                 True
feature_5                                 True
feature_6                                False
feature_7                                 True
feature_8                                 True
price                                    25900
sold_at                    2018-07-01 00:00:00
age_in_months_when_sold                     46
month_sold_at                                7
season_sold_at                          summer
model_initial

In [201]:
X_q3.loc[car_index]

mileage                    103077.391304
engine_power                         140
fuel                              diesel
paint_color                         grey
car_type                             suv
feature_1                           True
feature_2                           True
feature_3                           True
feature_4                           True
feature_5                           True
feature_6                          False
feature_7                           True
feature_8                           True
age_in_months_when_sold               60
month_sold_at                          7
season_sold_at                    summer
model_initial                          X
price_today                 31356.259766
price_one_year_later        48322.757812
loss                       -16966.498047
Name: 4499, dtype: object

### Q4

Check training notebook

### Q5

Add other findings from data exploration