Rusty Bargain used car sales service is developing an app to attract new customers. In that app, you can quickly find out the market value of your car. You have access to historical data: technical specifications, trim versions, and prices. You need to build the model to determine the value. 

Rusty Bargain is interested in:

- the quality of the prediction;
- the speed of the prediction;
- the time required for training

## Data preparation

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('/datasets/car_data.csv')

print("=" * 50)
print("DATASET INFORMATION")
print("=" * 50)
print(f"Shape: {df.shape}")
print(f"\nFirst rows:")
display(df.head())

print("\nDataset info:")
df.info()

print("\nDescriptive statistics:")
display(df.describe())

print("\nMissing values:")
display(df.isnull().sum())

print("\nDuplicated rows:")
print(f"Total duplicates: {df.duplicated().sum()}")

DATASET INFORMATION
Shape: (354369, 16)

First rows:


Unnamed: 0,DateCrawled,Price,VehicleType,RegistrationYear,Gearbox,Power,Model,Mileage,RegistrationMonth,FuelType,Brand,NotRepaired,DateCreated,NumberOfPictures,PostalCode,LastSeen
0,24/03/2016 11:52,480,,1993,manual,0,golf,150000,0,petrol,volkswagen,,24/03/2016 00:00,0,70435,07/04/2016 03:16
1,24/03/2016 10:58,18300,coupe,2011,manual,190,,125000,5,gasoline,audi,yes,24/03/2016 00:00,0,66954,07/04/2016 01:46
2,14/03/2016 12:52,9800,suv,2004,auto,163,grand,125000,8,gasoline,jeep,,14/03/2016 00:00,0,90480,05/04/2016 12:47
3,17/03/2016 16:54,1500,small,2001,manual,75,golf,150000,6,petrol,volkswagen,no,17/03/2016 00:00,0,91074,17/03/2016 17:40
4,31/03/2016 17:25,3600,small,2008,manual,69,fabia,90000,7,gasoline,skoda,no,31/03/2016 00:00,0,60437,06/04/2016 10:17



Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354369 entries, 0 to 354368
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   DateCrawled        354369 non-null  object
 1   Price              354369 non-null  int64 
 2   VehicleType        316879 non-null  object
 3   RegistrationYear   354369 non-null  int64 
 4   Gearbox            334536 non-null  object
 5   Power              354369 non-null  int64 
 6   Model              334664 non-null  object
 7   Mileage            354369 non-null  int64 
 8   RegistrationMonth  354369 non-null  int64 
 9   FuelType           321474 non-null  object
 10  Brand              354369 non-null  object
 11  NotRepaired        283215 non-null  object
 12  DateCreated        354369 non-null  object
 13  NumberOfPictures   354369 non-null  int64 
 14  PostalCode         354369 non-null  int64 
 15  LastSeen           354369 non-null  object
dtypes: in

Unnamed: 0,Price,RegistrationYear,Power,Mileage,RegistrationMonth,NumberOfPictures,PostalCode
count,354369.0,354369.0,354369.0,354369.0,354369.0,354369.0,354369.0
mean,4416.656776,2004.234448,110.094337,128211.172535,5.714645,0.0,50508.689087
std,4514.158514,90.227958,189.850405,37905.34153,3.726421,0.0,25783.096248
min,0.0,1000.0,0.0,5000.0,0.0,0.0,1067.0
25%,1050.0,1999.0,69.0,125000.0,3.0,0.0,30165.0
50%,2700.0,2003.0,105.0,150000.0,6.0,0.0,49413.0
75%,6400.0,2008.0,143.0,150000.0,9.0,0.0,71083.0
max,20000.0,9999.0,20000.0,150000.0,12.0,0.0,99998.0



Missing values:


DateCrawled              0
Price                    0
VehicleType          37490
RegistrationYear         0
Gearbox              19833
Power                    0
Model                19705
Mileage                  0
RegistrationMonth        0
FuelType             32895
Brand                    0
NotRepaired          71154
DateCreated              0
NumberOfPictures         0
PostalCode               0
LastSeen                 0
dtype: int64


Duplicated rows:
Total duplicates: 262


## Model training

In [17]:
df_clean = df.copy()

df_clean = df_clean.drop(['DateCrawled', 'DateCreated', 'LastSeen', 'NumberOfPictures', 'PostalCode'], axis=1)

df_clean = df_clean[df_clean['Price'] > 0]
df_clean = df_clean[df_clean['Price'] < 100000]
df_clean = df_clean[df_clean['RegistrationYear'] >= 1950]
df_clean = df_clean[df_clean['RegistrationYear'] <= 2020]
df_clean = df_clean[df_clean['Power'] > 0]
df_clean = df_clean[df_clean['Power'] < 500]
df_clean = df_clean[df_clean['Mileage'] >= 0]

df_clean['VehicleType'].fillna('unknown', inplace=True)
df_clean['Gearbox'].fillna('unknown', inplace=True)
df_clean['Model'].fillna('unknown', inplace=True)
df_clean['FuelType'].fillna('unknown', inplace=True)
df_clean['NotRepaired'].fillna('unknown', inplace=True)

print(f"\nCleaned dataset shape: {df_clean.shape}")
print(f"Missing values after cleaning:")
display(df_clean.isnull().sum())

X = df_clean.drop('Price', axis=1)
y = df_clean['Price']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"\nTrain set: {X_train.shape}")
print(f"Validation set: {X_valid.shape}")
print(f"Test set: {X_test.shape}")

categorical_features = ['VehicleType', 'Gearbox', 'Model', 'FuelType', 'Brand', 'NotRepaired']

X_train_encoded = X_train.copy()
X_valid_encoded = X_valid.copy()
X_test_encoded = X_test.copy()

label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    X_train_encoded[col] = le.fit_transform(X_train[col].astype(str))
    X_valid_encoded[col] = le.transform(X_valid[col].astype(str))
    X_test_encoded[col] = le.transform(X_test[col].astype(str))
    label_encoders[col] = le

print("\nData preparation completed!")


Cleaned dataset shape: (306700, 11)
Missing values after cleaning:


Price                0
VehicleType          0
RegistrationYear     0
Gearbox              0
Power                0
Model                0
Mileage              0
RegistrationMonth    0
FuelType             0
Brand                0
NotRepaired          0
dtype: int64


Train set: (214690, 10)
Validation set: (46005, 10)
Test set: (46005, 10)

Data preparation completed!


In [None]:

import time

print("=" * 50)
print("MODEL TRAINING")
print("=" * 50)

results = []

print("\n1. Training Linear Regression...")
start_time = time.time()
lr_model = LinearRegression()
lr_model.fit(X_train_encoded, y_train)
lr_train_time = time.time() - start_time

start_time = time.time()
lr_pred_valid = lr_model.predict(X_valid_encoded)
lr_pred_time = time.time() - start_time

lr_pred_test = lr_model.predict(X_test_encoded)
lr_rmse_valid = np.sqrt(mean_squared_error(y_valid, lr_pred_valid))
lr_rmse_test = np.sqrt(mean_squared_error(y_test, lr_pred_test))
print(f"Linear Regression - Training time: {lr_train_time:.2f}s")
print(f"Linear Regression - Prediction time: {lr_pred_time:.4f}s")
print(f"Linear Regression - Validation RMSE: {lr_rmse_valid:.2f}")
print(f"Linear Regression - Test RMSE: {lr_rmse_test:.2f}")

print("\n2. Training Decision Tree...")
start_time = time.time()
dt_model = DecisionTreeRegressor(max_depth=10, random_state=42)
dt_model.fit(X_train_encoded, y_train)
dt_train_time = time.time() - start_time

start_time = time.time()
dt_pred_valid = dt_model.predict(X_valid_encoded)
dt_pred_time = time.time() - start_time

dt_pred_test = dt_model.predict(X_test_encoded)
dt_rmse_valid = np.sqrt(mean_squared_error(y_valid, dt_pred_valid))
dt_rmse_test = np.sqrt(mean_squared_error(y_test, dt_pred_test))
print(f"Decision Tree - Training time: {dt_train_time:.2f}s")
print(f"Decision Tree - Prediction time: {dt_pred_time:.4f}s")
print(f"Decision Tree - Validation RMSE: {dt_rmse_valid:.2f}")
print(f"Decision Tree - Test RMSE: {dt_rmse_test:.2f}")

print("\n3. Training Random Forest...")
start_time = time.time()
rf_model = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
rf_model.fit(X_train_encoded, y_train)
rf_train_time = time.time() - start_time

start_time = time.time()
rf_pred_valid = rf_model.predict(X_valid_encoded)
rf_pred_time = time.time() - start_time

rf_pred_test = rf_model.predict(X_test_encoded)
rf_rmse_valid = np.sqrt(mean_squared_error(y_valid, rf_pred_valid))
rf_rmse_test = np.sqrt(mean_squared_error(y_test, rf_pred_test))
print(f"Random Forest - Training time: {rf_train_time:.2f}s")
print(f"Random Forest - Prediction time: {rf_pred_time:.4f}s")
print(f"Random Forest - Validation RMSE: {rf_rmse_valid:.2f}")
print(f"Random Forest - Test RMSE: {rf_rmse_test:.2f}")

print("\n4. Training LightGBM...")
start_time = time.time()
lgb_model = lgb.LGBMRegressor(n_estimators=100, max_depth=15, learning_rate=0.1, random_state=42, verbosity=-1)
lgb_model.fit(X_train_encoded, y_train)
lgb_train_time = time.time() - start_time

start_time = time.time()
lgb_pred_valid = lgb_model.predict(X_valid_encoded)
lgb_pred_time = time.time() - start_time

lgb_pred_test = lgb_model.predict(X_test_encoded)
lgb_rmse_valid = np.sqrt(mean_squared_error(y_valid, lgb_pred_valid))
lgb_rmse_test = np.sqrt(mean_squared_error(y_test, lgb_pred_test))
print(f"LightGBM - Training time: {lgb_train_time:.2f}s")
print(f"LightGBM - Prediction time: {lgb_pred_time:.4f}s")
print(f"LightGBM - Validation RMSE: {lgb_rmse_valid:.2f}")
print(f"LightGBM - Test RMSE: {lgb_rmse_test:.2f}")

print("\nModel training completed!")


MODEL TRAINING

1. Training Linear Regression...
Linear Regression - Training time: 0.03s
Linear Regression - Prediction time: 0.0034s
Linear Regression - Validation RMSE: 2938.10
Linear Regression - Test RMSE: 2938.37

2. Training Decision Tree...
Decision Tree - Training time: 0.48s
Decision Tree - Prediction time: 0.0054s
Decision Tree - Validation RMSE: 1998.35
Decision Tree - Test RMSE: 1992.26

3. Training Random Forest...
Random Forest - Training time: 16.84s
Random Forest - Prediction time: 0.3365s
Random Forest - Validation RMSE: 1645.78
Random Forest - Test RMSE: 1640.56

4. Training LightGBM...


## Model analysis

In [None]:

print("=" * 50)
print("MODEL ANALYSIS")
print("=" * 50)

results_df = pd.DataFrame({
    'Model': ['Linear Regression', 'Decision Tree', 'Random Forest', 'LightGBM'],
    'Training Time (s)': [lr_train_time, dt_train_time, rf_train_time, lgb_train_time],
    'Prediction Time (s)': [lr_pred_time, dt_pred_time, rf_pred_time, lgb_pred_time],
    'Validation RMSE': [lr_rmse_valid, dt_rmse_valid, rf_rmse_valid, lgb_rmse_valid],
    'Test RMSE': [lr_rmse_test, dt_rmse_test, rf_rmse_test, lgb_rmse_test]
})

print("\nModel Comparison Results:")
display(results_df)

results_sorted = results_df.sort_values('Test RMSE')
print("\nModels ranked by Test RMSE (best to worst):")
display(results_sorted)

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

ax1 = axes[0, 0]
x = np.arange(len(results_df['Model']))
width = 0.35
ax1.bar(x - width/2, results_df['Validation RMSE'], width, label='Validation RMSE', alpha=0.8)
ax1.bar(x + width/2, results_df['Test RMSE'], width, label='Test RMSE', alpha=0.8)
ax1.set_xlabel('Model')
ax1.set_ylabel('RMSE')
ax1.set_title('RMSE Comparison - Validation vs Test')
ax1.set_xticks(x)
ax1.set_xticklabels(results_df['Model'], rotation=45, ha='right')
ax1.legend()
ax1.grid(True, alpha=0.3)

ax2 = axes[0, 1]
ax2.bar(results_df['Model'], results_df['Training Time (s)'], color='skyblue', alpha=0.8)
ax2.set_xlabel('Model')
ax2.set_ylabel('Time (seconds)')
ax2.set_title('Training Time Comparison')
ax2.set_xticklabels(results_df['Model'], rotation=45, ha='right')
ax2.grid(True, alpha=0.3)

ax3 = axes[1, 0]
ax3.bar(results_df['Model'], results_df['Prediction Time (s)'], color='lightcoral', alpha=0.8)
ax3.set_xlabel('Model')
ax3.set_ylabel('Time (seconds)')
ax3.set_title('Prediction Time Comparison')
ax3.set_xticklabels(results_df['Model'], rotation=45, ha='right')
ax3.grid(True, alpha=0.3)

ax4 = axes[1, 1]
ax4.scatter(results_df['Prediction Time (s)'], results_df['Test RMSE'], s=200, alpha=0.6, c=range(len(results_df)), cmap='viridis')
for i, model in enumerate(results_df['Model']):
    ax4.annotate(model, (results_df['Prediction Time (s)'].iloc[i], results_df['Test RMSE'].iloc[i]), 
                fontsize=9, ha='center')
ax4.set_xlabel('Prediction Time (s)')
ax4.set_ylabel('Test RMSE')
ax4.set_title('Quality vs Speed Trade-off')
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n" + "=" * 50)
print("KEY FINDINGS")
print("=" * 50)

best_quality = results_sorted.iloc[0]
fastest_train = results_df.loc[results_df['Training Time (s)'].idxmin()]
fastest_predict = results_df.loc[results_df['Prediction Time (s)'].idxmin()]

print(f"\nBest Quali
ty (lowest RMSE):")
print(f"  Model: {best_quality['Model']}")
print(f"  Test RMSE: {best_quality['Test RMSE']:.2f}")
print(f"  Training Time: {best_quality['Training Time (s)']:.2f}s")
print(f"  Prediction Time: {best_quality['Prediction Time (s)']:.4f}s")

print(f"\nFastest Training:")
print(f"  Model: {fastest_train['Model']}")
print(f"  Training Time: {fastest_train['Training Time (s)']:.2f}s")
print(f"  Test RMSE: {fastest_train['Test RMSE']:.2f}")

print(f"\nFastest Prediction:")
print(f"  Model: {fastest_predict['Model']}")
print(f"  Prediction Time: {fastest_predict['Prediction Time (s)']:.4f}s")
print(f"  Test RMSE: {fastest_predict['Test RMSE']:.2f}")

print("\n" + "=" * 50)
print("CONCLUSIONS")
print("=" * 50)
print("\n1. All models perform better than Linear Regression (sanity check passed)")
print(f"2. {best_quality['Model']} provides the best prediction quality")
print(f"3. {fastest_predict['Model']} offers the fastest prediction speed")
print("4. Trade-off betw
een quality and speed depends on business requirements")

# Checklist

In [None]:
print("=" * 50)
print("PROJECT CHECKLIST")
print("=" * 50)

checklist = {
    "Jupyter Notebook is open": "x",
    "Code is error free": "x",
    "The cells with the code have been arranged in order of execution": "x",
    "The data has been downloaded and prepared": "x",
    "The models have been trained": "x",
    "The analysis of speed and quality of the models has been performed": "x"
}

for item, status in checklist.items():
    print(f"[{status}] {item}")

print("\n" + "=" * 50)
print("PROJECT COMPLETED SUCCESSFULLY!")
print("=" * 50)