# 03 - Feature Engineering for Car Price Prediction

This notebook creates derived features to improve model performance:
- Car age and age-related features
- Price and mileage ratios
- Categorical binning for better patterns
- Brand/model popularity metrics

## 1. Setup and Data Loading

In [1]:
# Core libraries
import polars as pl
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Path handling
from pathlib import Path
import os
import sys

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

# Add project root to path
current_dir = Path.cwd()
project_root = current_dir.parent if current_dir.name == "notebooks" else current_dir
sys.path.insert(0, str(project_root))

from src.config import DATA_PATH, PROCESSED_DATA_PATH, MODELS_PATH
from src import data_processing

print("‚úÖ Libraries loaded successfully")

‚úÖ Libraries loaded successfully


In [2]:
data_dir = Path(os.path.join(DATA_PATH, "le_boncoin_13_oct_2025"))
data = data_processing.load_car_data(data_dir)
data.head(2)

(732427, 35)


url,first_publication_date,index_date,price,marque,modele,annee_modele,kilometrage,energie,boite_de_vitesse,nombre_de_portes,nombre_de_place_s,version_constructeur,date_de_premiere_mise_en_circulation,type_de_vehicule,couleur,crit_air,puissance_fiscale,puissance_din,permis,reference,duree_de_disponibilite_des_pieces_detachees,pays,id_region,region,id_departement,departement,ville_affichee,ville,code_postal,latitude,longitude,source,fournisseur,forme_existante
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""https://www.leboncoin.fr/ad/vo‚Ä¶","""2025-09-01 15:37:26""","""2025-10-08 18:28:10""","""24900 ‚Ç¨""","""DELAGE""","""D4""","""1960""","""15000 km""","""Essence""","""Manuelle""","""4""","""4""",,"""01/1935""","""Berline""","""Beige""",,"""8 Cv""",,"""Avec permis""",,,"""FR""","""3""","""Auvergne""","""63""","""Puy-de-D√¥me""","""Romagnat 63540 Opme""","""Romagnat""","""63540""","""45.70758""","""3.08908""","""city""","""here""","""True"""
"""https://www.leboncoin.fr/ad/vo‚Ä¶","""2025-10-06 19:02:42""","""2025-10-06 19:02:42""","""39000 ‚Ç¨""","""DELAGE""","""DI""","""1960""","""43000 km""","""Essence""","""Manuelle""","""4""","""6""",,"""01/1924""","""Cabriolet""",,,"""11 Cv""","""35 Ch""","""Avec permis""",,,"""FR""","""17""","""Nord-Pas-de-Calais""","""62""","""Pas-de-Calais""","""La Couture 62136""","""La Couture""","""62136""","""50.58426""","""2.70473""","""city""","""here""","""True"""


In [3]:
# clean apply initial preprocessing

df = data_processing.clean_car_data(data)

üßπ Starting data cleaning pipeline...

1Ô∏è‚É£ Converting data types...
   Original: 732,427 rows
   After conversion: 732,426 rows
   Removed (invalid price): 1

2Ô∏è‚É£ Filtering and grouping brands...
   Removed 63 brands with <50 cars
   Grouped 38 brands into price tiers

3Ô∏è‚É£ Removing antique cars (pre-1990)...
   Removed 14,075 antique cars

4Ô∏è‚É£ Removing 'autre' entries...
   Original: 732,427 rows
   After conversion: 732,426 rows
   Removed (invalid price): 1

2Ô∏è‚É£ Filtering and grouping brands...
   Removed 63 brands with <50 cars
   Grouped 38 brands into price tiers

3Ô∏è‚É£ Removing antique cars (pre-1990)...
   Removed 14,075 antique cars

4Ô∏è‚É£ Removing 'autre' entries...
   Removed 4,050 'autre' entries

5Ô∏è‚É£ Removing outliers (IQR 1.5√ó for price, 1.5√ó for km)...
   Before: 713,413 rows
   After: 691,115 rows
   Removed: 22,298 (3.1%)

‚úÖ Data cleaning completed!
Final dataset: 691,115 rows √ó 5 columns
   Removed 4,050 'autre' entries

5Ô∏è‚É£ Remov

## Analysis on brands/models

Extract the list of all brands and models to find typos and potential issues in the assigment of a models to a brand.

In [None]:
#extract all combinations of cars and models to ask chatgt if any model might be an error

combinations_with_count = (
    df
    .filter(~pl.col('brand').str.contains('other'))
    .group_by(['brand', 'model'])
    .agg(pl.len().alias('count'))
    .sort(['brand', 'model'])
)

combinations_with_count.write_excel("/Users/brunobrumbrum/Documents/data/car_price_prediction/outputs/brand_model_combinations.xlsx")

<xlsxwriter.workbook.Workbook at 0x382942270>

In [17]:
print("üîé VALIDATING BRAND-MODEL COMBINATIONS FOR ERRORS")
print("=" * 70)

combinations_with_count = (
    data
    .filter(~pl.col('brand').str.contains('other'))
    .group_by(['brand', 'model'])
    .agg(pl.len().alias('count'))
    .sort('count', descending=True)
    .to_pandas()
)

print(f"\nTotal combinations: {len(combinations_with_count):,}")
print(f"\nüìä POTENTIAL ISSUES TO CHECK:")
print("-" * 70)

# 1. Check for single listing anomalies
single_listing = combinations_with_count[combinations_with_count['count'] == 1]
if len(single_listing) > 0:
    print(f"\nüü° {len(single_listing):,} brand-model combinations with ONLY 1 listing")
    print("   These might be data entry errors. Examples:")
    for idx, row in single_listing.head(10).iterrows():
        print(f"     ‚Ä¢ {row['brand']} {row['model']}")
    if len(single_listing) > 10:
        print(f"     ... and {len(single_listing) - 10} more")

# 2. Check for duplicate-like models (typos, case sensitivity)
print(f"\nüü° CHECKING FOR POTENTIAL TYPOS/DUPLICATES:")
for brand in combinations_with_count['brand'].unique():
    brand_models = combinations_with_count[combinations_with_count['brand'] == brand]['model'].tolist()
    
    # Check for models with similar names (potential typos)
    if len(brand_models) > 1:
        for i, model1 in enumerate(brand_models):
            for model2 in brand_models[i+1:]:
                # Simple check: similar length and similar starting letters
                if model1 and model2:
                    model1_lower = str(model1).lower().strip()
                    model2_lower = str(model2).lower().strip()
                    
                    # Check if very similar (potential typo)
                    if (model1_lower[:3] == model2_lower[:3] and 
                        abs(len(model1_lower) - len(model2_lower)) <= 2):
                        count1 = combinations_with_count[
                            (combinations_with_count['brand'] == brand) & 
                            (combinations_with_count['model'] == model1)
                        ]['count'].values[0]
                        count2 = combinations_with_count[
                            (combinations_with_count['brand'] == brand) & 
                            (combinations_with_count['model'] == model2)
                        ]['count'].values[0]
                        
                        print(f"   ‚ö†Ô∏è  {brand}: '{model1}' ({count1:,}) vs '{model2}' ({count2:,})")

# 3. Check for empty or very short model names
print(f"\nüü° CHECKING FOR EMPTY/SHORT MODEL NAMES:")
short_models = combinations_with_count[
    combinations_with_count['model'].astype(str).str.len() < 2
]
if len(short_models) > 0:
    print(f"   Found {len(short_models)} model names with <2 characters:")
    for idx, row in short_models.iterrows():
        print(f"     ‚Ä¢ {row['brand']}: '{row['model']}' ({row['count']:,} listings)")

# 4. Summary statistics
print(f"\nüìà STATISTICS:")
print("-" * 70)
print(f"Models per brand:")
models_per_brand = combinations_with_count.groupby('brand').size()
print(f"  Min: {models_per_brand.min()}")
print(f"  Max: {models_per_brand.max()}")
print(f"  Mean: {models_per_brand.mean():.1f}")
print(f"  Median: {models_per_brand.median():.0f}")

print(f"\nListings per combination:")
print(f"  Min: {combinations_with_count['count'].min():,}")
print(f"  Max: {combinations_with_count['count'].max():,}")
print(f"  Mean: {combinations_with_count['count'].mean():.0f}")
print(f"  Median: {combinations_with_count['count'].median():.0f}")

# 5. Show top combinations
print(f"\nüèÜ TOP 20 BRAND-MODEL COMBINATIONS:")
print("-" * 70)
print(combinations_with_count.head(20).to_string(index=False))

print(f"\nüí° NEXT STEPS:")
print("   1. Review the CSV export: brand_model_combinations.csv")
print("   2. Check for typos (especially single-listing combinations)")
print("   3. Verify brand names are spelled consistently")
print("   4. Look for data entry errors or misclassifications")
print("   5. Consider if any models should be grouped or renamed")

üîé VALIDATING BRAND-MODEL COMBINATIONS FOR ERRORS

Total combinations: 1,084

üìä POTENTIAL ISSUES TO CHECK:
----------------------------------------------------------------------

üü° 95 brand-model combinations with ONLY 1 listing
   These might be data entry errors. Examples:
     ‚Ä¢ JAGUAR Type E
     ‚Ä¢ JAGUAR XJ-SC
     ‚Ä¢ MASERATI 430
     ‚Ä¢ CHRYSLER Neon
     ‚Ä¢ HONDA Stream
     ‚Ä¢ FIAT Palio
     ‚Ä¢ CITROEN Traction/15
     ‚Ä¢ LIGIER Be Up
     ‚Ä¢ MITSUBISHI Pick-up
     ‚Ä¢ PEUGEOT 304
     ... and 85 more

üü° CHECKING FOR POTENTIAL TYPOS/DUPLICATES:
   ‚ö†Ô∏è  RENAULT: 'Twingo' (8,184) vs 'Twizy' (264)
   ‚ö†Ô∏è  RENAULT: 'Grand Scenic' (2,601) vs 'Grand Modus' (386)
   ‚ö†Ô∏è  RENAULT: 'Grand Scenic' (2,601) vs 'Grand Espace' (164)
   ‚ö†Ô∏è  RENAULT: 'Grand Modus' (386) vs 'Grand Espace' (164)
   ‚ö†Ô∏è  RENAULT: 'Master' (273) vs 'Mascott' (7)
   ‚ö†Ô∏è  PEUGEOT: '4008' (99) vs '4007' (93)
   ‚ö†Ô∏è  CITROEN: 'C4 Picasso' (5,565) vs 'C4 Cactus' (1,911)
  

## 2. Time-Based Features

Create features related to vehicle age and depreciation.

In [5]:
CURRENT_YEAR = 2025

# Create age-related features
df_features = df.with_columns([
    # Basic age
    (CURRENT_YEAR - pl.col('year')).alias('car_age'),
    
    # Age categories
    pl.when(pl.col('year') >= 2023)
        .then(pl.lit('new_0-2yr'))
    .when(pl.col('year') >= 2020)
        .then(pl.lit('recent_3-5yr'))
    .when(pl.col('year') >= 2015)
        .then(pl.lit('mid_age_6-10yr'))
    .when(pl.col('year') >= 2010)
        .then(pl.lit('older_11-15yr'))
    .otherwise(pl.lit('very_old_16+yr'))
    .alias('age_category'),
    
    # Is the car almost new?
    (pl.col('year') >= 2024).alias('is_almost_new'),
    
    # Decade of manufacture
    ((pl.col('year') // 10) * 10).alias('decade')
])

print("‚úÖ Time-based features created:")
print("  ‚Ä¢ car_age: Years since manufacture")
print("  ‚Ä¢ age_category: Categorical age grouping")
print("  ‚Ä¢ is_almost_new: Boolean for cars 2024+")
print("  ‚Ä¢ decade: Decade of manufacture")

# Show age distribution
age_dist = df_features.group_by('age_category').len().sort('len', descending=True)
print(f"\nüìä Age category distribution:")
for row in age_dist.iter_rows():
    cat, count = row
    pct = (count / df_features.height) * 100
    print(f"  {cat}: {count:,} ({pct:.1f}%)")

‚úÖ Time-based features created:
  ‚Ä¢ car_age: Years since manufacture
  ‚Ä¢ age_category: Categorical age grouping
  ‚Ä¢ is_almost_new: Boolean for cars 2024+
  ‚Ä¢ decade: Decade of manufacture

üìä Age category distribution:
  mid_age_6-10yr: 184,019 (26.6%)
  recent_3-5yr: 151,635 (21.9%)
  new_0-2yr: 131,579 (19.0%)
  older_11-15yr: 118,144 (17.1%)
  very_old_16+yr: 105,738 (15.3%)


## 3. Mileage-Based Features

Create features related to vehicle usage and mileage patterns.

In [6]:
# Calculate percentiles for mileage
km_p25 = df_features['km'].quantile(0.25)
km_p50 = df_features['km'].quantile(0.50)
km_p75 = df_features['km'].quantile(0.75)
km_p90 = df_features['km'].quantile(0.90)

print(f"Mileage percentiles:")
print(f"  25th: {km_p25:,.0f} km")
print(f"  50th: {km_p50:,.0f} km")
print(f"  75th: {km_p75:,.0f} km")
print(f"  90th: {km_p90:,.0f} km")

df_features = df_features.with_columns([
    # Kilometers per year (avoid division by zero)
    pl.when(pl.col('car_age') > 0)
        .then(pl.col('km') / pl.col('car_age'))
        .otherwise(pl.col('km'))
        .alias('km_per_year'),
    
    # Mileage categories
    pl.when(pl.col('km') < km_p25)
        .then(pl.lit('very_low'))
    .when(pl.col('km') < km_p50)
        .then(pl.lit('low'))
    .when(pl.col('km') < km_p75)
        .then(pl.lit('medium'))
    .when(pl.col('km') < km_p90)
        .then(pl.lit('high'))
    .otherwise(pl.lit('very_high'))
    .alias('mileage_category'),
    
    # Boolean flags
    (pl.col('km') < 50000).alias('is_low_mileage'),
    (pl.col('km') > km_p75).alias('is_high_mileage'),
    (pl.col('km') < 10000).alias('is_nearly_new_mileage')
])

print("\n‚úÖ Mileage features created:")
print("  ‚Ä¢ km_per_year: Average annual usage")
print("  ‚Ä¢ mileage_category: Low/Medium/High grouping")
print("  ‚Ä¢ is_low_mileage: <50k km")
print("  ‚Ä¢ is_high_mileage: >75th percentile")
print("  ‚Ä¢ is_nearly_new_mileage: <10k km")

# Show mileage category distribution
mileage_dist = df_features.group_by('mileage_category').len().sort('len', descending=True)
print(f"\nüìä Mileage category distribution:")
for row in mileage_dist.iter_rows():
    cat, count = row
    pct = (count / df_features.height) * 100
    print(f"  {cat}: {count:,} ({pct:.1f}%)")

Mileage percentiles:
  25th: 40,000 km
  50th: 92,500 km
  75th: 156,600 km
  90th: 213,000 km

‚úÖ Mileage features created:
  ‚Ä¢ km_per_year: Average annual usage
  ‚Ä¢ mileage_category: Low/Medium/High grouping
  ‚Ä¢ is_low_mileage: <50k km
  ‚Ä¢ is_high_mileage: >75th percentile
  ‚Ä¢ is_nearly_new_mileage: <10k km

üìä Mileage category distribution:
  low: 173,210 (25.1%)
  medium: 173,047 (25.0%)
  very_low: 172,072 (24.9%)
  high: 103,475 (15.0%)
  very_high: 69,311 (10.0%)


## 4. Brand and Model Features

Create aggregated features based on brand and model popularity.

In [10]:
print("üè∑Ô∏è  CREATING BRAND/MODEL FEATURES")
print("="*50)

# Calculate brand statistics
brand_stats = df_features.group_by('brand').agg([
    pl.len().alias('brand_count'),
    pl.col('price').mean().alias('brand_avg_price'),
    pl.col('price').median().alias('brand_median_price'),
    pl.col('price').std().alias('brand_price_std'),
    pl.col('km').mean().alias('brand_avg_km'),
    pl.col('car_age').mean().alias('brand_avg_age')
])

# Join brand stats back to main dataframe
df_features = df_features.join(brand_stats, on='brand', how='left')

print("‚úÖ Brand features created:")
print("  ‚Ä¢ brand_count: Number of listings for this brand")
print("  ‚Ä¢ brand_avg_price: Average price for this brand")
print("  ‚Ä¢ brand_median_price: Median price for this brand")
print("  ‚Ä¢ brand_price_std: Price std deviation for this brand")
print("  ‚Ä¢ brand_avg_km: Average mileage for this brand")
print("  ‚Ä¢ brand_avg_age: Average age for this brand")

# Calculate model statistics (within brand)
model_stats = df_features.group_by(['brand', 'model']).agg([
    pl.len().alias('model_count'),
    pl.col('price').mean().alias('model_avg_price'),
    pl.col('price').median().alias('model_median_price')
])

# Join model stats
df_features = df_features.join(model_stats, on=['brand', 'model'], how='left')

print("\n‚úÖ Model features created:")
print("  ‚Ä¢ model_count: Number of listings for this model")
print("  ‚Ä¢ model_avg_price: Average price for this model")
print("  ‚Ä¢ model_median_price: Median price for this model")

# Create relative price features
df_features = df_features.with_columns([
    
    # Is this model popular for the brand?
    (pl.col('model_count') / pl.col('brand_count')).alias('model_popularity_ratio')
])

print("\n‚úÖ Relative price features created:")
print("  ‚Ä¢ model_popularity_ratio: Model popularity within brand")

# Show top brands by count
top_brands = brand_stats.sort('brand_count', descending=True).head(10)
print(f"\nüìä Top 10 brands by listing count:")
print(f"{'Brand':<15} {'Count':>10} {'Avg Price':>12} {'Avg Age':>10}")
print("-" * 50)
for row in top_brands.iter_rows():
    brand, count, avg_price, med_price, std, avg_km, avg_age = row
    print(f"{brand:<15} {count:>10,} {avg_price:>11,.0f} {avg_age:>9.1f}yr")

üè∑Ô∏è  CREATING BRAND/MODEL FEATURES
‚úÖ Brand features created:
  ‚Ä¢ brand_count: Number of listings for this brand
  ‚Ä¢ brand_avg_price: Average price for this brand
  ‚Ä¢ brand_median_price: Median price for this brand
  ‚Ä¢ brand_price_std: Price std deviation for this brand
  ‚Ä¢ brand_avg_km: Average mileage for this brand
  ‚Ä¢ brand_avg_age: Average age for this brand

‚úÖ Model features created:
  ‚Ä¢ model_count: Number of listings for this model
  ‚Ä¢ model_avg_price: Average price for this model
  ‚Ä¢ model_median_price: Median price for this model

‚úÖ Relative price features created:
  ‚Ä¢ model_popularity_ratio: Model popularity within brand

üìä Top 10 brands by listing count:
Brand                Count    Avg Price    Avg Age
--------------------------------------------------
PEUGEOT            106,139      12,990       8.3yr
RENAULT             93,014      12,941       8.6yr
CITROEN             59,402      10,383       9.2yr
VOLKSWAGEN          53,637      19,142

## 5. Interaction Features

Create features that capture interactions between variables.

In [13]:
df_features = df_features.with_columns([
    # Age √ó Mileage interaction (heavily used old cars)
    (pl.col('car_age') * pl.col('km') / 1000).alias('age_km_interaction'),
    
    # Is this a low-mileage recent car? (barely used)
    ((pl.col('car_age') < 5) & (pl.col('km') < 50000)).alias('is_low_use_recent'),
    
    # Unusual combinations
    ((pl.col('car_age') < 3) & (pl.col('km') > 150000)).alias('is_high_use_new'),
    ((pl.col('car_age') > 15) & (pl.col('km') < 50000)).alias('is_garage_queen'),

])

print("‚úÖ Interaction features created:")
print("  ‚Ä¢ age_km_interaction: Age √ó Mileage combined")
print("  ‚Ä¢ is_low_use_recent: Recent car, low mileage")
print("  ‚Ä¢ is_high_use_new: New car, very high mileage")
print("  ‚Ä¢ is_garage_queen: Old car, very low mileage")

# Count unusual patterns
low_use_recent = df_features['is_low_use_recent'].sum()
high_use_new = df_features['is_high_use_new'].sum()
garage_queen = df_features['is_garage_queen'].sum()

print(f"\nüìä Unusual pattern counts:")
print(f"  Low-use recent cars: {low_use_recent:,}")
print(f"  High-use new cars: {high_use_new:,}")
print(f"  Garage queens (old, low mileage): {garage_queen:,}")

‚úÖ Interaction features created:
  ‚Ä¢ age_km_interaction: Age √ó Mileage combined
  ‚Ä¢ is_low_use_recent: Recent car, low mileage
  ‚Ä¢ is_high_use_new: New car, very high mileage
  ‚Ä¢ is_garage_queen: Old car, very low mileage

üìä Unusual pattern counts:
  Low-use recent cars: 174,892
  High-use new cars: 104
  Garage queens (old, low mileage): 2,852


## 8. Feature Summary and Data Quality Check

In [15]:
print("üìä FEATURE ENGINEERING SUMMARY")
print("="*70)

print(f"\nOriginal features: 5")
print(f"New features created: {len(df_features.columns) - 5}")
print(f"Total features: {len(df_features.columns)}")

print(f"\nüìã All features:")
print("-" * 70)

# Categorize features
original_features = ['price', 'year', 'km', 'brand', 'model']
time_features = ['car_age', 'age_category', 'is_almost_new', 'decade']
mileage_features = ['km_per_year', 'mileage_category', 'is_low_mileage', 'is_high_mileage', 'is_nearly_new_mileage']
brand_features = ['brand_count', 'brand_avg_price', 'brand_median_price', 'brand_price_std', 'brand_avg_km', 'brand_avg_age']
model_features = ['model_count', 'model_avg_price', 'model_median_price']
relative_features = ['model_popularity_ratio']
interaction_features = ['age_km_interaction', 'is_low_use_recent', 'is_high_use_new', 'is_garage_queen']

print("\nüîπ Original features (5):")
for f in original_features:
    print(f"  ‚Ä¢ {f}")

print("\nüîπ Time-based features (4):")
for f in time_features:
    print(f"  ‚Ä¢ {f}")

print("\nüîπ Mileage-based features (5):")
for f in mileage_features:
    print(f"  ‚Ä¢ {f}")

print("\nüîπ Brand aggregate features (6):")
for f in brand_features:
    print(f"  ‚Ä¢ {f}")

print("\nüîπ Model aggregate features (3):")
for f in model_features:
    print(f"  ‚Ä¢ {f}")

print("\nüîπ Relative price features (3):")
for f in relative_features:
    print(f"  ‚Ä¢ {f}")

print("\nüîπ Interaction features (6):")
for f in interaction_features:
    print(f"  ‚Ä¢ {f}")

# Check for missing values in new features
print(f"\nüîç DATA QUALITY CHECK:")
print("-" * 70)

missing_counts = {}
for col in df_features.columns:
    null_count = df_features[col].null_count()
    if null_count > 0:
        missing_counts[col] = null_count

if missing_counts:
    print("‚ö†Ô∏è  Features with missing values:")
    for col, count in missing_counts.items():
        pct = (count / df_features.height) * 100
        print(f"  ‚Ä¢ {col}: {count:,} ({pct:.2f}%)")
else:
    print("‚úÖ No missing values in any feature!")

# Check for infinite values
print(f"\nChecking for infinite values...")
inf_found = False
for col in df_features.columns:
    if df_features[col].dtype in [pl.Float64, pl.Float32]:
        inf_count = df_features.filter(pl.col(col).is_infinite()).height
        if inf_count > 0:
            print(f"  ‚ö†Ô∏è  {col}: {inf_count:,} infinite values")
            inf_found = True

if not inf_found:
    print("‚úÖ No infinite values found!")

print(f"\n{'='*70}")
print("‚úÖ FEATURE ENGINEERING COMPLETE")
print(f"{'='*70}")

üìä FEATURE ENGINEERING SUMMARY

Original features: 5
New features created: 32
Total features: 37

üìã All features:
----------------------------------------------------------------------

üîπ Original features (5):
  ‚Ä¢ price
  ‚Ä¢ year
  ‚Ä¢ km
  ‚Ä¢ brand
  ‚Ä¢ model

üîπ Time-based features (4):
  ‚Ä¢ car_age
  ‚Ä¢ age_category
  ‚Ä¢ is_almost_new
  ‚Ä¢ decade

üîπ Mileage-based features (5):
  ‚Ä¢ km_per_year
  ‚Ä¢ mileage_category
  ‚Ä¢ is_low_mileage
  ‚Ä¢ is_high_mileage
  ‚Ä¢ is_nearly_new_mileage

üîπ Brand aggregate features (6):
  ‚Ä¢ brand_count
  ‚Ä¢ brand_avg_price
  ‚Ä¢ brand_median_price
  ‚Ä¢ brand_price_std
  ‚Ä¢ brand_avg_km
  ‚Ä¢ brand_avg_age

üîπ Model aggregate features (3):
  ‚Ä¢ model_count
  ‚Ä¢ model_avg_price
  ‚Ä¢ model_median_price

üîπ Relative price features (3):
  ‚Ä¢ model_popularity_ratio

üîπ Interaction features (6):
  ‚Ä¢ age_km_interaction
  ‚Ä¢ is_low_use_recent
  ‚Ä¢ is_high_use_new
  ‚Ä¢ is_garage_queen

üîç DATA QUALITY CHECK:
----

In [None]:
# keep in midn that features that use price tehy need to be computed after separating in the CV process otherwise there will be a data leakage issue
# would it make more sense to create the price related features and maybe km converting first to log scale?

In [20]:
2015//10*10

2010