# 03 - Feature Engineering for Car Price Prediction

This notebook creates derived features to improve model performance:
- Car age and age-related features
- Price and mileage ratios
- Categorical binning for better patterns
- Brand/model popularity metrics

## 1. Setup and Data Loading

In [None]:
# Core libraries
import polars as pl
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Path handling
from pathlib import Path
import os
import sys

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

# Add project root to path
current_dir = Path.cwd()
project_root = current_dir.parent if current_dir.name == "notebooks" else current_dir
sys.path.insert(0, str(project_root))

from src.config import DATA_PATH, PROCESSED_DATA_PATH
from src.data_processing import load_car_data, CarDataProcessor

print("‚úÖ Libraries loaded successfully")

In [None]:
# Load and clean raw data using the data processing module
print("üìÇ Loading and cleaning raw data...")
print("=" * 60)

# Define data directory
data_dir = Path(os.path.join(DATA_PATH, "le_boncoin_13_oct_2025"))

# Load raw data
df_raw = load_car_data(data_dir, infer_schema_length=0)

# Initialize data processor
processor = CarDataProcessor(
    min_brand_threshold=50,
    rare_brand_threshold=600,
    price_iqr_multiplier=1.5,
    km_iqr_multiplier=1.5,
    min_year=1990,
    verbose=True
)

# Clean the data
df = processor.clean_data(df_raw)

print("\n" + "=" * 60)
print(f"‚úÖ Data cleaned and ready for feature engineering!")
print(f"   Final dataset: {df.height:,} rows √ó {df.width} columns")
print(f"\nüìä Columns: {df.columns}")
print(f"\nüìà Data types:")
for col in df.columns:
    print(f"  ‚Ä¢ {col}: {df[col].dtype}")

# Show cleaning summary
print("\nüìã Cleaning Summary:")
print("=" * 60)
summary = processor.get_cleaning_summary()
for step, stats in summary.items():
    print(f"\n{step.upper().replace('_', ' ')}:")
    for key, value in stats.items():
        if isinstance(value, float):
            print(f"  ‚Ä¢ {key}: {value:.2f}")
        else:
            print(f"  ‚Ä¢ {key}: {value:,}")

df.head(3)

## 2. Time-Based Features

Create features related to vehicle age and depreciation.

In [None]:
print("‚è∞ CREATING TIME-BASED FEATURES")
print("="*50)

CURRENT_YEAR = 2025

# Create age-related features
df_features = df.with_columns([
    # Basic age
    (CURRENT_YEAR - pl.col('year')).alias('car_age'),
    
    # Age categories
    pl.when(pl.col('year') >= 2023)
        .then(pl.lit('new_0-2yr'))
    .when(pl.col('year') >= 2020)
        .then(pl.lit('recent_3-5yr'))
    .when(pl.col('year') >= 2015)
        .then(pl.lit('mid_age_6-10yr'))
    .when(pl.col('year') >= 2010)
        .then(pl.lit('older_11-15yr'))
    .otherwise(pl.lit('very_old_16+yr'))
    .alias('age_category'),
    
    # Is the car almost new?
    (pl.col('year') >= 2023).alias('is_almost_new'),
    
    # Decade of manufacture
    ((pl.col('year') // 10) * 10).alias('decade')
])

print("‚úÖ Time-based features created:")
print("  ‚Ä¢ car_age: Years since manufacture")
print("  ‚Ä¢ age_category: Categorical age grouping")
print("  ‚Ä¢ is_almost_new: Boolean for cars 2023+")
print("  ‚Ä¢ decade: Decade of manufacture")

# Show age distribution
age_dist = df_features.group_by('age_category').len().sort('len', descending=True)
print(f"\nüìä Age category distribution:")
for row in age_dist.iter_rows():
    cat, count = row
    pct = (count / df_features.height) * 100
    print(f"  {cat}: {count:,} ({pct:.1f}%)")

## 3. Mileage-Based Features

Create features related to vehicle usage and mileage patterns.

In [None]:
print("üöó CREATING MILEAGE-BASED FEATURES")
print("="*50)

# Calculate percentiles for mileage
km_p25 = df_features['km'].quantile(0.25)
km_p50 = df_features['km'].quantile(0.50)
km_p75 = df_features['km'].quantile(0.75)
km_p90 = df_features['km'].quantile(0.90)

print(f"Mileage percentiles:")
print(f"  25th: {km_p25:,.0f} km")
print(f"  50th: {km_p50:,.0f} km")
print(f"  75th: {km_p75:,.0f} km")
print(f"  90th: {km_p90:,.0f} km")

df_features = df_features.with_columns([
    # Kilometers per year (avoid division by zero)
    pl.when(pl.col('car_age') > 0)
        .then(pl.col('km') / pl.col('car_age'))
        .otherwise(pl.col('km'))
        .alias('km_per_year'),
    
    # Mileage categories
    pl.when(pl.col('km') < km_p25)
        .then(pl.lit('very_low'))
    .when(pl.col('km') < km_p50)
        .then(pl.lit('low'))
    .when(pl.col('km') < km_p75)
        .then(pl.lit('medium'))
    .when(pl.col('km') < km_p90)
        .then(pl.lit('high'))
    .otherwise(pl.lit('very_high'))
    .alias('mileage_category'),
    
    # Boolean flags
    (pl.col('km') < 50000).alias('is_low_mileage'),
    (pl.col('km') > km_p75).alias('is_high_mileage'),
    (pl.col('km') < 10000).alias('is_nearly_new_mileage')
])

print("\n‚úÖ Mileage features created:")
print("  ‚Ä¢ km_per_year: Average annual usage")
print("  ‚Ä¢ mileage_category: Low/Medium/High grouping")
print("  ‚Ä¢ is_low_mileage: <50k km")
print("  ‚Ä¢ is_high_mileage: >75th percentile")
print("  ‚Ä¢ is_nearly_new_mileage: <10k km")

# Show mileage category distribution
mileage_dist = df_features.group_by('mileage_category').len().sort('len', descending=True)
print(f"\nüìä Mileage category distribution:")
for row in mileage_dist.iter_rows():
    cat, count = row
    pct = (count / df_features.height) * 100
    print(f"  {cat}: {count:,} ({pct:.1f}%)")

## 4. Price-Related Features

Create derived price features and categorizations.

In [None]:
print("üí∞ CREATING PRICE-RELATED FEATURES")
print("="*50)

# Calculate price percentiles
price_p20 = df_features['price'].quantile(0.20)
price_p40 = df_features['price'].quantile(0.40)
price_p60 = df_features['price'].quantile(0.60)
price_p80 = df_features['price'].quantile(0.80)

print(f"Price percentiles:")
print(f"  20th: ‚Ç¨{price_p20:,.0f}")
print(f"  40th: ‚Ç¨{price_p40:,.0f}")
print(f"  60th: ‚Ç¨{price_p60:,.0f}")
print(f"  80th: ‚Ç¨{price_p80:,.0f}")

df_features = df_features.with_columns([
    # Price per year of age (depreciation indicator)
    pl.when(pl.col('car_age') > 0)
        .then(pl.col('price') / pl.col('car_age'))
        .otherwise(pl.col('price'))
        .alias('price_per_year'),
    
    # Price per 1000 km
    pl.when(pl.col('km') > 0)
        .then(pl.col('price') / (pl.col('km') / 1000))
        .otherwise(pl.col('price'))
        .alias('price_per_1000km'),
    
    # Log price (for modeling)
    (pl.col('price') + 1).log().alias('log_price'),
    
    # Price categories
    pl.when(pl.col('price') < price_p20)
        .then(pl.lit('budget'))
    .when(pl.col('price') < price_p40)
        .then(pl.lit('economy'))
    .when(pl.col('price') < price_p60)
        .then(pl.lit('mid_range'))
    .when(pl.col('price') < price_p80)
        .then(pl.lit('premium'))
    .otherwise(pl.lit('luxury'))
    .alias('price_category')
])

print("\n‚úÖ Price features created:")
print("  ‚Ä¢ price_per_year: Price divided by age")
print("  ‚Ä¢ price_per_1000km: Price per 1000 km")
print("  ‚Ä¢ log_price: Log-transformed price")
print("  ‚Ä¢ price_category: Budget/Economy/Mid/Premium/Luxury")

# Show price category distribution
price_cat_dist = df_features.group_by('price_category').agg([
    pl.len().alias('count'),
    pl.col('price').mean().alias('avg_price'),
    pl.col('price').min().alias('min_price'),
    pl.col('price').max().alias('max_price')
]).sort('avg_price')

print(f"\nüìä Price category statistics:")
print(f"{'Category':<12} {'Count':>10} {'Avg Price':>12} {'Min':>10} {'Max':>10}")
print("-" * 60)
for row in price_cat_dist.iter_rows():
    cat, count, avg, min_p, max_p = row
    print(f"{cat:<12} {count:>10,} ‚Ç¨{avg:>11,.0f} ‚Ç¨{min_p:>9,.0f} ‚Ç¨{max_p:>9,.0f}")

## 5. Brand and Model Features

Create aggregated features based on brand and model popularity.

In [None]:
print("üè∑Ô∏è  CREATING BRAND/MODEL FEATURES")
print("="*50)

# Calculate brand statistics
brand_stats = df_features.group_by('brand').agg([
    pl.len().alias('brand_count'),
    pl.col('price').mean().alias('brand_avg_price'),
    pl.col('price').median().alias('brand_median_price'),
    pl.col('price').std().alias('brand_price_std'),
    pl.col('km').mean().alias('brand_avg_km'),
    pl.col('car_age').mean().alias('brand_avg_age')
])

# Join brand stats back to main dataframe
df_features = df_features.join(brand_stats, on='brand', how='left')

print("‚úÖ Brand features created:")
print("  ‚Ä¢ brand_count: Number of listings for this brand")
print("  ‚Ä¢ brand_avg_price: Average price for this brand")
print("  ‚Ä¢ brand_median_price: Median price for this brand")
print("  ‚Ä¢ brand_price_std: Price std deviation for this brand")
print("  ‚Ä¢ brand_avg_km: Average mileage for this brand")
print("  ‚Ä¢ brand_avg_age: Average age for this brand")

# Calculate model statistics (within brand)
model_stats = df_features.group_by(['brand', 'model']).agg([
    pl.len().alias('model_count'),
    pl.col('price').mean().alias('model_avg_price'),
    pl.col('price').median().alias('model_median_price')
])

# Join model stats
df_features = df_features.join(model_stats, on=['brand', 'model'], how='left')

print("\n‚úÖ Model features created:")
print("  ‚Ä¢ model_count: Number of listings for this model")
print("  ‚Ä¢ model_avg_price: Average price for this model")
print("  ‚Ä¢ model_median_price: Median price for this model")

# Create relative price features
df_features = df_features.with_columns([
    # How does this car's price compare to brand average?
    (pl.col('price') / pl.col('brand_avg_price')).alias('price_vs_brand_avg'),
    
    # How does this car's price compare to model average?
    (pl.col('price') / pl.col('model_avg_price')).alias('price_vs_model_avg'),
    
    # Is this model popular for the brand?
    (pl.col('model_count') / pl.col('brand_count')).alias('model_popularity_ratio')
])

print("\n‚úÖ Relative price features created:")
print("  ‚Ä¢ price_vs_brand_avg: Price ratio vs brand average")
print("  ‚Ä¢ price_vs_model_avg: Price ratio vs model average")
print("  ‚Ä¢ model_popularity_ratio: Model popularity within brand")

# Show top brands by count
top_brands = brand_stats.sort('brand_count', descending=True).head(10)
print(f"\nüìä Top 10 brands by listing count:")
print(f"{'Brand':<15} {'Count':>10} {'Avg Price':>12} {'Avg Age':>10}")
print("-" * 50)
for row in top_brands.iter_rows():
    brand, count, avg_price, med_price, std, avg_km, avg_age = row
    print(f"{brand:<15} {count:>10,} ‚Ç¨{avg_price:>11,.0f} {avg_age:>9.1f}yr")

## 6. Interaction Features

Create features that capture interactions between variables.

In [None]:
print("üîÑ CREATING INTERACTION FEATURES")
print("="*50)

df_features = df_features.with_columns([
    # Age √ó Mileage interaction (heavily used old cars)
    (pl.col('car_age') * pl.col('km') / 1000).alias('age_km_interaction'),
    
    # Is this a high-value old car? (potential classic/rare)
    ((pl.col('car_age') > 10) & (pl.col('price') > price_p80)).alias('is_valuable_old_car'),
    
    # Is this a low-mileage recent car? (barely used)
    ((pl.col('car_age') < 5) & (pl.col('km') < 50000)).alias('is_low_use_recent'),
    
    # Unusual combinations
    ((pl.col('car_age') < 3) & (pl.col('km') > 150000)).alias('is_high_use_new'),
    ((pl.col('car_age') > 15) & (pl.col('km') < 50000)).alias('is_garage_queen'),
    
    # Value for money indicator
    pl.when(pl.col('km_per_year') > 0)
        .then(pl.col('price') / (pl.col('km_per_year') / 1000))
        .otherwise(pl.col('price'))
        .alias('value_score')
])

print("‚úÖ Interaction features created:")
print("  ‚Ä¢ age_km_interaction: Age √ó Mileage combined")
print("  ‚Ä¢ is_valuable_old_car: Old but expensive (potential classic)")
print("  ‚Ä¢ is_low_use_recent: Recent car, low mileage")
print("  ‚Ä¢ is_high_use_new: New car, very high mileage")
print("  ‚Ä¢ is_garage_queen: Old car, very low mileage")
print("  ‚Ä¢ value_score: Price relative to annual usage")

# Count unusual patterns
valuable_old = df_features['is_valuable_old_car'].sum()
low_use_recent = df_features['is_low_use_recent'].sum()
high_use_new = df_features['is_high_use_new'].sum()
garage_queen = df_features['is_garage_queen'].sum()

print(f"\nüìä Unusual pattern counts:")
print(f"  Valuable old cars: {valuable_old:,}")
print(f"  Low-use recent cars: {low_use_recent:,}")
print(f"  High-use new cars: {high_use_new:,}")
print(f"  Garage queens (old, low mileage): {garage_queen:,}")

## 7. Feature Summary and Data Quality Check

In [None]:
print("üìä FEATURE ENGINEERING SUMMARY")
print("="*70)

print(f"\nOriginal features: 5")
print(f"New features created: {len(df_features.columns) - 5}")
print(f"Total features: {len(df_features.columns)}")

print(f"\nüìã All features:")
print("-" * 70)

# Categorize features
original_features = ['price', 'year', 'km', 'brand', 'model']
time_features = ['car_age', 'age_category', 'is_almost_new', 'decade']
mileage_features = ['km_per_year', 'mileage_category', 'is_low_mileage', 'is_high_mileage', 'is_nearly_new_mileage']
price_features = ['price_per_year', 'price_per_1000km', 'log_price', 'price_category']
brand_features = ['brand_count', 'brand_avg_price', 'brand_median_price', 'brand_price_std', 'brand_avg_km', 'brand_avg_age']
model_features = ['model_count', 'model_avg_price', 'model_median_price']
relative_features = ['price_vs_brand_avg', 'price_vs_model_avg', 'model_popularity_ratio']
interaction_features = ['age_km_interaction', 'is_valuable_old_car', 'is_low_use_recent', 'is_high_use_new', 'is_garage_queen', 'value_score']

print("\nüîπ Original features (5):")
for f in original_features:
    print(f"  ‚Ä¢ {f}")

print("\nüîπ Time-based features (4):")
for f in time_features:
    print(f"  ‚Ä¢ {f}")

print("\nüîπ Mileage-based features (5):")
for f in mileage_features:
    print(f"  ‚Ä¢ {f}")

print("\nüîπ Price-related features (4):")
for f in price_features:
    print(f"  ‚Ä¢ {f}")

print("\nüîπ Brand aggregate features (6):")
for f in brand_features:
    print(f"  ‚Ä¢ {f}")

print("\nüîπ Model aggregate features (3):")
for f in model_features:
    print(f"  ‚Ä¢ {f}")

print("\nüîπ Relative price features (3):")
for f in relative_features:
    print(f"  ‚Ä¢ {f}")

print("\nüîπ Interaction features (6):")
for f in interaction_features:
    print(f"  ‚Ä¢ {f}")

# Check for missing values in new features
print(f"\nüîç DATA QUALITY CHECK:")
print("-" * 70)

missing_counts = {}
for col in df_features.columns:
    null_count = df_features[col].null_count()
    if null_count > 0:
        missing_counts[col] = null_count

if missing_counts:
    print("‚ö†Ô∏è  Features with missing values:")
    for col, count in missing_counts.items():
        pct = (count / df_features.height) * 100
        print(f"  ‚Ä¢ {col}: {count:,} ({pct:.2f}%)")
else:
    print("‚úÖ No missing values in any feature!")

# Check for infinite values
print(f"\nChecking for infinite values...")
inf_found = False
for col in df_features.columns:
    if df_features[col].dtype in [pl.Float64, pl.Float32]:
        inf_count = df_features.filter(pl.col(col).is_infinite()).height
        if inf_count > 0:
            print(f"  ‚ö†Ô∏è  {col}: {inf_count:,} infinite values")
            inf_found = True

if not inf_found:
    print("‚úÖ No infinite values found!")

print(f"\n{'='*70}")
print("‚úÖ FEATURE ENGINEERING COMPLETE")
print(f"{'='*70}")

## 8. Save Enhanced Dataset

In [None]:
print("üíæ SAVING ENHANCED DATASET")
print("="*50)

# Save with features
output_path = PROCESSED_DATA_PATH / "car_data_with_features.csv"
df_features.write_csv(output_path)

print(f"‚úÖ Enhanced dataset saved to:")
print(f"   {output_path}")
print(f"\nüìä Dataset info:")
print(f"  ‚Ä¢ Rows: {df_features.height:,}")
print(f"  ‚Ä¢ Columns: {df_features.width}")
print(f"  ‚Ä¢ File size: {output_path.stat().st_size / (1024**2):.2f} MB")

print(f"\nüéØ Next steps:")
print("  1. Notebook 04: Post-cleaning EDA on enhanced features")
print("  2. Notebook 05: Model preparation (encoding, scaling, splitting)")
print("  3. Notebook 06: Model training and evaluation")

# Display sample
print(f"\nüìã Sample of enhanced data:")
df_features.head(3)