## Notes

We are likely overfitting here due to low sample size. For now, we keep this example for future reference as the procedure is valid.

### Imports

In [3]:
import logging
import sys
import os
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Add the parent directory to the path so we can import the modules
# note that in /notebooks, use sys.path.append(os.path.abspath(".."))
sys.path.append(os.path.abspath("../.."))

from scripts import load_config, setup_logging, get_session, load_data_to_db
from models import CarPriceDataset


### Config and Logging Setup

In [4]:
try:
    config = load_config()
    setup_logging(config['paths']['log_path'])
    logging.info("Starting the data analysis project.")
except Exception as e:
    logging.error(f"Failed to load config or setup logging: {e}")
    raise

INFO:root:Starting the data analysis project.
2025-02-22 17:15:17,532 - INFO - Starting the data analysis project.


### Database Session

In [5]:
try:
    session = get_session()
    logging.info("Database session created successfully.")
except Exception as e:
    logging.error(f"Failed to create database session: {e}")
    raise

INFO:root:Database session created successfully.
2025-02-22 17:15:17,583 - INFO - Database session created successfully.


### Query Car Price Data

In [6]:
try:
    car_price_data = session.query(CarPriceDataset).all()
    data = [car.__dict__ for car in car_price_data]
    for item in data:
        item.pop('_sa_instance_state', None)
    
    car_price_df = pd.DataFrame(data)
    print(car_price_df.head())
except Exception as e:
    logging.error(f"Failed to query car price data: {e}")
    raise


       brand  engine_size transmission  doors  price   model  year fuel_type  \
0       Audi          2.1       Manual      4   9704      Q5  2017    Petrol   
1       Audi          2.0       Manual      4  10535      A3  2021  Electric   
2        Kia          1.6       Manual      5   3774     Rio  2005    Petrol   
3        BMW          3.1       Manual      2  10072      X5  2011    Petrol   
4  Chevrolet          2.0    Automatic      2  12092  Malibu  2012    Hybrid   

   mileage  owner_count  
0    79782            1  
1   193207            2  
2   171257            1  
3    21375            4  
4     5356            3  


### Feature Engineering and Mutual Information

In [7]:
# Group less frequent categories in 'model' and 'brand'
model_counts = car_price_df['model'].value_counts()
less_frequent_models = model_counts[model_counts < 10].index
car_price_df['model'] = car_price_df['model'].apply(lambda x: 'Other' if x in less_frequent_models else x)

brand_counts = car_price_df['brand'].value_counts()
less_frequent_brands = brand_counts[brand_counts < 10].index
car_price_df['brand'] = car_price_df['brand'].apply(lambda x: 'Other' if x in less_frequent_brands else x)

# Create 'car_age' feature
from datetime import datetime
car_price_df['car_age'] = datetime.now().year - car_price_df['year']

# One-hot encode categorical variables
categorical_cols = ['fuel_type', 'model', 'brand', 'transmission']
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_categorical = encoder.fit_transform(car_price_df[categorical_cols])

# Create DataFrame for encoded categorical variables
encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(categorical_cols))

# Combine encoded categorical and numerical variables
numerical_cols = car_price_df.drop(columns=categorical_cols + ['price', 'year'])
X_encoded = pd.concat([numerical_cols.reset_index(drop=True), encoded_categorical_df.reset_index(drop=True)], axis=1)

# Target variable
y = car_price_df['price']

# Calculate mutual information
mi = mutual_info_regression(X_encoded, y)

# Create and display mutual information DataFrame
mi_df = pd.DataFrame({'Feature': X_encoded.columns, 'Mutual Information': mi})
mi_df = mi_df.sort_values(by='Mutual Information', ascending=False)
print(mi_df)

                       Feature  Mutual Information
4                      car_age            0.097957
7             fuel_type_Petrol            0.081093
0                  engine_size            0.067667
5           fuel_type_Electric            0.041760
8          transmission_Manual            0.034774
9  transmission_Semi-Automatic            0.002760
1                        doors            0.000000
2                      mileage            0.000000
3                  owner_count            0.000000
6             fuel_type_Hybrid            0.000000


### Variance Inflation Factor (VIF) Calculation

In [8]:
# Check for constant numerical columns
constant_columns = [col for col in numerical_cols.columns if car_price_df[col].nunique() == 1]
print("Constant columns:", constant_columns)

# Drop constant columns if any
X_encoded_reduced = X_encoded.drop(columns=constant_columns)

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = X_encoded_reduced.columns
vif_data["VIF"] = [variance_inflation_factor(X_encoded_reduced.values, i) for i in range(len(X_encoded_reduced.columns))]

print(vif_data)

Constant columns: []
                       Feature        VIF
0                  engine_size  10.754249
1                        doors   7.829674
2                      mileage   5.383627
3                  owner_count   6.342139
4                      car_age  10.987574
5           fuel_type_Electric   3.201536
6             fuel_type_Hybrid   3.254450
7             fuel_type_Petrol   3.649158
8          transmission_Manual   2.832613
9  transmission_Semi-Automatic   1.852904


### Ridge Regression Model

In [9]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded_reduced)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define a range of alpha values for Ridge Regression
alphas = np.logspace(-4, 4, 50)  # Test alpha values from 10^-4 to 10^4

# Use RidgeCV to find the best alpha
ridge_cv = RidgeCV(alphas=alphas, store_cv_results=True)
ridge_cv.fit(X_train, y_train)

# Best alpha value
print(f"Best alpha: {ridge_cv.alpha_}")

# Train Ridge Regression model with the best alpha
ridge_model = Ridge(alpha=ridge_cv.alpha_)
ridge_model.fit(X_train, y_train)

# Make predictions
y_pred = ridge_model.predict(X_test)

# Evaluate model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Best alpha: 0.00021209508879201905
Mean Squared Error: 0.0652019557374877
R^2 Score: 0.9999999863417236


### Debugging and Model Validation

In [10]:
# Check the shape of the feature set and target variable
print(f"Shape of X_encoded_reduced: {X_encoded_reduced.shape}")
print(f"Shape of y: {y.shape}")

# Inspect the target variable
print(y.describe())

# Check for data leakage
print("First few rows of X_encoded_reduced:")
print(X_encoded_reduced.head())
print("First few rows of y:")
print(y.head())

# Inspect feature importance
feature_importance = pd.DataFrame({
    'Feature': X_encoded_reduced.columns,
    'Coefficient': ridge_model.coef_
})
print("Feature Importance:")
print(feature_importance.sort_values(by='Coefficient', ascending=False))

# Cross-validation
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(ridge_model, X_scaled, y, cv=5, scoring='r2')
print(f"Cross-Validation R² Scores: {cv_scores}")
print(f"Mean Cross-Validation R² Score: {cv_scores.mean()}")

Shape of X_encoded_reduced: (30, 10)
Shape of y: (30,)
count       30.000000
mean      8558.100000
std       2639.424158
min       2867.000000
25%       6733.500000
50%       9238.500000
75%      10617.500000
max      13374.000000
Name: price, dtype: float64
First few rows of X_encoded_reduced:
   engine_size  doors  mileage  owner_count  car_age  fuel_type_Electric  \
0          2.1      4    79782            1        8                 0.0   
1          2.0      4   193207            2        4                 1.0   
2          1.6      5   171257            1       20                 0.0   
3          3.1      2    21375            4       14                 0.0   
4          2.0      2     5356            3       13                 0.0   

   fuel_type_Hybrid  fuel_type_Petrol  transmission_Manual  \
0               0.0               1.0                  1.0   
1               0.0               0.0                  1.0   
2               0.0               1.0                  1.0   