### Imports

In [None]:
import logging
import sys
import os
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

sys.path.append(os.path.abspath("../.."))

from scripts.database import get_session, load_data_to_db
from scripts.utils import load_config, setup_logging
from models import CarPriceDataset


### Config and Logging Setup

In [None]:
try:
    config = load_config()
    setup_logging(config['paths']['log_path'])
    logging.info("Starting the data analysis project.")
except Exception as e:
    logging.error(f"Failed to load config or setup logging: {e}")
    raise

### Database Session

In [None]:
try:
    session = get_session()
    logging.info("Database session created successfully.")
except Exception as e:
    logging.error(f"Failed to create database session: {e}")
    raise

### Query Car Price Data

In [None]:
try:
    car_price_data = session.query(CarPriceDataset).all()
    data = [car.__dict__ for car in car_price_data]
    for item in data:
        item.pop('_sa_instance_state', None)
    
    car_price_df = pd.DataFrame(data)
    print(car_price_df.head())
except Exception as e:
    logging.error(f"Failed to query car price data: {e}")
    raise


### Feature Engineering and Mutual Information

In [None]:
# Group less frequent categories in 'model' and 'brand'
model_counts = car_price_df['model'].value_counts()
less_frequent_models = model_counts[model_counts < 10].index
car_price_df['model'] = car_price_df['model'].apply(lambda x: 'Other' if x in less_frequent_models else x)

brand_counts = car_price_df['brand'].value_counts()
less_frequent_brands = brand_counts[brand_counts < 10].index
car_price_df['brand'] = car_price_df['brand'].apply(lambda x: 'Other' if x in less_frequent_brands else x)

# Create 'car_age' feature
from datetime import datetime
car_price_df['car_age'] = datetime.now().year - car_price_df['year']

# One-hot encode categorical variables
categorical_cols = ['fuel_type', 'model', 'brand', 'transmission']
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_categorical = encoder.fit_transform(car_price_df[categorical_cols])

# Create DataFrame for encoded categorical variables
encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(categorical_cols))

# Combine encoded categorical and numerical variables
numerical_cols = car_price_df.drop(columns=categorical_cols + ['price', 'year'])
X_encoded = pd.concat([numerical_cols.reset_index(drop=True), encoded_categorical_df.reset_index(drop=True)], axis=1)

# Target variable
y = car_price_df['price']

# Calculate mutual information
mi = mutual_info_regression(X_encoded, y)

# Create and display mutual information DataFrame
mi_df = pd.DataFrame({'Feature': X_encoded.columns, 'Mutual Information': mi})
mi_df = mi_df.sort_values(by='Mutual Information', ascending=False)
print(mi_df)

### Variance Inflation Factor (VIF) Calculation

In [None]:
# Check for constant numerical columns
constant_columns = [col for col in numerical_cols.columns if car_price_df[col].nunique() == 1]
print("Constant columns:", constant_columns)

# Drop constant columns if any
X_encoded_reduced = X_encoded.drop(columns=constant_columns)

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = X_encoded_reduced.columns
vif_data["VIF"] = [variance_inflation_factor(X_encoded_reduced.values, i) for i in range(len(X_encoded_reduced.columns))]

print(vif_data)

### Ridge Regression Model

In [None]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded_reduced)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define a range of alpha values for Ridge Regression
alphas = np.logspace(-4, 4, 50)  # Test alpha values from 10^-4 to 10^4

# Use RidgeCV to find the best alpha
ridge_cv = RidgeCV(alphas=alphas, store_cv_results=True)
ridge_cv.fit(X_train, y_train)

# Best alpha value
print(f"Best alpha: {ridge_cv.alpha_}")

# Train Ridge Regression model with the best alpha
ridge_model = Ridge(alpha=ridge_cv.alpha_)
ridge_model.fit(X_train, y_train)

# Make predictions
y_pred = ridge_model.predict(X_test)

# Evaluate model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")