### Imports

In [None]:
import logging
import sys
import os
import pandas as pd
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import OneHotEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


# Add the parent directory to the path so we can import the modules
# note that in /notebooks, use sys.path.append(os.path.abspath(".."))
sys.path.append(os.path.abspath("../.."))

from scripts import get_session, load_config, setup_logging
from orm_models import CarPriceDataset


### Config and Logging Setup

In [None]:
try:
    config = load_config()
    setup_logging(config['paths']['log_path'])
    logging.info("Starting the data analysis project.")
except Exception as e:
    logging.error(f"Failed to load config or setup logging: {e}")
    raise

### Database Session

In [None]:
try:
    session = get_session()
    logging.info("Database session created successfully.")
except Exception as e:
    logging.error(f"Failed to create database session: {e}")
    raise

### Query Car Price Data

In [None]:
try:
    car_price_data = session.query(CarPriceDataset).all()
    data = [car.__dict__ for car in car_price_data]
    for item in data:
        item.pop('_sa_instance_state', None)
    
    car_price_df = pd.DataFrame(data)
    print(car_price_df.head())
except Exception as e:
    logging.error(f"Failed to query car price data: {e}")
    raise


### Feature Engineering and Mutual Information

In [None]:
# One-hot encode categorical variables
categorical_cols = ['fuel_type', 'model', 'brand', 'transmission']
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_categorical = encoder.fit_transform(car_price_df[categorical_cols])

# Create DataFrame for encoded categorical variables
encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(categorical_cols))

# Combine encoded categorical and numerical variables
numerical_cols = car_price_df.drop(columns=categorical_cols + ['price'])
X_encoded = pd.concat([numerical_cols.reset_index(drop=True), encoded_categorical_df.reset_index(drop=True)], axis=1)

# Target variable
y = car_price_df['price']

# Calculate mutual information
mi = mutual_info_regression(X_encoded, y)

# Create and display mutual information DataFrame
mi_df = pd.DataFrame({'Feature': X_encoded.columns, 'Mutual Information': mi})
mi_df = mi_df.sort_values(by='Mutual Information', ascending=False)
print(mi_df)

### Create New Dataset

In [None]:
new_dataset = car_price_df[['mileage', 'doors', 'engine_size', 'fuel_type']]
print(new_dataset.head())

### Variance Inflation Factor (VIF) Calculation

In [None]:
# One-hot encode 'fuel_type' in the new dataset
encoded_fuel_type = encoder.fit_transform(new_dataset[['fuel_type']])
encoded_fuel_type_df = pd.DataFrame(encoded_fuel_type, columns=encoder.get_feature_names_out(['fuel_type']))

# Combine numerical and encoded categorical features
X_new = pd.concat([new_dataset.drop(columns=['fuel_type'])\
    .reset_index(drop=True), encoded_fuel_type_df\
    .reset_index(drop=True)], axis=1)

# Calculate VIF for each feature
vif_data_new = pd.DataFrame()
vif_data_new["Feature"] = X_new.columns
vif_data_new["VIF"] = [variance_inflation_factor(X_new.values, i) for i in range(len(X_new.columns))]

print(vif_data_new)

### Linear Regression Model

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Train linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")