## Set up

### Imports

In [1]:
import logging
import sys
import os
import pandas as pd

sys.path.append(os.path.abspath(".."))

from scripts.database import get_session, load_data_to_db
from scripts.data_cleaning import clean_data
from scripts.utils import load_config, setup_logging
from models import CarPriceDataset


### Config

In [2]:
try:
    config = load_config('../config/config.yaml')
    setup_logging(config['paths']['log_path'])
    logging.info("Starting the data analysis project.")
except Exception as e:
    logging.error(f"Failed to load config or setup logging: {e}")
    raise

INFO:root:Starting the data analysis project.
Starting the data analysis project.


### Create session

In [3]:
try:
    session = get_session()
    logging.info("Database session created successfully.")
except Exception as e:
    logging.error(f"Failed to create database session: {e}")
    raise

INFO:root:Database session created successfully.
Database session created successfully.


### Query the car price data set

In [4]:
try:
    car_price_data = session.query(CarPriceDataset).all()
    data = [car.__dict__ for car in car_price_data]
    for item in data:
        item.pop('_sa_instance_state', None)
    
    car_price_df = pd.DataFrame(data)
    print(car_price_df.head())
except Exception as e:
    logging.error(f"Failed to query car price data: {e}")
    raise


  fuel_type   model  year  mileage  owner_count       brand  engine_size  \
0    Diesel     Rio  2020   289944            5         Kia          4.2   
1    Hybrid  Malibu  2012     5356            3   Chevrolet          2.0   
2    Diesel     GLA  2020   231440            2    Mercedes          4.2   
3  Electric      Q5  2023   160971            1        Audi          2.0   
4    Hybrid    Golf  2003   286618            3  Volkswagen          2.6   

     transmission  doors  price  
0          Manual      3   8501  
1       Automatic      2  12092  
2       Automatic      4  11171  
3          Manual      2  11780  
4  Semi-Automatic      3   2867  


In [10]:
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import OneHotEncoder

# One-hot encode categorical variables
categorical_cols = ['fuel_type', 'model', 'brand', 'transmission']
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_categorical = encoder.fit_transform(car_price_df[categorical_cols])

# Create a DataFrame with the encoded categorical variables
encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(categorical_cols))

# Combine the encoded categorical variables with the numerical variables
numerical_cols = car_price_df.drop(columns=categorical_cols + ['price'])
X_encoded = pd.concat([numerical_cols.reset_index(drop=True), encoded_categorical_df.reset_index(drop=True)], axis=1)

# Assuming 'price' is the target variable in car_price_df
y = car_price_df['price']

# Calculate mutual information
mi = mutual_info_regression(X_encoded, y)

# Create a DataFrame for better visualization
mi_df = pd.DataFrame({'Feature': X_encoded.columns, 'Mutual Information': mi})
mi_df = mi_df.sort_values(by='Mutual Information', ascending=False)
print(mi_df)

                        Feature  Mutual Information
0                          year        9.152854e-02
3                   engine_size        8.357429e-02
4                         doors        6.789114e-02
1                       mileage        6.081609e-02
5            fuel_type_Electric        5.695204e-02
45             brand_Volkswagen        3.341848e-02
6              fuel_type_Hybrid        3.323307e-02
24                   model_Golf        2.222222e-02
37                    brand_BMW        2.107280e-02
38              brand_Chevrolet        1.432677e-02
28                 model_Passat        1.111111e-02
10                     model_A4        1.111111e-02
12                model_C-Class        1.111111e-02
36                     model_X5        1.111111e-02
18                model_Elantra        1.111111e-02
19                model_Equinox        1.111111e-02
29                     model_Q5        1.111111e-02
9                      model_A3        1.111111e-02
14          

In [16]:
new_dataset = car_price_df[['mileage', 'doors', 'engine_size', 'fuel_type']]
print(new_dataset.head())

   mileage  doors  engine_size fuel_type
0   289944      3          4.2    Diesel
1     5356      2          2.0    Hybrid
2   231440      4          4.2    Diesel
3   160971      2          2.0  Electric
4   286618      3          2.6    Hybrid


In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# One-hot encode the 'fuel_type' column in the new dataset
encoded_fuel_type = encoder.fit_transform(new_dataset[['fuel_type']])

# Create a DataFrame with the encoded 'fuel_type' column
encoded_fuel_type_df = pd.DataFrame(encoded_fuel_type, columns=encoder.get_feature_names_out(['fuel_type']))

# Combine numerical and encoded categorical features
X_new = pd.concat([new_dataset.drop(columns=['fuel_type']).reset_index(drop=True), encoded_fuel_type_df.reset_index(drop=True)], axis=1)

# Calculate VIF for each feature
vif_data_new = pd.DataFrame()
vif_data_new["Feature"] = X_new.columns
vif_data_new["VIF"] = [variance_inflation_factor(X_new.values, i) for i in range(len(X_new.columns))]

print(X_new.head())

   mileage  doors  engine_size  fuel_type_Electric  fuel_type_Hybrid  \
0   289944      3          4.2                 0.0               0.0   
1     5356      2          2.0                 0.0               1.0   
2   231440      4          4.2                 0.0               0.0   
3   160971      2          2.0                 1.0               0.0   
4   286618      3          2.6                 0.0               1.0   

   fuel_type_Petrol  
0               0.0  
1               0.0  
2               0.0  
3               0.0  
4               0.0  


In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Create a linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate mean squared error and R^2 score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 386239.2338095221
R^2 Score: 0.9635099717124912
