# Initial Data Exploration: Vehicle CO2 Emissions

This notebook explores the relationship between vehicle characteristics and CO2 emissions using linear regression models.

In [None]:
import sys
sys.path.append('../')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_loader import load_2025_data
from src.models import train_basic_models
from src.visualization import plot_regression_models, plot_correlation_matrix

%matplotlib inline

## Load and Explore the Dataset

In [None]:
# Load the 2025 dataset
df = load_2025_data()
df.head()

In [None]:
# Basic statistics
df.describe()

In [None]:
# Check for missing values
df.isna().sum()

In [None]:
# Distribution of features
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
sns.histplot(df['ENGINESIZE'], kde=True, ax=axes[0, 0])
axes[0, 0].set_title('Engine Size Distribution')

sns.histplot(df['CYLINDERS'], kde=True, ax=axes[0, 1])
axes[0, 1].set_title('Cylinders Distribution')

sns.histplot(df['FUELCONSUMPTION_COMB'], kde=True, ax=axes[1, 0])
axes[1, 0].set_title('Fuel Consumption Distribution')

sns.histplot(df['CO2EMISSIONS'], kde=True, ax=axes[1, 1])
axes[1, 1].set_title('CO2 Emissions Distribution')

plt.tight_layout()
plt.show()

## Correlation Analysis

In [None]:
# Plot correlation matrix
correlation_fig = plot_correlation_matrix(df)
plt.show()

# Save the correlation matrix
correlation_fig.savefig('../visualizations/correlation_matrix_2025.png')

## Train Basic Regression Models

In [None]:
# Train models
results = train_basic_models(df)

# Print performance metrics
for model_name, metrics in results['metrics'].items():
    print(f"{model_name.capitalize()} Model:")
    print(f"R² Score: {metrics['r2']:.4f}")
    print(f"RMSE: {metrics['rmse']:.4f}")
    print(f"MAE: {metrics['mae']:.4f}")
    print("")

In [None]:
# Plot regression models
fig = plot_regression_models(
    results['X_train'],
    results['X_test'],
    results['y_train'],
    results['y_test'],
    results['models']
)
plt.show()

# Save the figure
fig.savefig('../visualizations/regression_models_2025.png')

## Key Observations

1. Fuel consumption has the strongest correlation with CO2 emissions
2. Engine size and cylinders show moderate correlation with emissions
3. The multiple regression model using both engine size and fuel consumption performs similarly to the fuel consumption model alone
4. This suggests that once fuel consumption is known, engine size provides little additional predictive power