# Comparing Vehicle Emissions Models: 2014 vs 2025

This notebook compares the relationship between vehicle characteristics and CO2 emissions between 2014 and 2025 datasets.

In [None]:
import sys
sys.path.append('../')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_loader import load_2014_data, load_2025_data
from src.models import train_basic_models, train_advanced_models
from src.visualization import plot_regression_models, plot_correlation_matrix

%matplotlib inline

## Load Both Datasets

In [None]:
# Load datasets
df_2014 = load_2014_data()
df_2025 = load_2025_data()

print("2014 Dataset Shape:", df_2014.shape)
print("2025 Dataset Shape:", df_2025.shape)

## Compare Basic Statistics

In [None]:
# Compare statistics
print("2014 Dataset Statistics:")
print(df_2014.describe())
print("\n2025 Dataset Statistics:")
print(df_2025.describe())

## Train Models on Both Datasets

In [None]:
# Train models on 2014 data
results_2014 = train_basic_models(df_2014)

# Train models on 2025 data
results_2025 = train_basic_models(df_2025)

## Compare Model Performance

In [None]:
# Create comparison table
model_names = ['engine', 'fuel', 'cylinders', 'multiple']
comparison_data = []

for model in model_names:
    comparison_data.append({
        'Model': model.capitalize(),
        '2014 R²': results_2014['metrics'][model]['r2'],
        '2025 R²': results_2025['metrics'][model]['r2'],
        'Change in R²': results_2025['metrics'][model]['r2'] - results_2014['metrics'][model]['r2']
    })
    
comparison_df = pd.DataFrame(comparison_data)
comparison_df

In [None]:
# Visualize the comparison
plt.figure(figsize=(10, 6))
bar_width = 0.35
index = np.arange(len(model_names))

plt.bar(index, comparison_df['2014 R²'], bar_width, label='2014')
plt.bar(index + bar_width, comparison_df['2025 R²'], bar_width, label='2025')

plt.xlabel('Model Type')
plt.ylabel('R² Score')
plt.title('Model Performance Comparison: 2014 vs 2025')
plt.xticks(index + bar_width / 2, [name.capitalize() for name in model_names])
plt.legend()
plt.tight_layout()

# Save the figure
plt.savefig('../visualizations/model_comparison_2014_vs_2025.png')
plt.show()

## Compare Correlation Matrices

In [None]:
# 2014 Correlation matrix
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.heatmap(df_2014.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('2014 Correlation Matrix')

plt.subplot(1, 2, 2)
sns.heatmap(df_2025.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('2025 Correlation Matrix')

plt.tight_layout()
plt.savefig('../visualizations/correlation_comparison.png')
plt.show()

## Train Advanced Models on Both Datasets

In [None]:
# Train advanced models
advanced_2014 = train_advanced_models(df_2014)
advanced_2025 = train_advanced_models(df_2025)

# Compare Random Forest feature importance
rf_2014 = advanced_2014['random_forest']['feature_importance']
rf_2025 = advanced_2025['random_forest']['feature_importance']

# Create comparison dataframe
features = list(rf_2014.keys())
importance_2014 = [rf_2014[f] for f in features]
importance_2025 = [rf_2025[f] for f in features]

feature_importance_df = pd.DataFrame({
    'Feature': features,
    '2014 Importance': importance_2014,
    '2025 Importance': importance_2025
})

feature_importance_df

In [None]:
# Visualize feature importance comparison
plt.figure(figsize=(10, 6))
bar_width = 0.35
index = np.arange(len(features))

plt.bar(index, feature_importance_df['2014 Importance'], bar_width, label='2014')
plt.bar(index + bar_width, feature_importance_df['2025 Importance'], bar_width, label='2025')

plt.xlabel('Feature')
plt.ylabel('Importance Score')
plt.title('Random Forest Feature Importance: 2014 vs 2025')
plt.xticks(index + bar_width / 2, features)
plt.legend()
plt.tight_layout()

# Save the figure
plt.savefig('../visualizations/feature_importance_comparison.png')
plt.show()

## Key Findings

1. In 2014, all vehicle characteristics (engine size, cylinders, fuel consumption) were meaningful predictors of CO2 emissions

2. By 2025, fuel consumption has become the dominant predictor, with engine size and cylinders showing much weaker relationships

3. This suggests a technological evolution in the automotive industry where emissions have been decoupled from engine physical characteristics

4. The extremely strong correlation between fuel consumption and CO2 emissions in 2025 (R² = 0.95) suggests a near-deterministic relationship, possibly due to standardization of emissions measurement or calculation methods

5. These findings have important implications for emissions policy and vehicle design, suggesting that focusing on fuel efficiency rather than engine size is more effective for reducing emissions