In [None]:
## Regression Decision Tree Algorithm for predicting pollutant NO2
## Developer: Akanksha Upadhyay
## Date: 12/02/2025


import pandas as pd
import numpy as np

## Load the data
data = pd.read_csv(r"data.csv")
print(data.head())
print(data.describe())

## removing null
print(data.isnull().sum())
data.dropna(inplace=True)
print(data.info())



# separating features (root nodes) and target (leaf nodes)
from sklearn.tree import DecisionTreeRegressor, plot_tree
import matplotlib.pyplot as plt

# Define features (X) and target (y)
X = data[['traffic_count', 'temp', 'wind_speed', 'wind_direction',
                           'rel_humidity', 'air_pressure']]  # Features
y = data['NO2']  # Target variable



# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



## Measure the model performance, use different max depth and max no of features
# calculate R2 score with mse and mae

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Function to evaluate performance for different max_depth and max_features
def evaluate_decision_tree(X_train, y_train, max_depths, max_features_list):
    results = []

    # Calculate variance of the target variable (y_train)
    variance_y = np.var(y_train)
    print(f"Variance of NO2 (Training Data): {variance_y:.3f}")

    for max_depth in max_depths:
        for max_features in max_features_list:
            # Train the model on the training set
            tree_model = DecisionTreeRegressor(max_depth=max_depth, max_features=max_features, random_state=42)
            tree_model.fit(X_train, y_train)

            # Predict on the training set
            y_pred_train = tree_model.predict(X_train)

            # Calculate metrics on the training data
            r2 = r2_score(y_train, y_pred_train)
            mae = mean_absolute_error(y_train, y_pred_train)
            rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))

            # Append results
            results.append((max_depth, max_features, r2, mae, rmse))

            # Print metrics for training data
            print(f"max_depth={max_depth}, max_features={max_features} -> Train R²: {r2:.3f}, Train MAE: {mae:.3f}, Train RMSE: {rmse:.3f}")

    # Return as a DataFrame for easy handling
    return pd.DataFrame(results, columns=['max_depth', 'max_features', 'R2', 'MAE', 'RMSE'])


# Define values to test
max_depths = [3, 5, 10, None]
max_features_list = [1, 3, 6]

# Evaluate the decision tree for these combinations
results_df = evaluate_decision_tree(X_train, y_train, max_depths, max_features_list)


# Subplots for R2, MAE, and RMSE
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharex=True)  # 3 rows, 1 column

# Metrics to plot
metrics = ['R2', 'MAE', 'RMSE']
titles = ["R² Score vs max_depth", "MAE vs max_depth", "RMSE vs max_depth"]

# Plot each metric in a separate subplot
for i, metric in enumerate(metrics):
    ax = axes[i]
    for max_features in results_df['max_features'].unique():
        subset = results_df[results_df['max_features'] == max_features]
        ax.plot(subset['max_depth'], subset[metric], marker="o", label=f'max_features={max_features}')
    
    ax.set_title(titles[i])
    ax.set_xlabel("max_depth")
    ax.set_ylabel(metric)
    ax.legend()
    ax.grid(True)

plt.tight_layout()
plt.show()



# taking the best combo of parmeters max_depth=10, max_features=6
# criterion='squared_error' is default
# min_samples_split=2, min_samples_leaf = 1 ## default values
# we can use other criterian also like 'friedman_mse', 'poisson', and 'mae' based on target variable
# Create a single decision tree and training it

tree_model = DecisionTreeRegressor(max_depth=10, max_features=6, criterion='squared_error', min_samples_split=7, min_samples_leaf = 5, random_state=42)  
tree_model.fit(X_train, y_train)  # Train the tree
# Plot the decision tree
plt.figure(figsize=(15, 8))
plot_tree(tree_model, feature_names=X.columns, filled=True, rounded=True, fontsize=10)
plt.title("Decision Tree for Predicting NO2")
plt.show()
# Predict on the test data
y_pred_test = tree_model.predict(X_test)
print(y_pred_test)



# Evaluate the model on the test set
r2_test = r2_score(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"Test R² Score: {r2_test:.3f}")
print(f"Test MAE: {mae_test:.3f}")
print(f"Test RMSE: {rmse_test:.3f}")


# Plot actual vs predicted values
plt.figure(figsize=(10, 6))
plt.plot(y_test.values, label='Actual Values', color='blue', linewidth=2)
plt.plot(y_pred_test, label='Predicted Values', color='orange', linestyle='--', linewidth=2)
plt.xlabel('Index')
plt.ylabel('NO2 Values')
plt.title('Actual vs Predicted NO2 Values')
plt.legend()
plt.grid(True)
plt.show()




## Feature Importance for the decision tree

feature_importances = tree_model.feature_importances_

for name, importance in zip(X.columns, feature_importances):
    print(f"{name}: {importance:.3f}")

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.barh(X.columns, feature_importances, color="green")
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Feature Importance for Decision Tree")
plt.show()