Load package

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import GridSearchCV

Load the data

In [None]:
data = pd.read_csv('air_quality_data.csv')

Handle missing values if any

In [None]:
data.fillna(method='ffill', inplace=True)

Convert date and time to datetime object

In [None]:
data['DateTime'] = pd.to_datetime(data['Date'] + ' ' + data['Time'])
data.set_index('DateTime', inplace=True)
data.drop(['Date', 'Time'], axis=1, inplace=True)

Extract features and target variables

In [None]:
features = data.drop(['PM2.5'], axis=1)  # Assume we're predicting PM2.5
target = data['PM2.5']

Split the data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

Normalize the data

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Initialize the model

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

Train the model

In [None]:
rf_model.fit(X_train, y_train)

Make predictions

In [None]:
y_pred = rf_model.predict(X_test)

Evaluate the model

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Define categorization function

In [None]:
def categorize_pm25(pm25_value):
    if pm25_value <= 12.0:
        return 'Good'
    elif pm25_value <= 35.4:
        return 'Moderate'
    elif pm25_value <= 55.4:
        return 'Unhealthy for Sensitive Groups'
    elif pm25_value <= 150.4:
        return 'Unhealthy'
    elif pm25_value <= 250.4:
        return 'Very Unhealthy'
    else:
        return 'Hazardous'

Apply categorization to predictions

In [None]:
predicted_levels = [categorize_pm25(pm25) for pm25 in y_pred]

Add predictions and levels to a DataFrame for better visualization

In [None]:
results = pd.DataFrame({
    'Predicted PM2.5': y_pred,
    'Air Quality Level': predicted_levels
})

print(results.head())

Get feature importance

In [None]:
importance = rf_model.feature_importances_

Plot feature importance

In [None]:
features = data.columns.drop('PM2.5')
indices = np.argsort(importance)[::-1]

plt.figure(figsize=(12, 6))
plt.title("Feature Importance")
plt.bar(range(X_train.shape[1]), importance[indices], align="center")
plt.xticks(range(X_train.shape[1]), [features[i] for i in indices], rotation=90)
plt.show()

Define parameter grid

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

Initialize GridSearchCV

In [None]:
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

Fit GridSearchCV

In [None]:
grid_search.fit(X_train, y_train)

Best parameters and best score

In [None]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f'Best Parameters: {best_params}')
print(f'Best CV Score: {best_score}')

Use best parameters to predict

In [None]:
best_rf_model = grid_search.best_estimator_
y_pred = best_rf_model.predict(X_test)

Evaluate the model with best parameters

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error with best parameters: {mse}')
print(f'R-squared with best parameters: {r2}')

Apply categorization to predictions with best parameters

In [None]:
predicted_levels = [categorize_pm25(pm25) for pm25 in y_pred]

Add predictions and levels to a DataFrame for better visualization

In [None]:
results = pd.DataFrame({
    'Predicted PM2.5': y_pred,
    'Air Quality Level': predicted_levels
})

print(results.head())