In [None]:
import pandas as pd

consumer_data = pd.read_csv('C:/airqual-project/air-quality-Kafka-pipeline/consumed_air_quality_data.csv', parse_dates=['Datetime'])
consumer_data.set_index('Datetime', inplace=True) # Setting as index for convenience

# Time-based features (hour, day, month, year)
consumer_data['Hour'] = consumer_data.index.hour
consumer_data['Day'] = consumer_data.index.day
consumer_data['Month'] = consumer_data.index.month
consumer_data['Year'] = consumer_data.index.year

# Lagged features 
consumer_data['co_lag_1'] = consumer_data['co_gt'].shift(1) # Lagged by 1 hour for recent trends
consumer_data['co_lag_24'] = consumer_data['co_gt'].shift(24) # Lagged by 24 hours (1 day) for daily trends
consumer_data['co_lag_168'] = consumer_data['co_gt'].shift(168) # Lagged by 168 hours (7 days) for weekly trends

consumer_data['nox_lag_1'] = consumer_data['nox_gt'].shift(1)
consumer_data['nox_lag_24'] = consumer_data['nox_gt'].shift(24)
consumer_data['nox_lag_168'] = consumer_data['nox_gt'].shift(168)

consumer_data['benzene_lag_1'] = consumer_data['c6h6_gt'].shift(1)
consumer_data['benzene_lag_24'] = consumer_data['c6h6_gt'].shift(24)
consumer_data['benzene_lag_168'] = consumer_data['c6h6_gt'].shift(168)

cols = ['co_lag_1', 'co_lag_24', 'co_lag_168',
        'nox_lag_1', 'nox_lag_24', 'nox_lag_168',
        'benzene_lag_1', 'benzene_lag_24', 'benzene_lag_168']
consumer_data.dropna(subset=cols, inplace=True)
# print(consumer_data[cols + ['co_gt', 'nox_gt', 'c6h6_gt']].head())

# Rolling features 
cols = ['co_gt', 'nox_gt', 'c6h6_gt']
windows = [3, 24, 168]

for col in cols:
    for window in windows:
        consumer_data[f'{col}_roll_mean_{window}'] = consumer_data[col].rolling(window=window).mean()
        consumer_data[f'{col}_roll_std_{window}'] = consumer_data[col].rolling(window=window).std()
        # print(consumer_data[f'{col}_roll_mean_{window}'].head(5))
        # print(consumer_data[f'{col}_roll_std_{window}'].head(5))

consumer_data.dropna(subset=[f'{col}_roll_mean_{window}', f'{col}_roll_std_{window}'], inplace=True)

# Final features set
feature_cols = [col for col in consumer_data.columns if 'lag' in col or 'roll' in col or 
                col in ['Hour', 'Day', 'Month', 'Year']]

In [27]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, mean_absolute_error

# Baseline model
x = consumer_data[feature_cols]
y = consumer_data[['co_gt', 'nox_gt', 'c6h6_gt']]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) # Train-test split

lr_model = LinearRegression()
lr_model.fit(x_train, y_train) 
y_pred_lr = lr_model.predict(x_test)

rmse_lr = root_mean_squared_error(y_test, y_pred_lr) 
mae_lr = mean_absolute_error(y_test, y_pred_lr)
print(rmse_lr, mae_lr)

0.5693509803530304 0.3992155771083714


In [None]:
# Random Forest model
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

rf_model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
rf_model.fit(x_train, y_train)
y_pred_rf = rf_model.predict(x_test)

rmse_rf = root_mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
print(rmse_rf, mae_rf)

# Save model
import joblib
joblib.dump(rf_model, 'C:/airqual-project/phase_3_model_prediction/air_quality_model.pkl')
joblib.dump(feature_cols, 'C:/airqual-project/phase_3_model_prediction/model_features.pkl')

0.3633006064267687 0.22377030728662053


['C:/airqual-project/phase_3_model_prediction/model_features.pkl']

In [34]:
# SARIMA model (co_gt only)
from statsmodels.tsa.statespace.sarimax import SARIMAX

ts = consumer_data['co_gt'].asfreq('H')
sarima_model = SARIMAX(ts, order=(2, 1, 2), seasonal_order=(1, 1, 1, 24))
sarima_model_fit = sarima_model.fit()
y_pred_sarima = sarima_model_fit.forecast(steps=len(x_test))
sarima_rmse = root_mean_squared_error(y_test['co_gt'], y_pred_sarima)
sarima_mae = mean_absolute_error(y_test['co_gt'], y_pred_sarima)
print(sarima_rmse, sarima_mae)

  ts = consumer_data['co_gt'].asfreq('H')


1.2318626089302391 0.9087319026262947
