In [60]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, cross_val_predict, RandomizedSearchCV
from sklearn.metrics import root_mean_squared_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [29]:
data = pd.read_csv('AirQualityUCI.csv')
data.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
0,3/10/2004,18:00:00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578,,
1,3/10/2004,19:00:00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255,,
2,3/10/2004,20:00:00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502,,
3,3/10/2004,21:00:00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867,,
4,3/10/2004,22:00:00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888,,


In [30]:
# Rename the columns to be more descriptive
data.rename(columns={
    'Date': 'Measurement_Date',
    'Time': 'Measurement_Time',
    'CO(GT)': 'Carbon_Monoxide_Concentration',
    'PT08.S1(CO)': 'CO_Sensor_Measurement',
    'NMHC(GT)': 'Non_Methane_Hydrocarbons_Concentration',
    'C6H6(GT)': 'Benzene_Concentration',
    'PT08.S2(NMHC)': 'NMHC_Sensor_Measurement',
    'NOx(GT)': 'Nitrogen_Oxides_Concentration',
    'PT08.S3(NOx)': 'NOx_Sensor_Measurement',
    'NO2(GT)': 'Nitrogen_Dioxide_Concentration',
    'T': 'Temperature_Celsius',
    'RH': 'Relative_Humidity',
    'AH': 'Absolute_Humidity'
}, inplace=True)

In [31]:
# Visualize the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9471 entries, 0 to 9470
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Measurement_Date                        9357 non-null   object 
 1   Measurement_Time                        9357 non-null   object 
 2   Carbon_Monoxide_Concentration           9357 non-null   float64
 3   CO_Sensor_Measurement                   9357 non-null   float64
 4   Non_Methane_Hydrocarbons_Concentration  9357 non-null   float64
 5   Benzene_Concentration                   9357 non-null   float64
 6   NMHC_Sensor_Measurement                 9357 non-null   float64
 7   Nitrogen_Oxides_Concentration           9357 non-null   float64
 8   NOx_Sensor_Measurement                  9357 non-null   float64
 9   Nitrogen_Dioxide_Concentration          9357 non-null   float64
 10  PT08.S4(NO2)                            9357 non-null   floa

In [32]:
# Remove weird last two columns
data.drop(data.columns[15:17], axis=1, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9471 entries, 0 to 9470
Data columns (total 15 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Measurement_Date                        9357 non-null   object 
 1   Measurement_Time                        9357 non-null   object 
 2   Carbon_Monoxide_Concentration           9357 non-null   float64
 3   CO_Sensor_Measurement                   9357 non-null   float64
 4   Non_Methane_Hydrocarbons_Concentration  9357 non-null   float64
 5   Benzene_Concentration                   9357 non-null   float64
 6   NMHC_Sensor_Measurement                 9357 non-null   float64
 7   Nitrogen_Oxides_Concentration           9357 non-null   float64
 8   NOx_Sensor_Measurement                  9357 non-null   float64
 9   Nitrogen_Dioxide_Concentration          9357 non-null   float64
 10  PT08.S4(NO2)                            9357 non-null   floa

In [33]:
# List columns with missing data
missing_data = data.isna().any()
columns_with_missing_data = missing_data[missing_data].index.tolist()
print(columns_with_missing_data)

['Measurement_Date', 'Measurement_Time', 'Carbon_Monoxide_Concentration', 'CO_Sensor_Measurement', 'Non_Methane_Hydrocarbons_Concentration', 'Benzene_Concentration', 'NMHC_Sensor_Measurement', 'Nitrogen_Oxides_Concentration', 'NOx_Sensor_Measurement', 'Nitrogen_Dioxide_Concentration', 'PT08.S4(NO2)', 'PT08.S5(O3)', 'Temperature_Celsius', 'Relative_Humidity', 'Absolute_Humidity']


In [34]:
# After visual inspection, it looks like there are only 9358 entries in the csv file. Not sure why data.info() is registering 9471 
# data entries? Because of this, I will only "keep" the 9358 rows.
data = data[:9357]
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9357 entries, 0 to 9356
Data columns (total 15 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Measurement_Date                        9357 non-null   object 
 1   Measurement_Time                        9357 non-null   object 
 2   Carbon_Monoxide_Concentration           9357 non-null   float64
 3   CO_Sensor_Measurement                   9357 non-null   float64
 4   Non_Methane_Hydrocarbons_Concentration  9357 non-null   float64
 5   Benzene_Concentration                   9357 non-null   float64
 6   NMHC_Sensor_Measurement                 9357 non-null   float64
 7   Nitrogen_Oxides_Concentration           9357 non-null   float64
 8   NOx_Sensor_Measurement                  9357 non-null   float64
 9   Nitrogen_Dioxide_Concentration          9357 non-null   float64
 10  PT08.S4(NO2)                            9357 non-null   floa

In [35]:
# Calculate the correlation matrix with the numerical data
numerical_data = data.iloc[:, 2:14]
correlation_matrix = numerical_data.corr()
print(correlation_matrix)

                                        Carbon_Monoxide_Concentration  \
Carbon_Monoxide_Concentration                                1.000000   
CO_Sensor_Measurement                                        0.041411   
Non_Methane_Hydrocarbons_Concentration                       0.128351   
Benzene_Concentration                                       -0.031378   
NMHC_Sensor_Measurement                                      0.029926   
Nitrogen_Oxides_Concentration                                0.526451   
NOx_Sensor_Measurement                                      -0.089981   
Nitrogen_Dioxide_Concentration                               0.671127   
PT08.S4(NO2)                                                -0.073724   
PT08.S5(O3)                                                  0.080310   
Temperature_Celsius                                         -0.068939   
Relative_Humidity                                           -0.048227   

                                        CO_Sensor_

In [36]:
# The correlation matrix has some interesting highly correlated variables (carbon monoxide sensor measurement and benzene concentration) 
# and lowly correlated variables (carbon monoxide concentration and CO sensor measurement). Will keep that in mind later on.

In [45]:
#Split the dataset into a train and test set using Nitrogen_Dioxide_Concentration as the target label
train_set, test_set = train_test_split(numerical_data, test_size=0.2, random_state=42)

#Begin preparation of data for ML algorithms with fresh copy of training data
train_copy = train_set.copy()

#Separate the predictors and the label
train_data = train_copy.drop('Nitrogen_Dioxide_Concentration', axis=1)
train_labels = train_copy['Nitrogen_Dioxide_Concentration'].copy()

In [54]:
#Create pipelines for numerical attributes
#Imputer with median strategy for any missing values and then a standard scaler is used on the numerical data
num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

# Select columns with numerical attributes
preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
)

# Create the complete pipeline
pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),  # Step for preprocessing
    ('randomforestregressor', RandomForestRegressor(random_state=42))  # RandomForestRegressor step
])

In [56]:
#Create pipeline with preprocessing and a RandomForestClassifier with 10 cross validation folds
forest_regressor = make_pipeline(preprocessing, RandomForestRegressor(random_state=42))

# Define the hyperparameter grid to search
param_grid = {
    'randomforestregressor__n_estimators': [50, 100, 150, 200, 300],
    'randomforestregressor__max_depth': [5, 10, 20, 30],
    'randomforestregressor__min_samples_split': list(range(2, 21)),  # Min samples to split a node
    'randomforestregressor__min_samples_leaf': list(range(1, 11)),  # Min samples at a leaf node
}
# Set up the randomized search with 10 cross-validation folds, 100 combinations to try, and the negative mean squared error.
random_search = RandomizedSearchCV(
    estimator=forest_regressor,
    param_distributions=param_grid,
    n_iter=100,
    scoring='neg_mean_squared_error',  # Use appropriate regression metric
    cv=10,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit on the training data
random_search.fit(train_data, train_labels)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits




[CV] END randomforestregressor__max_depth=10, randomforestregressor__min_samples_leaf=1, randomforestregressor__min_samples_split=12, randomforestregressor__n_estimators=200; total time=   5.5s
[CV] END randomforestregressor__max_depth=5, randomforestregressor__min_samples_leaf=5, randomforestregressor__min_samples_split=18, randomforestregressor__n_estimators=100; total time=   0.9s
[CV] END randomforestregressor__max_depth=5, randomforestregressor__min_samples_leaf=5, randomforestregressor__min_samples_split=18, randomforestregressor__n_estimators=100; total time=   0.9s
[CV] END randomforestregressor__max_depth=5, randomforestregressor__min_samples_leaf=5, randomforestregressor__min_samples_split=18, randomforestregressor__n_estimators=100; total time=   1.7s
[CV] END randomforestregressor__max_depth=30, randomforestregressor__min_samples_leaf=3, randomforestregressor__min_samples_split=12, randomforestregressor__n_estimators=100; total time=   2.6s
[CV] END randomforestregressor__m

  _data = np.array(data, dtype=dtype, copy=copy,


In [57]:
#Evaluate the model on the test set
test_data = test_set.drop("Nitrogen_Dioxide_Concentration", axis=1)
test_labels = test_set["Nitrogen_Dioxide_Concentration"].copy()

final_predictions = random_search.best_estimator_.predict(test_data)

In [63]:
# Calculate metrics
mse = mean_squared_error(test_labels, final_predictions)
rmse = root_mean_squared_error(test_labels, final_predictions)  # RMSE
r2 = r2_score(test_labels, final_predictions)

# Print results
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R² Score: {r2}')

Mean Squared Error: 261.94176464180157
Root Mean Squared Error: 16.184615060043953
R² Score: 0.9830091101853512


In [65]:
numerical_data["Nitrogen_Dioxide_Concentration"].max() - numerical_data["Nitrogen_Dioxide_Concentration"].min()

540.0

In [None]:
# RMSE suggests that the model's predictions deviate from actual values by about 16.18 µg/m³ on average. Relative
# to the range of 540, this is approximately 3%, which is pretty good.