# Air Quality Analysis

This notebook presents a comprehensive analysis of air quality data, including advanced feature engineering, predictive modeling, clustering, and anomaly detection techniques.

## Import Libraries

In [None]:
# Basic data manipulation and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Time series analysis
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA

# Machine learning - preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score

# Machine learning - regression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Machine learning - classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc

# System utilities
import os
import warnings
warnings.filterwarnings('ignore')

### Creating Output Directories

Setting up directory structure for organizing analysis outputs.

In [None]:
# Create directories for outputs
os.makedirs('Matplotlib_Plots/data', exist_ok=True)
os.makedirs('Matplotlib_Plots/exploratory_data_analysis', exist_ok=True)
os.makedirs('Matplotlib_Plots/preprocessing', exist_ok=True)
os.makedirs('Matplotlib_Plots/feature_engineering', exist_ok=True)
os.makedirs('Matplotlib_Plots/correlation_analysis', exist_ok=True)
os.makedirs('Matplotlib_Plots/time_series_analysis', exist_ok=True)
os.makedirs('Matplotlib_Plots/modelling_analysis_results', exist_ok=True)

# Phase 1: Data Loading and Initial Examination

In this phase, we load the air quality dataset and perform initial examination to understand its structure, variables, and potential issues.

### 1.1 Loading the Dataset

Loading the Air Quality UCI dataset from Excel file.

In [None]:
print("Phase 1: Data Loading and Initial Examination")

In [None]:
# Load the dataset
airquality = pd.read_excel('Dataset/AirQualityUCI.xlsx')

### 1.2 Previewing the Dataset

Examining the first few rows to understand the data structure.

In [None]:
airquality.head()

### 1.3 Dataset Dimensions

Checking the number of rows and columns in the dataset.

In [None]:
print(f"Dataset shape: {airquality.shape}")

### 1.4 Column Names

Listing all variables in the dataset.

In [None]:
print("\nColumn names:")
print(airquality.columns.tolist())

### 1.5 Data Information

Examining data types and non-null counts for each column.

In [None]:
airquality.info()

### 1.6 Data Types

Checking the data type of each column.

In [None]:
print("\nData types:")
print(airquality.dtypes)

### 1.7 Summary Statistics

Calculating descriptive statistics for numerical variables.

In [None]:
airquality.describe()

### 1.8 Missing Values Analysis

Checking for null values in the dataset.

In [None]:
airquality.isnull().sum()

### 1.9 Special Missing Values (-200)

Identifying columns with -200 values, which represent missing data in this dataset.

In [None]:
# Check for missing values (represented as -200)
print("\nChecking for -200 values (missing data):")
for col in airquality.columns:
    if isinstance(airquality[col].min(), (int, float)) and airquality[col].min() == -200:
        print(f"{col} has -200 values: {(airquality[col] == -200).sum()} ({(airquality[col] == -200).sum()/len(airquality)*100:.2f}%)")

### 1.10 Replacing Special Missing Values

Converting -200 values to NaN for proper handling of missing data.

In [None]:
# Replace -200 with NaN
airquality = airquality.copy()
for col in airquality.columns:
    if airquality[col].dtype != 'datetime64[ns]' and airquality[col].dtype != 'object':
        airquality[col] = airquality[col].replace(-200, np.nan)

In [None]:
# Check for missing values (represented as -200)
print("\nChecking for -200 values (missing data):")
for col in airquality.columns:
    if isinstance(airquality[col].min(), (int, float)) and airquality[col].min() == -200:
        print(f"{col} has -200 values: {(airquality[col] == -200).sum()} ({(airquality[col] == -200).sum()/len(airquality)*100:.2f}%)")

### 1.11 Checking for Duplicates

Identifying duplicate rows in the dataset.

In [None]:
# Check for duplicates
duplicates = airquality.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

### 1.12 Verifying Missing Values

Rechecking missing values after replacing -200 with NaN.

In [None]:
airquality.isnull().sum()

In [None]:
# Save basic statistics to a file
with open('Matplotlib_Plots/data/data_description.txt', 'w') as f:
    f.write('# Air Quality Dataset - Exploratory Data Analysis\n\n')
    f.write('## Dataset Overview\n')
    f.write(f'Number of observations: {airquality.shape[0]}\n')
    f.write(f'Number of variables: {airquality.shape[1]}\n\n')
    
    f.write('## Variable Types\n')
    f.write(str(airquality.dtypes) + '\n\n')
    
    # Convert -200 values to NaN for better statistics
    df_clean = airquality.copy()
    for col in df_clean.columns:
        if df_clean[col].dtype != 'datetime64[ns]' and df_clean[col].dtype != 'object':
            df_clean.loc[df_clean[col] == -200, col] = np.nan
    
    f.write('## Summary Statistics (after replacing -200 with NaN)\n')
    f.write(str(df_clean.describe()) + '\n\n')
    
    f.write('## Missing Values (counting -200 as missing)\n')
    missing_counts = df_clean.isna().sum()
    missing_percent = (df_clean.isna().sum() / len(df_clean)) * 100
    missing_data = pd.DataFrame({'Missing Count': missing_counts, 'Missing Percent': missing_percent})
    f.write(str(missing_data) + '\n\n')

print("Initial data examination completed. Results saved to Matplotlib_Plots/data/data_description.txt")

In [None]:
plt.figure(figsize=(12, 10))

# Create a multi-panel figure to display dataset information
gs = plt.GridSpec(3, 2, height_ratios=[1, 1.5, 1.5])

# Dataset overview panel
plt.subplot(gs[0, :])
plt.axis('off')
plt.text(0.5, 0.9, 'Air Quality UCI Dataset', fontsize=24, ha='center', weight='bold')
plt.text(0.5, 0.6, 'Hourly measurements from an Italian city (March 2004 - February 2005)', 
         fontsize=16, ha='center')
plt.text(0.5, 0.3, f'9,357 hourly records, 15 variables', fontsize=14, ha='center')

# Data completeness
plt.subplot(gs[1, 0])
missing_data = airquality.isnull().sum().sort_values(ascending=False)
missing_percent = missing_data / len(airquality) * 100
missing_df = pd.DataFrame({'Missing Count': missing_data, 'Missing Percent': missing_percent})
missing_df = missing_df[missing_df['Missing Count'] > 0]
missing_df['Missing Percent'].sort_values().plot(kind='barh', color='steelblue')
plt.title('Missing Data (%)')
plt.xlabel('Percentage')
plt.tight_layout()

# Variables and measurement types
plt.subplot(gs[1, 1])
categories = ['Air pollutants', 'Sensor readings', 'Environmental']
counts = [5, 5, 3]  # CO, NOx, NO2, NMHC, C6H6 | PT08.S1-5 | T, RH, AH
colors = ['#ff9999', '#66b3ff', '#99ff99']
plt.pie(counts, labels=categories, colors=colors, autopct='%1.1f%%', startangle=90)
plt.axis('equal')
plt.title('Variable Categories')

# Time span visualization
plt.subplot(gs[2, :])
date_range = pd.to_datetime(airquality['Date'])
plt.plot(date_range, np.ones(len(date_range)), '|', color='blue', markersize=10)
plt.xticks(rotation=45)
plt.title('Dataset Time Coverage')
plt.yticks([])
plt.grid(axis='x')
plt.tight_layout()

# Add notes and source
plt.figtext(0.5, 0.02, 
           "Source: UCI Machine Learning Repository\nSavona, Italy - De Vito et al. (2008)\nFeatures include ground truth measurements (GT) and sensor responses", 
           ha="center", fontsize=12, bbox={"facecolor":"lightgray", "alpha":0.5, "pad":5})

plt.tight_layout(rect=[0, 0.03, 1, 0.97])  # Adjust layout to make room for the note at bottom
plt.savefig('Matplotlib_Plots/data/dataset_description.png', dpi=300, bbox_inches='tight')
plt.show()

# Phase 2: Exploratory Data Analysis (EDA)

In this phase, we perform a deeper investigation of the dataset through visualizations and statistical analyses to understand distributions, patterns, and relationships.

## Phase 1 Summary

In this initial phase, we successfully loaded the air quality dataset and performed a preliminary examination to understand its structure, variables, and potential issues. The dataset contains hourly measurements of various pollutants and environmental factors over a one-year period, providing a rich source of information for analysis.

Key findings from this phase include:

- The dataset contains 9,357 hourly records with 13 variables, including pollutant concentrations, temperature, and humidity measurements.
- Several variables have missing values, particularly NMHC(GT) with approximately 90% missing data.
- Missing values are represented as -200 in the original dataset and have been converted to NaN for proper handling.
- No duplicate rows were found in the dataset.
- The dataset includes both ground truth measurements (GT) and sensor responses (PT08.S1 to PT08.S5).

This initial examination provides the foundation for the more detailed analyses in subsequent phases. The identified data quality issues, particularly the missing values, will need to be addressed in the preprocessing phase.

# Phase 2: Exploratory Data Analysis (EDA)

### 2.1 Missing Values Visualization

Creating a heatmap to visualize the pattern of missing values in the dataset.

In [None]:
print("\nPhase 2: Exploratory Data Analysis")

In [None]:
df_clean = airquality.copy()

In [None]:
# Create a figure for missing values visualization
plt.figure(figsize=(14, 6))

# Calculate percentage of missing values per column
missing_percent = df_clean.isna().mean().sort_values(ascending=False) * 100

# Create a bar chart
ax = missing_percent.plot(kind='bar', color='steelblue')
plt.title('Percentage of Missing Values by Column', fontsize=14)
plt.xlabel('Columns', fontsize=12)
plt.ylabel('Missing Values (%)', fontsize=12)
plt.xticks(rotation=45, ha='right')

# Add value labels on top of bars
for i, v in enumerate(missing_percent):
    if v > 0:  # Only show labels for columns with missing values
        ax.text(i, v + 0.5, f'{v:.1f}%', ha='center', fontweight='bold')

plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('Matplotlib_Plots/exploratory_data_analysis/missing_values_percentage.png')
plt.show()
plt.close()

### 2.2 Distribution Analysis

Examining the distribution of key variables using histograms.

In [None]:
# Create histograms for all numeric columns
numeric_cols = df_clean.select_dtypes(include=['float64', 'int64']).columns
n_cols = 3
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols
plt.figure(figsize=(15, n_rows * 4))
for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_clean[col].dropna(), kde=True)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.savefig('Matplotlib_Plots/exploratory_data_analysis/histograms.png')
plt.show()
plt.close()

### 2.3 Outlier Analysis

Identifying outliers in key variables using box plots.

In [None]:
# Create box plots for all numeric columns
plt.figure(figsize=(15, 10))
df_clean_melt = pd.melt(df_clean[numeric_cols])
sns.boxplot(x='variable', y='value', data=df_clean_melt)
plt.xticks(rotation=90)
plt.title('Box Plots of Numeric Variables')
plt.tight_layout()
plt.savefig('Matplotlib_Plots/exploratory_data_analysis/boxplots.png')
plt.show()
plt.close()

### 2.4 Temporal Patterns

Analyzing how variables change over time to identify trends and patterns.

In [None]:
# Create time series plots for key pollutants
# First, ensure datetime format
df_clean['DateTime'] = pd.to_datetime(df_clean['Date'].astype(str) + ' ' + df_clean['Time'].astype(str))
df_clean = df_clean.set_index('DateTime')

In [None]:
# Plot time series for CO, NOx, and NO2
pollutants = ['CO(GT)', 'NOx(GT)', 'NO2(GT)']
plt.figure(figsize=(15, 12))
for i, pollutant in enumerate(pollutants):
    plt.subplot(3, 1, i + 1)
    df_clean[pollutant].resample('D').mean().plot()
    plt.title(f'Daily Average {pollutant}')
    plt.ylabel('Concentration')
plt.tight_layout()
plt.savefig('Matplotlib_Plots/exploratory_data_analysis/time_series_pollutants.png')
plt.show()
plt.close()

In [None]:
# Plot environmental variables
env_vars = ['T', 'RH', 'AH']
plt.figure(figsize=(15, 12))
for i, var in enumerate(env_vars):
    plt.subplot(3, 1, i + 1)
    df_clean[var].resample('D').mean().plot()
    plt.title(f'Daily Average {var}')
plt.tight_layout()
plt.savefig('Matplotlib_Plots/exploratory_data_analysis/time_series_env.png')
plt.show()
plt.close()

In [None]:
# Create a pair plot for key variables
key_vars = ['CO(GT)', 'NOx(GT)', 'NO2(GT)', 'T', 'RH']
plt.figure(figsize=(12, 10))
sns.pairplot(df_clean[key_vars].dropna().sample(1000))  # Sample to speed up plotting
plt.savefig('Matplotlib_Plots/exploratory_data_analysis/pairplot.png')
plt.show()
plt.close()

## EDA Summary: Key Findings and Observations

### Data Description and Patterns
- The dataset contains 9,357 hourly records of air quality and meteorological variables from an Italian city.
- Key variables include concentrations of CO, NOx, NO2, C6H6, and sensor responses, as well as temperature (T), relative humidity (RH), and absolute humidity (AH).
- There are significant missing values in some variables, especially NMHC(GT) (~90% missing), and moderate missingness in CO(GT), NOx(GT), and NO2(GT) (~18%).
- The summary statistics show a wide range of values for pollutants, with some variables (e.g., CO(GT), NOx(GT)) having outliers and skewed distributions.

### Visual Patterns and Anomalies
- Histograms reveal that many pollutant concentrations are right-skewed, with a majority of values clustered at the lower end and a long tail of higher values.
- Box plots confirm the presence of outliers, especially for CO(GT), NOx(GT), and C6H6(GT).
- Time series plots show clear daily and seasonal trends in pollutant concentrations and meteorological variables. For example, CO and NOx levels tend to be higher in colder months.
- Pair plots (scatter plots) indicate positive correlations between some pollutants (e.g., CO and NOx), and relationships between temperature/humidity and pollutant levels.

### Interesting Observations
- The high proportion of missing data in NMHC(GT) may require imputation or exclusion from some analyses.
- Outliers and non-normal distributions suggest the need for robust statistical methods or data transformation in further modeling.
- The data's temporal structure (hourly, with date and time) enables time series analysis and investigation of diurnal/seasonal cycles.

### Next Steps
- Address missing values and outliers in preprocessing.
- Explore feature engineering and correlation analysis for predictive modeling.
- Consider stratified or time-based data splitting for model validation.

In [None]:
print("Exploratory data analysis completed. Visualizations saved to Matplotlib_Plots/exploratory_data_analysis/ directory")

## Phase 2 Summary

The exploratory data analysis phase provided valuable insights into the distribution, patterns, and relationships within the air quality dataset. Through various visualizations and statistical analyses, we gained a deeper understanding of the data characteristics and identified important features for further investigation.

Key findings from this phase include:

- Pollutant concentrations (CO, NOx, NO2, C6H6) showed right-skewed distributions, with most values clustered at the lower end and a long tail of higher values.
- Temperature (T) followed a bimodal distribution, reflecting seasonal variations throughout the year.
- Clear temporal patterns were observed in pollution levels, with distinct daily, weekly, and seasonal variations.
- Strong correlations were found between related pollutants (e.g., NOx and NO2), and between pollutants and their corresponding sensor readings.
- Environmental factors like temperature and humidity showed significant relationships with pollution levels, with lower temperatures often associated with higher pollution concentrations.

These insights inform our approach to data preprocessing, feature engineering, and modeling in subsequent phases. The identified patterns and relationships will guide our selection of relevant features and appropriate modeling techniques.

# Phase 3: Data Preprocessing

### 3.1 Data Cleaning

Handling missing values, removing outliers, and preparing data for analysis.

In [None]:
print("\n--- Phase 3.1: Initial Overview, Duplicates, and Missing Values ---")

In [None]:
# Initial overview
print("Initial Dataset Overview:")
print(f"Number of observations: {airquality.shape[0]}")
print(f"Number of variables: {airquality.shape[1]}")

In [None]:
# Check and remove duplicates
duplicates = airquality.duplicated().sum()
print(f"\nDuplicate Records: {duplicates}")
if duplicates > 0:
    print("Removing duplicate records...")
    airquality = airquality.drop_duplicates()
    print(f"Dataset shape after removing duplicates: {airquality.shape}")
else:
    print("No duplicate records found.")

### 3.2 Feature Engineering

Creating new features to enhance analysis and modeling capabilities.

In [None]:
# Handle -200 as missing values
airquality_clean = airquality.copy()
print("\nMissing Values Before Treatment:")
for col in airquality_clean.columns:
    if airquality_clean[col].dtype != 'datetime64[ns]' and airquality_clean[col].dtype != 'object':
        mask = airquality_clean[col] == -200
        missing_count = mask.sum()
        if missing_count > 0:
            print(f"{col}: {missing_count} missing values ({missing_count/len(airquality_clean)*100:.2f}%)")
            airquality_clean.loc[mask, col] = np.nan

print("\nMissing Values Treatment Strategy:")
for col in airquality_clean.columns:
    if col not in ['Date', 'Time'] and airquality_clean[col].isna().sum() > 0:
        missing_pct = airquality_clean[col].isna().sum() / len(airquality_clean) * 100
        if missing_pct > 80:
            print(f"{col}: {missing_pct:.2f}% missing - Column will be dropped")
        elif missing_pct > 30:
            print(f"{col}: {missing_pct:.2f}% missing - Sensor correlations will be used for imputation")
        else:
            print(f"{col}: {missing_pct:.2f}% missing - Forward fill with rolling mean")

# Drop high-missing column
if 'NMHC(GT)' in airquality_clean.columns and airquality_clean['NMHC(GT)'].isna().sum() / len(airquality_clean) > 0.8:
    print("Dropping NMHC(GT) due to excessive missing values")
    airquality_clean = airquality_clean.drop(columns=['NMHC(GT)'])

In [None]:
print("\n--- Phase 3.2: Imputation and Outlier Handling ---")

In [None]:
# Set datetime index
airquality_clean['DateTime'] = pd.to_datetime(airquality_clean['Date'].astype(str) + ' ' + airquality_clean['Time'].astype(str))
airquality_clean = airquality_clean.set_index('DateTime').sort_index()

In [None]:
# Sensor-based imputation
pollutant_sensor_pairs = [('CO(GT)', 'PT08.S1(CO)'), ('NOx(GT)', 'PT08.S3(NOx)'), ('NO2(GT)', 'PT08.S4(NO2)')]
for pollutant, sensor in pollutant_sensor_pairs:
    if pollutant in airquality_clean.columns and sensor in airquality_clean.columns:
        if airquality_clean[pollutant].isna().sum() > 0:
            valid_data = airquality_clean[[pollutant, sensor]].dropna()
            if len(valid_data) > 0:
                correlation = valid_data[pollutant].corr(valid_data[sensor])
                print(f"Correlation between {pollutant} and {sensor}: {correlation:.4f}")
                if abs(correlation) > 0.5:
                    model = LinearRegression()
                    model.fit(valid_data[[sensor]], valid_data[pollutant])
                    predict_indices = airquality_clean[pollutant].isna() & ~airquality_clean[sensor].isna()
                    airquality_clean.loc[predict_indices, pollutant] = model.predict(airquality_clean.loc[predict_indices, [sensor]])
                    print(f"Used regression model to impute {predict_indices.sum()} values in {pollutant}")

### 3.3 Temporal Feature Engineering

Extracting time-based features from datetime information.

In [None]:
# Rolling mean and fill
for col in airquality_clean.columns:
    if col not in ['Date', 'Time'] and airquality_clean[col].isna().sum() > 0:
        missing_before = airquality_clean[col].isna().sum()
        rolling_mean = airquality_clean[col].rolling(window=24, min_periods=1).mean()
        airquality_clean[col] = airquality_clean[col].fillna(rolling_mean)
        if airquality_clean[col].isna().sum() > 0:
            airquality_clean[col] = airquality_clean[col].ffill()
        if airquality_clean[col].isna().sum() > 0:
            airquality_clean[col] = airquality_clean[col].bfill()
        if airquality_clean[col].isna().sum() > 0:
            airquality_clean[col] = airquality_clean[col].fillna(airquality_clean[col].mean())
        print(f"{col}: Imputed {missing_before} missing values")

print(f"\nRemaining missing values: {airquality_clean.isna().sum().sum()}")

In [None]:
# Outlier handling
numeric_cols = airquality_clean.select_dtypes(include=['float64', 'int64']).columns
numeric_cols = [col for col in numeric_cols if col not in ['Date', 'Time']]

In [None]:
# Boxplot before
plt.figure(figsize=(15, 10))
sns.boxplot(x='variable', y='value', data=pd.melt(airquality_clean.reset_index()[numeric_cols]))
plt.xticks(rotation=90)
plt.title('Box Plots Before Outlier Treatment')
plt.tight_layout()
plt.savefig('Matplotlib_Plots/preprocessing/boxplots_before_treatment.png')
plt.show()
plt.close()

In [None]:
# Detect and cap outliers
for col in numeric_cols:
    Q1 = airquality_clean[col].quantile(0.25)
    Q3 = airquality_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = ((airquality_clean[col] < lower) | (airquality_clean[col] > upper)).sum()
    print(f"{col}: {outliers} outliers detected")
    if outliers > 0:
        airquality_clean[col] = airquality_clean[col].clip(lower, upper)
        print(f"  - Outliers capped between {lower:.2f} and {upper:.2f}")

In [None]:
# Boxplot after
plt.figure(figsize=(15, 10))
sns.boxplot(x='variable', y='value', data=pd.melt(airquality_clean.reset_index()[numeric_cols]))
plt.xticks(rotation=90)
plt.title('Box Plots After Outlier Treatment')
plt.tight_layout()
plt.savefig('Matplotlib_Plots/preprocessing/boxplots_after_treatment.png')
plt.show()
plt.close()

In [None]:
print("\n--- Phase 3.3: Data Transformation and Finalization ---")

In [None]:
# Standardization
print("Standardizing numeric features...")
airquality_standardized = airquality_clean.copy()
scaler = StandardScaler()
for col in numeric_cols:
    airquality_standardized[col] = scaler.fit_transform(airquality_standardized[[col]])

In [None]:
# Save preprocessed and standardized data
airquality_clean.to_csv('Matplotlib_Plots/preprocessing/preprocessed_data.csv')
airquality_standardized.to_csv('Matplotlib_Plots/preprocessing/standardized_data.csv')

print(f"\nFinal Preprocessed Dataset Shape: {airquality_clean.shape}")
print(f"Columns: {list(airquality_clean.columns)}")
print("Preprocessed data saved to 'Matplotlib_Plots/preprocessing/preprocessed_data.csv'")
print("Standardized data saved to 'Matplotlib_Plots/preprocessing/standardized_data.csv'")

In [None]:
print("Data preprocessing completed. Results saved to preprocessing/ directory")

# Phase 3.4: Feature Engineering & Advanced Preprocessing

In this section, we will create new features to enhance our analysis and modeling capabilities. These engineered features will help capture temporal patterns, environmental conditions, and other factors that may influence air quality.

In [None]:
print("\n--- Phase 3.4: Feature Engineering & Advanced Preprocessing ---")

# Load the preprocessed data
df = pd.read_csv('Matplotlib_Plots/preprocessing/preprocessed_data.csv', index_col=0, parse_dates=True)
df.head()

## Rush Hour Indicator

Creating a feature to indicate if a timestamp falls within typical rush hours (7-9 AM, 5-8 PM).

In [None]:
# Extract hour from the datetime index
df['hour'] = df.index.hour

# Create rush hour indicator
df['is_rush_hour'] = ((df['hour'] >= 7) & (df['hour'] <= 9)) | ((df['hour'] >= 17) & (df['hour'] <= 20))
df['is_rush_hour'] = df['is_rush_hour'].astype(int)  # Convert boolean to 0/1

# Visualize pollution levels during rush hours vs. non-rush hours
plt.figure(figsize=(15, 10))
for i, pollutant in enumerate(['CO(GT)', 'NOx(GT)', 'NO2(GT)']):
    plt.subplot(3, 1, i+1)
    sns.boxplot(x='is_rush_hour', y=pollutant, data=df)
    plt.title(f'{pollutant} Levels: Rush Hour vs. Non-Rush Hour')
    plt.xlabel('Rush Hour (1) vs. Non-Rush Hour (0)')
    plt.ylabel('Concentration')
plt.tight_layout()
plt.savefig('Matplotlib_Plots/feature_engineering/rush_hour_comparison.png')
plt.show()
plt.close()

### Justification for Rush Hour Indicator

The rush hour indicator is valuable for several reasons:

1. **Traffic Patterns**: Rush hours typically coincide with increased traffic volume, which is a major source of urban air pollution.
2. **Predictive Power**: This feature can help models identify and predict pollution spikes associated with commuting patterns.
3. **Policy Relevance**: Understanding pollution patterns during rush hours can inform traffic management policies and public health advisories.
4. **Temporal Context**: It provides important temporal context that raw timestamp data doesn't explicitly capture.

## Weekend vs. Weekday Feature

Creating a binary feature to distinguish between weekdays and weekends.

In [None]:
# Extract day of week (0=Monday, 6=Sunday)
df['day_of_week'] = df.index.dayofweek

# Create weekend indicator (5=Saturday, 6=Sunday)
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

# Visualize pollution levels on weekends vs. weekdays
plt.figure(figsize=(15, 10))
for i, pollutant in enumerate(['CO(GT)', 'NOx(GT)', 'NO2(GT)']):
    plt.subplot(3, 1, i+1)
    sns.boxplot(x='is_weekend', y=pollutant, data=df)
    plt.title(f'{pollutant} Levels: Weekend vs. Weekday')
    plt.xlabel('Weekend (1) vs. Weekday (0)')
    plt.ylabel('Concentration')
plt.tight_layout()
plt.savefig('Matplotlib_Plots/feature_engineering/weekend_comparison.png')
plt.show()
plt.close()

### Justification for Weekend vs. Weekday Feature

The weekend/weekday distinction is important for these reasons:

1. **Activity Patterns**: Human activity patterns differ significantly between weekdays and weekends, affecting emission sources.
2. **Industrial Operations**: Many industrial facilities operate on different schedules during weekends.
3. **Traffic Volumes**: Traffic patterns and volumes typically differ between weekdays and weekends.
4. **Model Accuracy**: Including this feature can help models account for weekly cyclical patterns in pollution levels.

## Season Feature

Creating a categorical feature to represent seasons, which can significantly affect pollution patterns.

In [None]:
# Extract month
df['month'] = df.index.month

# Create season feature (Northern Hemisphere)
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:  # 9, 10, 11
        return 'Fall'

df['season'] = df['month'].apply(get_season)

# Visualize pollution levels by season
plt.figure(figsize=(15, 12))
for i, pollutant in enumerate(['CO(GT)', 'NOx(GT)', 'NO2(GT)']):
    plt.subplot(3, 1, i+1)
    sns.boxplot(x='season', y=pollutant, data=df, order=['Winter', 'Spring', 'Summer', 'Fall'])
    plt.title(f'{pollutant} Levels by Season')
    plt.xlabel('Season')
    plt.ylabel('Concentration')
plt.tight_layout()
plt.savefig('Matplotlib_Plots/feature_engineering/season_comparison.png')
plt.show()
plt.close()

### Justification for Season Feature

The season feature is valuable for these reasons:

1. **Meteorological Conditions**: Seasons bring different weather patterns that affect pollution dispersion and chemistry.
2. **Emission Sources**: Seasonal activities like heating in winter or increased air conditioning in summer affect emissions.
3. **Photochemical Reactions**: Seasonal variations in sunlight affect photochemical reactions that create secondary pollutants.
4. **Long-term Patterns**: This feature helps models capture long-term cyclical patterns in the data.

## Time-Based Features

Creating additional time-based features to capture temporal patterns at different scales.

In [None]:
# Hour of day as cyclical features using sine and cosine transformations
# This preserves the cyclical nature of time (hour 23 is close to hour 0)
df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)

# Day of week as cyclical features
df['day_sin'] = np.sin(2 * np.pi * df['day_of_week']/7)
df['day_cos'] = np.cos(2 * np.pi * df['day_of_week']/7)

# Month as cyclical features
df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
df['month_cos'] = np.cos(2 * np.pi * df['month']/12)

# Visualize hourly patterns with the new cyclical features
plt.figure(figsize=(12, 8))
plt.subplot(2, 1, 1)
plt.plot(df.groupby('hour')['CO(GT)'].mean())
plt.title('Average CO(GT) by Hour of Day')
plt.xlabel('Hour')
plt.ylabel('CO(GT)')

plt.subplot(2, 1, 2)
plt.scatter(df['hour_sin'], df['hour_cos'], c=df['hour'], cmap='hsv')
plt.title('Cyclical Representation of Hours')
plt.xlabel('sin(hour)')
plt.ylabel('cos(hour)')
plt.colorbar(label='Hour of Day')
plt.tight_layout()
plt.savefig('Matplotlib_Plots/feature_engineering/cyclical_time_features.png')
plt.show()
plt.close()

### Justification for Cyclical Time Features

Cyclical time features offer several advantages:

1. **Preserving Cyclical Nature**: They maintain the cyclical relationship between time periods (e.g., hour 23 is close to hour 0).
2. **Model Compatibility**: These transformations make time features more suitable for machine learning models.
3. **Capturing Periodicity**: They help models identify periodic patterns at different time scales (daily, weekly, monthly).
4. **Feature Importance**: They often provide more predictive power than raw time values.

## Lag Features

Creating lag features to capture the relationship between current and past pollution levels.

In [None]:
# Create lag features for key pollutants
for pollutant in ['CO(GT)', 'NOx(GT)', 'NO2(GT)']:
    # 1-hour lag
    df[f'{pollutant}_lag1'] = df[pollutant].shift(1)
    # 24-hour lag (same time yesterday)
    df[f'{pollutant}_lag24'] = df[pollutant].shift(24)
    # 168-hour lag (same time last week)
    df[f'{pollutant}_lag168'] = df[pollutant].shift(168)

# Create rolling average features
for pollutant in ['CO(GT)', 'NOx(GT)', 'NO2(GT)']:
    # 24-hour rolling average
    df[f'{pollutant}_rolling24'] = df[pollutant].rolling(window=24).mean()
    # 7-day rolling average
    df[f'{pollutant}_rolling168'] = df[pollutant].rolling(window=168).mean()

# Drop NaN values created by lag and rolling features
df_lag = df.dropna()

# Visualize correlation between current and lagged values
plt.figure(figsize=(15, 5))
for i, lag in enumerate(['lag1', 'lag24', 'lag168']):
    plt.subplot(1, 3, i+1)
    plt.scatter(df_lag['CO(GT)'], df_lag[f'CO(GT)_{lag}'], alpha=0.5)
    plt.title(f'CO(GT) vs CO(GT)_{lag}')
    plt.xlabel('CO(GT)')
    plt.ylabel(f'CO(GT)_{lag}')
    # Add correlation coefficient
    corr = df_lag['CO(GT)'].corr(df_lag[f'CO(GT)_{lag}'])
    plt.annotate(f'r = {corr:.3f}', xy=(0.05, 0.95), xycoords='axes fraction')
plt.tight_layout()
plt.savefig('Matplotlib_Plots/feature_engineering/lag_features_correlation.png')
plt.show()
plt.close()

### Justification for Lag and Rolling Features

Lag and rolling features are particularly valuable for time series data:

1. **Temporal Dependency**: They capture the autocorrelation in pollution levels over time.
2. **Predictive Power**: Previous pollution levels are often strong predictors of current levels.
3. **Trend Capture**: Rolling averages help capture longer-term trends and smooth out noise.
4. **Cyclical Patterns**: Lag features at specific intervals (24 hours, 168 hours) capture daily and weekly patterns.

## Interaction Features

Creating interaction features to capture relationships between different variables.

In [None]:
# Create interaction features between temperature and pollutants
for pollutant in ['CO(GT)', 'NOx(GT)', 'NO2(GT)']:
    df[f'{pollutant}_T_interaction'] = df[pollutant] * df['T']

# Create interaction features between humidity and pollutants
for pollutant in ['CO(GT)', 'NOx(GT)', 'NO2(GT)']:
    df[f'{pollutant}_RH_interaction'] = df[pollutant] * df['RH']

# Create interaction between rush hour and weekend
df['rush_hour_weekend'] = df['is_rush_hour'] * df['is_weekend']

# Visualize one of the interaction features
plt.figure(figsize=(10, 6))
plt.scatter(df['T'], df['CO(GT)'], c=df['CO(GT)_T_interaction'], cmap='viridis', alpha=0.6)
plt.colorbar(label='CO(GT) × Temperature Interaction')
plt.title('Temperature vs CO(GT) Colored by Their Interaction')
plt.xlabel('Temperature')
plt.ylabel('CO(GT)')
plt.tight_layout()
plt.savefig('Matplotlib_Plots/feature_engineering/interaction_features.png')
plt.show()
plt.close()

### Justification for Interaction Features

Interaction features capture complex relationships between variables:

1. **Non-linear Relationships**: They help models capture non-linear relationships between variables.
2. **Environmental Chemistry**: Pollutant behavior often depends on interactions with environmental factors like temperature and humidity.
3. **Compound Effects**: Some effects only manifest when multiple factors coincide (e.g., rush hour on weekdays vs. weekends).
4. **Model Flexibility**: They give linear models the ability to capture more complex patterns.

In [None]:
# Save the dataset with engineered features
df.to_csv('Matplotlib_Plots/feature_engineering/data_with_engineered_features.csv')

# Create a summary of all engineered features
engineered_features = [
    'is_rush_hour', 'is_weekend', 'season', 'hour_sin', 'hour_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos',
    'CO(GT)_lag1', 'CO(GT)_lag24', 'CO(GT)_lag168', 'CO(GT)_rolling24', 'CO(GT)_rolling168',
    'NOx(GT)_lag1', 'NOx(GT)_lag24', 'NOx(GT)_lag168', 'NOx(GT)_rolling24', 'NOx(GT)_rolling168',
    'NO2(GT)_lag1', 'NO2(GT)_lag24', 'NO2(GT)_lag168', 'NO2(GT)_rolling24', 'NO2(GT)_rolling168',
    'CO(GT)_T_interaction', 'NOx(GT)_T_interaction', 'NO2(GT)_T_interaction',
    'CO(GT)_RH_interaction', 'NOx(GT)_RH_interaction', 'NO2(GT)_RH_interaction',
    'rush_hour_weekend'
]

print("\nEngineered Features Summary:")
print(f"Number of original features: {len(airquality_clean.columns)}")
print(f"Number of engineered features: {len(engineered_features)}")
print(f"Total number of features: {len(df.columns)}")
print("\nEngineered features saved to 'Matplotlib_Plots/feature_engineering/data_with_engineered_features.csv'")

## Feature Engineering Summary

We have created several categories of engineered features to enhance our analysis and modeling capabilities:

1. **Temporal Features**:
   - Rush hour indicator
   - Weekend/weekday indicator
   - Season categorization
   - Cyclical time representations (hour, day, month)

2. **Lag and Rolling Features**:
   - 1-hour, 24-hour, and 168-hour lags for key pollutants
   - 24-hour and 7-day rolling averages

3. **Interaction Features**:
   - Pollutant × Temperature interactions
   - Pollutant × Humidity interactions
   - Rush hour × Weekend interaction

These engineered features capture important temporal patterns, environmental relationships, and complex interactions that can significantly improve model performance and provide deeper insights into air quality dynamics.

## Phase 3 Summary

In the data preprocessing and feature engineering phase, we prepared the dataset for advanced analysis and modeling by addressing data quality issues and creating new features to enhance predictive power. This phase built upon the insights gained from the exploratory data analysis to develop a clean, feature-rich dataset for subsequent modeling.

Key accomplishments in this phase include:

- Handling missing values using appropriate imputation techniques based on the nature and extent of missingness in each variable.
- Identifying and treating outliers to minimize their impact on analysis results.
- Creating temporal features (hour of day, day of week, month, season) to capture cyclical patterns in pollution levels.
- Developing lag features and rolling averages to incorporate temporal dependencies in the data.
- Normalizing and scaling variables to ensure comparability and improve model performance.

The resulting preprocessed dataset provides a solid foundation for the time series analysis and classification modeling in the following phases. The engineered features capture important temporal patterns and relationships that will enhance our ability to understand and predict air quality variations.

# Phase 4: Correlation Analysis

In this section, we will:
- Calculate and visualize the correlation matrix for all numeric variables.
- Identify and discuss significant correlations (strong positive/negative).
- Visualize key relationships with scatter plots.
- Analyze correlations between pollutants, environmental factors, and sensor performance.

In [None]:
# Load the preprocessed data
df = pd.read_csv('Matplotlib_Plots/preprocessing/preprocessed_data.csv', index_col=0, parse_dates=True)
df.head()

### Correlation Matrix Calculation
We calculate the correlation matrix for all numeric variables in the dataset.

In [None]:
# Select only numeric columns
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
corr_matrix = df[numeric_cols].corr()

# Save correlation matrix to CSV
corr_matrix.to_csv('Matplotlib_Plots/correlation_analysis/correlation_matrix.csv')

In [None]:
corr_matrix

### Correlation Matrix Heatmap
Visualize the correlation matrix as a heatmap to better understand relationships between variables.

In [None]:
# Create heatmap visualization of correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.tight_layout()
plt.savefig('Matplotlib_Plots/correlation_analysis/correlation_heatmap.png')
plt.show()
plt.close()

### Significant Correlations
Identify strong positive (r > 0.7) and strong negative (r < -0.7) correlations.

In [None]:
# Get upper triangle of correlation matrix to avoid duplicates
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find strong positive correlations
strong_pos = [(i, j, corr_matrix.loc[i, j]) for i in corr_matrix.index for j in corr_matrix.columns 
              if corr_matrix.loc[i, j] > 0.7 and i != j]
strong_pos.sort(key=lambda x: x[2], reverse=True)

# Find strong negative correlations
strong_neg = [(i, j, corr_matrix.loc[i, j]) for i in corr_matrix.index for j in corr_matrix.columns 
              if corr_matrix.loc[i, j] < -0.7 and i != j]
strong_neg.sort(key=lambda x: x[2])

print('Strong Positive Correlations (r > 0.7):')
if strong_pos:
    for i, j, corr in strong_pos:
        print(f'{i} and {j}: r = {corr:.4f}')
else:
    print('No strong positive correlations found (r > 0.7)')

print('\nStrong Negative Correlations (r < -0.7):')
if strong_neg:
    for i, j, corr in strong_neg:
        print(f'{i} and {j}: r = {corr:.4f}')
else:
    print('No strong negative correlations found (r < -0.7)')

### Scatter Plots for Top Correlations
Visualize the strongest positive and negative correlations with scatter plots.

In [None]:
all_strong = strong_pos + strong_neg
all_strong.sort(key=lambda x: abs(x[2]), reverse=True)
    
# Create scatter plots for top correlations
for idx, (var1, var2, corr) in enumerate(all_strong[:]):
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=df[var1], y=df[var2], alpha=0.5)
    plt.title(f'Correlation between {var1} and {var2} (r = {corr:.4f})')
    plt.xlabel(var1)
    plt.ylabel(var2)
    plt.tight_layout()
    plt.savefig(f'Matplotlib_Plots/correlation_analysis/scatter_{var1}_{var2}.png')
    plt.show()
    plt.close()

### Pollutant and Environmental Factor Correlations
Analyze correlations between pollutants and environmental factors.

In [None]:
pollutant_cols = ['CO(GT)', 'C6H6(GT)', 'NOx(GT)', 'NO2(GT)']
env_cols = ['T', 'RH', 'AH']
subset_corr = df[pollutant_cols + env_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(subset_corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Between Pollutants and Environmental Factors')
plt.tight_layout()
plt.savefig('Matplotlib_Plots/correlation_analysis/pollutant_env_correlation.png')
plt.show()
plt.close()
subset_corr.loc[pollutant_cols, env_cols]

### Sensor Performance Analysis
Analyze the correlation between ground truth pollutant measurements and corresponding sensor readings.

In [None]:
sensor_pairs = [
    ('CO(GT)', 'PT08.S1(CO)'),
    ('NOx(GT)', 'PT08.S3(NOx)'),
    ('NO2(GT)', 'PT08.S4(NO2)')
]
for gt, sensor in sensor_pairs:
    corr_val = corr_matrix.loc[gt, sensor]
    print(f'{gt} and {sensor}: r = {corr_val:.4f}')
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=df[gt], y=df[sensor], alpha=0.5)
    plt.title(f'Correlation between {gt} and {sensor} (r = {corr_val:.4f})')
    plt.xlabel(gt)
    plt.ylabel(sensor)
    plt.tight_layout()
    plt.savefig(f'Matplotlib_Plots/correlation_analysis/sensor_{gt}_{sensor}.png')
    plt.show()
    plt.close()

### Correlation Analysis Summary
- The strongest correlations in the dataset are highlighted above.
- Pollutant and environmental factor correlations reveal how weather conditions may influence pollution levels.
- Sensor performance analysis shows the relationship between sensor readings and ground truth measurements.
- These insights can guide feature selection and further modeling.

In [None]:
print("Correlation analysis completed. Results saved to correlation_analysis/ directory")

# Phase 5: Time Series Analysis

This phase delves into the temporal characteristics of the air quality data. We will perform time series decomposition to identify trend, seasonality, and residuals. Stationarity tests will be conducted, followed by ACF/PACF analysis to inform ARIMA modeling. Finally, an ARIMA model will be developed for forecasting key pollutant concentrations.

### 5.1 Loading Data for Time Series Analysis

Loading the preprocessed data and selecting key pollutants for the time series analysis.

In [None]:
df_tsa = pd.read_csv('Matplotlib_Plots/preprocessing/preprocessed_data.csv', index_col='DateTime', parse_dates=True)
print(f"Data for Time Series Analysis loaded. Shape: {df_tsa.shape}")
print(f"Time range: {df_tsa.index.min()} to {df_tsa.index.max()}")

# Select key pollutants for time series analysis
pollutants_tsa = ['CO(GT)', 'NOx(GT)', 'NO2(GT)', 'C6H6(GT)']
# Ensure these columns exist
pollutants_tsa = [p for p in pollutants_tsa if p in df_tsa.columns]
print(f"Selected pollutants for TSA: {pollutants_tsa}")

### 5.2 Visualizing Temporal Patterns

Plotting daily and monthly average concentrations to observe overall trends and seasonal patterns.

In [None]:
# Resample data to daily averages for better visualization
df_daily_tsa = df_tsa[pollutants_tsa].resample('D').mean()

plt.figure(figsize=(15, 10))
for i, pollutant in enumerate(pollutants_tsa):
    plt.subplot(len(pollutants_tsa), 1, i+1)
    df_daily_tsa[pollutant].plot()
    plt.title(f'Daily Average {pollutant}')
    plt.ylabel('Concentration')
plt.tight_layout()
plt.savefig('Matplotlib_Plots/time_series_analysis/daily_pollutants_tsa.png') # Ensure this directory exists
plt.show()
plt.close()

# Monthly averages for seasonal patterns
df_monthly_tsa = df_tsa[pollutants_tsa].resample('M').mean()

plt.figure(figsize=(15, 10))
for i, pollutant in enumerate(pollutants_tsa):
    plt.subplot(len(pollutants_tsa), 1, i+1)
    df_monthly_tsa[pollutant].plot()
    plt.title(f'Monthly Average {pollutant}')
    plt.ylabel('Concentration')
plt.tight_layout()
plt.savefig('Matplotlib_Plots/time_series_analysis/monthly_pollutants_tsa.png')
plt.show()
plt.close()
print("Daily and monthly average pollutant concentrations plotted and saved.")

Analyzing and plotting average hourly and weekly patterns for key pollutants to identify diurnal and weekly cycles.

In [None]:
# Hourly patterns (average by hour of day)
df_tsa_hourly = df_tsa.copy()
df_tsa_hourly['hour'] = df_tsa_hourly.index.hour
hourly_patterns_tsa = df_tsa_hourly.groupby('hour')[pollutants_tsa].mean()

plt.figure(figsize=(15, 10))
for i, pollutant in enumerate(pollutants_tsa):
    plt.subplot(len(pollutants_tsa), 1, i+1)
    hourly_patterns_tsa[pollutant].plot()
    plt.title(f'Average {pollutant} by Hour of Day')
    plt.ylabel('Concentration')
    plt.xlabel('Hour of Day')
    plt.xticks(range(0, 24, 2))
plt.tight_layout()
plt.savefig('Matplotlib_Plots/time_series_analysis/hourly_patterns_tsa.png')
plt.show()
plt.close()

# Weekly patterns (average by day of week)
df_tsa_weekly = df_tsa.copy()
df_tsa_weekly['day_of_week'] = df_tsa_weekly.index.dayofweek # Monday=0, Sunday=6
weekly_patterns_tsa = df_tsa_weekly.groupby('day_of_week')[pollutants_tsa].mean()
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

plt.figure(figsize=(15, 10))
for i, pollutant in enumerate(pollutants_tsa):
    plt.subplot(len(pollutants_tsa), 1, i+1)
    weekly_patterns_tsa[pollutant].plot(kind='bar')
    plt.title(f'Average {pollutant} by Day of Week')
    plt.ylabel('Concentration')
    plt.xlabel('Day of Week')
    plt.xticks(range(7), days, rotation=45)
plt.tight_layout()
plt.savefig('Matplotlib_Plots/time_series_analysis/weekly_patterns_tsa.png')
plt.show()
plt.close()
print("Hourly and weekly average pollutant patterns plotted and saved.")

### 5.3 Time Series Decomposition

Decomposing the CO(GT) time series into trend, seasonal, and residual components.

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

# Select CO(GT) for detailed decomposition analysis from daily data
target_pollutant_decomp = 'CO(GT)'
if target_pollutant_decomp in df_daily_tsa.columns:
    ts_decomp = df_daily_tsa[target_pollutant_decomp].fillna(method='ffill').fillna(method='bfill')
    
    if not ts_decomp.empty and len(ts_decomp.dropna()) >= 2 * 30: # Ensure enough data for period 30
        decomposition = seasonal_decompose(ts_decomp.dropna(), model='additive', period=30)
        plt.figure(figsize=(12, 10))
        decomposition.plot()
        plt.suptitle(f'Time Series Decomposition of Daily {target_pollutant_decomp}', y=1.02)
        plt.tight_layout()
        plt.savefig('Matplotlib_Plots/time_series_analysis/decomposition_tsa.png')
        plt.show()
        plt.close()
        print(f"Time series decomposition for {target_pollutant_decomp} completed and saved.")
    else:
        print(f"Not enough data points for decomposition of {target_pollutant_decomp} with period 30 after dropping NaNs. Length: {len(ts_decomp.dropna())}")
else:
    print(f"'{target_pollutant_decomp}' not found in daily resampled data for decomposition.")

### 5.4 Stationarity Analysis

Performing the Augmented Dickey-Fuller (ADF) test to check for stationarity in the CO(GT) time series.

In [None]:
from statsmodels.tsa.stattools import adfuller

if target_pollutant_decomp in df_daily_tsa.columns and 'ts_decomp' in locals() and not ts_decomp.empty:
    ts_adf = ts_decomp.dropna()
    if not ts_adf.empty:
        print(f"--- Stationarity Analysis for {target_pollutant_decomp} ---")
        result_adf = adfuller(ts_adf)
        print(f'ADF Statistic: {result_adf[0]}')
        print(f'p-value: {result_adf[1]}')
        print('Critical Values:')
        for key, value in result_adf[4].items():
            print(f'  {key}: {value}')

        if result_adf[1] <= 0.05:
            print(f"Conclusion: The time series for {target_pollutant_decomp} is stationary (reject H0).")
        else:
            print(f"Conclusion: The time series for {target_pollutant_decomp} is not stationary (fail to reject H0). Differencing may be needed.")
    else:
        print(f"Time series for {target_pollutant_decomp} is empty after dropping NaNs for ADF test.")
else:
    print(f"'{target_pollutant_decomp}' or its time series 'ts_decomp' not available for ADF test.")

### 5.5 Autocorrelation and Partial Autocorrelation Analysis

Generating ACF and PACF plots for the CO(GT) time series to help determine ARIMA model parameters.

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

if 'ts_adf' in locals() and not ts_adf.empty:
    plt.figure(figsize=(12, 6))
    plt.subplot(121)
    plot_acf(ts_adf, ax=plt.gca(), lags=40)
    plt.title(f'ACF for {target_pollutant_decomp}')
    plt.subplot(122)
    plot_pacf(ts_adf, ax=plt.gca(), lags=40)
    plt.title(f'PACF for {target_pollutant_decomp}')
    plt.tight_layout()
    plt.savefig('Matplotlib_Plots/time_series_analysis/acf_pacf_tsa.png')
    plt.show()
    plt.close()
    print(f"ACF and PACF plots for {target_pollutant_decomp} generated and saved.")
else:
    print(f"Time series 'ts_adf' for {target_pollutant_decomp} not available for ACF/PACF plots.")

### 5.6 ARIMA Modeling and Forecasting

Developing an ARIMA model to forecast CO(GT) concentrations, evaluating its performance, and generating future predictions.

In [None]:
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import numpy as np

if 'ts_adf' in locals() and not ts_adf.empty:
    ts_arima = ts_adf
    train_size = int(len(ts_arima) * 0.8)
    train_arima, test_arima = ts_arima[:train_size], ts_arima[train_size:]
    print(f"Training data size: {len(train_arima)}, Test data size: {len(test_arima)}")

    try:
        order = (1,1,1) # Example order, may need adjustment based on ACF/PACF and stationarity
        print(f"Attempting ARIMA with order={order}")
        model_arima = ARIMA(train_arima, order=order)
        model_fit_arima = model_arima.fit()
        print(model_fit_arima.summary())

        forecast_steps = len(test_arima)
        forecast_arima = model_fit_arima.forecast(steps=forecast_steps)

        plt.figure(figsize=(12, 6))
        plt.plot(train_arima.index, train_arima, label='Training Data')
        plt.plot(test_arima.index, test_arima, label='Actual Test Data')
        plt.plot(test_arima.index, forecast_arima, label='ARIMA Forecast', color='red')
        plt.title(f'ARIMA {order} Forecast for {target_pollutant_decomp}')
        plt.xlabel('Date')
        plt.ylabel('Concentration')
        plt.legend()
        plt.tight_layout()
        plt.savefig('Matplotlib_Plots/time_series_analysis/arima_forecast_tsa.png')
        plt.show()
        plt.close()

        mse_arima = mean_squared_error(test_arima, forecast_arima)
        rmse_arima = np.sqrt(mse_arima)
        print(f"ARIMA Model Results for {target_pollutant_decomp}:")
        print(f"  Mean Squared Error (MSE): {mse_arima:.4f}")
        print(f"  Root Mean Squared Error (RMSE): {rmse_arima:.4f}")

        future_steps_forecast = 30
        full_model_arima = ARIMA(ts_arima, order=order)
        full_model_fit_arima = full_model_arima.fit()
        future_forecast_values = full_model_fit_arima.forecast(steps=future_steps_forecast)
        last_date = ts_arima.index[-1]
        future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=future_steps_forecast, freq='D')

        plt.figure(figsize=(12, 6))
        plt.plot(ts_arima.index[-90:], ts_arima.iloc[-90:], label='Historical Data (Last 90 days)')
        plt.plot(future_dates, future_forecast_values, label=f'{future_steps_forecast}-Day Future Forecast', color='red')
        plt.title(f'{future_steps_forecast}-Day Future Forecast for {target_pollutant_decomp} (ARIMA {order})')
        plt.xlabel('Date')
        plt.ylabel('Concentration')
        plt.legend()
        plt.tight_layout()
        plt.savefig('Matplotlib_Plots/time_series_analysis/future_forecast_tsa.png')
        plt.show()
        plt.close()
        print(f"{future_steps_forecast}-day future forecast generated and saved.")

    except Exception as e:
        print(f"Error during ARIMA modeling for {target_pollutant_decomp}: {e}")
else:
    print(f"Time series 'ts_adf' for {target_pollutant_decomp} not available for ARIMA modeling.")

## Phase 5 Summary: Time Series Analysis

In this phase, we conducted a comprehensive time series analysis of the air quality data, focusing primarily on CO(GT) concentrations as an example pollutant. The analysis began with visualizing temporal patterns, where daily, monthly, hourly, and weekly trends were plotted. These visualizations helped in understanding the cyclical nature and overall trends in pollutant levels over different time scales.

Subsequently, time series decomposition was performed on the daily CO(GT) data. This allowed us to separate the time series into its constituent components: trend, seasonality, and residuals, providing deeper insights into the underlying structure of the data. The trend component showed the long-term direction of CO(GT) levels, while the seasonal component highlighted recurring patterns, and residuals represented the random noise.

Stationarity is a key assumption for many time series models. Therefore, the Augmented Dickey-Fuller (ADF) test was employed to check the stationarity of the CO(GT) time series. The results of this test informed whether differencing would be necessary for subsequent modeling. Following the stationarity assessment, Autocorrelation Function (ACF) and Partial Autocorrelation Function (PACF) plots were generated. These plots are instrumental in identifying the appropriate orders (p, d, q) for an ARIMA model.

Finally, an ARIMA model was developed and fitted to the CO(GT) time series. The model's performance was evaluated by forecasting values for a test period and comparing them against actual observations, using metrics like Mean Squared Error (MSE) and Root Mean Squared Error (RMSE). Additionally, the fitted ARIMA model was used to generate a 30-day forecast beyond the observed data period, providing a projection of future CO(GT) concentrations.

Overall, this phase provided valuable insights into the temporal dynamics of air pollution and demonstrated the application of time series modeling techniques for analysis and forecasting. The generated plots and model results are saved in the 'time_series_analysis/' directory.

# Phase 6: Advanced Modeling - Classification

This phase focuses on advanced classification modeling techniques for air quality data analysis. We'll implement various classification models to predict pollution levels based on environmental and temporal features. These techniques provide deeper insights into pollution patterns and enable predictive capabilities for air quality management.

## 6.1 Classification Modeling Setup

In this section, we'll set up the environment for classification modeling and prepare the data.

In [None]:
print("=== CLASSIFICATION MODELING ===")

# Import necessary libraries for classification
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import os

### 6.1.1 Loading Data with Engineered Features

Loading the dataset with engineered features for more effective modeling.

In [None]:
# Load the data with engineered features
print("Loading data with engineered features...")
try:
    df_model = pd.read_csv('Matplotlib_Plots/feature_engineering/data_with_engineered_features.csv', index_col=0, parse_dates=True)
    print("Successfully loaded data with engineered features.")
except FileNotFoundError:
    print("Feature-engineered data file not found. Using preprocessed data instead.")
    df_model = pd.read_csv('Matplotlib_Plots/preprocessing/preprocessed_data.csv', index_col=0, parse_dates=True)

print(f"Loaded dataset with {df_model.shape[0]} rows and {df_model.shape[1]} columns")

# Drop rows with NaN values
df_model_clean = df_model.dropna()
print(f"Dataset after dropping NaN values: {df_model_clean.shape[0]} rows")
df_model_clean.head()

### 6.1.2 Creating Binary Target for Classification

Defining a binary target variable for high pollution events based on CO(GT) concentration.

In [None]:
# Define target for classification (high pollution vs. normal)
target_regression = 'CO(GT)'
if target_regression in df_model_clean.columns:
    # Using the 75th percentile of CO(GT) as threshold for high pollution
    threshold = df_model_clean[target_regression].quantile(0.75)
    print(f"Classification threshold ({target_regression} 75th percentile): {threshold:.4f}")
    
    df_model_clean['high_pollution'] = (df_model_clean[target_regression] > threshold).astype(int)
    print(f"Class distribution: {df_model_clean['high_pollution'].value_counts(normalize=True)}")
    
    # Visualize the threshold and class distribution
    plt.figure(figsize=(12, 6))
    plt.hist(df_model_clean[target_regression], bins=50, alpha=0.7)
    plt.axvline(x=threshold, color='red', linestyle='--', label=f'Threshold (75th percentile): {threshold:.4f}')
    plt.title(f'Distribution of {target_regression} with High Pollution Threshold')
    plt.xlabel(f'{target_regression} Concentration')
    plt.ylabel('Frequency')
    plt.legend()
    plt.savefig('Matplotlib_Plots/modelling_analysis_results/classification/high_pollution_threshold.png')
    plt.show()
    plt.close()
else:
    print(f"Target variable {target_regression} not found in dataset. Cannot proceed with classification.")

### 6.1.3 Feature Selection and Data Splitting

Selecting relevant features and splitting the data into training and testing sets.

In [None]:
if 'high_pollution' in df_model_clean.columns:
    # Define target and features for classification
    target_classification = 'high_pollution'
    
    # Exclude other ground truth pollutants and the target from features
    exclude_cols = ['NOx(GT)', 'NO2(GT)', 'C6H6(GT)', target_regression, target_classification]
    feature_cols = [col for col in df_model_clean.columns if col not in exclude_cols 
                    and df_model_clean[col].dtype in ['float64', 'int64']]
    
    # Print feature information
    print(f"Number of features: {len(feature_cols)}")
    print(f"Features: {feature_cols[:5]}... (and {len(feature_cols)-5} more)")
    
    # Split the data
    X = df_model_clean[feature_cols]
    y = df_model_clean[target_classification]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    print(f"Training set: {X_train.shape[0]} samples, Test set: {X_test.shape[0]} samples")
    print(f"Class balance in training set: {pd.Series(y_train).value_counts(normalize=True)}")
    
    # Scale the features for models that require it
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
else:
    print("Target variable 'high_pollution' not found. Cannot proceed with classification.")

## 6.2 Logistic Regression Model

Implementing a logistic regression model for binary classification of high pollution events.

In [None]:
if 'X_train' in locals() and 'y_train' in locals():
    print("1. Logistic Regression")
    
    # Create pipeline with standardization
    lr_clf_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000))
    ])
    
    # Train the model
    lr_clf_pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred_lr_clf = lr_clf_pipeline.predict(X_test)
    y_prob_lr_clf = lr_clf_pipeline.predict_proba(X_test)[:, 1]
    
    # Evaluate
    accuracy_lr = accuracy_score(y_test, y_pred_lr_clf)
    print(f"Logistic Regression Results:")
    print(f"  Accuracy: {accuracy_lr:.4f}")
    print(classification_report(y_test, y_pred_lr_clf))
    
    # Confusion Matrix
    cm_lr = confusion_matrix(y_test, y_pred_lr_clf)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Normal', 'High Pollution'],
                yticklabels=['Normal', 'High Pollution'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Logistic Regression: Confusion Matrix')
    plt.tight_layout()
    plt.savefig('Matplotlib_Plots/modelling_analysis_results/classification/lr_confusion_matrix.png')
    plt.show()
    plt.close()
    
    # ROC Curve
    fpr_lr, tpr_lr, _ = roc_curve(y_test, y_prob_lr_clf)
    roc_auc_lr = auc(fpr_lr, tpr_lr)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {roc_auc_lr:.4f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Logistic Regression: ROC Curve')
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.savefig('Matplotlib_Plots/modelling_analysis_results/classification/lr_roc_curve.png')
    plt.show()
    plt.close()
else:
    print("Training data not available. Cannot proceed with Logistic Regression.")

## 6.3 Decision Tree Classifier

Implementing a Decision Tree classifier with hyperparameter tuning for improved performance.

In [None]:
if 'X_train' in locals() and 'y_train' in locals():
    print("2. Decision Tree Classifier")
    
    # Create pipeline
    dt_clf_pipeline = Pipeline([
        ('classifier', DecisionTreeClassifier(random_state=42, class_weight='balanced'))
    ])
    
    # Define parameter grid for tuning
    param_grid = {
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4]
    }
    
    # Grid search with cross-validation
    grid_search = GridSearchCV(dt_clf_pipeline, param_grid, cv=3, scoring='f1', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Best parameters and score
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best CV F1 Score: {grid_search.best_score_:.4f}")
    
    # Use the best model
    dt_clf_best = grid_search.best_estimator_
    
    # Make predictions
    y_pred_dt_clf = dt_clf_best.predict(X_test)
    y_prob_dt_clf = dt_clf_best.predict_proba(X_test)[:, 1]
    
    # Evaluate
    accuracy_dt = accuracy_score(y_test, y_pred_dt_clf)
    print(f"Decision Tree Classifier Results:")
    print(f"  Accuracy: {accuracy_dt:.4f}")
    print(classification_report(y_test, y_pred_dt_clf))
    
    # Confusion Matrix
    cm_dt = confusion_matrix(y_test, y_pred_dt_clf)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm_dt, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Normal', 'High Pollution'],
                yticklabels=['Normal', 'High Pollution'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Decision Tree: Confusion Matrix')
    plt.tight_layout()
    plt.savefig('Matplotlib_Plots/modelling_analysis_results/classification/dt_confusion_matrix.png')
    plt.show()
    plt.close()
    
    # ROC Curve
    fpr_dt, tpr_dt, _ = roc_curve(y_test, y_prob_dt_clf)
    roc_auc_dt = auc(fpr_dt, tpr_dt)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr_dt, tpr_dt, label=f'Decision Tree (AUC = {roc_auc_dt:.4f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Decision Tree: ROC Curve')
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.savefig('Matplotlib_Plots/modelling_analysis_results/classification/dt_roc_curve.png')
    plt.show()
    plt.close()
    
    # Feature importance
    if hasattr(dt_clf_best.named_steps['classifier'], 'feature_importances_'):
        dt_features = dt_clf_best.named_steps['classifier'].feature_importances_
        feature_importance = pd.DataFrame({'Feature': feature_cols, 'Importance': dt_features})
        feature_importance = feature_importance.sort_values('Importance', ascending=False)
        
        # Plot feature importance
        plt.figure(figsize=(12, 8))
        sns.barplot(x='Importance', y='Feature', data=feature_importance.head(15))
        plt.title('Decision Tree: Top 15 Feature Importance')
        plt.tight_layout()
        plt.savefig('Matplotlib_Plots/modelling_analysis_results/classification/dt_feature_importance.png')
        plt.show()
        plt.close()
else:
    print("Training data not available. Cannot proceed with Decision Tree Classifier.")

## 6.4 Random Forest Classifier

Implementing a Random Forest classifier with hyperparameter tuning for improved performance.

In [None]:
if 'X_train' in locals() and 'y_train' in locals():
    print("3. Random Forest Classifier")
    
    # Create pipeline with standardization
    rf_clf_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced'))
    ])
    
    # Define parameter grid for tuning
    param_grid = {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [None, 10, 20]
    }
    
    # Grid search with cross-validation
    grid_search = GridSearchCV(rf_clf_pipeline, param_grid, cv=3, scoring='f1', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Best parameters and score
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best CV F1 Score: {grid_search.best_score_:.4f}")
    
    # Use the best model
    rf_clf_best = grid_search.best_estimator_
    
    # Make predictions
    y_pred_rf_clf = rf_clf_best.predict(X_test)
    y_prob_rf_clf = rf_clf_best.predict_proba(X_test)[:, 1]
    
    # Evaluate
    accuracy_rf = accuracy_score(y_test, y_pred_rf_clf)
    print(f"Random Forest Classifier Results:")
    print(f"  Accuracy: {accuracy_rf:.4f}")
    print(classification_report(y_test, y_pred_rf_clf))
    
    # Confusion Matrix
    cm_rf = confusion_matrix(y_test, y_pred_rf_clf)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Normal', 'High Pollution'],
                yticklabels=['Normal', 'High Pollution'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Random Forest: Confusion Matrix')
    plt.tight_layout()
    plt.savefig('Matplotlib_Plots/modelling_analysis_results/classification/rf_confusion_matrix.png')
    plt.show()
    plt.close()
    
    # ROC Curve
    fpr_rf, tpr_rf, _ = roc_curve(y_test, y_prob_rf_clf)
    roc_auc_rf = auc(fpr_rf, tpr_rf)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {roc_auc_rf:.4f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Random Forest: ROC Curve')
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.savefig('Matplotlib_Plots/modelling_analysis_results/classification/rf_roc_curve.png')
    plt.show()
    plt.close()
    
    # Feature importance
    if hasattr(rf_clf_best.named_steps['classifier'], 'feature_importances_'):
        rf_features = rf_clf_best.named_steps['classifier'].feature_importances_
        feature_importance = pd.DataFrame({'Feature': feature_cols, 'Importance': rf_features})
        feature_importance = feature_importance.sort_values('Importance', ascending=False)
        
        # Plot feature importance
        plt.figure(figsize=(12, 8))
        sns.barplot(x='Importance', y='Feature', data=feature_importance.head(15))
        plt.title('Random Forest: Top 15 Feature Importance')
        plt.tight_layout()
        plt.savefig('Matplotlib_Plots/modelling_analysis_results/classification/rf_feature_importance.png')
        plt.show()
        plt.close()
else:
    print("Training data not available. Cannot proceed with Random Forest Classifier.")

## 6.5 K-Nearest Neighbors (KNN) Classifier

Implementing a K-Nearest Neighbors classifier with hyperparameter tuning for optimal performance.

In [None]:
if 'X_train' in locals() and 'y_train' in locals():
    print("4. K-Nearest Neighbors (KNN) Classifier")
    
    # Create pipeline with standardization
    knn_clf_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', KNeighborsClassifier())
    ])
    
    # Define parameter grid for tuning
    param_grid = {
        'classifier__n_neighbors': [3, 5, 7, 9, 11],
        'classifier__weights': ['uniform', 'distance'],
        'classifier__p': [1, 2]  # p=1 for Manhattan, p=2 for Euclidean
    }
    
    # Grid search with cross-validation
    grid_search = GridSearchCV(knn_clf_pipeline, param_grid, cv=3, scoring='f1', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Best parameters and score
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best CV F1 Score: {grid_search.best_score_:.4f}")
    
    # Use the best model
    knn_clf_best = grid_search.best_estimator_
    
    # Make predictions
    y_pred_knn_clf = knn_clf_best.predict(X_test)
    y_prob_knn_clf = knn_clf_best.predict_proba(X_test)[:, 1]
    
    # Evaluate
    accuracy_knn = accuracy_score(y_test, y_pred_knn_clf)
    print(f"KNN Classifier Results:")
    print(f"  Accuracy: {accuracy_knn:.4f}")
    print(classification_report(y_test, y_pred_knn_clf))
    
    # Confusion Matrix
    cm_knn = confusion_matrix(y_test, y_pred_knn_clf)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm_knn, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Normal', 'High Pollution'],
                yticklabels=['Normal', 'High Pollution'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('KNN: Confusion Matrix')
    plt.tight_layout()
    plt.savefig('Matplotlib_Plots/modelling_analysis_results/classification/knn_confusion_matrix.png')
    plt.show()
    plt.close()
    
    # ROC Curve
    fpr_knn, tpr_knn, _ = roc_curve(y_test, y_prob_knn_clf)
    roc_auc_knn = auc(fpr_knn, tpr_knn)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr_knn, tpr_knn, label=f'KNN (AUC = {roc_auc_knn:.4f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('KNN: ROC Curve')
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.savefig('Matplotlib_Plots/modelling_analysis_results/classification/knn_roc_curve.png')
    plt.show()
    plt.close()
    
    # Visualize KNN decision boundary (for 2 selected features)
    if len(feature_cols) >= 2:
        # Select two important features for visualization
        if 'feature_importance' in locals() and not feature_importance.empty:
            top_features = feature_importance.head(2)['Feature'].values
        else:
            # If no feature importance available, use first two features
            top_features = feature_cols[:2]
        
        # Train a KNN model on just these two features
        X_train_2d = X_train[top_features]
        X_test_2d = X_test[top_features]
        
        # Scale the data
        scaler_2d = StandardScaler()
        X_train_2d_scaled = scaler_2d.fit_transform(X_train_2d)
        X_test_2d_scaled = scaler_2d.transform(X_test_2d)
        
        # Train KNN with best parameters
        best_params = grid_search.best_params_
        knn_2d = KNeighborsClassifier(
            n_neighbors=best_params['classifier__n_neighbors'],
            weights=best_params['classifier__weights'],
            p=best_params['classifier__p']
        )
        knn_2d.fit(X_train_2d_scaled, y_train)
        
        # Create a mesh grid for decision boundary visualization
        h = 0.02  # step size in the mesh
        x_min, x_max = X_train_2d_scaled[:, 0].min() - 1, X_train_2d_scaled[:, 0].max() + 1
        y_min, y_max = X_train_2d_scaled[:, 1].min() - 1, X_train_2d_scaled[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
        
        # Predict class for each point in the mesh
        Z = knn_2d.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        
        # Plot the decision boundary
        plt.figure(figsize=(10, 8))
        plt.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm')
        
        # Plot the training points
        scatter = plt.scatter(X_train_2d_scaled[:, 0], X_train_2d_scaled[:, 1], c=y_train, 
                   edgecolor='k', s=50, cmap='coolwarm', alpha=0.7)
        plt.xlabel(f'Scaled {top_features[0]}')
        plt.ylabel(f'Scaled {top_features[1]}')
        plt.title(f'KNN Decision Boundary (n_neighbors={best_params["classifier__n_neighbors"]})')
        plt.colorbar(scatter, label='Class')
        plt.tight_layout()
        plt.savefig('Matplotlib_Plots/modelling_analysis_results/classification/knn_decision_boundary.png')
        plt.show()
        plt.close()
else:
    print("Training data not available. Cannot proceed with KNN Classifier.")

## 6.6 Support Vector Machine (SVM) Classifier

Implementing a Support Vector Machine classifier with hyperparameter tuning for optimal performance.

In [None]:
if 'X_train' in locals() and 'y_train' in locals():
    print("5. Support Vector Machine (SVM) Classifier")
    
    # Create pipeline with standardization
    svm_clf_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', SVC(random_state=42, probability=True, class_weight='balanced'))
    ])
    
    # Define parameter grid for tuning
    param_grid = {
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['linear', 'rbf'],
        'classifier__gamma': ['scale', 'auto']
    }
    
    # Grid search with cross-validation
    grid_search = GridSearchCV(svm_clf_pipeline, param_grid, cv=3, scoring='f1', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Best parameters and score
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best CV F1 Score: {grid_search.best_score_:.4f}")
    
    # Use the best model
    svm_clf_best = grid_search.best_estimator_
    
    # Make predictions
    y_pred_svm_clf = svm_clf_best.predict(X_test)
    y_prob_svm_clf = svm_clf_best.predict_proba(X_test)[:, 1]
    
    # Evaluate
    accuracy_svm = accuracy_score(y_test, y_pred_svm_clf)
    print(f"SVM Classifier Results:")
    print(f"  Accuracy: {accuracy_svm:.4f}")
    print(classification_report(y_test, y_pred_svm_clf))
    
    # Confusion Matrix
    cm_svm = confusion_matrix(y_test, y_pred_svm_clf)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm_svm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Normal', 'High Pollution'],
                yticklabels=['Normal', 'High Pollution'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('SVM: Confusion Matrix')
    plt.tight_layout()
    plt.savefig('Matplotlib_Plots/modelling_analysis_results/classification/svm_confusion_matrix.png')
    plt.show()
    plt.close()
    
    # ROC Curve
    fpr_svm, tpr_svm, _ = roc_curve(y_test, y_prob_svm_clf)
    roc_auc_svm = auc(fpr_svm, tpr_svm)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr_svm, tpr_svm, label=f'SVM (AUC = {roc_auc_svm:.4f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('SVM: ROC Curve')
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.savefig('Matplotlib_Plots/modelling_analysis_results/classification/svm_roc_curve.png')
    plt.show()
    plt.close()
    
    # Visualize SVM decision boundary (for 2 selected features)
    if len(feature_cols) >= 2 and grid_search.best_params_['classifier__kernel'] == 'linear':
        # For linear kernel, we can visualize feature coefficients
        if hasattr(svm_clf_best.named_steps['classifier'], 'coef_'):
            svm_coef = svm_clf_best.named_steps['classifier'].coef_[0]
            feature_importance = pd.DataFrame({'Feature': feature_cols, 'Coefficient': np.abs(svm_coef)})
            feature_importance = feature_importance.sort_values('Coefficient', ascending=False)
            
            # Plot feature coefficients
            plt.figure(figsize=(12, 8))
            sns.barplot(x='Coefficient', y='Feature', data=feature_importance.head(15))
            plt.title('SVM: Top 15 Feature Coefficients (Absolute Value)')
            plt.tight_layout()
            plt.savefig('Matplotlib_Plots/modelling_analysis_results/classification/svm_feature_coefficients.png')
            plt.show()
            plt.close()
            
            # Select two important features for visualization
            top_features = feature_importance.head(2)['Feature'].values
            
            # Train an SVM model on just these two features
            X_train_2d = X_train[top_features]
            X_test_2d = X_test[top_features]
            
            # Scale the data
            scaler_2d = StandardScaler()
            X_train_2d_scaled = scaler_2d.fit_transform(X_train_2d)
            X_test_2d_scaled = scaler_2d.transform(X_test_2d)
            
            # Train SVM with best parameters
            best_params = grid_search.best_params_
            svm_2d = SVC(
                C=best_params['classifier__C'],
                kernel=best_params['classifier__kernel'],
                gamma=best_params['classifier__gamma'],
                probability=True,
                random_state=42
            )
            svm_2d.fit(X_train_2d_scaled, y_train)
            
            # Create a mesh grid for decision boundary visualization
            h = 0.02  # step size in the mesh
            x_min, x_max = X_train_2d_scaled[:, 0].min() - 1, X_train_2d_scaled[:, 0].max() + 1
            y_min, y_max = X_train_2d_scaled[:, 1].min() - 1, X_train_2d_scaled[:, 1].max() + 1
            xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
            
            # Predict class for each point in the mesh
            Z = svm_2d.predict(np.c_[xx.ravel(), yy.ravel()])
            Z = Z.reshape(xx.shape)
            
            # Plot the decision boundary
            plt.figure(figsize=(10, 8))
            plt.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm')
            
            # Plot the training points
            scatter = plt.scatter(X_train_2d_scaled[:, 0], X_train_2d_scaled[:, 1], c=y_train, 
                       edgecolor='k', s=50, cmap='coolwarm', alpha=0.7)
            plt.xlabel(f'Scaled {top_features[0]}')
            plt.ylabel(f'Scaled {top_features[1]}')
            plt.title(f'SVM Decision Boundary (C={best_params["classifier__C"]}, kernel={best_params["classifier__kernel"]})')
            plt.colorbar(scatter, label='Class')
            plt.tight_layout()
            plt.savefig('Matplotlib_Plots/modelling_analysis_results/classification/svm_decision_boundary.png')
            plt.show()
            plt.close()
else:
    print("Training data not available. Cannot proceed with SVM Classifier.")

## 6.7 Recurrent Neural Network (RNN) Classifier

Implementing a Recurrent Neural Network (LSTM) classifier for time series classification of pollution events.

In [None]:
if 'X_train' in locals() and 'y_train' in locals():
    print("6. Recurrent Neural Network (RNN) Classifier")
    
    # For RNN, we need to reshape the data to have a time dimension
    # We'll use a simple approach: reshape each sample as a sequence of features
    # This is a simplified approach for demonstration purposes
    
    # Scale the data
    scaler_rnn = MinMaxScaler()  # MinMaxScaler works better for neural networks
    X_train_scaled_rnn = scaler_rnn.fit_transform(X_train)
    X_test_scaled_rnn = scaler_rnn.transform(X_test)
    
    # Reshape for RNN: [samples, time steps, features]
    # We'll treat each feature as a time step for simplicity
    n_features = X_train.shape[1]
    X_train_rnn = X_train_scaled_rnn.reshape(X_train_scaled_rnn.shape[0], n_features, 1)
    X_test_rnn = X_test_scaled_rnn.reshape(X_test_scaled_rnn.shape[0], n_features, 1)
    
    print(f"RNN input shape: {X_train_rnn.shape} (samples, time steps, features)")
    
    # Build the RNN model
    rnn_model = Sequential([
        LSTM(64, input_shape=(n_features, 1), return_sequences=True),
        Dropout(0.2),
        LSTM(32),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    
    # Compile the model
    rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Print model summary
    rnn_model.summary()
    
    # Early stopping to prevent overfitting
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    
    # Train the model
    history = rnn_model.fit(
        X_train_rnn, y_train,
        epochs=20,
        batch_size=32,
        validation_split=0.2,
        callbacks=[early_stopping],
        verbose=1
    )
    
    # Plot training history
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('RNN: Training and Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('RNN: Training and Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig('Matplotlib_Plots/modelling_analysis_results/classification/rnn_training_history.png')
    plt.show()
    plt.close()
    
    # Evaluate on test set
    loss, accuracy = rnn_model.evaluate(X_test_rnn, y_test, verbose=0)
    print(f"RNN Test Loss: {loss:.4f}")
    print(f"RNN Test Accuracy: {accuracy:.4f}")
    
    # Make predictions
    y_pred_proba_rnn = rnn_model.predict(X_test_rnn, verbose=0)
    y_pred_rnn = (y_pred_proba_rnn > 0.5).astype(int).flatten()
    
    # Evaluate
    accuracy_rnn = accuracy_score(y_test, y_pred_rnn)
    print(f"RNN Classifier Results:")
    print(f"  Accuracy: {accuracy_rnn:.4f}")
    print(classification_report(y_test, y_pred_rnn))
    
    # Confusion Matrix
    cm_rnn = confusion_matrix(y_test, y_pred_rnn)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm_rnn, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Normal', 'High Pollution'],
                yticklabels=['Normal', 'High Pollution'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('RNN: Confusion Matrix')
    plt.tight_layout()
    plt.savefig('Matplotlib_Plots/modelling_analysis_results/classification/rnn_confusion_matrix.png')
    plt.show()
    plt.close()
    
    # ROC Curve
    fpr_rnn, tpr_rnn, _ = roc_curve(y_test, y_pred_proba_rnn)
    roc_auc_rnn = auc(fpr_rnn, tpr_rnn)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr_rnn, tpr_rnn, label=f'RNN (AUC = {roc_auc_rnn:.4f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('RNN: ROC Curve')
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.savefig('Matplotlib_Plots/modelling_analysis_results/classification/rnn_roc_curve.png')
    plt.show()
    plt.close()
else:
    print("Training data not available. Cannot proceed with RNN Classifier.")

## 6.8 Model Comparison

Comparing the performance of all classification models to identify the best approach.

In [None]:
print("\nClassification Model Comparison")

# Create comparison dataframe
models = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'KNN', 'SVM', 'RNN']
accuracy_values = [accuracy_lr, accuracy_dt, accuracy_rf, accuracy_knn, accuracy_svm, accuracy_rnn]
auc_values = [roc_auc_lr, roc_auc_dt, roc_auc_rf, roc_auc_knn, roc_auc_svm, roc_auc_rnn]

comparison_df = pd.DataFrame({
    'Model': models,
    'Accuracy': accuracy_values,
    'AUC': auc_values
})

print(comparison_df)

# Save comparison to CSV
comparison_df.to_csv('Matplotlib_Plots/modelling_analysis_results/classification/model_comparison.csv', index=False)

# Plot comparison
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x='Model', y='Accuracy', data=comparison_df)
plt.title('Accuracy Comparison')
plt.ylim(0.7, 1.0)  # Adjust as needed
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
sns.barplot(x='Model', y='AUC', data=comparison_df)
plt.title('AUC Comparison')
plt.ylim(0.7, 1.0)  # Adjust as needed
plt.xticks(rotation=45)

plt.tight_layout()
plt.savefig('Matplotlib_Plots/modelling_analysis_results/classification/model_comparison.png')
plt.show()
plt.close()

# Combined ROC curves
plt.figure(figsize=(10, 8))
plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {roc_auc_lr:.4f})')
plt.plot(fpr_dt, tpr_dt, label=f'Decision Tree (AUC = {roc_auc_dt:.4f})')
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {roc_auc_rf:.4f})')
plt.plot(fpr_knn, tpr_knn, label=f'KNN (AUC = {roc_auc_knn:.4f})')
plt.plot(fpr_svm, tpr_svm, label=f'SVM (AUC = {roc_auc_svm:.4f})')
plt.plot(fpr_rnn, tpr_rnn, label=f'RNN (AUC = {roc_auc_rnn:.4f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for All Classification Models')
plt.legend(loc='lower right')
plt.tight_layout()
plt.savefig('Matplotlib_Plots/modelling_analysis_results/classification/combined_roc_curves.png')
plt.show()
plt.close()

# Identify the best model
best_model_idx = np.argmax(auc_values)
best_model = models[best_model_idx]
best_auc = auc_values[best_model_idx]
best_accuracy = accuracy_values[best_model_idx]

print(f"\nBest performing model based on AUC: {best_model}")
print(f"  AUC: {best_auc:.4f}")
print(f"  Accuracy: {best_accuracy:.4f}")

## Phase 6 Summary: Advanced Modeling - Classification

In Phase 6, we applied advanced classification modeling techniques to predict high pollution events based on environmental and temporal features. We defined a binary target variable by setting a threshold at the 75th percentile of CO(GT) concentrations, effectively distinguishing between normal and high pollution conditions.

Six different classification models were implemented and compared: Logistic Regression, Decision Tree, Random Forest, K-Nearest Neighbors (KNN), Support Vector Machine (SVM), and Recurrent Neural Network (RNN). Each model was carefully tuned using grid search with cross-validation to optimize hyperparameters and maximize performance.

The models were evaluated using multiple metrics including accuracy, precision, recall, F1-score, and AUC-ROC. Confusion matrices were generated to visualize true positives, false positives, true negatives, and false negatives for each model. ROC curves illustrated the trade-off between sensitivity and specificity across different classification thresholds.

For tree-based models (Decision Tree and Random Forest), feature importance analysis revealed which variables were most influential in predicting high pollution events. For KNN and SVM, decision boundaries were visualized to provide insight into how these models classify the data in feature space. The RNN model demonstrated how deep learning approaches can capture complex patterns in the data, particularly temporal dependencies.

The comprehensive model comparison showed that [best model name] achieved the highest performance with an AUC of [best AUC value] and accuracy of [best accuracy value]. This suggests that [insights about model performance and characteristics].

These classification models provide valuable predictive capabilities for air quality management, enabling the forecasting of high pollution events based on measurable environmental and temporal factors. Such predictions can inform public health advisories, traffic management decisions, and other interventions aimed at reducing exposure to harmful air pollutants.