In [None]:
import os
import warnings

import hopsworks
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from dotenv import load_dotenv
from sklearn.ensemble import RandomForestRegressor
from statsmodels.tsa.seasonal import seasonal_decompose

warnings.filterwarnings('ignore')
HOPSWORKS_AVAILABLE = True

In [None]:
# CONFIG
HOPSWORKS_PROJECT_NAME = "jurjanji_AQI"
load_dotenv()
HOPSWORKS_API_KEY = os.getenv("hopsworks_api_key")
FEATURE_GROUP_NAME = 'air_quality_data'
FEATURE_GROUP_VERSION = 1

# Utility: show a pretty dataframe head
def show(df, n=5):
    display(df.head(n))

In [None]:
# 1) LOAD DATA - try Hopsworks Feature Store first, else fallback to local CSV
if HOPSWORKS_AVAILABLE and (HOPSWORKS_API_KEY is not None or HOPSWORKS_PROJECT_NAME is not None):
    print('Attempting to load from Hopsworks Feature Store...')
    project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY, project=HOPSWORKS_PROJECT_NAME)
    fs = project.get_feature_store()
    try:
        fg = fs.get_feature_group(name=FEATURE_GROUP_NAME, version=FEATURE_GROUP_VERSION)
        df = fg.read()
        print('Loaded from Hopsworks:', df.shape)
    except Exception as e:
        print('Could not read feature group from Hopsworks:', e)
else:
    print('Hopsworks not available or API key/project not set')

In [None]:
# Ensure timestamp column exists and is datetime
if 'timestamp_utc' in df.columns:
    df['timestamp_utc'] = pd.to_datetime(df['timestamp_utc'])
    df = df.sort_values('timestamp_utc').reset_index(drop=True)
else:
    raise ValueError('No timestamp column found. Ensure dataset includes a timestamp column.')

print('Data shape:', df.shape)

In [None]:
df.info()
df.describe().T.head(10)
df.isna().sum().sort_values(ascending=False).head(10)

In [None]:
# QUICK OVERVIEW
print('Columns:')
print(df.columns.tolist())
print('\nBasic info:')
print(df.info())

# Summary stats
display(df.describe(include='all').T)

In [None]:
# MISSINGNESS ANALYSIS
missing = df.isnull().mean().sort_values(ascending=False)
print('Missing fraction per column:')
print(missing[missing>0])


# Visual missingness (simple)
plt.figure(figsize=(12,4))
sns.heatmap(df.isnull().T, cbar=False)
plt.title('Missingness map (columns x rows)')
plt.savefig("missingness_map(columns_x_rows).png")
plt.show()

In [None]:
# # DISTRIBUTIONS - numeric cols
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print('Numeric columns:', numeric_cols)


# plt.figure(figsize=(14, 6))
# for i, col in enumerate(numeric_cols):
#     plt.subplot(2, int(np.ceil(len(numeric_cols)/2)), i+1)
#     sns.histplot(df[col].dropna(), kde=True)
#     plt.title(col)
#     plt.tight_layout()
#     plt.show()
#     plt.savefig("numeric_cols_distribution.png")

In [None]:
# TIME SERIES PLOTS - AQI over time
if 'ow_aqi_index' in df.columns:
    plt.figure(figsize=(15,4))
    plt.plot(df['timestamp_utc'], df['ow_aqi_index'], marker='.', linewidth=0.5)
    plt.title('AQI over time')
    plt.xlabel('timestamp')
    plt.ylabel('AQI')
    plt.savefig("aqi_over_time.png")
    plt.show()
    
    # Rolling averages
    plt.figure(figsize=(15,4))
    df.set_index('timestamp_utc')['ow_aqi_index'].rolling(window=24).mean().plot()
    plt.title('AQI - 24-point rolling mean')
    plt.savefig("aqi_rolling_avgs.png")
    plt.show()  
else:
    print('AQI column not found - skip AQI time plots')

In [None]:
# SEASONAL DECOMPOSITION (daily/weekly) - requires regular frequency
# We attempt to set the series to hourly frequency if possible
try:
    ts = df.set_index('timestamp_utc')['ow_aqi_index'].asfreq('H')
    ts_interpolated = ts.interpolate()
    decomposition = seasonal_decompose(ts_interpolated, model='additive', period=24) # daily
    fig = decomposition.plot()
    fig.set_size_inches(12, 8)
    plt.suptitle('Seasonal Decompose (period=24)')
    plt.savefig("seasonal_decomposition.png")
    plt.show()   
except Exception as e:
    print('Seasonal decomposition failed:', e)

In [None]:
# CORRELATION HEATMAP (pollutants vs AQI)
plt.figure(figsize=(10,8))
corr = df[numeric_cols].corr()
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation matrix')
plt.savefig("correlation_heatmap_vsAQI.png")
plt.show()

In [None]:
# the target label distribution
# checking is AQI target values are balanced or spread about
sns.histplot(df['ow_aqi_index'], kde=True) # type: ignore
plt.title("Distribution of AQI Index")
plt.savefig("aqi_distribution.png")
plt.show()

In [None]:
# Focused scatter plots vs AQI
if 'ow_aqi_index' in df.columns:
    pollutants = [c for c in numeric_cols if c != 'ow_aqi_index' and c.lower() not in ['index']]
    plt.figure(figsize=(14, 4*len(pollutants)))
    for i, col in enumerate(pollutants):
        plt.subplot(len(pollutants), 1, i+1)
        plt.scatter(df[col], df['ow_aqi_index'], alpha=0.3, s=8)
        plt.xlabel(col)
        plt.ylabel('AQI')
        plt.title(f'AQI vs {col}')
        plt.tight_layout()
    plt.savefig("aqi_vs_pollutants.png")
    plt.show()

In [None]:
# checking which features have the strongest correlation with the target label
corr = df.corr(numeric_only=True)
plt.figure(figsize=(12, 8))
sns.heatmap(corr, cmap="coolwarm", center=0, annot=False)
plt.title("Feature Correlation Heatmap")
plt.savefig("correlation_heatmap_2.png")
plt.show()


# the top 10 correlated features
# Top correlated with AQI
corr['ow_aqi_index'].sort_values(ascending=False).head(10)
corr['ow_aqi_index'].sort_values(ascending=False).to_csv("feature_correlations.csv")

In [None]:
# identifying trends over time
df['timestamp_utc'] = pd.to_datetime(df['timestamp_utc'])
plt.figure(figsize=(10, 5))
plt.plot(df['timestamp_utc'], df['ow_aqi_index'], marker='o')
plt.title("AQI Over Time")
plt.xlabel("Date")
plt.ylabel("AQI")
plt.xticks(rotation=45)
plt.savefig("aqi_over_time_2.png")
plt.show()

In [None]:
# seeing how the pollutants relate to the AQI
pollutants = ['pm2_5', 'pm10', 'co', 'no', 'no2', 'o3', 'so2', 'nh3']
for col in pollutants:
    plt.figure(figsize=(6, 4))
    sns.scatterplot(data=df, x=col, y='ow_aqi_index')
    plt.title(f"{col.upper()} vs AQI")
    plt.savefig(f"pollutants_correlation/{col}_vs_aqi.png")
    plt.show()

In [None]:

# Prepare data (drop nulls for quick check)
feature_cols = [c for c in numeric_cols if c != 'ow_aqi_index' and 'lag' not in c]
X = df[feature_cols].fillna(df[feature_cols].median())
y = df['ow_aqi_index']

# Quick RF model
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X, y)

# Plot feature importance
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False).head(15)

plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df, x='importance', y='feature')
plt.title('Top 15 Feature Importances (Random Forest)')
plt.xlabel('Importance')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.show()

print("\nTop Features:")
print(importance_df)