In [1]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pymongo
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Configuration
MONGO_URI = os.getenv('MONGO_URI')
DB_NAME = "aqi_db"
COLLECTION_NAME = "processed_data"

# Set plot style
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

In [2]:
# Connect to MongoDB and fetch data
print("⏳ Fetching data from MongoDB...")
client = pymongo.MongoClient(MONGO_URI)
db = client[DB_NAME]
collection = db[COLLECTION_NAME]

data = list(collection.find())
df = pd.DataFrame(data)

# Convert dates and sort
if not df.empty:
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date').reset_index(drop=True)

    # Drop MongoDB ID if it exists
    if '_id' in df.columns:
        df = df.drop(columns=['_id'])

    print(f"✅ Loaded {len(df)} records.")
    display(df.tail(500)) # 'display' works automatically in Jupyter
else:
    print("❌ No data found. Please run your preprocessing pipeline first.")

⏳ Fetching data from MongoDB...
✅ Loaded 4371 records.


Unnamed: 0,date,aqi,clouds,co,day,day_of_week,hour,humidity,month,no2,...,aqi_lag_1,aqi_lag_6,aqi_lag_24,aqi_roll_mean_24,hour_sin,hour_cos,month_sin,month_cos,wind_sin,wind_cos
3871,2026-01-03 07:00:00,275.403604,15.872482,2195.66,3,5,7,50.397653,1,20.82,...,270.131532,238.974775,316.647748,232.036656,0.965926,-0.258819,0.500000,0.866025,-0.704202,0.710000
3872,2026-01-03 08:00:00,278.376577,15.880502,2248.56,3,5,8,50.385221,1,22.01,...,275.403604,245.713514,314.378378,230.536581,0.866025,-0.500000,0.500000,0.866025,-0.703605,0.710591
3873,2026-01-03 09:00:00,256.039640,15.888523,2028.12,3,5,9,50.372790,1,18.44,...,278.376577,253.116216,299.781982,228.713983,0.707107,-0.707107,0.500000,0.866025,-0.703008,0.711182
3874,2026-01-03 10:00:00,248.131532,15.896543,1990.00,3,5,10,50.360358,1,16.43,...,256.039640,260.528829,275.562162,227.571040,0.500000,-0.866025,0.500000,0.866025,-0.702411,0.711772
3875,2026-01-03 11:00:00,242.869369,15.904564,1965.35,3,5,11,50.347926,1,14.17,...,248.131532,266.108108,249.518919,227.293976,0.258819,-0.965926,0.500000,0.866025,-0.701813,0.712362
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4366,2026-01-23 22:00:00,115.256784,19.842607,651.98,23,4,22,44.243958,1,14.67,...,97.455365,59.222747,194.919283,98.126118,-0.500000,0.866025,0.500000,0.866025,-0.357461,0.933928
4367,2026-01-23 23:00:00,131.754271,19.850628,732.43,23,4,23,44.231527,1,16.33,...,115.256784,61.493991,193.024341,95.573199,-0.258819,0.965926,0.500000,0.866025,-0.356677,0.934228
4368,2026-01-24 00:00:00,143.056281,19.858648,784.01,24,5,0,44.219095,1,17.61,...,131.754271,65.636910,187.489252,93.721825,0.000000,1.000000,0.500000,0.866025,-0.355892,0.934527
4369,2026-01-24 17:37:26,165.849737,20.000000,736.54,24,5,17,44.000000,1,10.24,...,143.056281,74.848069,178.701264,93.186345,-0.965926,-0.258819,0.500000,0.866025,-0.342020,0.939693


In [None]:
# Is the smog getting worse? Look for seasonal spikes (Nov-Jan).
plt.figure(figsize=(14, 6))
sns.lineplot(data=df, x='date', y='aqi', color='crimson', linewidth=1)
plt.title('AQI Trend in Lahore (Time Series)')
plt.xlabel('Date')
plt.ylabel('AQI')
plt.show()

In [None]:
# When is smog the worst? (Night vs Afternoon)
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='hour', y='aqi', palette="coolwarm")
plt.title('AQI Distribution by Hour of the Day')
plt.xlabel('Hour (0-23)')
plt.ylabel('AQI')
plt.show()

In [None]:
# What drives the AQI? (Red = Strong Positive, Blue = Strong Negative)
corr_cols = ['aqi', 'temperature', 'humidity', 'wind_speed', 'pressure', 'aqi_lag_24']

if all(col in df.columns for col in corr_cols):
    plt.figure(figsize=(10, 8))
    corr_matrix = df[corr_cols].corr()
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
    plt.title('Correlation Heatmap')
    plt.show()
else:
    print("⚠️ Missing columns for correlation. Check your dataframe.")

In [None]:
# Scatter plots to see direct relationships
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Temp vs AQI
sns.scatterplot(data=df, x='temperature', y='aqi', alpha=0.5, color='orange', ax=axes[0])
axes[0].set_title('Temperature vs AQI')

# Wind vs AQI
sns.scatterplot(data=df, x='wind_speed', y='aqi', alpha=0.5, color='teal', ax=axes[1])
axes[1].set_title('Wind Speed vs AQI')

plt.tight_layout()
plt.show()