In [2]:
import pandas as pd

df = pd.read_csv("air_quality_2011_cities.csv",
                encoding="latin1")

print(df.shape)
print(df.columns)
df.head()


(217, 10)
Index(['State ', 'City', 'Type', 'Category of ES',
       'SO2 Annual average (µg/m3)', 'Air quality of SO2',
       'NO2 Annual average (µg/m3)', 'Air quality of NO2',
       'PM10 Annual average (µg/m3)', 'Air quality of PM10'],
      dtype='object')


Unnamed: 0,State,City,Type,Category of ES,SO2 Annual average (µg/m3),Air quality of SO2,NO2 Annual average (µg/m3),Air quality of NO2,PM10 Annual average (µg/m3),Air quality of PM10
0,Andhra Pradesh,Chitoor,RIRuO,,4,Low,9,Low,39,Moderate
1,Andhra Pradesh,Guntur,RIRuO,,4,Low,10,Low,74,High
2,Andhra Pradesh,Hydrabad,RIRuO,,5,Low,29,Moderate,86,High
3,Andhra Pradesh,Kakinada,RIRuO,,4,Null,9,Null,59,Null
4,Andhra Pradesh,Kothagudem,RIRuO,,4,Low,9,Low,81,High


In [3]:
import pandas as pd

df_rt = pd.read_csv("realtime_aqi_india.csv", encoding="latin1")

print(df_rt.shape)
print(df_rt.columns)
df_rt.head()


(3297, 11)
Index(['country', 'state', 'city', 'station', 'last_update', 'latitude',
       'longitude', 'pollutant_id', 'pollutant_min', 'pollutant_max',
       'pollutant_avg'],
      dtype='object')


Unnamed: 0,country,state,city,station,last_update,latitude,longitude,pollutant_id,pollutant_min,pollutant_max,pollutant_avg
0,India,Andhra_Pradesh,Tirupati,"Vaikuntapuram, Tirupati - APPCB",06-01-2026 11:00,13.615387,79.40923,NH3,4.0,5.0,4.0
1,India,Andhra_Pradesh,Vijayawada,"HB Colony, Vijayawada - APPCB",06-01-2026 11:00,16.536107,80.594233,PM10,71.0,136.0,96.0
2,India,Andhra_Pradesh,Vijayawada,"HB Colony, Vijayawada - APPCB",06-01-2026 11:00,16.536107,80.594233,NH3,3.0,5.0,3.0
3,India,Andhra_Pradesh,Vijayawada,"Kanuru, Vijayawada - APPCB",06-01-2026 11:00,16.486692,80.699436,PM10,54.0,137.0,78.0
4,India,Andhra_Pradesh,Vijayawada,"Kanuru, Vijayawada - APPCB",06-01-2026 11:00,16.486692,80.699436,NO2,32.0,43.0,35.0


In [4]:
df_rt['pollutant_id'].value_counts()


pollutant_id
PM2.5    485
PM10     484
NO2      483
CO       476
SO2      472
OZONE    471
NH3      426
Name: count, dtype: int64

In [5]:
df_pivot = df_rt.pivot_table(
    index=['state', 'city', 'latitude', 'longitude', 'last_update'],
    columns='pollutant_id',
    values='pollutant_avg',
    aggfunc='mean'
).reset_index()

df_pivot.head()


pollutant_id,state,city,latitude,longitude,last_update,CO,NH3,NO2,OZONE,PM10,PM2.5,SO2
0,Andaman and Nicobar,Sri Vijaya Puram,11.654054,92.734055,06-01-2026 11:00,22.0,,,22.0,,,
1,Andhra_Pradesh,Amaravati,16.515083,80.518167,06-01-2026 11:00,35.0,7.0,48.0,20.0,144.0,150.0,10.0
2,Andhra_Pradesh,Anantapur,14.675886,77.593027,06-01-2026 11:00,28.0,5.0,18.0,54.0,103.0,103.0,9.0
3,Andhra_Pradesh,Chittoor,13.20488,79.097889,06-01-2026 11:00,21.0,5.0,41.0,52.0,80.0,84.0,15.0
4,Andhra_Pradesh,Kadapa,14.465052,78.824187,06-01-2026 11:00,30.0,3.0,21.0,26.0,123.0,147.0,14.0


In [6]:
import numpy as np

def compute_subindex(C, breakpoints):
    if np.isnan(C):
        return np.nan
    
    for bp_lo, bp_hi, I_lo, I_hi in breakpoints:
        if bp_lo <= C <= bp_hi:
            return ((I_hi - I_lo) / (bp_hi - bp_lo)) * (C - bp_lo) + I_lo
    
    # If concentration exceeds highest breakpoint
    bp_lo, bp_hi, I_lo, I_hi = breakpoints[-1]
    if C > bp_hi:
        return I_hi
    
    return np.nan


In [7]:
PM25_BP = [
    (0, 30, 0, 50),
    (31, 60, 51, 100),
    (61, 90, 101, 200),
    (91, 120, 201, 300),
    (121, 250, 301, 400),
    (251, 500, 401, 500)
]

PM10_BP = [
    (0, 50, 0, 50),
    (51, 100, 51, 100),
    (101, 250, 101, 200),
    (251, 350, 201, 300),
    (351, 430, 301, 400),
    (431, 1000, 401, 500)
]

NO2_BP = [
    (0, 40, 0, 50),
    (41, 80, 51, 100),
    (81, 180, 101, 200),
    (181, 280, 201, 300),
    (281, 400, 301, 400),
    (401, 1000, 401, 500)
]


In [8]:
df_aqi = df_pivot.copy()

df_aqi['PM25_SI'] = df_aqi['PM2.5'].apply(lambda x: compute_subindex(x, PM25_BP))
df_aqi['PM10_SI'] = df_aqi['PM10'].apply(lambda x: compute_subindex(x, PM10_BP))
df_aqi['NO2_SI']  = df_aqi['NO2'].apply(lambda x: compute_subindex(x, NO2_BP))


In [11]:
df_aqi['AQI'] = df_aqi[['PM25_SI', 'PM10_SI', 'NO2_SI']].max(axis=1)
df_aqi[['state', 'city', 'AQI']].head()


pollutant_id,state,city,AQI
0,Andaman and Nicobar,Sri Vijaya Puram,
1,Andhra_Pradesh,Amaravati,323.255814
2,Andhra_Pradesh,Anantapur,241.965517
3,Andhra_Pradesh,Chittoor,179.517241
4,Andhra_Pradesh,Kadapa,320.953488


In [13]:
df_aqi.isna().sum()

pollutant_id
state           0
city            0
latitude        0
longitude       0
last_update     0
CO             23
NH3            81
NO2            25
OZONE          33
PM10           33
PM2.5          36
SO2            40
PM25_SI        36
PM10_SI        33
NO2_SI         25
AQI             8
dtype: int64

In [14]:
df_aqi = df_aqi.dropna(subset=['AQI'])
print(df_aqi.shape)


(485, 16)


In [16]:
features = ['latitude', 'longitude', 'PM2.5', 'PM10', 'NO2']
target = 'AQI'

ml_df = df_aqi[features + [target]].copy()
ml_df

pollutant_id,latitude,longitude,PM2.5,PM10,NO2,AQI
1,16.515083,80.518167,150.0,144.0,48.0,323.255814
2,14.675886,77.593027,103.0,103.0,18.0,241.965517
3,13.204880,79.097889,84.0,80.0,41.0,179.517241
4,14.465052,78.824187,147.0,123.0,21.0,320.953488
5,16.987287,81.736318,204.0,138.0,33.0,364.697674
...,...,...,...,...,...,...
488,22.544808,88.340369,169.0,117.0,36.0,337.837209
489,22.556640,88.342674,138.0,117.0,16.0,314.046512
490,22.581570,88.410025,95.0,98.0,25.0,214.655172
491,22.627847,88.380669,256.0,136.0,61.0,402.987952


In [17]:
for col in ['PM2.5', 'PM10', 'NO2']:
    median_val = ml_df[col].median()
    ml_df[col] = ml_df[col].fillna(median_val)


In [18]:
ml_df.isna().sum()


pollutant_id
latitude     0
longitude    0
PM2.5        0
PM10         0
NO2          0
AQI          0
dtype: int64

## Model training

In [19]:
X = ml_df.drop(columns=['AQI'])
y = ml_df['AQI']

print(X.shape, y.shape)


(485, 5) (485,)


In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

print(X_train.shape, X_test.shape)


(388, 5) (97, 5)


In [21]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [22]:
X_train_scaled[:5]


array([[ 0.88501039, -0.15748953, -1.17172627, -1.51607623, -0.81128758],
       [ 0.74161156,  1.29460451,  1.300596  ,  1.30030508, -0.98609229],
       [-0.55459299, -0.65138827, -0.46534848, -0.72112989,  0.45604657],
       [-0.55630012, -0.52768854, -0.21307069, -0.13059833, -0.85498876],
       [ 0.63174347, -1.12761401, -0.69239848, -0.22144934, -0.68018405]])

In [24]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


In [25]:
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)  # Output AQI value
])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [26]:
model.compile(
    optimizer='adam',
    loss='mse',
    metrics=[tf.keras.metrics.RootMeanSquaredError()]
)


In [27]:
history = model.fit(
    X_train_scaled,
    y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=32,
    verbose=1
)


Epoch 1/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 50ms/step - loss: 75506.9219 - root_mean_squared_error: 274.7852 - val_loss: 84571.1641 - val_root_mean_squared_error: 290.8112
Epoch 2/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 75278.7109 - root_mean_squared_error: 274.3697 - val_loss: 84294.8438 - val_root_mean_squared_error: 290.3358
Epoch 3/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 75014.4922 - root_mean_squared_error: 273.8877 - val_loss: 83955.3906 - val_root_mean_squared_error: 289.7506
Epoch 4/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - loss: 74677.7344 - root_mean_squared_error: 273.2723 - val_loss: 83522.5391 - val_root_mean_squared_error: 289.0027
Epoch 5/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 74241.4766 - root_mean_squared_error: 272.4729 - val_loss: 82971.9844 - val_root_mean_squared_e

In [28]:
test_loss, test_rmse = model.evaluate(X_test_scaled, y_test)
print("Test RMSE:", test_rmse)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 8685.4531 - root_mean_squared_error: 93.1958
Test RMSE: 93.1957778930664


In [29]:
y_pred = model.predict(X_test_scaled)

comparison = pd.DataFrame({
    'Actual AQI': y_test.values[:10],
    'Predicted AQI': y_pred.flatten()[:10]
})

comparison


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step 


Unnamed: 0,Actual AQI,Predicted AQI
0,100.0,133.871185
1,369.302326,234.445572
2,74.655172,159.128067
3,317.883721,246.054352
4,141.965517,156.867035
5,310.209302,239.366501
6,293.172414,206.783707
7,367.0,374.487183
8,337.837209,222.558517
9,301.0,222.852737


In [30]:
features_si = ['latitude', 'longitude', 'PM25_SI', 'PM10_SI', 'NO2_SI']
target = 'AQI'

ml_df_si = df_aqi[features_si + [target]].copy()

ml_df_si.head()


pollutant_id,latitude,longitude,PM25_SI,PM10_SI,NO2_SI,AQI
1,16.515083,80.518167,323.255814,129.57047,59.794872,323.255814
2,14.675886,77.593027,241.965517,102.328859,22.5,241.965517
3,13.20488,79.097889,179.517241,80.0,51.0,179.517241
4,14.465052,78.824187,320.953488,115.61745,26.25,320.953488
5,16.987287,81.736318,364.697674,125.583893,41.25,364.697674


In [31]:
ml_df_si.isna().sum()


pollutant_id
latitude      0
longitude     0
PM25_SI      28
PM10_SI      25
NO2_SI       17
AQI           0
dtype: int64

In [32]:
for col in ['PM25_SI', 'PM10_SI', 'NO2_SI']:
    ml_df_si[col] = ml_df_si[col].fillna(ml_df_si[col].median())


In [33]:
X = ml_df_si.drop(columns=['AQI'])
y = ml_df_si['AQI']

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [34]:
model_si = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1)
])

model_si.compile(
    optimizer='adam',
    loss='mse',
    metrics=[tf.keras.metrics.RootMeanSquaredError()]
)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [35]:
history_si = model_si.fit(
    X_train_scaled,
    y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=32,
    verbose=1
)


Epoch 1/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 50ms/step - loss: 75244.8750 - root_mean_squared_error: 274.3080 - val_loss: 84225.8906 - val_root_mean_squared_error: 290.2170
Epoch 2/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 74979.1641 - root_mean_squared_error: 273.8232 - val_loss: 83894.2578 - val_root_mean_squared_error: 289.6451
Epoch 3/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 74672.1641 - root_mean_squared_error: 273.2621 - val_loss: 83497.2500 - val_root_mean_squared_error: 288.9589
Epoch 4/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 74291.6016 - root_mean_squared_error: 272.5648 - val_loss: 82987.3203 - val_root_mean_squared_error: 288.0752
Epoch 5/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 73797.2578 - root_mean_squared_error: 271.6565 - val_loss: 82327.0859 - val_root_mean_squared_e

In [36]:
loss_si, rmse_si = model_si.evaluate(X_test_scaled, y_test)
print("Sub-index model RMSE:", rmse_si)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 4974.6255 - root_mean_squared_error: 70.5310
Sub-index model RMSE: 70.5310287475586


In [39]:
df_aqi[['OZONE', 'SO2']].head()

pollutant_id,OZONE,SO2
1,20.0,10.0
2,54.0,9.0
3,52.0,15.0
4,26.0,14.0
5,11.0,14.0


In [40]:
def ozone_si(val):
    if pd.isna(val): return None
    if val <= 50: return (val/50)*50
    elif val <= 100: return ((val-51)/(100-51))*50 + 51
    else: return 200  # cap

def so2_si(val):
    if pd.isna(val): return None
    if val <= 40: return (val/40)*50
    elif val <= 80: return ((val-41)/(80-41))*50 + 51
    else: return 200


In [41]:
df_aqi['OZONE_SI'] = df_aqi['OZONE'].apply(ozone_si)
df_aqi['SO2_SI']   = df_aqi['SO2'].apply(so2_si)


In [42]:
df_aqi[['OZONE_SI', 'SO2_SI']].head()


pollutant_id,OZONE_SI,SO2_SI
1,20.0,12.5
2,54.061224,11.25
3,52.020408,18.75
4,26.0,17.5
5,11.0,17.5


In [43]:
features_final = [
    'latitude',
    'longitude',
    'PM25_SI',
    'PM10_SI',
    'NO2_SI',
    'OZONE_SI',
    'SO2_SI'
]

ml_df_final = df_aqi[features_final + ['AQI']].copy()

for col in features_final:
    ml_df_final[col] = ml_df_final[col].fillna(ml_df_final[col].median())

ml_df_final.isna().sum()


pollutant_id
latitude     0
longitude    0
PM25_SI      0
PM10_SI      0
NO2_SI       0
OZONE_SI     0
SO2_SI       0
AQI          0
dtype: int64

In [44]:
X = ml_df_final.drop(columns=['AQI'])
y = ml_df_final['AQI']

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [45]:
model_final = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1)
])

model_final.compile(
    optimizer='adam',
    loss='mse',
    metrics=[tf.keras.metrics.RootMeanSquaredError()]
)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [46]:
history_final = model_final.fit(
    X_train_scaled,
    y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=32,
    verbose=1
)


Epoch 1/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 57ms/step - loss: 75602.1641 - root_mean_squared_error: 274.9585 - val_loss: 84629.3594 - val_root_mean_squared_error: 290.9113
Epoch 2/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 75311.0703 - root_mean_squared_error: 274.4286 - val_loss: 84304.2578 - val_root_mean_squared_error: 290.3520
Epoch 3/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 74998.7500 - root_mean_squared_error: 273.8590 - val_loss: 83937.3359 - val_root_mean_squared_error: 289.7194
Epoch 4/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 74645.0000 - root_mean_squared_error: 273.2124 - val_loss: 83493.2578 - val_root_mean_squared_error: 288.9520
Epoch 5/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 74195.1562 - root_mean_squared_error: 272.3879 - val_loss: 82937.9609 - val_root_mean_squared_e

In [47]:
loss_final, rmse_final = model_final.evaluate(X_test_scaled, y_test)
print("FINAL MODEL RMSE:", rmse_final)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 7733.8608 - root_mean_squared_error: 87.9424
FINAL MODEL RMSE: 87.94237518310547


In [48]:
si_cols = ['PM25_SI', 'PM10_SI', 'NO2_SI', 'SO2_SI', 'OZONE_SI']

df_aqi['Dominant_SI'] = df_aqi[si_cols].max(axis=1)

df_aqi[['state', 'city', 'Dominant_SI']].head()


pollutant_id,state,city,Dominant_SI
1,Andhra_Pradesh,Amaravati,323.255814
2,Andhra_Pradesh,Anantapur,241.965517
3,Andhra_Pradesh,Chittoor,179.517241
4,Andhra_Pradesh,Kadapa,320.953488
5,Andhra_Pradesh,Rajamahendravaram,364.697674


In [49]:
df_aqi = df_aqi.dropna(subset=['Dominant_SI'])


In [50]:
feature_cols = [
    'PM2.5', 'PM10', 'NO2', 'SO2', 'OZONE'
]

X = df_aqi[feature_cols]
y = df_aqi['Dominant_SI']


In [51]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [52]:
import tensorflow as tf
from tensorflow.keras import layers, models

model_dom = models.Sequential([
    layers.Dense(32, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dense(16, activation='relu'),
    layers.Dense(1)
])

model_dom.compile(
    optimizer='adam',
    loss='mse',
    metrics=[tf.keras.metrics.RootMeanSquaredError()]
)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [53]:
history = model_dom.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=16,
    verbose=1
)


Epoch 1/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step - loss: 75599.7812 - root_mean_squared_error: 274.9541 - val_loss: 84755.6562 - val_root_mean_squared_error: 291.1282
Epoch 2/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 75565.3203 - root_mean_squared_error: 274.8915 - val_loss: 84715.0156 - val_root_mean_squared_error: 291.0584
Epoch 3/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 75525.2812 - root_mean_squared_error: 274.8186 - val_loss: 84669.6719 - val_root_mean_squared_error: 290.9805
Epoch 4/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 75480.5938 - root_mean_squared_error: 274.7373 - val_loss: 84617.9375 - val_root_mean_squared_error: 290.8916
Epoch 5/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 75429.8516 - root_mean_squared_error: 274.6450 - val_loss: 84559.7969 - val_root_mean_squared_er

In [54]:
loss_dom, rmse_dom = model_dom.evaluate(X_test_scaled, y_test)
print("Dominant SI Model RMSE:", rmse_dom)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 71140.9141 - root_mean_squared_error: 266.7225
Dominant SI Model RMSE: 266.7225341796875


In [55]:
y_pred_dom = model_dom.predict(X_test_scaled)

comparison = pd.DataFrame({
    'Actual Dominant SI': y_test.values[:10],
    'Predicted Dominant SI': y_pred_dom.flatten()[:10]
})

comparison


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step


Unnamed: 0,Actual Dominant SI,Predicted Dominant SI
0,100.0,18.741335
1,369.302326,18.741335
2,74.655172,18.741335
3,317.883721,18.741335
4,141.965517,18.741335
5,310.209302,18.741335
6,293.172414,18.741335
7,367.0,18.741335
8,337.837209,18.741335
9,301.0,18.741335


In [56]:
import numpy as np

y = np.log1p(df_aqi['Dominant_SI'])


In [57]:
model_dom = tf.keras.Sequential([
    layers.Dense(16, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dense(1)
])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [58]:
model_dom.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=tf.keras.losses.Huber(),
    metrics=[tf.keras.metrics.RootMeanSquaredError()]
)


In [59]:
history = model_dom.fit(
    X_train_scaled,
    y_train,   # log-transformed target
    validation_split=0.2,
    epochs=50,
    batch_size=16,
    verbose=1
)


Epoch 1/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - loss: 248.1156 - root_mean_squared_error: 274.9423 - val_loss: 266.6628 - val_root_mean_squared_error: 291.1743
Epoch 2/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 248.1192 - root_mean_squared_error: 274.9596 - val_loss: 266.6428 - val_root_mean_squared_error: 291.1560
Epoch 3/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 248.0993 - root_mean_squared_error: 274.9414 - val_loss: 266.6228 - val_root_mean_squared_error: 291.1377
Epoch 4/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 248.0793 - root_mean_squared_error: 274.9236 - val_loss: 266.6028 - val_root_mean_squared_error: 291.1193
Epoch 5/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 248.0592 - root_mean_squared_error: 274.9054 - val_loss: 266.5828 - val_root_mean_squared_error: 291.1010
Epoch 6

In [63]:
import numpy as np
from sklearn.metrics import mean_squared_error

# Predict (log scale)
y_pred_log = model_dom.predict(X_test_scaled).flatten()

# 🔒 Clip LOG predictions
y_pred_log = np.clip(y_pred_log, 0, np.log1p(500))

# Inverse transform
y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_test.values)

# 🔒 Clip AQI values (very important)
y_pred = np.clip(y_pred, 0, 500)
y_true = np.clip(y_true, 0, 500)

# RMSE
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)

print("Dominant SI RMSE (actual scale):", rmse)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
Dominant SI RMSE (actual scale): 498.2817507378127


In [64]:
pd.DataFrame({
    'Actual Dominant SI': y_true[:10],
    'Predicted Dominant SI': y_pred[:10]
})


Unnamed: 0,Actual Dominant SI,Predicted Dominant SI
0,500.0,1.718249
1,500.0,1.718249
2,500.0,1.718249
3,500.0,1.718249
4,500.0,1.718249
5,500.0,1.718249
6,500.0,1.718249
7,500.0,1.718249
8,500.0,1.718249
9,500.0,1.718249


## Predict pollutant concentration instead

In [79]:
y_train.isna().sum(), y_test.isna().sum()


(np.int64(24), np.int64(4))

In [80]:
# Create modeling dataframe from AQI dataframe
df_model = df_aqi.copy()

print(df_model.shape)
df_model.head()


(485, 19)


pollutant_id,state,city,latitude,longitude,last_update,CO,NH3,NO2,OZONE,PM10,PM2.5,SO2,PM25_SI,PM10_SI,NO2_SI,AQI,OZONE_SI,SO2_SI,Dominant_SI
1,Andhra_Pradesh,Amaravati,16.515083,80.518167,06-01-2026 11:00,35.0,7.0,48.0,20.0,144.0,150.0,10.0,323.255814,129.57047,59.794872,323.255814,20.0,12.5,323.255814
2,Andhra_Pradesh,Anantapur,14.675886,77.593027,06-01-2026 11:00,28.0,5.0,18.0,54.0,103.0,103.0,9.0,241.965517,102.328859,22.5,241.965517,54.061224,11.25,241.965517
3,Andhra_Pradesh,Chittoor,13.20488,79.097889,06-01-2026 11:00,21.0,5.0,41.0,52.0,80.0,84.0,15.0,179.517241,80.0,51.0,179.517241,52.020408,18.75,179.517241
4,Andhra_Pradesh,Kadapa,14.465052,78.824187,06-01-2026 11:00,30.0,3.0,21.0,26.0,123.0,147.0,14.0,320.953488,115.61745,26.25,320.953488,26.0,17.5,320.953488
5,Andhra_Pradesh,Rajamahendravaram,16.987287,81.736318,06-01-2026 11:00,40.0,5.0,33.0,11.0,138.0,204.0,14.0,364.697674,125.583893,41.25,364.697674,11.0,17.5,364.697674


In [81]:
# Make sure PM2.5 is numeric
df_model['PM2.5'] = pd.to_numeric(df_model['PM2.5'], errors='coerce')

# Drop rows where target is missing
df_model = df_model.dropna(subset=['PM2.5'])

print("Remaining rows:", df_model.shape[0])


Remaining rows: 457


In [82]:
df_model['PM2.5'].isna().sum()


np.int64(0)

In [85]:
feature_cols = [
    'latitude', 'longitude',
    'PM10', 'NO2', 'SO2', 'OZONE',
    'CO', 'NH3'
]

X = df_model[feature_cols]
y = df_model['PM2.5']

X.shape, y.shape


((457, 8), (457,))

In [88]:
# Impute missing values with median (safe & standard)
for col in X.columns:
    X[col] = X[col].fillna(X[col].median())

X.isna().sum()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].fillna(X[col].median())


pollutant_id
latitude     0
longitude    0
PM10         0
NO2          0
SO2          0
OZONE        0
CO           0
NH3          0
dtype: int64

In [89]:
y.isna().sum()


np.int64(0)

In [90]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [91]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [92]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model_pm25 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(16, activation='relu'),
    Dense(1)
])

model_pm25.compile(
    optimizer='adam',
    loss='mse',
    metrics=[tf.keras.metrics.RootMeanSquaredError()]
)

history = model_pm25.fit(
    X_train_scaled,
    y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=16,
    verbose=1
)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 26ms/step - loss: 25515.8535 - root_mean_squared_error: 159.7368 - val_loss: 28978.3223 - val_root_mean_squared_error: 170.2302
Epoch 2/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 25360.6602 - root_mean_squared_error: 159.2503 - val_loss: 28797.3633 - val_root_mean_squared_error: 169.6979
Epoch 3/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 25170.1680 - root_mean_squared_error: 158.6511 - val_loss: 28553.7129 - val_root_mean_squared_error: 168.9784
Epoch 4/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 24914.5391 - root_mean_squared_error: 157.8434 - val_loss: 28221.8770 - val_root_mean_squared_error: 167.9937
Epoch 5/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 24566.6504 - root_mean_squared_error: 156.7375 - val_loss: 27760.6680 - val_root_mean_squared_e

In [93]:
loss, rmse = model_pm25.evaluate(X_test_scaled, y_test)
print("PM2.5 Model RMSE:", rmse)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - loss: 1923.0005 - root_mean_squared_error: 43.8520
PM2.5 Model RMSE: 43.852027893066406


In [94]:
y_pred = model_pm25.predict(X_test_scaled)

import pandas as pd
pd.DataFrame({
    'Actual PM2.5': y_test.values[:10],
    'Predicted PM2.5': y_pred.flatten()[:10]
})


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step 


Unnamed: 0,Actual PM2.5,Predicted PM2.5
0,109.0,136.295425
1,101.0,95.535973
2,126.0,58.381916
3,47.0,130.501495
4,91.0,96.794121
5,121.0,68.205894
6,50.0,91.217209
7,206.0,237.607452
8,142.0,96.543884
9,75.0,133.847717


In [95]:
import numpy as np

y_log = np.log1p(y)


In [96]:
X_train, X_test, y_train_log, y_test_log = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)


In [97]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [98]:
model_pm25_log = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)
])

model_pm25_log.compile(
    optimizer='adam',
    loss='mse',
    metrics=[tf.keras.metrics.RootMeanSquaredError()]
)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [99]:
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

model_pm25_log.fit(
    X_train_scaled,
    y_train_log,
    validation_split=0.2,
    epochs=100,
    batch_size=16,
    callbacks=[early_stop],
    verbose=1
)


Epoch 1/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 31ms/step - loss: 17.8026 - root_mean_squared_error: 4.2193 - val_loss: 14.3355 - val_root_mean_squared_error: 3.7862
Epoch 2/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 10.4838 - root_mean_squared_error: 3.2379 - val_loss: 7.2556 - val_root_mean_squared_error: 2.6936
Epoch 3/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 5.1600 - root_mean_squared_error: 2.2716 - val_loss: 2.9537 - val_root_mean_squared_error: 1.7186
Epoch 4/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 2.9727 - root_mean_squared_error: 1.7242 - val_loss: 1.7294 - val_root_mean_squared_error: 1.3151
Epoch 5/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 2.1466 - root_mean_squared_error: 1.4651 - val_loss: 1.4160 - val_root_mean_squared_error: 1.1900
Epoch 6/100
[1m19/19[0m [32m━━━━━━

<keras.src.callbacks.history.History at 0x203d10ea360>

In [100]:
y_pred_log = model_pm25_log.predict(X_test_scaled)
y_pred = np.expm1(y_pred_log).flatten()
y_true = np.expm1(y_test_log)

from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_true, y_pred))

print("PM2.5 RMSE (log model):", rmse)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
PM2.5 RMSE (log model): 74.96323550566704


In [101]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_true, y_pred)
print("PM2.5 MAE (log model):", mae)


PM2.5 MAE (log model): 45.88319732831872


In [102]:
# Make sure last_update is datetime
df_model['last_update'] = pd.to_datetime(df_model['last_update'])


In [103]:
df_model['last_update'].head()


1   2026-06-01 11:00:00
2   2026-06-01 11:00:00
3   2026-06-01 11:00:00
4   2026-06-01 11:00:00
5   2026-06-01 11:00:00
Name: last_update, dtype: datetime64[ns]

In [104]:
# Hour of day (0–23)
df_model['hour'] = df_model['last_update'].dt.hour

# Day of week (0=Mon, 6=Sun)
df_model['dayofweek'] = df_model['last_update'].dt.dayofweek

# Month (1–12)
df_model['month'] = df_model['last_update'].dt.month


In [105]:
# Weekend flag
df_model['is_weekend'] = df_model['dayofweek'].isin([5, 6]).astype(int)


In [106]:
df_model[['hour', 'dayofweek', 'month', 'is_weekend']].head()


pollutant_id,hour,dayofweek,month,is_weekend
1,11,0,6,0
2,11,0,6,0
3,11,0,6,0
4,11,0,6,0
5,11,0,6,0


In [107]:
feature_cols = [
    'latitude', 'longitude',
    'PM10', 'NO2', 'SO2', 'OZONE', 'CO', 'NH3',
    'hour', 'dayofweek', 'month', 'is_weekend'
]

X = df_model[feature_cols]
y = df_model['PM2.5']


In [108]:
X.shape, y.shape


((457, 12), (457,))

In [109]:
# Fill missing values with median
for col in X.columns:
    X[col] = X[col].fillna(X[col].median())

X.isna().sum(), y.isna().sum()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].fillna(X[col].median())


(pollutant_id
 latitude      0
 longitude     0
 PM10          0
 NO2           0
 SO2           0
 OZONE         0
 CO            0
 NH3           0
 hour          0
 dayofweek     0
 month         0
 is_weekend    0
 dtype: int64,
 np.int64(0))

In [110]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [111]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)
])

model.compile(
    optimizer='adam',
    loss='mse',
    metrics=['RootMeanSquaredError']
)

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 50ms/step - RootMeanSquaredError: 159.9296 - loss: 25577.4707 - val_RootMeanSquaredError: 170.4371 - val_loss: 29048.8008
Epoch 2/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - RootMeanSquaredError: 159.4986 - loss: 25439.7988 - val_RootMeanSquaredError: 169.9697 - val_loss: 28889.6992
Epoch 3/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - RootMeanSquaredError: 159.0132 - loss: 25285.1875 - val_RootMeanSquaredError: 169.4119 - val_loss: 28700.3906
Epoch 4/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - RootMeanSquaredError: 158.4409 - loss: 25103.5078 - val_RootMeanSquaredError: 168.7372 - val_loss: 28472.2266
Epoch 5/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - RootMeanSquaredError: 157.7302 - loss: 24878.8301 - val_RootMeanSquaredError: 167.8708 - val_loss: 28180.6035
Epoch 6/10

In [112]:
loss, rmse = model.evaluate(X_test_scaled, y_test)
print("FINAL PM2.5 RMSE:", rmse)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - RootMeanSquaredError: 45.3638 - loss: 2057.8723
FINAL PM2.5 RMSE: 45.36377716064453


In [113]:
y_pred = model.predict(X_test_scaled)

import pandas as pd
pd.DataFrame({
    'Actual PM2.5': y_test.values[:10],
    'Predicted PM2.5': y_pred.flatten()[:10]
})


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step


Unnamed: 0,Actual PM2.5,Predicted PM2.5
0,109.0,132.876419
1,101.0,94.154648
2,126.0,57.056908
3,47.0,157.111984
4,91.0,92.62471
5,121.0,79.75898
6,50.0,75.659149
7,206.0,241.16597
8,142.0,92.062195
9,75.0,138.73317


In [114]:
model.save("pm25_model.h5")




In [115]:
def pm25_to_aqi_category(pm25):
    if pm25 <= 30:
        return "Good"
    elif pm25 <= 60:
        return "Satisfactory"
    elif pm25 <= 90:
        return "Moderate"
    elif pm25 <= 120:
        return "Poor"
    elif pm25 <= 250:
        return "Very Poor"
    else:
        return "Severe"


In [116]:
df_model['AQI_Category'] = df_model['PM2.5'].apply(pm25_to_aqi_category)


In [117]:
df_model[['PM2.5', 'AQI_Category']].head(10)


pollutant_id,PM2.5,AQI_Category
1,150.0,Very Poor
2,103.0,Poor
3,84.0,Moderate
4,147.0,Very Poor
5,204.0,Very Poor
6,198.0,Very Poor
7,137.0,Very Poor
8,76.0,Moderate
9,163.0,Very Poor
10,89.0,Moderate


In [118]:
df_model['AQI_Category'].value_counts()


AQI_Category
Very Poor       172
Poor             81
Moderate         81
Severe           60
Satisfactory     51
Good             12
Name: count, dtype: int64

In [119]:
model_cols = [
    'state','city','latitude','longitude',
    'PM2.5','PM10','NO2','SO2','OZONE','CO','NH3',
    'hour','dayofweek','month','is_weekend'
]

df_model[model_cols].to_csv("aqi_model_ready.csv", index=False)


In [120]:
dashboard_cols = [
    'state','city','latitude','longitude',
    'PM2.5','AQI_Category','last_update'
]

df_model[dashboard_cols].to_csv("aqi_dashboard_ready.csv", index=False)


In [121]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_model['AQI_Label'] = le.fit_transform(df_model['AQI_Category'])

df_model[['AQI_Category', 'AQI_Label']].head()


pollutant_id,AQI_Category,AQI_Label
1,Very Poor,5
2,Poor,2
3,Moderate,1
4,Very Poor,5
5,Very Poor,5


In [122]:
feature_cols_cls = [
    'latitude', 'longitude',
    'PM2.5', 'PM10', 'NO2', 'SO2', 'OZONE',
    'hour', 'dayofweek', 'month', 'is_weekend'
]

X_cls = df_model[feature_cols_cls]
y_cls = df_model['AQI_Label']

X_cls.shape, y_cls.shape


((457, 11), (457,))

In [123]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X_cls, y_cls, test_size=0.2, random_state=42, stratify=y_cls
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [124]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

num_classes = len(le.classes_)

model_cls = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model_cls.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

history = model_cls.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=30,
    batch_size=32,
    verbose=1
)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 64ms/step - accuracy: 0.3699 - loss: 1.7768 - val_accuracy: 0.3425 - val_loss: 1.7825
Epoch 2/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.3836 - loss: 1.7764 - val_accuracy: 0.3425 - val_loss: 1.7732
Epoch 3/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.3836 - loss: 1.7650 - val_accuracy: 0.3425 - val_loss: 1.7639
Epoch 4/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.3836 - loss: 1.7542 - val_accuracy: 0.3425 - val_loss: 1.7542
Epoch 5/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.3836 - loss: 1.7426 - val_accuracy: 0.3425 - val_loss: 1.7445
Epoch 6/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.3836 - loss: 1.7313 - val_accuracy: 0.3425 - val_loss: 1.7354
Epoch 7/30
[1m10/10[0m [32m━━━━

In [125]:
loss, acc = model_cls.evaluate(X_test_scaled, y_test)
print("AQI Category Accuracy:", acc)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.3804 - loss: 1.5989
AQI Category Accuracy: 0.3804347813129425


In [126]:
y_pred = model_cls.predict(X_test_scaled)
y_pred_labels = y_pred.argmax(axis=1)

comparison = pd.DataFrame({
    'Actual AQI': le.inverse_transform(y_test[:10]),
    'Predicted AQI': le.inverse_transform(y_pred_labels[:10])
})

comparison


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step 


Unnamed: 0,Actual AQI,Predicted AQI
0,Very Poor,Very Poor
1,Moderate,Very Poor
2,Poor,Very Poor
3,Satisfactory,Very Poor
4,Severe,Very Poor
5,Very Poor,Very Poor
6,Moderate,Very Poor
7,Very Poor,Very Poor
8,Very Poor,Very Poor
9,Poor,Very Poor


In [133]:
# Predict PM2.5 for all rows

feature_cols = [
    'latitude', 'longitude',
    'PM10', 'NO2', 'SO2', 'OZONE', 'CO', 'NH3',
    'hour', 'dayofweek', 'month', 'is_weekend'
]
X = df_model[feature_cols].copy()
y = df_model['PM2.5'].copy()


for col in feature_cols:
    X[col] = X[col].fillna(X[col].median())

y = y.fillna(y.median())

X.isna().sum(), y.isna().sum()

(pollutant_id
 latitude      0
 longitude     0
 PM10          0
 NO2           0
 SO2           0
 OZONE         0
 CO            0
 NH3           0
 hour          0
 dayofweek     0
 month         0
 is_weekend    0
 dtype: int64,
 np.int64(0))

In [134]:
X.isna().sum(), y.isna().sum()


(pollutant_id
 latitude      0
 longitude     0
 PM10          0
 NO2           0
 SO2           0
 OZONE         0
 CO            0
 NH3           0
 hour          0
 dayofweek     0
 month         0
 is_weekend    0
 dtype: int64,
 np.int64(0))

In [135]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)


In [137]:
model_pm25.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=32,
    verbose=1
)


Epoch 1/100


ValueError: Exception encountered when calling Sequential.call().

[1mInput 0 with name 'None' of layer 'dense_17' is incompatible with the layer: expected axis -1 of input shape to have value 8, but received input with shape (None, 11)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(None, 11), dtype=float32)
  • training=True
  • mask=None
  • kwargs=<class 'inspect._empty'>

In [138]:
X_train_scaled.shape


(365, 11)

In [139]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model_pm25 = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1)  # PM2.5 regression output
])

model_pm25.compile(
    optimizer='adam',
    loss='mse',
    metrics=['RootMeanSquaredError']
)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [140]:
model_pm25.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=32,
    verbose=1
)


Epoch 1/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 52ms/step - RootMeanSquaredError: 3.6694 - loss: 13.4642 - val_RootMeanSquaredError: 3.5886 - val_loss: 12.8782
Epoch 2/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - RootMeanSquaredError: 3.6265 - loss: 13.1514 - val_RootMeanSquaredError: 3.5566 - val_loss: 12.6497
Epoch 3/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - RootMeanSquaredError: 3.5940 - loss: 12.9170 - val_RootMeanSquaredError: 3.5226 - val_loss: 12.4085
Epoch 4/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - RootMeanSquaredError: 3.5590 - loss: 12.6663 - val_RootMeanSquaredError: 3.4854 - val_loss: 12.1483
Epoch 5/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - RootMeanSquaredError: 3.5210 - loss: 12.3972 - val_RootMeanSquaredError: 3.4449 - val_loss: 11.8672
Epoch 6/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x203cb6a9e50>

In [141]:
loss, rmse = model_pm25.evaluate(X_test_scaled, y_test)
print("PM2.5 RMSE:", rmse)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - RootMeanSquaredError: 1.6563 - loss: 2.7435
PM2.5 RMSE: 1.6563467979431152


In [147]:
X_train_scaled.shape


(365, 10)

In [148]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

model_pm25 = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)
])

model_pm25.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='mse',
    metrics=['RootMeanSquaredError']
)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [149]:
model_pm25.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=32,
    verbose=1
)


Epoch 1/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 50ms/step - RootMeanSquaredError: 159.3497 - loss: 25392.3164 - val_RootMeanSquaredError: 169.7718 - val_loss: 28822.4727
Epoch 2/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - RootMeanSquaredError: 158.8100 - loss: 25220.6250 - val_RootMeanSquaredError: 169.1890 - val_loss: 28624.9102
Epoch 3/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - RootMeanSquaredError: 158.1687 - loss: 25017.3320 - val_RootMeanSquaredError: 168.4895 - val_loss: 28388.7188
Epoch 4/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - RootMeanSquaredError: 157.4282 - loss: 24783.6328 - val_RootMeanSquaredError: 167.6103 - val_loss: 28093.2051
Epoch 5/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - RootMeanSquaredError: 156.4534 - loss: 24477.6641 - val_RootMeanSquaredError: 166.4610 - val_loss: 27709.2578
Epoch 6/10

<keras.src.callbacks.history.History at 0x203d6c2d710>

In [150]:
loss, rmse = model_pm25.evaluate(X_test_scaled, y_test)
print("PM2.5 RMSE:", rmse)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - RootMeanSquaredError: 43.7036 - loss: 1910.0027
PM2.5 RMSE: 43.70357894897461


## Prediction

In [151]:
# Use EXACT same features used in training
X_all = df_model[feature_cols].copy()

# Fill missing values
for col in X_all.columns:
    X_all[col] = X_all[col].fillna(X_train[col].median())

# Scale using TRAINED scaler
X_all_scaled = scaler.transform(X_all)

# Predict
df_model['PM25_Predicted'] = model_pm25.predict(X_all_scaled).flatten()


[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 


In [152]:
def pm25_to_aqi(pm25):
    if pm25 <= 30: return pm25 * 50 / 30
    elif pm25 <= 60: return 50 + (pm25-30)*50/30
    elif pm25 <= 90: return 100 + (pm25-60)*100/30
    elif pm25 <= 120: return 200 + (pm25-90)*100/30
    elif pm25 <= 250: return 300 + (pm25-120)*100/130
    else: return 400 + (pm25-250)*100/130

df_model['AQI_Predicted'] = df_model['PM25_Predicted'].apply(pm25_to_aqi)


In [153]:
def aqi_category(aqi):
    if aqi <= 50: return "Good"
    elif aqi <= 100: return "Satisfactory"
    elif aqi <= 200: return "Moderate"
    elif aqi <= 300: return "Poor"
    elif aqi <= 400: return "Very Poor"
    else: return "Severe"

df_model['AQI_Category_Predicted'] = df_model['AQI_Predicted'].apply(aqi_category)


In [154]:
df_model.to_csv("aqi_predictions_india.csv", index=False)


In [160]:
scaler.feature_names_in_

feature_cols = list(scaler.feature_names_in_)
feature_cols



['latitude',
 'longitude',
 'PM10',
 'NO2',
 'SO2',
 'OZONE',
 'hour',
 'dayofweek',
 'month',
 'is_weekend']

In [172]:
import numpy as np
import pandas as pd

demo_data = pd.DataFrame({
    'latitude':    np.random.uniform(8, 35, 5),
    'longitude':   np.random.uniform(68, 97, 5),
    'PM10':        np.random.uniform(40, 300, 5),
    'NO2':         np.random.uniform(5, 120, 5),
    'SO2':         np.random.uniform(2, 80, 5),
    'OZONE':       np.random.uniform(10, 120, 5),
    'hour':        np.random.randint(0, 24, 5),
    'dayofweek':   np.random.randint(0, 7, 5),
    'month':       np.random.randint(1, 13, 5),
    'is_weekend':  np.random.randint(0, 2, 5)
})

# Enforce correct order
demo_data = demo_data[feature_cols]

demo_data

Unnamed: 0,latitude,longitude,PM10,NO2,SO2,OZONE,hour,dayofweek,month,is_weekend
0,20.627737,74.0512,155.441781,57.224234,23.795392,66.315033,13,6,6,0
1,22.921213,78.869466,247.166105,30.056344,38.137428,35.218744,15,4,11,0
2,23.713448,85.841783,51.78318,13.0464,41.7389,36.644675,16,3,11,0
3,24.512334,96.277231,95.110919,8.273945,70.216972,23.788673,4,4,4,1
4,33.564153,86.615119,162.514728,42.194656,63.910867,48.013062,10,2,3,0


In [173]:
demo_scaled = scaler.transform(demo_data)

demo_data['Predicted_PM2.5'] = model_pm25.predict(demo_scaled).flatten()
demo_data['AQI_Category'] = demo_data['Predicted_PM2.5'].apply(pm25_to_aqi_category)

demo_data[['Predicted_PM2.5', 'AQI_Category']]


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step


Unnamed: 0,Predicted_PM2.5,AQI_Category
0,203.208725,Very Poor
1,370.966675,Severe
2,147.391953,Very Poor
3,128.158295,Very Poor
4,200.543198,Very Poor


In [174]:
import joblib
joblib.dump(scaler, "scaler_pm25_v1.pkl")


['scaler_pm25_v1.pkl']

In [175]:
df_model.to_csv("pm25_training_data_v1.csv", index=False)
