# AQI Prediction App

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score
import matplotlib.pyplot as plt

In [5]:
data = pd.read_csv("air_quality_and_public_health_data.csv")

In [7]:
data.head()

Unnamed: 0,RecordID,AQI,PM10,PM2_5,NO2,SO2,O3,Temperature,Humidity,WindSpeed,RespiratoryCases,CardiovascularCases,HospitalAdmissions,HealthImpactScore,HealthImpactClass
0,1,187.270059,295.853039,13.03856,6.639263,66.16115,54.62428,5.150335,84.424344,6.137755,7,5,1,97.244041,0.0
1,2,475.357153,246.254703,9.984497,16.318326,90.499523,169.621728,1.543378,46.851415,4.521422,10,2,0,100.0,0.0
2,3,365.996971,84.443191,23.11134,96.317811,17.87585,9.006794,1.169483,17.806977,11.157384,13,3,0,100.0,0.0
3,4,299.329242,21.020609,14.273403,81.234403,48.323616,93.161033,21.925276,99.473373,15.3025,8,8,1,100.0,0.0
4,5,78.00932,16.987667,152.111623,121.235461,90.866167,241.795138,9.217517,24.906837,14.534733,9,0,1,95.182643,0.0


In [9]:
data.isnull().sum()

RecordID               0
AQI                    0
PM10                   0
PM2_5                  0
NO2                    0
SO2                    0
O3                     0
Temperature            0
Humidity               0
WindSpeed              0
RespiratoryCases       0
CardiovascularCases    0
HospitalAdmissions     0
HealthImpactScore      0
HealthImpactClass      0
dtype: int64

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5811 entries, 0 to 5810
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   RecordID             5811 non-null   int64  
 1   AQI                  5811 non-null   float64
 2   PM10                 5811 non-null   float64
 3   PM2_5                5811 non-null   float64
 4   NO2                  5811 non-null   float64
 5   SO2                  5811 non-null   float64
 6   O3                   5811 non-null   float64
 7   Temperature          5811 non-null   float64
 8   Humidity             5811 non-null   float64
 9   WindSpeed            5811 non-null   float64
 10  RespiratoryCases     5811 non-null   int64  
 11  CardiovascularCases  5811 non-null   int64  
 12  HospitalAdmissions   5811 non-null   int64  
 13  HealthImpactScore    5811 non-null   float64
 14  HealthImpactClass    5811 non-null   float64
dtypes: float64(11), int64(4)
memory usage:

In [13]:
X = data.iloc[:, 1:-2].values
Y = data.iloc[:, -2].values

In [15]:
X

array([[187.27005942, 295.85303919,  13.03856044, ...,   7.        ,
          5.        ,   1.        ],
       [475.3571532 , 246.25470278,   9.98449713, ...,  10.        ,
          2.        ,   0.        ],
       [365.99697091,  84.44319074,  23.11133977, ...,  13.        ,
          3.        ,   0.        ],
       ...,
       [314.84179763,  41.89269906, 184.70855139, ...,  12.        ,
          2.        ,   3.        ],
       [208.0804732 , 165.53378512, 199.17725515, ...,   6.        ,
          2.        ,   3.        ],
       [ 83.26925675,  82.21626223, 119.96824423, ...,  14.        ,
          2.        ,   2.        ]])

In [17]:
Y

array([ 97.24404109, 100.        , 100.        , ..., 100.        ,
       100.        ,  81.66829811])

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [21]:
X_train.shape, X_test.shape

((4648, 12), (1163, 12))

In [23]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [25]:
Y_train.shape

(4648,)

# LINEAR REGRESSION Model

In [27]:
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, Y_train)

In [29]:
linear_pred = linear_model.predict(X_test_scaled)

In [31]:
linear_mae = mean_absolute_error(Y_test, linear_pred)
print("Mean Absolute Error = ", linear_mae)

Mean Absolute Error =  7.354111310534881


In [33]:
linear_mse = mean_squared_error(Y_test, linear_pred)
print("Mean Squared Error = ", linear_mse)

Mean Squared Error =  92.6828792598329


In [35]:
linear_r2 = r2_score(Y_test, linear_pred)
print("R2 Score = ", linear_r2)

R2 Score =  0.5053377752606643


In [37]:
#Building the SVR
from sklearn.svm import SVR
SVR = SVR()
SVR.fit(X_train_scaled, Y_train)

In [39]:
SVR_pred = SVR.predict(X_test_scaled)

In [41]:
SVR_mae = mean_absolute_error(Y_test, SVR_pred)
print("Mean Absolute Error = ", SVR_mae)

Mean Absolute Error =  4.708029871981457


In [43]:
SVR_mse = mean_squared_error(Y_test, SVR_pred)
print("Mean Squared Error = ",SVR_mae)

Mean Squared Error =  4.708029871981457


In [45]:
SVR_r2 = r2_score(Y_test, SVR_pred)
print("R2 Score = ", SVR_r2)

R2 Score =  0.5575655272162192


# **Random forest regression**

In [47]:
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, Y_train)

In [49]:
rf_pred = rf_model.predict(X_test_scaled)

In [51]:
rf_mae = mean_absolute_error(Y_test, rf_pred)
print("Mean Absolute Error = ", rf_mae)

Mean Absolute Error =  1.569198567637857


In [53]:
rf_mse = mean_squared_error(Y_test, rf_pred)
print("Mean Squared Error = ",rf_mae)

Mean Squared Error =  1.569198567637857


In [55]:
rf_r2 = r2_score(Y_test, rf_pred)
print("R2 Score", rf_r2)

R2 Score 0.9454070921231519


# Saving the RF model

In [57]:
import joblib
joblib.dump(rf_model, 'Air_Quality_And_Public_Health_Model.joblib')

['Air_Quality_And_Public_Health_Model.joblib']