In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/aqi-air-quality-index-scheduled-daily-update/data_date.csv


In [2]:
df = pd.read_csv("/kaggle/input/aqi-air-quality-index-scheduled-daily-update/data_date.csv")
df

Unnamed: 0,Date,Country,Status,AQI Value
0,2022-07-21,Albania,Good,14
1,2022-07-21,Algeria,Moderate,65
2,2022-07-21,Andorra,Moderate,55
3,2022-07-21,Angola,Unhealthy for Sensitive Groups,113
4,2022-07-21,Argentina,Moderate,63
...,...,...,...,...
21848,2025-10-30,United Kingdom of Great Britain and Northern I...,Good,47
21849,2025-10-30,United States of America,Moderate,73
21850,2025-10-30,Uzbekistan,Good,25
21851,2025-10-30,Vatican,Good,42


Exploratory Data Analysis (EDA)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21853 entries, 0 to 21852
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Date       21853 non-null  object
 1   Country    21853 non-null  object
 2   Status     21853 non-null  object
 3   AQI Value  21853 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 683.0+ KB


In [4]:
df.describe()

Unnamed: 0,AQI Value
count,21853.0
mean,62.337391
std,49.686767
min,1.0
25%,29.0
50%,52.0
75%,82.0
max,963.0


In [5]:
print(df.isnull().sum())

Date         0
Country      0
Status       0
AQI Value    0
dtype: int64


Defining the problem:
What can be done next is to
1. To predict the AQI value (regression)
2. To predict the air quality (classification)

In [6]:
df.shape

(21853, 4)

In [7]:
df = df.dropna()
df

Unnamed: 0,Date,Country,Status,AQI Value
0,2022-07-21,Albania,Good,14
1,2022-07-21,Algeria,Moderate,65
2,2022-07-21,Andorra,Moderate,55
3,2022-07-21,Angola,Unhealthy for Sensitive Groups,113
4,2022-07-21,Argentina,Moderate,63
...,...,...,...,...
21848,2025-10-30,United Kingdom of Great Britain and Northern I...,Good,47
21849,2025-10-30,United States of America,Moderate,73
21850,2025-10-30,Uzbekistan,Good,25
21851,2025-10-30,Vatican,Good,42


In [8]:
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

In [9]:
df2 = pd.get_dummies(df, columns = ['Country'], drop_first = True)
label_encoder = LabelEncoder()
df2['Status2'] = label_encoder.fit_transform(df2['Status'])

In [10]:
df2

Unnamed: 0,Date,Status,AQI Value,Year,Month,Day,Country_Algeria,Country_Andorra,Country_Angola,Country_Argentina,...,Country_Ukraine,Country_United Arab Emirates,Country_United Kingdom of Great Britain and Northern Ireland,Country_United States of America,Country_Uzbekistan,Country_Vatican,Country_Venezuela,Country_Vietnam,Country_Zambia,Status2
0,2022-07-21,Good,14,2022,7,21,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
1,2022-07-21,Moderate,65,2022,7,21,True,False,False,False,...,False,False,False,False,False,False,False,False,False,2
2,2022-07-21,Moderate,55,2022,7,21,False,True,False,False,...,False,False,False,False,False,False,False,False,False,2
3,2022-07-21,Unhealthy for Sensitive Groups,113,2022,7,21,False,False,True,False,...,False,False,False,False,False,False,False,False,False,4
4,2022-07-21,Moderate,63,2022,7,21,False,False,False,True,...,False,False,False,False,False,False,False,False,False,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21848,2025-10-30,Good,47,2025,10,30,False,False,False,False,...,False,False,True,False,False,False,False,False,False,0
21849,2025-10-30,Moderate,73,2025,10,30,False,False,False,False,...,False,False,False,True,False,False,False,False,False,2
21850,2025-10-30,Good,25,2025,10,30,False,False,False,False,...,False,False,False,False,True,False,False,False,False,0
21851,2025-10-30,Good,42,2025,10,30,False,False,False,False,...,False,False,False,False,False,True,False,False,False,0


In [11]:
print(f"Categories: {list(label_encoder.classes_)}")

Categories: ['Good', 'Hazardous', 'Moderate', 'Unhealthy', 'Unhealthy for Sensitive Groups', 'Very Unhealthy']


In [12]:
print(df2[['Status', 'Status2']].head())

                           Status  Status2
0                            Good        0
1                        Moderate        2
2                        Moderate        2
3  Unhealthy for Sensitive Groups        4
4                        Moderate        2


Now we assign the values of x and y

In [13]:
y = df2['Status2']

In [14]:
X = df2.drop(columns=['Date', 'Status', 'AQI Value', 'Status2'])

In [15]:
X.head(10)

Unnamed: 0,Year,Month,Day,Country_Algeria,Country_Andorra,Country_Angola,Country_Argentina,Country_Armenia,Country_Australia,Country_Austria,...,Country_Uganda,Country_Ukraine,Country_United Arab Emirates,Country_United Kingdom of Great Britain and Northern Ireland,Country_United States of America,Country_Uzbekistan,Country_Vatican,Country_Venezuela,Country_Vietnam,Country_Zambia
0,2022,7,21,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2022,7,21,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2022,7,21,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,2022,7,21,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,2022,7,21,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,2022,7,21,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
6,2022,7,21,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
7,2022,7,21,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
8,2022,7,21,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,2022,7,21,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


Now to split the data into training and testing sets

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2 , random_state = 42)

this is a split of 80:20

In [17]:
X_train.shape

(17482, 144)

In [18]:
X_test.shape

(4371, 144)

for starter's, we'll go with Decision Tree Classifier

In [19]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state = 42)
model.fit(X_train, y_train)
from sklearn.metrics import accuracy_score, classification_report
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {acc * 100:.2f}%")
target = label_encoder.classes_
print(classification_report(y_test, y_pred, target_names = target))


Model Accuracy: 70.58%
                                precision    recall  f1-score   support

                          Good       0.83      0.81      0.82      2158
                     Hazardous       0.44      0.20      0.28        20
                      Moderate       0.67      0.67      0.67      1525
                     Unhealthy       0.42      0.51      0.46       181
Unhealthy for Sensitive Groups       0.46      0.48      0.47       445
                Very Unhealthy       0.17      0.14      0.15        42

                      accuracy                           0.71      4371
                     macro avg       0.50      0.47      0.47      4371
                  weighted avg       0.71      0.71      0.71      4371

