In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("air-quality-india.csv")

In [3]:
df.head()

Unnamed: 0,Timestamp,Year,Month,Day,Hour,PM2.5
0,2017-11-07 12:00:00,2017,11,7,12,64.51
1,2017-11-07 13:00:00,2017,11,7,13,69.95
2,2017-11-07 14:00:00,2017,11,7,14,92.79
3,2017-11-07 15:00:00,2017,11,7,15,109.66
4,2017-11-07 16:00:00,2017,11,7,16,116.5


In [4]:
df = df[df["PM2.5"].notnull()]

In [5]:
df.head()

Unnamed: 0,Timestamp,Year,Month,Day,Hour,PM2.5
0,2017-11-07 12:00:00,2017,11,7,12,64.51
1,2017-11-07 13:00:00,2017,11,7,13,69.95
2,2017-11-07 14:00:00,2017,11,7,14,92.79
3,2017-11-07 15:00:00,2017,11,7,15,109.66
4,2017-11-07 16:00:00,2017,11,7,16,116.5


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36192 entries, 0 to 36191
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Timestamp  36192 non-null  object 
 1   Year       36192 non-null  int64  
 2   Month      36192 non-null  int64  
 3   Day        36192 non-null  int64  
 4   Hour       36192 non-null  int64  
 5   PM2.5      36192 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 1.9+ MB


In [7]:
df = df[["Year", "Month", "Day", "Hour", "PM2.5"]]
df.head()

Unnamed: 0,Year,Month,Day,Hour,PM2.5
0,2017,11,7,12,64.51
1,2017,11,7,13,69.95
2,2017,11,7,14,92.79
3,2017,11,7,15,109.66
4,2017,11,7,16,116.5


In [8]:
df = df.dropna()
df.isnull().sum()

Year     0
Month    0
Day      0
Hour     0
PM2.5    0
dtype: int64

In [9]:
df['Year'].value_counts()

2020    8356
2021    8283
2019    7685
2018    7537
2022    3194
2017    1137
Name: Year, dtype: int64

In [10]:
df['Month'].value_counts()

1     3546
3     3529
12    3428
2     3250
5     3212
11    3203
4     3083
10    2814
6     2743
9     2495
8     2492
7     2397
Name: Month, dtype: int64

In [11]:
df['Day'].value_counts()

2     1249
3     1230
1     1229
24    1225
26    1224
21    1222
22    1221
11    1221
27    1215
4     1214
23    1212
25    1206
12    1192
5     1189
7     1188
20    1185
6     1184
10    1182
28    1181
13    1175
9     1169
19    1168
14    1163
15    1150
8     1141
17    1123
18    1120
29    1120
16    1116
30    1085
31     693
Name: Day, dtype: int64

In [12]:
df['Hour'].value_counts()

8     1524
6     1523
5     1519
1     1518
23    1517
2     1515
9     1514
4     1514
7     1512
20    1512
10    1511
3     1510
22    1509
0     1506
11    1506
14    1504
12    1502
13    1501
19    1499
17    1499
15    1499
16    1495
21    1492
18    1491
Name: Hour, dtype: int64

In [13]:
df['Month'].unique()

array([11, 12,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=int64)

In [14]:
df['Day'].unique()

array([ 7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
       24, 25, 26, 27, 28, 29, 30,  1,  2,  3,  4,  5,  6, 31],
      dtype=int64)

In [15]:
def organize_days_in_a_month(x):
    if x >= 1 and x <= 10:
        return 'Start days of the Month'
    elif x >= 11 and x <= 20:
        return 'Mid days of the Month'
    elif x >= 21 and x <= 31:
        return 'Late days of the Month'
    
df['Day'] = df['Day'].apply(organize_days_in_a_month)

In [16]:
df['Day'].unique()

array(['Start days of the Month', 'Mid days of the Month',
       'Late days of the Month'], dtype=object)

In [17]:
df['Hour'].unique()

array([12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,  0,  1,  2,  3,  4,
        5,  6,  7,  8,  9, 10, 11], dtype=int64)

In [19]:
def organize_hours_in_a_day(x):
    if x >= 7 and x <= 11:
        return 'Morning'
    elif x >= 12 and x <= 15:
        return 'Afternoon'
    elif x >= 16 and x <= 19:
        return 'Evening'
    elif x >= 20 and x <= 23:
        return 'Night'
    elif x >= 0 and x <= 3:
        return 'Late Night'
    elif x >= 4 and x <= 6:
        return 'Early Morning'
    
df['Hour'] = df['Hour'].apply(organize_hours_in_a_day)

In [20]:
df['Hour'].unique()

array(['Afternoon', 'Evening', 'Night', 'Late Night', 'Early Morning',
       'Morning'], dtype=object)

In [21]:
from sklearn.preprocessing import LabelEncoder

In [22]:
le_day = LabelEncoder()
df['Day'] = le_day.fit_transform(df['Day'])
df['Day'].unique()

array([2, 1, 0])

In [23]:
le_hour = LabelEncoder()
df['Hour'] = le_hour.fit_transform(df['Hour'])
df['Hour'].unique()

array([0, 2, 5, 3, 1, 4])

In [24]:
x = df.drop("PM2.5", axis = 1)
y = df["PM2.5"]

In [25]:
from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()
linear_reg.fit(x, y.values)

In [26]:
y_pred = linear_reg.predict(x)

In [27]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
error = np.sqrt(mean_squared_error(y, y_pred))

In [28]:
error

24.204773819186624

In [29]:
from sklearn.tree import DecisionTreeRegressor
dec_tree_reg = DecisionTreeRegressor(random_state=0)
dec_tree_reg.fit(x, y.values)

In [30]:
y_pred = dec_tree_reg.predict(x)

In [31]:
error = np.sqrt(mean_squared_error(y, y_pred))

In [32]:
error

8.965272990614206

In [33]:
from sklearn.ensemble import RandomForestRegressor
random_forest_reg = RandomForestRegressor(random_state=0)
random_forest_reg.fit(x, y.values)

In [34]:
y_pred = random_forest_reg.predict(x)

In [35]:
error = np.sqrt(mean_squared_error(y, y_pred))
error

8.966709722344904

In [36]:
x

Unnamed: 0,Year,Month,Day,Hour
0,2017,11,2,0
1,2017,11,2,0
2,2017,11,2,0
3,2017,11,2,0
4,2017,11,2,2
...,...,...,...,...
36187,2022,6,2,4
36188,2022,6,2,0
36189,2022,6,2,0
36190,2022,6,2,0


In [37]:
from sklearn.model_selection import GridSearchCV

max_depth = [None, 2,4,6,8,10,12]
parameters = {"max_depth": max_depth}

regressor = DecisionTreeRegressor(random_state=0)
gs = GridSearchCV(regressor, parameters, scoring='neg_mean_squared_error')
gs.fit(x, y.values)

In [38]:
regressor = gs.best_estimator_

regressor.fit(x, y.values)
y_pred = regressor.predict(x)
error = np.sqrt(mean_squared_error(y, y_pred))
error

9.92365060132157

In [39]:
x

Unnamed: 0,Year,Month,Day,Hour
0,2017,11,2,0
1,2017,11,2,0
2,2017,11,2,0
3,2017,11,2,0
4,2017,11,2,2
...,...,...,...,...
36187,2022,6,2,4
36188,2022,6,2,0
36189,2022,6,2,0
36190,2022,6,2,0


In [47]:
z = np.array([[2019, 6, "Mid days of the Month", "Morning"]])
z

array([['2019', '6', 'Mid days of the Month', 'Morning']], dtype='<U21')

In [48]:
z[:, 2] = le_day.transform(z[:,2])
z[:, 3] = le_hour.transform(z[:,3])
z = z.astype(int)
z

array([[2019,    6,    1,    4]])

In [49]:
y_pred = regressor.predict(z)
y_pred



array([29.69709184])

In [50]:
import pickle

In [51]:
data = {"model": regressor, "le_day": le_day, "le_hour": le_hour}
with open('saved_steps.pkl', 'wb') as file:
    pickle.dump(data, file)

In [52]:
with open('saved_steps.pkl', 'rb') as file:
    data = pickle.load(file)

regressor_loaded = data["model"]
le_day = data["le_day"]
le_hour = data["le_hour"]

In [53]:
y_pred = regressor_loaded.predict(x)
y_pred

array([100.55   , 100.55   , 100.55   , ...,  41.61125,  41.61125,
        41.61125])

In [54]:
y_pred = regressor_loaded.predict(z)
y_pred



array([29.69709184])