In [1]:
import pandas as pd
import numpy as np
import pickle
from matplotlib import pyplot
from pandas.api import types
from catboost import CatBoostRegressor
from sklearn.metrics import *
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

In [2]:
data = pd.read_excel('pollution_data.xlsx')
data.drop(columns=['No'], axis=0, inplace=True)
data = data[24:]
print(data.head())
print(data.shape)

    year  month  day  hour  pollution  DEWP  TEMP    PRES cbwd   Iws  Is  Ir
24  2010      1    2     0      129.0   -16  -4.0  1020.0   SE  1.79   0   0
25  2010      1    2     1      148.0   -15  -4.0  1020.0   SE  2.68   0   0
26  2010      1    2     2      159.0   -11  -5.0  1021.0   SE  3.57   0   0
27  2010      1    2     3      181.0    -7  -5.0  1022.0   SE  5.36   1   0
28  2010      1    2     4      138.0    -7  -5.0  1022.0   SE  6.25   2   0
(43800, 12)


In [3]:
"""
Imputing Data
"""

data['pollution'].fillna(data['pollution'].mean(), inplace=True)
data.isna().sum()

year         0
month        0
day          0
hour         0
pollution    0
DEWP         0
TEMP         0
PRES         0
cbwd         0
Iws          0
Is           0
Ir           0
dtype: int64

In [6]:
"""
Defining Response and Target Variable
"""

covariates = [column for column in data.columns if column not in ['pollution']]
print(covariates)

response = 'pollution_next_hour'

data[response] = data['pollution'].shift(-1)
data.dropna(inplace=True)
data.head()

['year', 'month', 'day', 'hour', 'DEWP', 'TEMP', 'PRES', 'cbwd', 'Iws', 'Is', 'Ir', 'pollution_next_hour']


Unnamed: 0,year,month,day,hour,pollution,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir,pollution_next_hour
25,2010,1,2,1,148.0,-15,-4.0,1020.0,SE,2.68,0,0,159.0
26,2010,1,2,2,159.0,-11,-5.0,1021.0,SE,3.57,0,0,181.0
27,2010,1,2,3,181.0,-7,-5.0,1022.0,SE,5.36,1,0,138.0
28,2010,1,2,4,138.0,-7,-5.0,1022.0,SE,6.25,2,0,109.0
29,2010,1,2,5,109.0,-7,-6.0,1022.0,SE,7.14,3,0,105.0


In [5]:
"""
Dividing Data Into Train and Validation
"""

records = data.shape[0]
train_index = int(records*0.6)
val_index = train_index + int(records*0.2)
print(f'Train Data Index: {train_index}...Validation Data Index: {val_index}')

data['cbwd'] = data['cbwd'].astype('category')

train_data = data[:train_index].reset_index()
val_data = data[train_index+1: val_index].reset_index()
test_data = data[val_index+1:].reset_index()
print(f'Train Data Shape {train_data.shape}...Validation Data Shape {val_data.shape}...Test Data Shape {test_data.shape}')

Train Data Index: 26279...Validation Data Index: 35038
Train Data Shape (26279, 14)...Validation Data Shape (8758, 14)...Test Data Shape (8760, 14)


In [9]:
"""
Reformatting Data
"""

def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	df = pd.DataFrame(data)
	cols, names = list(), list()

	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]

	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]

	agg = pd.concat(cols, axis=1)
	agg.columns = names

	if dropnan:
		agg.dropna(inplace=True)
	return agg

train_data_reformatted = series_to_supervised(train_data)
train_data_reformatted

Unnamed: 0,var1(t-1),var2(t-1),var3(t-1),var4(t-1),var5(t-1),var6(t-1),var7(t-1),var8(t-1),var9(t-1),var10(t-1),...,var5(t),var6(t),var7(t),var8(t),var9(t),var10(t),var11(t),var12(t),var13(t),var14(t)
1,25.0,2010.0,1.0,2.0,1.0,148.0,-15.0,-4.0,1020.0,SE,...,2,159.0,-11,-5.0,1021.0,SE,3.57,0,0,148.0
2,26.0,2010.0,1.0,2.0,2.0,159.0,-11.0,-5.0,1021.0,SE,...,3,181.0,-7,-5.0,1022.0,SE,5.36,1,0,159.0
3,27.0,2010.0,1.0,2.0,3.0,181.0,-7.0,-5.0,1022.0,SE,...,4,138.0,-7,-5.0,1022.0,SE,6.25,2,0,181.0
4,28.0,2010.0,1.0,2.0,4.0,138.0,-7.0,-5.0,1022.0,SE,...,5,109.0,-7,-6.0,1022.0,SE,7.14,3,0,138.0
5,29.0,2010.0,1.0,2.0,5.0,109.0,-7.0,-6.0,1022.0,SE,...,6,105.0,-7,-6.0,1023.0,SE,8.93,4,0,109.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26274,26298.0,2012.0,12.0,31.0,18.0,103.0,-15.0,-2.0,1016.0,NW,...,19,104.0,-13,-5.0,1017.0,SE,0.89,0,0,103.0
26275,26299.0,2012.0,12.0,31.0,19.0,104.0,-13.0,-5.0,1017.0,SE,...,20,131.0,-14,-6.0,1017.0,cv,0.89,0,0,104.0
26276,26300.0,2012.0,12.0,31.0,20.0,131.0,-14.0,-6.0,1017.0,cv,...,21,113.0,-14,-9.0,1018.0,NW,1.79,0,0,131.0
26277,26301.0,2012.0,12.0,31.0,21.0,113.0,-14.0,-9.0,1018.0,NW,...,22,45.0,-12,-8.0,1018.0,cv,0.89,0,0,113.0
