In [81]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# Merging Data

In [82]:
df_2018 = pd.read_csv("data/City_wise_raw_data_1Hr_2018_Asansol_1Hr.csv")
df_2019 = pd.read_csv("data/City_wise_raw_data_1Hr_2019_Asansol_1Hr.csv")
df_2020 = pd.read_csv("data/City_wise_raw_data_1Hr_2020_Asansol_1Hr.csv")
df_2021 = pd.read_csv("data/City_wise_raw_data_1Hr_2021_Asansol_1Hr.csv")
df_2022 = pd.read_csv("data/City_wise_raw_data_1Hr_2022_Asansol_1Hr.csv")
df_2023 = pd.read_csv("data/City_wise_raw_data_1Hr_2023_Asansol_1Hr.csv")
df_2024 = pd.read_csv("data/City_wise_raw_data_1Hr_2024_Asansol_1Hr.csv")

<b>Null Value Percentage of each year's dataset</b>
- 2018: 23%
- 2019 : 6%
- 2020 : 17%
- 2021 : 11%
- 2022 : 5%
- 2023 : 9%
- 2024 : 0.8%

According to cpcb data below 30% null values can be used for forecasting 

In [83]:
df_concat = pd.concat([df_2018, df_2019, df_2020, df_2021, df_2022, df_2023, df_2024], ignore_index=True)

<b>We will be using 8 pollutants : PM2.5, PM10, NO, NO2, NOx, NH3, SO2, CO</b>

In [84]:
df_merged = df_concat[["Timestamp", "PM2.5 (µg/m³)", "PM10 (µg/m³)", "NO (µg/m³)", "NO2 (µg/m³)", "NOx (ppb)", "NH3 (µg/m³)", "SO2 (µg/m³)", "CO (mg/m³)"]]

In [85]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61368 entries, 0 to 61367
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Timestamp      61368 non-null  object 
 1   PM2.5 (µg/m³)  55570 non-null  float64
 2   PM10 (µg/m³)   55892 non-null  float64
 3   NO (µg/m³)     54961 non-null  float64
 4   NO2 (µg/m³)    54911 non-null  float64
 5   NOx (ppb)      54987 non-null  float64
 6   NH3 (µg/m³)    54880 non-null  float64
 7   SO2 (µg/m³)    55725 non-null  float64
 8   CO (mg/m³)     56194 non-null  float64
dtypes: float64(8), object(1)
memory usage: 4.2+ MB


In [86]:
df_merged.head(3)

Unnamed: 0,Timestamp,PM2.5 (µg/m³),PM10 (µg/m³),NO (µg/m³),NO2 (µg/m³),NOx (ppb),NH3 (µg/m³),SO2 (µg/m³),CO (mg/m³)
0,2018-01-01 00:00:00,,,,,,,,
1,2018-01-01 01:00:00,,,,,,,,
2,2018-01-01 02:00:00,,,,,,,,


In [87]:
df_merged.tail()

Unnamed: 0,Timestamp,PM2.5 (µg/m³),PM10 (µg/m³),NO (µg/m³),NO2 (µg/m³),NOx (ppb),NH3 (µg/m³),SO2 (µg/m³),CO (mg/m³)
61363,2024-12-31 19:00:00,76.46,155.91,7.84,31.64,22.93,22.94,7.44,0.89
61364,2024-12-31 20:00:00,76.9,153.72,7.24,30.77,22.0,22.46,7.28,0.74
61365,2024-12-31 21:00:00,69.65,141.51,6.76,22.8,17.39,20.95,7.69,0.76
61366,2024-12-31 22:00:00,61.91,128.83,6.14,18.78,14.76,20.32,7.04,0.62
61367,2024-12-31 23:00:00,60.65,118.69,5.87,19.58,14.97,20.58,6.62,0.57


<b>Unit of Pollutants</b>  

NOx : ppb - parts per billion  
CO : mg/m³ - milligram per cubic meter  
Remaining 6 pollutants : µg/m³ - microgram per cubic meter  

1 mg = 1000 µg

In [88]:
df_merged.to_csv("data/Asansol Merged.csv")

# Basic Preprocessing

In [116]:
df = df_merged.copy()

In [117]:
df["Timestamp"] = pd.to_datetime(df["Timestamp"])
df = df.set_index("Timestamp")

df.duplicated().sum()

4550

All 4550 duplicates are rows with all columns as null values

In [118]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 61368 entries, 2018-01-01 00:00:00 to 2024-12-31 23:00:00
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PM2.5 (µg/m³)  55570 non-null  float64
 1   PM10 (µg/m³)   55892 non-null  float64
 2   NO (µg/m³)     54961 non-null  float64
 3   NO2 (µg/m³)    54911 non-null  float64
 4   NOx (ppb)      54987 non-null  float64
 5   NH3 (µg/m³)    54880 non-null  float64
 6   SO2 (µg/m³)    55725 non-null  float64
 7   CO (mg/m³)     56194 non-null  float64
dtypes: float64(8)
memory usage: 4.2 MB


In [125]:
df = df.rename(columns = {"PM2.5 (µg/m³)":"PM2.5", "PM10 (µg/m³)":"PM10", "NO (µg/m³)":"NO", "NO2 (µg/m³)":"NO2", 
            "NOx (ppb)":"NOx", "NH3 (µg/m³)":"NH3", "SO2 (µg/m³)":"SO2", "CO (mg/m³)":"CO"})

In [126]:
df.head()

Unnamed: 0_level_0,PM2.5,PM10,NO,NO2,NOx,NH3,SO2,CO
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-01 00:00:00,,,,,,,,
2018-01-01 01:00:00,,,,,,,,
2018-01-01 02:00:00,,,,,,,,
2018-01-01 03:00:00,,,,,,,,
2018-01-01 04:00:00,,,,,,,,
