In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
%matplotlib inline
tf.__version__

'2.18.0'

In [2]:
air_index = pd.read_csv(r"Datasets\city_day.csv")
air_index.head()

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,,,0.92,18.22,17.15,,0.92,27.64,133.36,0.0,0.02,0.0,,
1,Ahmedabad,2015-01-02,,,0.97,15.69,16.46,,0.97,24.55,34.06,3.68,5.5,3.77,,
2,Ahmedabad,2015-01-03,,,17.4,19.3,29.7,,17.4,29.07,30.7,6.8,16.4,2.25,,
3,Ahmedabad,2015-01-04,,,1.7,18.48,17.97,,1.7,18.59,36.08,4.43,10.14,1.0,,
4,Ahmedabad,2015-01-05,,,22.1,21.42,37.76,,22.1,39.33,39.31,7.01,18.89,2.78,,


In [3]:
air_index.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29531 entries, 0 to 29530
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        29531 non-null  object 
 1   Date        29531 non-null  object 
 2   PM2.5       24933 non-null  float64
 3   PM10        18391 non-null  float64
 4   NO          25949 non-null  float64
 5   NO2         25946 non-null  float64
 6   NOx         25346 non-null  float64
 7   NH3         19203 non-null  float64
 8   CO          27472 non-null  float64
 9   SO2         25677 non-null  float64
 10  O3          25509 non-null  float64
 11  Benzene     23908 non-null  float64
 12  Toluene     21490 non-null  float64
 13  Xylene      11422 non-null  float64
 14  AQI         24850 non-null  float64
 15  AQI_Bucket  24850 non-null  object 
dtypes: float64(13), object(3)
memory usage: 3.6+ MB


In [5]:
air_index.isnull().sum()

City              0
Date              0
PM2.5          4598
PM10          11140
NO             3582
NO2            3585
NOx            4185
NH3           10328
CO             2059
SO2            3854
O3             4022
Benzene        5623
Toluene        8041
Xylene        18109
AQI            4681
AQI_Bucket     4681
dtype: int64

In [6]:
air_index.corr(numeric_only= True)['AQI']

PM2.5      0.659181
PM10       0.803313
NO         0.452191
NO2        0.537071
NOx        0.486450
NH3        0.252019
CO         0.683346
SO2        0.490586
O3         0.198991
Benzene    0.044407
Toluene    0.279992
Xylene     0.165532
AQI        1.000000
Name: AQI, dtype: float64

In [15]:
## Higly correlated columns with correlation greater than 2
columns_lessthan_point_two = []
columns_greaterthan_point_two = []
for column, value in dict(air_index.corr(numeric_only=True)['AQI']).items():
    if value > 0.2:
        columns_greaterthan_point_two.append(column)
    else:
        columns_lessthan_point_two.append(column)
columns_lessthan_point_two.append('Date') ## Added Date column to drop the Date column from the dataset
columns_greaterthan_point_two, columns_lessthan_point_two

(['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'Toluene', 'AQI'],
 ['O3', 'Benzene', 'Xylene', 'Date'])

In [17]:
air_index_corr = air_index.drop(columns= columns_lessthan_point_two, axis=1)
air_index_corr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29531 entries, 0 to 29530
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        29531 non-null  object 
 1   PM2.5       24933 non-null  float64
 2   PM10        18391 non-null  float64
 3   NO          25949 non-null  float64
 4   NO2         25946 non-null  float64
 5   NOx         25346 non-null  float64
 6   NH3         19203 non-null  float64
 7   CO          27472 non-null  float64
 8   SO2         25677 non-null  float64
 9   Toluene     21490 non-null  float64
 10  AQI         24850 non-null  float64
 11  AQI_Bucket  24850 non-null  object 
dtypes: float64(10), object(2)
memory usage: 2.7+ MB


In [18]:
air_index_corr.isnull().sum()

City              0
PM2.5          4598
PM10          11140
NO             3582
NO2            3585
NOx            4185
NH3           10328
CO             2059
SO2            3854
Toluene        8041
AQI            4681
AQI_Bucket     4681
dtype: int64

In [19]:
## since NH3 has low correlation with AQI dropping NH3 might not be a problem
air_index_corr.drop(columns=['NH3'], axis=1, inplace= True)

In [23]:
## We need the rows where AQI is available 
air_index_corr = air_index_corr.dropna(subset=['AQI'])
air_index_corr.isnull().sum()

City             0
PM2.5          678
PM10          7086
NO             387
NO2            391
NOx           1857
CO             445
SO2            605
Toluene       5826
AQI              0
AQI_Bucket       0
dtype: int64

#### Filling missing values for columns otherthan PM10 with mean or mode in both missing and train data

In [24]:
columns_mean = ['PM2.5', 'NO', 'NO2', 'NOx', 'CO', 'SO2', 'Toluene']
for column in columns_mean:
    air_index_corr[column] = air_index_corr[column].mean()
air_index_corr.isnull().sum()

City             0
PM2.5            0
PM10          7086
NO               0
NO2              0
NOx              0
CO               0
SO2              0
Toluene          0
AQI              0
AQI_Bucket       0
dtype: int64

#### Encoding City and AQI_Bucket columns

In [30]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for column in ['City', 'AQI_Bucket']:
    air_index_corr[column] = le.fit_transform(air_index_corr[column])
air_index_corr.head()

Unnamed: 0,City,PM2.5,PM10,NO,NO2,NOx,CO,SO2,Toluene,AQI,AQI_Bucket
28,0,67.476613,,17.622421,28.978391,32.289012,2.345267,14.362933,9.525714,209.0,2
29,0,67.476613,,17.622421,28.978391,32.289012,2.345267,14.362933,9.525714,328.0,5
30,0,67.476613,,17.622421,28.978391,32.289012,2.345267,14.362933,9.525714,514.0,4
31,0,67.476613,,17.622421,28.978391,32.289012,2.345267,14.362933,9.525714,782.0,4
32,0,67.476613,,17.622421,28.978391,32.289012,2.345267,14.362933,9.525714,914.0,4


### PM10 has high correlation with AQI dropping not the correct way
#### Building the model and fill the NAN values with predictions

In [31]:
PM10_df_train = air_index_corr.dropna(subset=['PM10'])  # Rows where PM10 is available
PM10_df_missing = air_index_corr[air_index_corr['PM10'].isna()]  # Rows where PM10 is missing
PM10_df_train.shape, PM10_df_missing.shape

((17764, 11), (7086, 11))

In [32]:
PM10_df_train.head()

Unnamed: 0,City,PM2.5,PM10,NO,NO2,NOx,CO,SO2,Toluene,AQI,AQI_Bucket
1595,0,67.476613,122.41,17.622421,28.978391,32.289012,2.345267,14.362933,9.525714,281.0,2
1596,0,67.476613,116.32,17.622421,28.978391,32.289012,2.345267,14.362933,9.525714,330.0,5
1597,0,67.476613,130.07,17.622421,28.978391,32.289012,2.345267,14.362933,9.525714,356.0,5
1598,0,67.476613,138.31,17.622421,28.978391,32.289012,2.345267,14.362933,9.525714,359.0,5
1599,0,67.476613,111.73,17.622421,28.978391,32.289012,2.345267,14.362933,9.525714,547.0,4


In [33]:
PM10_df_missing.head()

Unnamed: 0,City,PM2.5,PM10,NO,NO2,NOx,CO,SO2,Toluene,AQI,AQI_Bucket
28,0,67.476613,,17.622421,28.978391,32.289012,2.345267,14.362933,9.525714,209.0,2
29,0,67.476613,,17.622421,28.978391,32.289012,2.345267,14.362933,9.525714,328.0,5
30,0,67.476613,,17.622421,28.978391,32.289012,2.345267,14.362933,9.525714,514.0,4
31,0,67.476613,,17.622421,28.978391,32.289012,2.345267,14.362933,9.525714,782.0,4
32,0,67.476613,,17.622421,28.978391,32.289012,2.345267,14.362933,9.525714,914.0,4


#### Building a random forrest regression for predicting the PM10

In [34]:
from sklearn.ensemble import RandomForestRegressor 
X_train = PM10_df_train.drop(columns=['PM10'])
y_train = PM10_df_train['PM10']
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
X_missing = PM10_df_missing.drop(columns=['PM10'])
predicted_PM10 = model.predict(X_missing)

In [38]:
air_index_corr.loc[air_index_corr['PM10'].isna(), 'PM10'] = predicted_PM10
air_index_corr.isnull().sum()

City          0
PM2.5         0
PM10          0
NO            0
NO2           0
NOx           0
CO            0
SO2           0
Toluene       0
AQI           0
AQI_Bucket    0
dtype: int64

In [41]:
air_index_corr.corr()

Unnamed: 0,City,PM2.5,PM10,NO,NO2,NOx,CO,SO2,Toluene,AQI,AQI_Bucket
City,1.0,,0.098108,,,,,,,-0.123456,-0.026826
PM2.5,,,,,,,,,,,
PM10,0.098108,,1.0,,,,,,,0.68213,0.382135
NO,,,,,,,,,,,
NO2,,,,,,,,,,,
NOx,,,,,,,,,,,
CO,,,,,,,,,,,
SO2,,,,,,,,,,,
Toluene,,,,,,,,,,,
AQI,-0.123456,,0.68213,,,,,,,1.0,0.451764
