## EDA and classification prediction on AQI India data

This notebook is maintained as a solution to the Kaggle dataset: https://www.kaggle.com/rohanrao/air-quality-data-in-india.

In [26]:
#import standard data sci libs
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

!pip install chart_studio
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')



For convenience purposes, I have saved a copy of the dataset in my github repo: https://github.com/adityarc19/aqi-india/blob/main/city_day.csv

In [65]:
df = pd.read_csv('https://github.com/adityarc19/aqi-india/blob/main/city_day.csv?raw=true')
df

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,,,0.92,18.22,17.15,,0.92,27.64,133.36,0.00,0.02,0.00,,
1,Ahmedabad,2015-01-02,,,0.97,15.69,16.46,,0.97,24.55,34.06,3.68,5.50,3.77,,
2,Ahmedabad,2015-01-03,,,17.40,19.30,29.70,,17.40,29.07,30.70,6.80,16.40,2.25,,
3,Ahmedabad,2015-01-04,,,1.70,18.48,17.97,,1.70,18.59,36.08,4.43,10.14,1.00,,
4,Ahmedabad,2015-01-05,,,22.10,21.42,37.76,,22.10,39.33,39.31,7.01,18.89,2.78,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29526,Visakhapatnam,2020-06-27,15.02,50.94,7.68,25.06,19.54,12.47,0.47,8.55,23.30,2.24,12.07,0.73,41.0,Good
29527,Visakhapatnam,2020-06-28,24.38,74.09,3.42,26.06,16.53,11.99,0.52,12.72,30.14,0.74,2.21,0.38,70.0,Satisfactory
29528,Visakhapatnam,2020-06-29,22.91,65.73,3.45,29.53,18.33,10.71,0.48,8.42,30.96,0.01,0.01,0.00,68.0,Satisfactory
29529,Visakhapatnam,2020-06-30,16.64,49.97,4.05,29.26,18.80,10.03,0.52,9.84,28.30,0.00,0.00,0.00,54.0,Satisfactory


In [3]:
# show columns
df.columns

Index(['City', 'Date', 'PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2',
       'O3', 'Benzene', 'Toluene', 'Xylene', 'AQI', 'AQI_Bucket'],
      dtype='object')

In [4]:
df.info()  # more info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29531 entries, 0 to 29530
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        29531 non-null  object 
 1   Date        29531 non-null  object 
 2   PM2.5       24933 non-null  float64
 3   PM10        18391 non-null  float64
 4   NO          25949 non-null  float64
 5   NO2         25946 non-null  float64
 6   NOx         25346 non-null  float64
 7   NH3         19203 non-null  float64
 8   CO          27472 non-null  float64
 9   SO2         25677 non-null  float64
 10  O3          25509 non-null  float64
 11  Benzene     23908 non-null  float64
 12  Toluene     21490 non-null  float64
 13  Xylene      11422 non-null  float64
 14  AQI         24850 non-null  float64
 15  AQI_Bucket  24850 non-null  object 
dtypes: float64(13), object(3)
memory usage: 3.6+ MB


In [5]:
# null counts
df.isnull().sum()

City              0
Date              0
PM2.5          4598
PM10          11140
NO             3582
NO2            3585
NOx            4185
NH3           10328
CO             2059
SO2            3854
O3             4022
Benzene        5623
Toluene        8041
Xylene        18109
AQI            4681
AQI_Bucket     4681
dtype: int64

In [6]:
df.describe()  # data statistics

Unnamed: 0,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI
count,24933.0,18391.0,25949.0,25946.0,25346.0,19203.0,27472.0,25677.0,25509.0,23908.0,21490.0,11422.0,24850.0
mean,67.450578,118.127103,17.57473,28.560659,32.309123,23.483476,2.248598,14.531977,34.49143,3.28084,8.700972,3.070128,166.463581
std,64.661449,90.60511,22.785846,24.474746,31.646011,25.684275,6.962884,18.133775,21.694928,15.811136,19.969164,6.323247,140.696585
min,0.04,0.01,0.02,0.01,0.0,0.01,0.0,0.01,0.01,0.0,0.0,0.0,13.0
25%,28.82,56.255,5.63,11.75,12.82,8.58,0.51,5.67,18.86,0.12,0.6,0.14,81.0
50%,48.57,95.68,9.89,21.69,23.52,15.85,0.89,9.16,30.84,1.07,2.97,0.98,118.0
75%,80.59,149.745,19.95,37.62,40.1275,30.02,1.45,15.22,45.57,3.08,9.15,3.35,208.0
max,949.99,1000.0,390.68,362.21,467.63,352.89,175.81,193.86,257.73,455.03,454.85,170.37,2049.0


In [21]:
# convert 'Date' to datetime format
df['Date'] = pd.to_datetime(df['Date'])
df.head()

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,,,0.92,18.22,17.15,,0.92,27.64,133.36,0.0,0.02,0.0,,
1,Ahmedabad,2015-01-02,,,0.97,15.69,16.46,,0.97,24.55,34.06,3.68,5.5,3.77,,
2,Ahmedabad,2015-01-03,,,17.4,19.3,29.7,,17.4,29.07,30.7,6.8,16.4,2.25,,
3,Ahmedabad,2015-01-04,,,1.7,18.48,17.97,,1.7,18.59,36.08,4.43,10.14,1.0,,
4,Ahmedabad,2015-01-05,,,22.1,21.42,37.76,,22.1,39.33,39.31,7.01,18.89,2.78,,


#### Let's look at the missing values

In [20]:
# Missing values
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

missing_values= missing_values_table(df)
missing_values.style.background_gradient(cmap='Reds')

Your selected dataframe has 16 columns.
There are 14 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
Xylene,18109,61.3
PM10,11140,37.7
NH3,10328,35.0
Toluene,8041,27.2
Benzene,5623,19.0
AQI,4681,15.9
AQI_Bucket,4681,15.9
PM2.5,4598,15.6
NOx,4185,14.2
O3,4022,13.6


#### Now lets look at a few city wise pollution stats

In [33]:
df1 = df.copy()
df1['PM2.5']=df1['PM2.5'].fillna((df1['PM2.5'].median()))
df1['PM10']=df1['PM10'].fillna((df1['PM10'].median()))
df1['NO']=df1['NO'].fillna((df1['NO'].median()))
df1['NO2']=df1['NO2'].fillna((df1['NO2'].median()))
df1['NOx']=df1['NOx'].fillna((df1['NOx'].median()))
df1['NH3']=df1['NH3'].fillna((df1['NH3'].median()))
df1['CO']=df1['CO'].fillna((df1['CO'].median()))
df1['SO2']=df1['SO2'].fillna((df1['SO2'].median()))
df1['O3']=df1['O3'].fillna((df1['O3'].median()))
df1['Benzene']=df1['Benzene'].fillna((df1['Benzene'].median()))
df1['Toluene']=df1['Toluene'].fillna((df1['Toluene'].median()))
df1['Xylene']=df1['Xylene'].fillna((df1['Xylene'].median()))
df1['AQI']=df1['AQI'].fillna((df1['AQI'].median()))
df1['AQI_Bucket']=df1['AQI_Bucket'].fillna('Moderate')

In [34]:
df = df1.copy()

In [35]:
df['City'].unique()

array(['Ahmedabad', 'Aizawl', 'Amaravati', 'Amritsar', 'Bengaluru',
       'Bhopal', 'Brajrajnagar', 'Chandigarh', 'Chennai', 'Coimbatore',
       'Delhi', 'Ernakulam', 'Gurugram', 'Guwahati', 'Hyderabad',
       'Jaipur', 'Jorapokhar', 'Kochi', 'Kolkata', 'Lucknow', 'Mumbai',
       'Patna', 'Shillong', 'Talcher', 'Thiruvananthapuram',
       'Visakhapatnam'], dtype=object)

In [36]:
df['City'].value_counts()

Ahmedabad             2009
Chennai               2009
Bengaluru             2009
Delhi                 2009
Mumbai                2009
Lucknow               2009
Hyderabad             2006
Patna                 1858
Gurugram              1679
Visakhapatnam         1462
Amritsar              1221
Jorapokhar            1169
Jaipur                1114
Thiruvananthapuram    1112
Amaravati              951
Brajrajnagar           938
Talcher                925
Kolkata                814
Guwahati               502
Coimbatore             386
Shillong               310
Chandigarh             304
Bhopal                 289
Ernakulam              162
Kochi                  162
Aizawl                 113
Name: City, dtype: int64

In [37]:
df['Pollution content'] = df['PM2.5']+df['PM10']+df['NO']+df['NO2']+df['NOx']+df['NH3']+df['CO']+df['SO2']+df['O3']+df['Benzene']+df['Toluene']+df['Xylene']

In [38]:
def ploting(var):
    df[var].iplot(title=var,xTitle='Cities',yTitle=var, linecolor='black', )
    plt.show()
ploting('Pollution content')

In [40]:
def max_bar_plot(var):
    x1 = df[['City',var]].groupby(["City"]).median().sort_values(by = var,
    ascending = True).tail(10).iplot(kind='bar', xTitle='Cities',yTitle=var, 
                                     linecolor='black', title='{2} {1} {0}'.format(")",var,' Most polluted cities('))

p = max_bar_plot('Pollution content')

In [43]:
def min_bar_plot(var):
    x1 = df[['City',var]].groupby(["City"]).mean().sort_values(by = var,
    ascending = True).head(10).iplot(kind='bar', xTitle='Cities',yTitle=var, linecolor='black',title='{2} {1} {0}'.format(")",var,' Minimum polluted cities('))
p1 = min_bar_plot('Pollution content')

### EDA using Pandas Profiling 

In [44]:
import pandas_profiling

In [66]:
data = pd.read_csv('https://github.com/adityarc19/aqi-india/blob/main/city_day.csv?raw=true')
data

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,,,0.92,18.22,17.15,,0.92,27.64,133.36,0.00,0.02,0.00,,
1,Ahmedabad,2015-01-02,,,0.97,15.69,16.46,,0.97,24.55,34.06,3.68,5.50,3.77,,
2,Ahmedabad,2015-01-03,,,17.40,19.30,29.70,,17.40,29.07,30.70,6.80,16.40,2.25,,
3,Ahmedabad,2015-01-04,,,1.70,18.48,17.97,,1.70,18.59,36.08,4.43,10.14,1.00,,
4,Ahmedabad,2015-01-05,,,22.10,21.42,37.76,,22.10,39.33,39.31,7.01,18.89,2.78,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29526,Visakhapatnam,2020-06-27,15.02,50.94,7.68,25.06,19.54,12.47,0.47,8.55,23.30,2.24,12.07,0.73,41.0,Good
29527,Visakhapatnam,2020-06-28,24.38,74.09,3.42,26.06,16.53,11.99,0.52,12.72,30.14,0.74,2.21,0.38,70.0,Satisfactory
29528,Visakhapatnam,2020-06-29,22.91,65.73,3.45,29.53,18.33,10.71,0.48,8.42,30.96,0.01,0.01,0.00,68.0,Satisfactory
29529,Visakhapatnam,2020-06-30,16.64,49.97,4.05,29.26,18.80,10.03,0.52,9.84,28.30,0.00,0.00,0.00,54.0,Satisfactory


In [62]:
profile = data.profile_report(title="EDA", explorative = True)
profile.to_file(output_file="EDA.html")

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=30.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




In [67]:
#run the code below to view the EDA report
#profile

### Prediction of AQI Bucket using PyCaret

In [52]:
from pycaret.classification import *

In [53]:
reg = setup(data = data, 
             target = 'AQI_Bucket',
             silent = True)

 
Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,3456
1,Target Type,Multiclass
2,Label Encoded,"Good: 0, Moderate: 1, Poor: 2, Satisfactory: 3, Severe: 4, Very Poor: 5"
3,Original Data,"(29531, 16)"
4,Missing Values,True
5,Numeric Features,13
6,Categorical Features,1
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


In [54]:
compare_models()   #compare all models

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Decision Tree Classifier,1.0,0.0,1.0,1.0,1.0,1.0
1,Ada Boost Classifier,1.0,0.0,1.0,1.0,1.0,1.0
2,Gradient Boosting Classifier,1.0,0.0,1.0,1.0,1.0,1.0
3,Extreme Gradient Boosting,1.0,0.0,1.0,1.0,1.0,1.0
4,Light Gradient Boosting Machine,0.998,0.0,0.9969,0.9981,0.998,0.9973
5,Random Forest Classifier,0.9976,0.0,0.9956,0.9977,0.9976,0.9968
6,CatBoost Classifier,0.9966,0.0,0.9944,0.9966,0.9966,0.9953
7,K Neighbors Classifier,0.9342,0.0,0.9188,0.9344,0.9341,0.9106
8,Extra Trees Classifier,0.8828,0.0,0.8355,0.8826,0.8812,0.8396
9,Logistic Regression,0.7424,0.0,0.6109,0.7263,0.7138,0.6357


In [56]:
dt_model = create_model('dt')   #Model creation

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,1.0,0.0,1.0,1.0,1.0,1.0
1,1.0,0.0,1.0,1.0,1.0,1.0
2,1.0,0.0,1.0,1.0,1.0,1.0
3,1.0,0.0,1.0,1.0,1.0,1.0
4,1.0,0.0,1.0,1.0,1.0,1.0
5,1.0,0.0,1.0,1.0,1.0,1.0
6,1.0,0.0,1.0,1.0,1.0,1.0
7,1.0,0.0,1.0,1.0,1.0,1.0
8,1.0,0.0,1.0,1.0,1.0,1.0
9,1.0,0.0,1.0,1.0,1.0,1.0


In [68]:
#run the code below to evaluate model:
#evaluate_model(dt_model)

In [60]:
# save the model
save_model(dt_model, 'decision_tree_model')

Transformation Pipeline and Model Succesfully Saved


In [61]:
# # load model
# dt_model = load_model(model_name='decision_tree_model')