# Pollution In India

## Importing Relevant Libraries

In [190]:
#Relevant Analysis Libraries
import numpy as np
import pandas as pd
import statsmodels.api as sm

#Relevant Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

#Setting Seaborn as Default Visualization Libraries
sns.set()

## Loading Data

In [191]:
#Reading .csv file and saving as data_raw
data_raw = pd.read_csv('C:\\Users\\DELL\\Desktop\\Data_Akshay\\pollution_india_2010.csv')

## 1. Exploring and Visualizing the Loaded Data

## 1.1. Data Exploration

In [192]:
#Exploring the Head Rows in the Data Set
data_raw.head(10)

Unnamed: 0,City,NO2,PM10,SO2,State
0,Chitoor,9,39,4,Andhra Pradesh
1,Guntur,11,81,2,Andhra Pradesh
2,Hydrabad,24,79,5,Andhra Pradesh
3,Kothagudem,11,62,2,Andhra Pradesh
4,Kurnool,9,85,4,Andhra Pradesh
5,Nalgonda,23,85,5,Andhra Pradesh
6,Nellore,12,65,2,Andhra Pradesh
7,Patencheru,23,76,11,Andhra Pradesh
8,Ramagundam,12,68,4,Andhra Pradesh
9,Tirupati,9,37,4,Andhra Pradesh


In [193]:
#Exploring the Tail Rows in the Data Set
data_raw.tail(10)

Unnamed: 0,City,NO2,PM10,SO2,State
171,Rishikesh,Null,212,Null,Uttarakhand
172,Asansol,66,141,8,West Bengal
173,Barrackpore,74,121,12,West Bengal
174,Durgapur,66,141,8,West Bengal
175,Haldia,52,57,14,West Bengal
176,Howrah,75,118,12,West Bengal
177,Kolkata,62,99,11,West Bengal
178,Raniganj,63,159,8,West Bengal
179,Sankrail,65,100,10,West Bengal
180,South Suburban,56,82,7,West Bengal


In [194]:
#Exploring the Columns in the Data Set
data_raw.columns

Index(['City', 'NO2', 'PM10', 'SO2', 'State'], dtype='object')

In [195]:
#Exploring the Data Types in the Data Set
data_raw.dtypes

City     object
NO2      object
PM10     object
SO2      object
State    object
dtype: object

In [196]:
#Exploring the Discrete Statistics of the Data Set
data_raw.describe(include= 'all')

Unnamed: 0,City,NO2,PM10,SO2,State
count,181,181,181,181,181
unique,181,54,113,29,29
top,Aurangabad,15,58,2,Maharashtra
freq,1,10,6,24,19


# 2. Feature Engineering

## 2.1. Dealing with Missing Values

In [197]:
#Checking for Null Values in the Data Set
data_raw.isnull().sum()

City     0
NO2      0
PM10     0
SO2      0
State    0
dtype: int64

In [198]:
data_raw.tail(10)

Unnamed: 0,City,NO2,PM10,SO2,State
171,Rishikesh,Null,212,Null,Uttarakhand
172,Asansol,66,141,8,West Bengal
173,Barrackpore,74,121,12,West Bengal
174,Durgapur,66,141,8,West Bengal
175,Haldia,52,57,14,West Bengal
176,Howrah,75,118,12,West Bengal
177,Kolkata,62,99,11,West Bengal
178,Raniganj,63,159,8,West Bengal
179,Sankrail,65,100,10,West Bengal
180,South Suburban,56,82,7,West Bengal


### Onservations
1. Few missing values are entered as Null, it must be adressed in order to get the clean data.

In [199]:
#Adressing all the missing values entered as Null
data_raw['NO2'] = pd.to_numeric(data_raw['NO2'], errors= 'coerce')
data_raw['PM10'] = pd.to_numeric(data_raw['PM10'], errors= 'coerce')
data_raw['SO2'] = pd.to_numeric(data_raw['SO2'], errors= 'coerce')

In [200]:
data_raw.isnull().sum()

City     0
NO2      4
PM10     1
SO2      5
State    0
dtype: int64

In [201]:
data_raw.tail(10)

Unnamed: 0,City,NO2,PM10,SO2,State
171,Rishikesh,,212.0,,Uttarakhand
172,Asansol,66.0,141.0,8.0,West Bengal
173,Barrackpore,74.0,121.0,12.0,West Bengal
174,Durgapur,66.0,141.0,8.0,West Bengal
175,Haldia,52.0,57.0,14.0,West Bengal
176,Howrah,75.0,118.0,12.0,West Bengal
177,Kolkata,62.0,99.0,11.0,West Bengal
178,Raniganj,63.0,159.0,8.0,West Bengal
179,Sankrail,65.0,100.0,10.0,West Bengal
180,South Suburban,56.0,82.0,7.0,West Bengal


In [202]:
data_raw.mean()

NO2      24.112994
PM10    108.111111
SO2       9.971591
dtype: float64

In [203]:
data_raw.median()

NO2     20.0
PM10    89.5
SO2      7.5
dtype: float64

In [204]:
data_with_no_mv = data_raw.fillna({'NO2': data_raw['NO2'].median(), 'PM10': data_raw['PM10'].median(), 'SO2': data_raw['SO2'].median()})

In [205]:
data_with_no_mv.isnull().sum()

City     0
NO2      0
PM10     0
SO2      0
State    0
dtype: int64

In [206]:
data_with_no_mv.tail(10)

Unnamed: 0,City,NO2,PM10,SO2,State
171,Rishikesh,20.0,212.0,7.5,Uttarakhand
172,Asansol,66.0,141.0,8.0,West Bengal
173,Barrackpore,74.0,121.0,12.0,West Bengal
174,Durgapur,66.0,141.0,8.0,West Bengal
175,Haldia,52.0,57.0,14.0,West Bengal
176,Howrah,75.0,118.0,12.0,West Bengal
177,Kolkata,62.0,99.0,11.0,West Bengal
178,Raniganj,63.0,159.0,8.0,West Bengal
179,Sankrail,65.0,100.0,10.0,West Bengal
180,South Suburban,56.0,82.0,7.0,West Bengal


In [241]:
pd.unique(data_with_no_mv['State'])

len(pd.unique(data_with_no_mv['State']))

29

In [239]:
len(data_with_no_mv["State"])

181

In [207]:
data_with_no_mv.describe(include= 'all')

Unnamed: 0,City,NO2,PM10,SO2,State
count,181,181.0,181.0,181.0,181
unique,181,,,,29
top,Aurangabad,,,,Maharashtra
freq,1,,,,19
mean,,24.022099,108.008287,9.903315,
std,,14.54646,60.762115,8.186221,
min,,5.0,27.0,2.0,
25%,,15.0,65.0,5.0,
50%,,20.0,89.5,7.5,
75%,,29.0,135.0,13.0,


In [208]:
data_with_no_mv[['NO2', 'PM10', 'SO2']] = data_with_no_mv[['NO2', 'PM10', 'SO2']].astype('int')
data_with_no_mv[['City', 'State']] = data_with_no_mv[['City', 'State']].astype('str')

In [209]:
data_with_no_mv.dtypes

City     object
NO2       int32
PM10      int32
SO2       int32
State    object
dtype: object

In [242]:
data_with_no_mv

Unnamed: 0,City,NO2,PM10,SO2,State
0,Chitoor,9,39,4,Andhra Pradesh
1,Guntur,11,81,2,Andhra Pradesh
2,Hydrabad,24,79,5,Andhra Pradesh
3,Kothagudem,11,62,2,Andhra Pradesh
4,Kurnool,9,85,4,Andhra Pradesh
...,...,...,...,...,...
176,Howrah,75,118,12,West Bengal
177,Kolkata,62,99,11,West Bengal
178,Raniganj,63,159,8,West Bengal
179,Sankrail,65,100,10,West Bengal
