# Format CPCB data - Delhi CAAQM stations

This notebook normalises and cleans data from the CAAQM stations. 
Data downloaded from:https://app.cpcbccr.com/ccr/#/caaqm-dashboard-all/caaqm-landing/caaqm-comparison-data

NOTE: all thepollutants quantities are in ug m3 except for NOx which is in ppb.  

## Import raw data

In [1]:
# import libraries.
import pandas as pd
import numpy as np

In [2]:
# import Delhi data spredsheet.
filepth = 'raw_data_2019/CPCB_oct2019.xlsx'
data = pd.read_excel(filepth)

In [3]:
data.head(12)

Unnamed: 0,CENTRAL POLLUTION CONTROL BOARD,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,CONTINUOUS AMBIENT AIR QUALITY,,,,,,,,
1,"Date: Wednesday, Jun 09 2021",,,,,,,,
2,Time: 01:14:50 PM,,,,,,,,
3,Station,"Alipur, Delhi - DPCC, Anand Vihar, Delhi - ...",,,,,,,
4,Parameter,"PM2.5,PM10,NO,NO2,NOx,NH3,SO2,Temp,RH,Ozone,WS...",,,,,,,
5,AvgPeriod,1 Hours,,,,,,,
6,From,01-10-2019 T00:00:00Z 00:00,,,,,,,
7,To,01-11-2019 T00:00:59Z 00:00,,,,,,,
8,,,,,,,,,
9,"Alipur, Delhi - DPCC, Anand Vihar, Delhi - ...",,,,,,,,


## Normalise dataframe 

Put data with station and polluntants as columns.

In [4]:

# given shape of data, get normalised data for a given station
def station_dataframe(line,ds_len):
    
    df = data.iloc[line:line+ds_len, :8].reset_index(drop=True)
    df.iloc[0,0] = 'start_date'
    df.iloc[0,1]= 'end_date'
    df.columns =df.loc[0]
    line = line+ds_len
    
    #for each stations I have two blocks of data since number of paramenters is high.
    df2 = data.iloc[line : line+ds_len, :8].reset_index(drop=True)
    df2.iloc[0, 0] = 'start_date'
    df2.iloc[0,1]= 'end_date'
    df2.columns = df2.iloc[0]
    df = df.merge(df2, left_on=['start_date', 'end_date'], right_on=['start_date', 'end_date'], how='outer')
    
    # reshape new data   
    df_clean = df.drop(df.index[[0,0]])
    df_clean['station']=df.iloc[0,2] 
    # put first raw as column
    new_cols = df_clean.iloc[0] 
    df_clean = df_clean[1:] 
    df_clean.columns = new_cols
    # rename columns
    df_clean.rename(columns={'From Date':'date','Black Carbon':'BC','Ozone':'O3','PM2.5':'pm25', df_clean.iloc[0,14] :'station'}, inplace=True)
    df_clean.drop('To Date',axis=1, inplace=True)
    return df_clean

In [5]:
# create initial dataframe with first station to append all the others onto.
line=10
ds_len=2+24*31+1
df=station_dataframe(line,ds_len)

In [6]:
df

1,date,pm25,PM10,NO,NO2,NOx,NH3,SO2,Temp,RH,O3,WS,BC,station
2,01-10-2019 00:00,54.5,113,5.25,38.78,24.9,15.4,11.58,,89.85,2.1,1.05,,Alipur Delhi - DPCC
3,01-10-2019 01:00,44.5,85,5.2,37.5,24.2,15.18,11.12,,91.93,4.45,1.1,,Alipur Delhi - DPCC
4,01-10-2019 02:00,40.5,76,3.35,31.4,19.4,19.6,13.25,,93.2,3.72,1,,Alipur Delhi - DPCC
5,01-10-2019 03:00,43.5,81.5,3.45,32.67,20.23,17.62,11.97,,94.05,3.8,1.02,,Alipur Delhi - DPCC
6,01-10-2019 04:00,51.5,89,3.4,32.57,20.07,18.38,12.57,,94.3,3.85,0.88,,Alipur Delhi - DPCC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742,31-10-2019 20:00,402,618.5,51.15,154.72,123.88,64.62,18.88,,66.25,1.38,1.6,,Alipur Delhi - DPCC
743,31-10-2019 21:00,422,667,81.75,151.7,147.4,67.62,19.6,,79.03,0.35,1.27,,Alipur Delhi - DPCC
744,31-10-2019 22:00,431.5,693.5,88.65,141.43,147.55,66.67,19.23,,76.53,0.83,1.23,,Alipur Delhi - DPCC
745,31-10-2019 23:00,426,668.5,56.08,135.1,117.53,68.35,16.35,,73.92,2.17,1.4,,Alipur Delhi - DPCC


In [7]:
# get number of stations from dataframe lenght
n_iterations = int(data.shape[0]/(line+ds_len*2))

In [8]:
n_iterations    # number of stations

39

In [9]:
# create full dataset: cycle update dataframe with all stations.
for x in range(1,n_iterations): #skip first
    line = line+ds_len*2
    df2 = station_dataframe(line,ds_len)
    print(x, df2.station.unique())
    df = pd.concat([df,df2])

1 ['Anand Vihar  Delhi - DPCC ']
2 ['Ashok Vihar  Delhi - DPCC ']
3 ['Aya Nagar  Delhi - IMD ']
4 ['Bawana  Delhi - DPCC ']
5 ['Burari Crossing  Delhi - IMD ']
6 ['CRRI Mathura Road  Delhi - IMD ']
7 ['Chandni Chowk  Delhi - IITM ']
8 ['DTU  Delhi - CPCB ']
9 ['Dr. Karni Singh Shooting Range  Delhi - DPCC ']
10 ['Dwarka-Sector 8  Delhi - DPCC  ']
11 ['East Arjun Nagar  Delhi - CPCB ']
12 ['IGI Airport (T3)  Delhi - IMD ']
13 ['IHBAS  Dilshad Garden  Delhi - CPCB ']
14 ['ITO  Delhi - CPCB ']
15 ['Jahangirpuri  Delhi - DPCC ']
16 ['Jawaharlal Nehru Stadium  Delhi - DPCC ']
17 ['Lodhi Road  Delhi - IITM ']
18 ['Lodhi Road  Delhi - IMD ']
19 ['Major Dhyan Chand National Stadium  Delhi - DPCC ']
20 ['Mandir Marg  Delhi - DPCC ']
21 ['Mundka  Delhi - DPCC ']
22 ['NSIT Dwarka  Delhi - CPCB ']
23 ['Najafgarh  Delhi - DPCC ']
24 ['Narela  Delhi - DPCC ']
25 ['Nehru Nagar  Delhi - DPCC ']
26 ['North Campus  DU  Delhi - IMD ']
27 ['Okhla Phase-2  Delhi - DPCC ']
28 ['Patparganj  Delhi - DPCC ']
2

In [10]:
#check
df[df.station=='Vivek Vihar  Delhi - DPCC ']

1,date,pm25,PM10,NO,NO2,NOx,NH3,SO2,Temp,RH,O3,WS,BC,station
2,01-10-2019 00:00,37,87.5,15.88,12.78,19.75,17.2,13.55,,84.75,24.45,1.18,,Vivek Vihar Delhi - DPCC
3,01-10-2019 01:00,28.5,75,13.45,14.12,18.48,16.48,11.05,,86.15,13.4,0.95,,Vivek Vihar Delhi - DPCC
4,01-10-2019 02:00,21.5,67,13.47,13.1,17.98,17.45,10.6,,87.78,22.4,1.33,,Vivek Vihar Delhi - DPCC
5,01-10-2019 03:00,26,59.5,12.25,13.25,17.02,19.15,10.92,,88.3,11.5,1.23,,Vivek Vihar Delhi - DPCC
6,01-10-2019 04:00,22.5,52,12.47,13.47,17.32,21.52,10.47,,90.88,24.75,0.68,,Vivek Vihar Delhi - DPCC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742,31-10-2019 20:00,447,705,34.15,6.73,31.45,63.95,39.15,,89,47.47,0.3,,Vivek Vihar Delhi - DPCC
743,31-10-2019 21:00,446,670.5,35.05,8.2,32.92,66.75,41.38,,90.35,45.08,0.3,,Vivek Vihar Delhi - DPCC
744,31-10-2019 22:00,451.5,638.5,36.43,7.03,33.45,72.08,42.05,,91.05,42.42,0.3,,Vivek Vihar Delhi - DPCC
745,31-10-2019 23:00,437.5,608.5,32.8,6.1,30,69.62,41.22,,87.42,38.85,0.3,,Vivek Vihar Delhi - DPCC


In [11]:
#rename column names with lowercase
df.columns = map(str.lower, df.columns)

In [12]:
df.head()

Unnamed: 0,date,pm25,pm10,no,no2,nox,nh3,so2,temp,rh,o3,ws,bc,station
2,01-10-2019 00:00,54.5,113.0,5.25,38.78,24.9,15.4,11.58,,89.85,2.1,1.05,,Alipur Delhi - DPCC
3,01-10-2019 01:00,44.5,85.0,5.2,37.5,24.2,15.18,11.12,,91.93,4.45,1.1,,Alipur Delhi - DPCC
4,01-10-2019 02:00,40.5,76.0,3.35,31.4,19.4,19.6,13.25,,93.2,3.72,1.0,,Alipur Delhi - DPCC
5,01-10-2019 03:00,43.5,81.5,3.45,32.67,20.23,17.62,11.97,,94.05,3.8,1.02,,Alipur Delhi - DPCC
6,01-10-2019 04:00,51.5,89.0,3.4,32.57,20.07,18.38,12.57,,94.3,3.85,0.88,,Alipur Delhi - DPCC


In [13]:
df

Unnamed: 0,date,pm25,pm10,no,no2,nox,nh3,so2,temp,rh,o3,ws,bc,station
2,01-10-2019 00:00,54.5,113,5.25,38.78,24.9,15.4,11.58,,89.85,2.1,1.05,,Alipur Delhi - DPCC
3,01-10-2019 01:00,44.5,85,5.2,37.5,24.2,15.18,11.12,,91.93,4.45,1.1,,Alipur Delhi - DPCC
4,01-10-2019 02:00,40.5,76,3.35,31.4,19.4,19.6,13.25,,93.2,3.72,1,,Alipur Delhi - DPCC
5,01-10-2019 03:00,43.5,81.5,3.45,32.67,20.23,17.62,11.97,,94.05,3.8,1.02,,Alipur Delhi - DPCC
6,01-10-2019 04:00,51.5,89,3.4,32.57,20.07,18.38,12.57,,94.3,3.85,0.88,,Alipur Delhi - DPCC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742,31-10-2019 20:00,447,705,34.15,6.73,31.45,63.95,39.15,,89,47.47,0.3,,Vivek Vihar Delhi - DPCC
743,31-10-2019 21:00,446,670.5,35.05,8.2,32.92,66.75,41.38,,90.35,45.08,0.3,,Vivek Vihar Delhi - DPCC
744,31-10-2019 22:00,451.5,638.5,36.43,7.03,33.45,72.08,42.05,,91.05,42.42,0.3,,Vivek Vihar Delhi - DPCC
745,31-10-2019 23:00,437.5,608.5,32.8,6.1,30,69.62,41.22,,87.42,38.85,0.3,,Vivek Vihar Delhi - DPCC


## Add lat lon values to stations data

to get lat lon data, use an openaq dataset over Delhi that has station location.

In [14]:
#load opeaq dataset.
st = pd.read_csv('raw_data_2019/openaq_oct2019.csv')

In [15]:
st

Unnamed: 0.1,Unnamed: 0,location,parameter,value,unit,country,city,date.utc,coordinates.latitude,coordinates.longitude
0,0,"IHBAS, Dilshad Garden, Delhi - CPCB",so2,12.83,b'\xc2\xb5g/m\xc2\xb3',IN,Delhi,2019-10-30 06:15:00+00:00,28.681174,77.302523
1,1,"NSIT Dwarka, Delhi - CPCB",so2,20.87,b'\xc2\xb5g/m\xc2\xb3',IN,Delhi,2019-10-30 06:15:00+00:00,28.609090,77.032541
2,2,"DTU, Delhi - CPCB",o3,52.80,b'\xc2\xb5g/m\xc2\xb3',IN,Delhi,2019-10-30 06:15:00+00:00,28.750050,77.111261
3,3,"DTU, Delhi - CPCB",no2,31.70,b'\xc2\xb5g/m\xc2\xb3',IN,Delhi,2019-10-30 06:15:00+00:00,28.750050,77.111261
4,4,"NSIT Dwarka, Delhi - CPCB",pm25,125.53,b'\xc2\xb5g/m\xc2\xb3',IN,Delhi,2019-10-30 06:15:00+00:00,28.609090,77.032541
...,...,...,...,...,...,...,...,...,...,...
278331,278331,"Sri Aurobindo Marg, Delhi - DPCC",no2,17.20,b'\xc2\xb5g/m\xc2\xb3',IN,Delhi,2019-10-01 00:00:00+00:00,28.531346,77.190156
278332,278332,"NSIT Dwarka, Delhi - CPCB",so2,14.86,b'\xc2\xb5g/m\xc2\xb3',IN,Delhi,2019-10-01 00:00:00+00:00,28.609090,77.032541
278333,278333,"Mandir Marg, Delhi - DPCC",o3,14.90,b'\xc2\xb5g/m\xc2\xb3',IN,Delhi,2019-10-01 00:00:00+00:00,28.636429,77.201067
278334,278334,"North Campus, DU, Delhi - IMD",pm25,19.21,b'\xc2\xb5g/m\xc2\xb3',IN,Delhi,2019-10-01 00:00:00+00:00,28.657381,77.158545


In [16]:
st.location.unique()

array(['IHBAS, Dilshad Garden, Delhi - CPCB', 'NSIT Dwarka, Delhi - CPCB',
       'DTU, Delhi - CPCB', 'ITO, Delhi - CPCB', 'Shadipur, Delhi - CPCB',
       'Patparganj, Delhi - DPCC', 'Punjabi Bagh, Delhi - DPCC',
       'Sonia Vihar, Delhi - DPCC', 'R K Puram, Delhi - DPCC',
       'Major Dhyan Chand National Stadium, Delhi - DPCC',
       'Dwarka-Sector 8, Delhi - DPCC ', 'Sirifort, Delhi - CPCB',
       'Jawaharlal Nehru Stadium, Delhi - DPCC', 'Bawana, Delhi - DPCC',
       'Rohini, Delhi - DPCC', 'Jahangirpuri, Delhi - DPCC',
       'Anand Vihar, Delhi - DPCC', 'Mandir Marg, Delhi - DPCC',
       'Dr. Karni Singh Shooting Range, Delhi - DPCC',
       'Vivek Vihar, Delhi - DPCC', 'Najafgarh, Delhi - DPCC',
       'Okhla Phase-2, Delhi - DPCC', 'Alipur, Delhi - DPCC',
       'Narela, Delhi - DPCC', 'Mundka, Delhi - DPCC',
       'Pusa, Delhi - DPCC', 'Ashok Vihar, Delhi - DPCC',
       'Wazirpur, Delhi - DPCC', 'CRRI Mathura Road, Delhi - IMD',
       'Aya Nagar, Delhi - IMD', 'Nor

In [17]:
st.location.unique().size

37

In [18]:
df.station.unique()

array(['Alipur  Delhi - DPCC ', 'Anand Vihar  Delhi - DPCC ',
       'Ashok Vihar  Delhi - DPCC ', 'Aya Nagar  Delhi - IMD ',
       'Bawana  Delhi - DPCC ', 'Burari Crossing  Delhi - IMD ',
       'CRRI Mathura Road  Delhi - IMD ', 'Chandni Chowk  Delhi - IITM ',
       'DTU  Delhi - CPCB ',
       'Dr. Karni Singh Shooting Range  Delhi - DPCC ',
       'Dwarka-Sector 8  Delhi - DPCC  ',
       'East Arjun Nagar  Delhi - CPCB ',
       'IGI Airport (T3)  Delhi - IMD ',
       'IHBAS  Dilshad Garden  Delhi - CPCB ', 'ITO  Delhi - CPCB ',
       'Jahangirpuri  Delhi - DPCC ',
       'Jawaharlal Nehru Stadium  Delhi - DPCC ',
       'Lodhi Road  Delhi - IITM ', 'Lodhi Road  Delhi - IMD ',
       'Major Dhyan Chand National Stadium  Delhi - DPCC ',
       'Mandir Marg  Delhi - DPCC ', 'Mundka  Delhi - DPCC ',
       'NSIT Dwarka  Delhi - CPCB ', 'Najafgarh  Delhi - DPCC ',
       'Narela  Delhi - DPCC ', 'Nehru Nagar  Delhi - DPCC ',
       'North Campus  DU  Delhi - IMD ', 'Okhla Phase-2

In [19]:
df.station.unique().size

39

In [20]:
# make stations the same between the two dataset

# remove last white space in station name (to be comparable with other dataset)

df['station']=df.station.str.rstrip()
st['location']=st.location.str.replace(","," ")

# get only location, lat lon from openaq dataset.
st =st[['location','coordinates.latitude', 'coordinates.longitude']].drop_duplicates()

In [21]:
st.rename(columns={'location':'station','coordinates.latitude':'lat','coordinates.longitude':'lon'},inplace=True)

In [22]:
st=st.reset_index(drop=True)

In [23]:
st

Unnamed: 0,station,lat,lon
0,IHBAS Dilshad Garden Delhi - CPCB,28.681174,77.302523
1,NSIT Dwarka Delhi - CPCB,28.60909,77.032541
2,DTU Delhi - CPCB,28.75005,77.111261
3,ITO Delhi - CPCB,28.631695,77.249439
4,Shadipur Delhi - CPCB,28.651478,77.147311
5,Patparganj Delhi - DPCC,28.623748,77.287205
6,Punjabi Bagh Delhi - DPCC,28.674045,77.131023
7,Sonia Vihar Delhi - DPCC,28.710508,77.249485
8,R K Puram Delhi - DPCC,28.563262,77.186937
9,Major Dhyan Chand National Stadium Delhi - DPCC,28.611281,77.237738


In [24]:
#check which stations are in the main df and new stations.
intr= list(set(df.station.unique()).intersection(st.station.unique()))

In [25]:
intr

['Shadipur  Delhi - CPCB',
 'NSIT Dwarka  Delhi - CPCB',
 'Nehru Nagar  Delhi - DPCC',
 'CRRI Mathura Road  Delhi - IMD',
 'Major Dhyan Chand National Stadium  Delhi - DPCC',
 'Patparganj  Delhi - DPCC',
 'Burari Crossing  Delhi - IMD',
 'Narela  Delhi - DPCC',
 'Aya Nagar  Delhi - IMD',
 'ITO  Delhi - CPCB',
 'IHBAS  Dilshad Garden  Delhi - CPCB',
 'East Arjun Nagar  Delhi - CPCB',
 'Vivek Vihar  Delhi - DPCC',
 'Mundka  Delhi - DPCC',
 'Alipur  Delhi - DPCC',
 'Punjabi Bagh  Delhi - DPCC',
 'Mandir Marg  Delhi - DPCC',
 'Pusa  Delhi - DPCC',
 'DTU  Delhi - CPCB',
 'Bawana  Delhi - DPCC',
 'Dr. Karni Singh Shooting Range  Delhi - DPCC',
 'Sirifort  Delhi - CPCB',
 'Sonia Vihar  Delhi - DPCC',
 'Lodhi Road  Delhi - IMD',
 'Sri Aurobindo Marg  Delhi - DPCC',
 'Rohini  Delhi - DPCC',
 'Jahangirpuri  Delhi - DPCC',
 'Jawaharlal Nehru Stadium  Delhi - DPCC',
 'North Campus  DU  Delhi - IMD',
 'Ashok Vihar  Delhi - DPCC',
 'R K Puram  Delhi - DPCC',
 'Okhla Phase-2  Delhi - DPCC',
 'Anand V

In [26]:
len(intr)

35

In [27]:
# add lat lon values to station name
df=pd.merge(df,st, on='station',how='left')

In [28]:
df

Unnamed: 0,date,pm25,pm10,no,no2,nox,nh3,so2,temp,rh,o3,ws,bc,station,lat,lon
0,01-10-2019 00:00,54.5,113,5.25,38.78,24.9,15.4,11.58,,89.85,2.1,1.05,,Alipur Delhi - DPCC,28.815329,77.15301
1,01-10-2019 01:00,44.5,85,5.2,37.5,24.2,15.18,11.12,,91.93,4.45,1.1,,Alipur Delhi - DPCC,28.815329,77.15301
2,01-10-2019 02:00,40.5,76,3.35,31.4,19.4,19.6,13.25,,93.2,3.72,1,,Alipur Delhi - DPCC,28.815329,77.15301
3,01-10-2019 03:00,43.5,81.5,3.45,32.67,20.23,17.62,11.97,,94.05,3.8,1.02,,Alipur Delhi - DPCC,28.815329,77.15301
4,01-10-2019 04:00,51.5,89,3.4,32.57,20.07,18.38,12.57,,94.3,3.85,0.88,,Alipur Delhi - DPCC,28.815329,77.15301
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29050,31-10-2019 20:00,447,705,34.15,6.73,31.45,63.95,39.15,,89,47.47,0.3,,Vivek Vihar Delhi - DPCC,28.672342,77.31526
29051,31-10-2019 21:00,446,670.5,35.05,8.2,32.92,66.75,41.38,,90.35,45.08,0.3,,Vivek Vihar Delhi - DPCC,28.672342,77.31526
29052,31-10-2019 22:00,451.5,638.5,36.43,7.03,33.45,72.08,42.05,,91.05,42.42,0.3,,Vivek Vihar Delhi - DPCC,28.672342,77.31526
29053,31-10-2019 23:00,437.5,608.5,32.8,6.1,30,69.62,41.22,,87.42,38.85,0.3,,Vivek Vihar Delhi - DPCC,28.672342,77.31526


In [29]:
#check
df[df['station']=='Pusa  Delhi - DPCC'].head()

Unnamed: 0,date,pm25,pm10,no,no2,nox,nh3,so2,temp,rh,o3,ws,bc,station,lat,lon
22350,01-10-2019 00:00,29.5,61.25,10.15,32.27,25.45,20.8,13.07,,88.95,0.2,1.35,,Pusa Delhi - DPCC,28.639645,77.146262
22351,01-10-2019 01:00,23.25,50.75,3.4,27.18,17.2,19.62,12.42,,89.58,4.45,1.27,,Pusa Delhi - DPCC,28.639645,77.146262
22352,01-10-2019 02:00,19.25,36.0,2.85,22.7,14.4,19.75,12.0,,89.58,4.15,1.3,,Pusa Delhi - DPCC,28.639645,77.146262
22353,01-10-2019 03:00,25.0,40.5,2.5,21.6,13.5,18.25,12.97,,90.05,4.47,1.42,,Pusa Delhi - DPCC,28.639645,77.146262
22354,01-10-2019 04:00,30.0,43.5,6.95,19.2,15.82,16.4,13.28,,92.25,4.6,0.8,,Pusa Delhi - DPCC,28.639645,77.146262


In [30]:
# remove stations with nan values:

# 1) check which stations have no lat lon values.
df.groupby('station').mean().reset_index()[['station','lat','lon']]

Unnamed: 0,station,lat,lon
0,Alipur Delhi - DPCC,28.815329,77.15301
1,Anand Vihar Delhi - DPCC,28.646835,77.316032
2,Ashok Vihar Delhi - DPCC,28.695381,77.181665
3,Aya Nagar Delhi - IMD,28.470691,77.109936
4,Bawana Delhi - DPCC,28.7762,77.051074
5,Burari Crossing Delhi - IMD,28.72565,77.201157
6,CRRI Mathura Road Delhi - IMD,28.5512,77.273574
7,Chandni Chowk Delhi - IITM,,
8,DTU Delhi - CPCB,28.75005,77.111262
9,Dr. Karni Singh Shooting Range Delhi - DPCC,28.498571,77.26484


In [31]:
# 2) remove stations with NaN lat lon
df.drop(df[df['lat'].isna()].index ,inplace=True)

In [32]:
df.groupby('station').mean().reset_index()[['station','lat','lon']]

Unnamed: 0,station,lat,lon
0,Alipur Delhi - DPCC,28.815329,77.15301
1,Anand Vihar Delhi - DPCC,28.646835,77.316032
2,Ashok Vihar Delhi - DPCC,28.695381,77.181665
3,Aya Nagar Delhi - IMD,28.470691,77.109936
4,Bawana Delhi - DPCC,28.7762,77.051074
5,Burari Crossing Delhi - IMD,28.72565,77.201157
6,CRRI Mathura Road Delhi - IMD,28.5512,77.273574
7,DTU Delhi - CPCB,28.75005,77.111262
8,Dr. Karni Singh Shooting Range Delhi - DPCC,28.498571,77.26484
9,East Arjun Nagar Delhi - CPCB,28.655602,77.285932


### Set dates in correct format

In [33]:
from datetime import timedelta

# local datetime
df['date_LT'] = pd.to_datetime(df['date'], format='%d-%m-%Y %H:%M')
df['year_LT'], df['month_LT'], df['day_LT'],df['hour_LT'] = df['date_LT'].dt.year, df['date_LT'].dt.month, df['date_LT'].dt.day,df['date_LT'].dt.hour
df.drop('date',axis=1, inplace=True)

#UTC time (for comparing with model)
df['date_UTC'] = df['date_LT']-timedelta(hours=5.00) # Delhi time is IST that is GMT+ 5h.30min. (round to 5)
df['year_UTC'], df['month_UTC'], df['day_UTC'],df['hour_UTC'] = df['date_UTC'].dt.year, df['date_UTC'].dt.month, df['date_UTC'].dt.day,df['date_UTC'].dt.hour

In [34]:
df.columns

Index(['pm25', 'pm10', 'no', 'no2', 'nox', 'nh3', 'so2', 'temp', 'rh', 'o3',
       'ws', 'bc', 'station', 'lat', 'lon', 'date_LT', 'year_LT', 'month_LT',
       'day_LT', 'hour_LT', 'date_UTC', 'year_UTC', 'month_UTC', 'day_UTC',
       'hour_UTC'],
      dtype='object')

In [35]:
df.shape

(26075, 25)

### Check on NaN and data types

In [36]:
#Replacing None with NaN
df.isin(['None']).sum()

pm25          775
pm10          497
no           1229
no2          1012
nox           912
nh3           891
so2           765
temp           53
rh            263
o3           1380
ws            317
bc             82
station         0
lat             0
lon             0
date_LT         0
year_LT         0
month_LT        0
day_LT          0
hour_LT         0
date_UTC        0
year_UTC        0
month_UTC       0
day_UTC         0
hour_UTC        0
dtype: int64

In [37]:
df.replace('None', np.nan, inplace=True)

In [38]:
df.isin(['None']).sum()

pm25         0
pm10         0
no           0
no2          0
nox          0
nh3          0
so2          0
temp         0
rh           0
o3           0
ws           0
bc           0
station      0
lat          0
lon          0
date_LT      0
year_LT      0
month_LT     0
day_LT       0
hour_LT      0
date_UTC     0
year_UTC     0
month_UTC    0
day_UTC      0
hour_UTC     0
dtype: int64

In [39]:
#check data types
df.dtypes

pm25                float64
pm10                float64
no                  float64
no2                 float64
nox                 float64
nh3                 float64
so2                 float64
temp                float64
rh                  float64
o3                  float64
ws                  float64
bc                  float64
station              object
lat                 float64
lon                 float64
date_LT      datetime64[ns]
year_LT               int64
month_LT              int64
day_LT                int64
hour_LT               int64
date_UTC     datetime64[ns]
year_UTC              int64
month_UTC             int64
day_UTC               int64
hour_UTC              int64
dtype: object

## Saving new data

In [40]:
# saving in a xlsx dataframe
filepth = './cpcb_oct2019_formatted.csv'
df.to_csv(filepth) 

In [41]:
df

Unnamed: 0,pm25,pm10,no,no2,nox,nh3,so2,temp,rh,o3,...,date_LT,year_LT,month_LT,day_LT,hour_LT,date_UTC,year_UTC,month_UTC,day_UTC,hour_UTC
0,54.5,113.0,5.25,38.78,24.90,15.40,11.58,,89.85,2.10,...,2019-10-01 00:00:00,2019,10,1,0,2019-09-30 19:00:00,2019,9,30,19
1,44.5,85.0,5.20,37.50,24.20,15.18,11.12,,91.93,4.45,...,2019-10-01 01:00:00,2019,10,1,1,2019-09-30 20:00:00,2019,9,30,20
2,40.5,76.0,3.35,31.40,19.40,19.60,13.25,,93.20,3.72,...,2019-10-01 02:00:00,2019,10,1,2,2019-09-30 21:00:00,2019,9,30,21
3,43.5,81.5,3.45,32.67,20.23,17.62,11.97,,94.05,3.80,...,2019-10-01 03:00:00,2019,10,1,3,2019-09-30 22:00:00,2019,9,30,22
4,51.5,89.0,3.40,32.57,20.07,18.38,12.57,,94.30,3.85,...,2019-10-01 04:00:00,2019,10,1,4,2019-09-30 23:00:00,2019,9,30,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29050,447.0,705.0,34.15,6.73,31.45,63.95,39.15,,89.00,47.47,...,2019-10-31 20:00:00,2019,10,31,20,2019-10-31 15:00:00,2019,10,31,15
29051,446.0,670.5,35.05,8.20,32.92,66.75,41.38,,90.35,45.08,...,2019-10-31 21:00:00,2019,10,31,21,2019-10-31 16:00:00,2019,10,31,16
29052,451.5,638.5,36.43,7.03,33.45,72.08,42.05,,91.05,42.42,...,2019-10-31 22:00:00,2019,10,31,22,2019-10-31 17:00:00,2019,10,31,17
29053,437.5,608.5,32.80,6.10,30.00,69.62,41.22,,87.42,38.85,...,2019-10-31 23:00:00,2019,10,31,23,2019-10-31 18:00:00,2019,10,31,18


In [42]:
df.columns

Index(['pm25', 'pm10', 'no', 'no2', 'nox', 'nh3', 'so2', 'temp', 'rh', 'o3',
       'ws', 'bc', 'station', 'lat', 'lon', 'date_LT', 'year_LT', 'month_LT',
       'day_LT', 'hour_LT', 'date_UTC', 'year_UTC', 'month_UTC', 'day_UTC',
       'hour_UTC'],
      dtype='object')