In [164]:
import camelot as camelot
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import geopandas as gop
from geopy.geocoders import Nominatim 
import folium as fo
from folium.plugins import MarkerCluster
from folium import plugins
from folium.plugins import HeatMap



In [42]:
# Extract tables 

file = "https://cpcb.nic.in//upload/Downloads/AQI_Bulletin_20220307.pdf"
tables = camelot.read_pdf(file, pages='all', strip_text='\n', flag_size=True)

In [43]:
# Define headers
def filter_df(df):
    df.columns = ['City', 'Air Quality', 'Index Value', 'Prominent Pollutant', 'No. of Monitoring Stations']
    df.reset_index().drop(columns= 'index', axis=1, inplace=True)
    return df

In [44]:
# First table to dataframe
tables[0].df

Unnamed: 0,0,1,2,3,4,5
0,S.No,City,Air Quality,Index Value,Prominent Pollutant,Based on Numberof MonitoringStations
1,1,Agartala,Poor,225,PM2.5,1
2,2,Agra,Moderate,140,"PM, O, PM1032.5",6
3,3,Ahmedabad,Poor,239,PM2.5,8
4,4,Aizawl,Satisfactory,70,PM10,1
5,5,Ajmer,Satisfactory,91,PM10,1
6,6,Alwar,Satisfactory,83,PM10,1
7,7,Amaravati,Satisfactory,56,PM10,1
8,8,Amritsar,Satisfactory,77,PM10,1
9,9,Ankleshwar,Moderate,196,PM2.5,1


In [45]:
# Even number of dataframes
table_list = []
for num, table in enumerate(tables):
    if  num % 2 == 0:
        table_df = table.df
        table_list.append(table_df.drop(0).drop(0, axis=1))

In [46]:
# Concatenate all dateframes
df = pd.concat(table_list)

In [47]:
# Add headers
df.pipe(filter_df)

Unnamed: 0,City,Air Quality,Index Value,Prominent Pollutant,No. of Monitoring Stations
1,Agartala,Poor,225,PM2.5,1
2,Agra,Moderate,140,"PM, O, PM1032.5",6
3,Ahmedabad,Poor,239,PM2.5,8
4,Aizawl,Satisfactory,70,PM10,1
5,Ajmer,Satisfactory,91,PM10,1
...,...,...,...,...,...
7,Vatva,Poor,203,PM2.5,1
8,Vijayapura,Satisfactory,51,PM10,1
9,Visakhapatnam,Moderate,130,PM10,1
10,Yadgir,Satisfactory,53,PM10,1


In [48]:
# Add date 

#df['Date'] = pd.to_datetime('today').strftime("%d/%m/%Y")

In [91]:
# To csv
df.to_csv("AQI.csv",index = False)

In [92]:
# load previous data

df_base = pd.read_csv("AQI.csv")

#df_base.head()


# Append latest data

frames = [df, df_base]
df1 = pd.concat(frames)

# Export to csv
df1.to_csv("AQI.csv",index = False)



In [93]:

# Geocode addresses 
 


def my_geocoder(row):
    try:
        point = geolocator.geocode(row).point
        return pd.Series({'Latitude': point.latitude, 'Longitude': point.longitude})
    except:
        return None

df1[['Latitude', 'Longitude']] = df1.apply(lambda x: my_geocoder(x['City']), axis=1)

In [129]:
# Check percentage of rows geocoded
print("{}% of addresses were geocoded!".format(
    (1 - sum(np.isnan(df1["Latitude"])) / len(df1)) * 100))



100.0% of addresses were geocoded!


In [127]:
# Field with NaN
df1[df1.isna().any(axis=1)]

Unnamed: 0,City,Air Quality,Index Value,Prominent Pollutant,No. of Monitoring Stations,Date,Latitude,Longitude


In [170]:
# Create a map
m = fo.Map(location=[20, 80], tiles='openstreetmap', zoom_start=2)

# add marker one by one on the map
#for i in range(0,len(df1)):
#   fo.Marker(
#     location=[df1.iloc[i]['Latitude'], df1.iloc[i]['Longitude']],
#      popup=df1.iloc[i]['City'],
#   ).add_to(m)




locations = list(zip(df1["Latitude"], df1["Longitude"]))

icons = [fo.Icon(icon="car", prefix="fa") for _ in range(len(locations))]

# Create a folium marker cluster


cluster = MarkerCluster(locations=locations, icons=icons)

# Add marker cluster to map
cluster.add_to(m)


# Display the map
m

In [175]:
# Add heatmap

HeatMap(locations).add_to(m)

m