# **1. Import all important libraries**

In [6]:
import requests
import numpy as np
import pandas as pd
import regex as re
import glob
import os
from bs4 import BeautifulSoup
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# **2. Create a folder to store the downloaded dataset**

In [7]:
for x in range(2010,2022):
  !mkdir /content/gdrive/MyDrive/dokumentasi/dataset/$x #Use your own path

# **3. Fetch all URL that contain air quality data**

In [13]:
#Search all ISPU data available
#Dataset source link
url = "https://data.jakarta.go.id/dataset?q=Indeks+Standar+Pencemar+Udara+ISPU&sort=3&page=1" 
jakarta_response = requests.get(url)

#store all link contain ispu data
all_link=[] 
jakarta_soup = BeautifulSoup(jakarta_response.text, 'html.parser')

#get the total year of data available
jumlah = jakarta_soup.find('span', {'class': 'panton-bold'})
jumlah = int(jumlah.text) 

for halaman in range(1, 2+jumlah//10):
  url = f"https://data.jakarta.go.id/dataset?q=Indeks+Standar+Pencemar+Udara+ISPU&sort=3&page={halaman}"
  halaman_response = requests.get(url)
  halaman_soup = BeautifulSoup(halaman_response.text, 'html.parser')
  halaman_link = halaman_soup.find_all('a', {'class': 'text-decoration-none text-reset'})
  for link in halaman_link:
    all_link.append(re.search("https.*[\d]", str(link)).group())
    
all_link

['https://data.jakarta.go.id/dataset/indeks-standar-pencemar-udara-di-provinsi-dki-jakarta-tahun-2018',
 'https://data.jakarta.go.id/dataset/indeks-standar-pencemaran-udara-ispu-tahun-2010',
 'https://data.jakarta.go.id/dataset/indeks-standar-pencemaran-udara-ispu-tahun-2011',
 'https://data.jakarta.go.id/dataset/indeks-standar-pencemaran-udara-ispu-tahun-2012',
 'https://data.jakarta.go.id/dataset/indeks-standar-pencemaran-udara-ispu-tahun-2013',
 'https://data.jakarta.go.id/dataset/indeks-standar-pencemaran-udara-ispu-tahun-2014',
 'https://data.jakarta.go.id/dataset/indeks-standar-pencemaran-udara-ispu-tahun-2015',
 'https://data.jakarta.go.id/dataset/indeks-standar-pencemaran-udara-ispu-tahun-2016',
 'https://data.jakarta.go.id/dataset/indeks-standar-pencemaran-udara-ispu-tahun-2017',
 'https://data.jakarta.go.id/dataset/data-indeks-standar-pencemar-udara-ispu-di-provinsi-dki-jakarta-tahun-2019',
 'https://data.jakarta.go.id/dataset/indeks-standar-pencemaran-udara-ispu-tahun-2020',

# **4. Download dataset in csv format from the website**

In [18]:
month={"Januari": "01", "Februari": "02", "Maret": "03", "April": "04", "Mei": "05", "Juni": "06", 
       "Juli": "07", "Agustus": "08", "September": "09", "Oktober":10, "November":11, "Desember":12}
for url in all_link:
  #Extract link to download csv files
  data_response = requests.get(url)
  data_soup = BeautifulSoup(data_response.text, 'html.parser')
  dataset = data_soup.find_all('a', {'class': 'btn btn-success resource-url-analytics'})
  #Store all csv link to down_list
  down_list = []
  for link in dataset:
      down_list.append(re.search("https.*csv",str(link)).group())
  
  #Download files and save to folders by year
  year = re.search("\d{4}", url).group()
  path = f'/content/gdrive/MyDrive/dokumentasi/dataset/{year}'
  for data_url in down_list:
    #Skip spku file
    if "SPKU" not in data_url:
      #Give code in the file name to make sorting easier in the future
      for bulan,kode in month.items():
        if bulan in data_url:
          new_name = path +"/"+ f"{kode}. ISPU Provinsi-DKI-Jakarta-Bulan-{bulan}-Tahun-{year}.csv"
          if os.path.isfile(new_name):
            print(f"Bulan: {bulan} {year} Already Downloaded")
          else:
            #Trying to download until it works
            while True:
              try: 
                data = pd.read_csv(data_url)
                data.to_csv(new_name, index=False)
                print(f"Bulan: {bulan} {year} Done")
                break
              except:
                print(f"Bulan: {bulan} {year} Failed. Retrying...")
                continue
      #!wget $name -P /content/gdrive/MyDrive/dokumentasi/dataset/$year

Bulan: Januari 2018 Already Downloaded
Bulan: Maret 2018 Already Downloaded
Bulan: Februari 2018 Already Downloaded
Bulan: April 2018 Already Downloaded
Bulan: Mei 2018 Already Downloaded
Bulan: Juni 2018 Already Downloaded
Bulan: Juli 2018 Already Downloaded
Bulan: Agustus 2018 Done
Bulan: September 2018 Done
Bulan: Oktober 2018 Done
Bulan: November 2018 Done
Bulan: Desember 2018 Done
Bulan: Desember 2010 Done
Bulan: November 2010 Done
Bulan: Oktober 2010 Done
Bulan: September 2010 Done
Bulan: Agustus 2010 Done
Bulan: Juli 2010 Done
Bulan: Juni 2010 Done
Bulan: Mei 2010 Done
Bulan: April 2010 Done
Bulan: Maret 2010 Done
Bulan: Februari 2010 Done
Bulan: Januari 2010 Done
Bulan: Desember 2011 Done
Bulan: November 2011 Done
Bulan: Oktober 2011 Done
Bulan: September 2011 Done
Bulan: Agustus 2011 Done
Bulan: Juli 2011 Done
Bulan: Juni 2011 Done
Bulan: Mei 2011 Done
Bulan: April 2011 Done
Bulan: Maret 2011 Done
Bulan: Februari 2011 Done
Bulan: Januari 2011 Done
Bulan: Desember 2012 Done
Bul