In [58]:
import os
import csv
import uuid
import requests
from bs4 import BeautifulSoup

In [118]:
BASE_URL = "https://en.wikipedia.org"

In [4]:
city = {
  "jakarta": "Jakarta",
  "medan": "Medan",
  "makassar": "Makassar",
  "denpasar": "Denpasar",
  "ambon": "Ambon,_Maluku",
  "jayapura": "Jayapura"
}

## Scrape Jakarta

In [None]:
response = requests.get(
  os.path.join(BASE_URL, "wiki", city["jakarta"])
)

response

<Response [200]>

In [8]:
response.text

'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8">\n<title>Jakarta - Wikipedia</title>\n<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-featu

In [10]:
soup = BeautifulSoup(response.text)

In [15]:
soup.title

<title>Jakarta - Wikipedia</title>

In [17]:
soup.title.name

'title'

In [18]:
soup.title.string

'Jakarta - Wikipedia'

In [19]:
soup.title.text

'Jakarta - Wikipedia'

In [21]:
soup.title.parent.name

'head'

In [22]:
soup.p

<p class="cdx-dialog__header__subtitle">This is an accepted version of this page</p>

In [65]:
name = soup.find("div", class_="fn org").text

name

'Jakarta'

In [86]:
full_name = soup.find("tr", class_="mergedtoprow ib-settlement-official").td.span.text

full_name

'Daerah Khusus Ibukota Jakarta'

In [50]:
category = soup.css.select("div.category")[0].text

category

'Special capital region'

In [55]:
longitude = soup.css.select("span.longitude")[0].text

longitude

'106°49′39″E'

In [56]:
latitude = soup.css.select("span.latitude")[0].text

latitude

'6°10′30″S'

In [87]:
new_data = {
  "id": uuid.uuid4(),
  "name": name,
  "full_name": full_name,
  "category": category,
  "longitude": longitude,
  "latitude": latitude,
  "url": response.url
}

In [88]:
with open("../output/web_wikipedia_jakarta.csv", "w", newline="", encoding="utf-8") as f:
  field_names = ["id", "name", "full_name", "category", "longitude", "latitude", "url"]
  writer = csv.DictWriter(f, fieldnames=field_names)

  if f.tell() == 0:
    writer.writeheader()

  writer.writerow(new_data)

## Scrape all cities

In [None]:
for key, item in city.items():
  print("Processing", key.upper())
  response = requests.get(
    os.path.join(BASE_URL, "wiki", item)
  )

  soup = BeautifulSoup(response.text)

  # Get data
  name = soup.find("div", class_="fn org").text
  full_name = soup.find("tr", class_="mergedtoprow ib-settlement-official").td.span.text
  category = soup.css.select("div.category")[0].text
  longitude = soup.css.select("span.longitude")[0].text
  latitude = soup.css.select("span.latitude")[0].text

  # Data configuration
  new_data = {
    "id": uuid.uuid4(),
    "name": name,
    "full_name": full_name,
    "category": category,
    "longitude": longitude,
    "latitude": latitude,
    "url": response.url
  }

  with open("../output/web_wikipedia_city.csv", "a", newline="", encoding="utf-8") as f:
    field_names = ["id", "name", "full_name", "category", "longitude", "latitude", "url"]
    writer = csv.DictWriter(f, fieldnames=field_names)

    if f.tell() == 0:
      writer.writeheader()

    writer.writerow(new_data)

Processing JAKARTA
Processing MEDAN
Processing MAKASSAR
Processing DENPASAR
Processing AMBON
Processing JAYAPURA


## Scrape cities in Sulawesi

In [95]:
response = requests.get("https://en.wikipedia.org/wiki/Sulawesi")

response

<Response [200]>

In [96]:
response.text

'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8">\n<title>Sulawesi - Wikipedia</title>\n<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feat

In [97]:
soup = BeautifulSoup(response.text)

In [102]:
sulawesi_cities = soup.css.select("table.wikitable")[2]

In [104]:
sulawesi_cities.find_all("tr")[0]

<tr>
<th>City
</th>
<th>Province containing the city
</th>
<th>Population (2010 Census)
</th>
<th>Population<br/>(2020 Census)
</th>
<th>Population<br/>(mid 2023<br/>estimate)
</th></tr>

In [116]:
name = sulawesi_cities.find_all("tr")[1].td.a.text

name

'Makassar'

In [123]:
url = sulawesi_cities.find_all("tr")[1].td.a.get("href")

url

'/wiki/Makassar'

In [134]:
parts = url.split("/")

parts

['', 'wiki', 'Makassar']

In [136]:
full_path = os.path.join(BASE_URL, parts[1], parts[2])

full_path

'https://en.wikipedia.org/wiki/Makassar'

In [170]:
province = sulawesi_cities.find_all("tr")[1].find_all("td")[1].text.replace("\n", "")

province

'South Sulawesi'

In [171]:
population = sulawesi_cities.find_all("tr")[1].find_all("td")[3].text.replace("\n", "")

population

'1,423,877'

In [165]:
len(sulawesi_cities.find_all("tr")[1:])

11

In [None]:
for item in sulawesi_cities.find_all("tr")[1:]:
  name = item.td.a.text
  print("Processing", name)
  url = item.td.a.get("href")
  parts = url.split("/")
  full_path = os.path.join(BASE_URL, parts[1], parts[2])
  province = item.find_all("td")[1].text.replace("\n", "")
  population = item.find_all("td")[3].text.replace("\n", "")


  # Nagiate to new page
  r = requests.get(full_path)
  s = BeautifulSoup(r.text)

  # Get data
  full_name = s.find("tr", class_="mergedtoprow ib-settlement-official").td.span.text
  category = s.css.select("div.category")[0].text
  longitude = s.css.select("span.longitude")[0].text
  latitude = s.css.select("span.latitude")[0].text

  # Data configuration
  new_data = {
    "id": uuid.uuid4(),
    "name": name,
    "full_name": full_name,
    "province": province,
    "category": category,
    "population": population,
    "longitude": longitude,
    "latitude": latitude,
    "url": r.url
  }

  with open("../output/web_wikipedia_sulawesi.csv", "a", newline="", encoding="utf-8") as f:
    field_names = [
      "id", 
      "name", 
      "full_name",
      "province", 
      "category", 
      "population", 
      "longitude", 
      "latitude", 
      "url"
    ]
    writer = csv.DictWriter(f, fieldnames=field_names)

    if f.tell() == 0:
      writer.writeheader()

    writer.writerow(new_data)
  

Processing Makassar
Processing Manado
Processing Palu
Processing Kendari
Processing Bitung
Processing Gorontalo
Processing Palopo
Processing Baubau
Processing Parepare
Processing Kotamobagu
Processing Tomohon
