### Web Scraping
Retrieving the Table Dataset from Wikipedia

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
wiki_url = "https://en.wikipedia.org/wiki/List_of_largest_wastewater_treatment_plants"
webpage = requests.get(wiki_url).text
doc = BeautifulSoup(webpage, "html.parser")
doc.find_all('table')

In [12]:
table = doc.find( 'table', class_='wikitable sortable')
main_header = table.find_all('th')
main_header = [title.get_text().replace("\n", "") for title in main_header]

In [13]:
df = pd.DataFrame(columns = main_header)
df.columns


Index(['Plant name', 'City', 'Country', 'OpeningYear',
       'Dry-weathercapacity (m³ per day) ', 'Wet-weathercapacity (m³ per day)',
       'Area(km²)', 'Notes'],
      dtype='object')

In [14]:
df  = df.rename(columns={'Plant name':'Plant_Name','OpeningYear':'Opening_Year',\
    'Dry-weathercapacity (m³ per day) ':'DryWeather(CMD)','Wet-weathercapacity (m³ per day)':\
        'WetWeather(CMD)','Area(km²)':'Area(sqkm)'})
df.columns

Index(['Plant_Name', 'City', 'Country', 'Opening_Year', 'DryWeather(CMD)',
       'WetWeather(CMD)', 'Area(sqkm)', 'Notes'],
      dtype='object')

In [17]:
main_data = table.find_all('tr')
for row in main_data[1:]:   
    each_row = row.find_all('td')
    individual_row = [data.text.strip() for data in each_row]
    length = len(df)
    df.loc[length] = individual_row
df

Unnamed: 0,Plant_Name,City,Country,Opening_Year,DryWeather(CMD),WetWeather(CMD),Area(sqkm),Notes
0,Jean-R.-Marcotte Wastewater Treatment Plant[1],Montreal,Canada,1984.0,2780000,7600000,0.67,Secondary treatment planned for 2023.[2]
1,Detroit Wastewater Treatment Plant[3],Detroit,USA,1940.0,2 460 000,6 435 000,0.53,Wet-weather secondary treatment capacity limit...
2,Stickney Water Reclamation Plant[4],Chicago,USA,1930.0,2 665 000[5],5 450 000[6],1.67,
3,Blue Plains Advanced Wastewater Treatment Plan...,Washington D.C.,USA,1937.0,1 450 000,4 073 000,0.62,Secondary treatment since 1959. Enhanced nutri...
4,Deer Island Waste Water Treatment Plant,Boston,USA,1968.0,1 438 000,4 542 000,0.6[8],Full secondary treatment since 1995.
5,Abu Rawash Wastewater treatment plant,Giza,Egypt,2021.0,1600000,,1.39,Secondary treatment since 2021.
6,Atotonilco de Tula Plant[9],Mexico City,Mexico,2015.0,2 000 000,3 000 000,,
7,Hyperion Water Reclamation Plant[10],Los Angeles,USA,1925.0,1 041 000,3 000 000,0.81,Wet-weather secondary treatment (since 1950) c...
8,Kuryanovo wastewater treatment facilities[11],Moscow,Russia,1950.0,2 200 000,,,
9,Lyuberetskiye wastewater treatment facilities[12],Moscow,Russia,1963.0,3 000 000,,,


In [18]:
df.to_csv(
    r'C:\Users\User\Downloads\Documents\Largest WWTP\raw_largewwtp.csv', index=False)