---
## 1.&nbsp; Import libraries 💾

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

#Converts latitude and longitude to decimal:
from lat_lon_parser import parse 

#To get today's date
from datetime import datetime 

# install if needed
#!pip install sqlalchemy
#!pip install pymysql

## Challenge 2 😀

Utilise your web scraping skills to gather information about three German cities – Berlin, Hamburg, and Munich – from Wikipedia. You will start by extracting basic information: the country, the latitude and the longitude of each city and then expand to more dynamic data such as the population.

### 1. Scraping Basic Information


  1.1. Begin by scraping the country, the latitude and the longitude of each city from their respective Wikipedia pages:

 - Berlin: https://en.wikipedia.org/wiki/Berlin
 - Hamburg: https://en.wikipedia.org/wiki/Hamburg
 - Munich: https://en.wikipedia.org/wiki/Munich

In [None]:
Berlin_url = "https://en.wikipedia.org/wiki/Berlin"
Hamburg_url = "https://en.wikipedia.org/wiki/Hamburg"
Munich_url = "https://en.wikipedia.org/wiki/Munich"

Berlin_response = requests.get(Berlin_url)
Hamburg_response = requests.get(Hamburg_url)

Munich_response = requests.get(Munich_url)
Berlin_response, Hamburg_response, Munich_response

(<Response [200]>, <Response [200]>, <Response [200]>)

In [None]:
#Create soup
Berlin_soup = BeautifulSoup(Berlin_response.content, "html.parser")

Hamburg_soup = BeautifulSoup(Hamburg_response.content, "html.parser")

Munich_soup = BeautifulSoup(Munich_response.content, "html.parser")

In [None]:
country_Germany = Berlin_soup.select('td.infobox-data a[title="Germany"]')[0]
country_Germany.get_text()

'Germany'

In [None]:
#For Berlin
country_Germany = Berlin_soup.find("td", class_="infobox-data").get_text()

Berlin_latitude = Berlin_soup.find("span", class_="latitude").get_text()

Berlin_longitude = Berlin_soup.find("span", class_="longitude").get_text()

print("Basic information about Berlin")
print(f"Country: {country_Germany}\ncoordinates of Berlin:\nlatitude = {Berlin_latitude}\tlongitude = {Berlin_longitude}")

Basic information about Berlin
Country: Germany
coordinates of Berlin:
latitude = 52°31′12″N	longitude = 13°24′18″E


In [None]:
#For Hamburg
country_Germany1 = Hamburg_soup.find("td", class_="infobox-data").get_text()

Hamburg_latitude = Hamburg_soup.find("span", class_="latitude").get_text()

Hamburg_longitude = Hamburg_soup.find("span", class_="longitude").get_text()

print("Basic information about Hamburg")
print(f"Country: {country_Germany1}\ncoordinates of Hamburg:\nlatitude = {Hamburg_latitude}\tlongitude = {Hamburg_longitude}")

Basic information about Hamburg
Country: Germany
coordinates of Hamburg:
latitude = 53°33′N	longitude = 10°00′E


In [None]:
#For Munich
country_Germany2 = Munich_soup.find("td", class_="infobox-data").get_text()

Munich_latitude = Munich_soup.find("span", class_="latitude").get_text()

Munich_longitude = Munich_soup.find("span", class_="longitude").get_text()

print("Basic information about Munich")
print(f"Country: {country_Germany2}\ncoordinates of Munich:\nlatitude = {Munich_latitude}\tlongitude = {Munich_longitude}")

Basic information about Munich
Country: Germany
coordinates of Munich:
latitude = 48°08′15″N	longitude = 11°34′30″E


1.2. Once you have scraped the basic information of each city, reflect on the similarities and patterns in accessing them across the three pages. Also, analyse the URLs to identify any commonalities. Make a loop that executes once and retrieves the country, latitude, and longitude for all three cities.

In [None]:
cities = ["Berlin", "Hamburg", "Munich"]
countries = []
latitudes = []
longitudes = []

for city in cities:

    url = f"https://en.wikipedia.org/wiki/{city}"

    response = requests.get(url)

    soup_city = BeautifulSoup(response.content, "html.parser")

    #Creating loop 
    country= soup_city.find("td", class_="infobox-data").get_text()

    latitude = soup_city.find("span", class_="latitude").get_text()

    longitude = soup_city.find("span", class_="longitude").get_text()

    countries.append(country)
    latitudes.append(latitude)
    longitudes.append(longitude)

print(f"The cities are in the following countries: {countries}")
print(f"The cities have the following latitudes: {latitudes}")
print(f"The cities have the following longitudes: {longitudes}")

The cities are in the following countries: ['Germany', 'Germany', 'Germany']
The cities have the following latitudes: ['52°31′12″N', '53°33′N', '48°08′15″N']
The cities have the following longitudes: ['13°24′18″E', '10°00′E', '11°34′30″E']


### 2. Data Organisation

  2.1 Utilise pandas DataFrame to effectively store the extracted information. This DataFrame should have a row for each city, and columns for each type of information (cityname, country, latitude, longitude). If you feel brave, change latitude and longitude into decimal format.




In [None]:
city_dict = {"City" : cities,
             "Country" : countries,
             "Latitude" : latitudes,
             "Longitude" : longitudes}

city_df = pd.DataFrame(city_dict)
city_df

Unnamed: 0,City,Country,Latitude,Longitude
0,Berlin,Germany,52°31′12″N,13°24′18″E
1,Hamburg,Germany,53°33′N,10°00′E
2,Munich,Germany,48°08′15″N,11°34′30″E


- We had our information in separate lists and now used them to create our DataFrame. However, if we add more and more columns we would need many separate lists. So use only one list

In [None]:
#Solution - use only one list
cities = ["Berlin", "Hamburg", "Munich"]
city_dict = []

for city in cities:

    url = f"https://en.wikipedia.org/wiki/{city}"

    response = requests.get(url)

    soup_city = BeautifulSoup(response.content, "html.parser")

    #Creating loop 
    country= soup_city.find("td", class_="infobox-data").get_text()

    latitude = soup_city.find("span", class_="latitude").get_text()

    longitude = soup_city.find("span", class_="longitude").get_text()

    latitude_decimal = parse(latitude)
    longitude_decimal = parse(longitude)

    city_dict.append({"City" : city,
                      "Country" : country,
                      "Latitude" : latitude,
                      "Lat_decimal" : latitude_decimal,
                      "Longitude" : longitude,
                      "Lon_decimal" : longitude_decimal})

city_df = pd.DataFrame(city_dict)
city_df

Unnamed: 0,City,Country,Latitude,Lat_decimal,Longitude,Lon_decimal
0,Berlin,Germany,52°31′12″N,52.52,13°24′18″E,13.405
1,Hamburg,Germany,53°33′N,53.55,10°00′E,10.0
2,Munich,Germany,48°08′15″N,48.1375,11°34′30″E,11.575


  2.2 Looking ahead (optional): Create a function from the loop and DataFrame to encapsulate the scraping process. This function can be used repeatedly to fetch updated data whenever necessary. It should return a clean, properly formatted DataFrame.

In [None]:
#Create a function

def cities_data(cities):
    city_dict = []

    for city in cities:

        url = f"https://en.wikipedia.org/wiki/{city}"

        response = requests.get(url)

        soup_city = BeautifulSoup(response.content, "html.parser")

        #Creating loop 
        country= soup_city.find("td", class_="infobox-data").get_text()

        latitude = soup_city.find("span", class_="latitude").get_text()

        longitude = soup_city.find("span", class_="longitude").get_text()

        latitude_decimal = parse(latitude)
        longitude_decimal = parse(longitude)


        city_dict.append({"City" : city,
                        "Country" : country,
                        "Latitude" : latitude_decimal,
                        "Longitude" : longitude_decimal
                        })
    return pd.DataFrame(city_dict)

cities_name = ["Berlin", "Hamburg", "Munich"]
city_df = cities_data(cities_name)
city_df

Unnamed: 0,City,Country,Latitude,Longitude
0,Berlin,Germany,52.52,13.405
1,Hamburg,Germany,53.55,10.0
2,Munich,Germany,48.1375,11.575


2.3 Hey, this worked great! Let's remember why we use functions: We want to be able to repeat the code many times. When we call our function with different cities, the relevant information will be extracted. This will help Gans to expand in the future.

In [None]:


Berlin_soup.find("table", class_="infobox ib-settlement vcard").find(string="Population").find_next(class_="infobox-data").get_text()

In [None]:
datetime.today().strftime("%d.%m.%Y")

In [None]:
#!. find the table which is on right side
#2.finding nearest string i.e. population
#3. use find_next 
#Create a function to extract population of cities from wikipedia and get today's date as timestamp

def cities_population(cities):
    city_population = []

    for city in cities:

        url = f"https://en.wikipedia.org/wiki/{city}"

        response = requests.get(url)

        soup_city = BeautifulSoup(response.content, "html.parser")

        #Extract Population and timestamp
        population = soup_city.find("table", class_="infobox ib-settlement vcard").find(string="Population").find_next(class_="infobox-data").get_text()
        population_clean = int(population.replace(",",""))

        #Extract today's date & convert in datetime format
        date_today = datetime.today().strftime("%d.%m.%Y")
        date_today_clean = pd.to_datetime(date_today, dayfirst=True) #Specify a date parse order 


        city_population.append({"City" : city,
                                "Population" : population_clean,
                                "Population_retrieved_date" : date_today_clean})
    
    return pd.DataFrame(city_population)

cities_name = ["Berlin", "Hamburg", "Munich"]
city_population_df = cities_population(cities_name)
city_population_df

Unnamed: 0,City,Population,Population_retrieved_date
0,Berlin,3878100,2024-11-28
1,Hamburg,1964021,2024-11-28
2,Munich,1510378,2024-11-28


## BONUS Challenge 4: Global Data Scraping

  With your robust scraping skills now honed, venture beyond the confines of Germany and explore other cities around the world. While the extraction methodology for German cities may follow a consistent pattern, this may not be the case for cities from different countries. Can you make a function that returns a clean DataFrame of information for cities worldwide?

In [None]:
new_cities = ["Cologne", "Amsterdam", "Pune", "Nashik"]

cities_data(new_cities)

Unnamed: 0,City,Country,Latitude,Lat_decimal,Longitude,Lon_decimal
0,Cologne,Germany,50°56′11″N,50.936389,6°57′10″E,6.952778
1,Amsterdam,Netherlands,52°22′22″N,52.372778,04°53′37″E,4.893611
2,Pune,India,18°31′13″N,18.520278,73°51′24″E,73.856667
3,Nashik,India,19°59′51.0″N,19.9975,73°47′23.3″E,73.789806
