---
## 1.&nbsp; Import libraries 💾

In [235]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from timezonefinder import TimezoneFinder #To find timezone
import pytz
from datetime import datetime, timedelta #To get today's date and time
from lat_lon_parser import parse  #Converts latitude and longitude to decimal
#Please create python file to save your sql password and user API Key
from safe_key import safe_key

# install if needed
#!pip install sqlalchemy
#!pip install pymysql

## 2. Scrap information from url using webscrapping 😀 and create a function 
### Function for cities and population table

Utilise web scraping skills to gather information about three German cities – Berlin, Hamburg, and Munich – from Wikipedia.

 - Berlin: https://en.wikipedia.org/wiki/Berlin
 - Hamburg: https://en.wikipedia.org/wiki/Hamburg
 - Munich: https://en.wikipedia.org/wiki/Munich

In [252]:
#Create a function to gather cities basic information from wikipedia

def CityData(cities):
    city_list = []

    #Creating for loop to gather information of all cities given as a list 
    for city_name in cities:

        url = f"https://en.wikipedia.org/wiki/{city_name}"

        response = requests.get(url)

        soup_city = BeautifulSoup(response.content, "html.parser")

        #Extract country
        country= soup_city.find("td", class_="infobox-data").get_text()

        #Extract latitude
        latitude = soup_city.find("span", class_="latitude").get_text()

        #Extract longitude
        longitude = soup_city.find("span", class_="longitude").get_text()

        #geo = soup_city.find("span", class_="geo").get_text()  # with this we directly get latitue and longitude in decimal format 
    
        #Convert lat and lon into decimal places
        latitude_decimal = parse(latitude)
        longitude_decimal = parse(longitude)

        #Extract Population and timestamp
        population = soup_city.find("table", class_="infobox ib-settlement vcard").find(string="Population").find_next(class_="infobox-data").get_text()
        population_clean = int(population.replace(",",""))

        #Extract today's date & convert in datetime format
        date_today = datetime.today().strftime("%d.%m.%Y %H:%M:%S")
        date_today_clean = pd.to_datetime(date_today, dayfirst=True) #Specify a date parse order 


        city_list.append({"City" : city_name,
                        "Country" : country,
                        "Latitude" : latitude_decimal,
                        "Longitude" : longitude_decimal,
                        "Population" : population_clean,
                        "Population_retrieved_date" : date_today_clean
                        })
    return pd.DataFrame(city_list)

---
## 3. Create local pipeline i.e. connection between python and MySQL😃

In [None]:
#Create connection string between python and MySQL
def create_connection_string():
    
    schema = "sql_gans"
    host = safe_key["Host_IP"]
    user = "root"
    port = 3306
    password = safe_key["SQL_password"]

    db = f"mysql+pymysql://{user}:{password}@{host}:{port}/{schema}"
    return db

---
## 4. Creating the matching table with SQL 💻

Now we're ready to store the DataFrame in SQL. Before we can send the information in SQL, we need to make a table that has the same columns with same name and data types to recieve the data. While we are creating a table for cities, we can also create the population table too.

Open MySQL Workbench, open a local connection, and open a "sql_table_creation_gans.sql" file. 

In [238]:
#Sending python data to sql table
def send_data(df, connection_string, sql_table):
    return df.to_sql(name = sql_table,
                    con = connection_string,
                    if_exists ='append',
                    index = False) #Ensures the DataFrame’s index is not written as a separate column in the database table.
    

#Retrieving information from sql to python - this notebook
def fetch_data(sql_table, connection_string):
    return pd.read_sql(sql_table, con = connection_string)

In [253]:
def update_cities_data(cities_name):

    #cities_name is a list of cities
    city_data_df = CityData(cities_name)

    #Splitting a df into city and population df
    city_df = city_data_df.loc[:,["City", "Country", "Latitude", "Longitude"]]

    connection_string = create_connection_string()

    # Sending the cities basic information to the sql table name cities_info
    send_data(city_df, connection_string, "cities_info")
    
    cities_info_from_sql = fetch_data("cities_info", connection_string)

    return city_data_df, connection_string, cities_info_from_sql

def update_population_data(city_data_df, connection_string, cities_info_from_sql):

    population_df = city_data_df.loc[:,["Population", "Population_retrieved_date"]]
    

    #Preparing and sending the population table 📚
    #Select only required columns
    population_df["City_id"] = cities_info_from_sql.loc[:,"City_id"]

    #Transforming city_population_df info to sql table - cities_population
    send_data(population_df, connection_string, "cities_population")

    # Retrieving the cities population information from the sql table name cities_population with unique City_id
    cities_population_from_sql = fetch_data("cities_population", connection_string)
    

    return cities_population_from_sql



### 5. Weather Data - Creating a function for 5 days/ 3 Hour weather forecast using API 
- https://openweathermap.org/forecast5

In [240]:
def WeatherData(cities_info_from_sql, temp_unit): 
    
    #store all 'values' belong to key 'list' (key:value)
    weather_data = []

    #This loop iterate over cities table in rowwise maanner
    for row in cities_info_from_sql.itertuples(index=False):
    
        #Extracts city details
        city_name = row.City
        city_id = row.City_id
        latitude = row.Latitude  
        longitude = row.Longitude  
        
        #Finds the city's timezone using the TimezoneFinder library
        tf = TimezoneFinder()
        timezone_str = tf.timezone_at(lng=longitude, lat=latitude) or "UTC"
        city_timezone = pytz.timezone(timezone_str)

        # Get current UTC time
        utc_time = datetime.now(pytz.utc)

        # Convert UTC time to the city's local timezone
        city_time = utc_time.astimezone(city_timezone).strftime("%Y-%m-%d %H:%M:%S")
        city_time_clean = pd.to_datetime(city_time)

        #Define base url
        base_url = "https://api.openweathermap.org/data/2.5/forecast"

        #Define parameters
        parameters = {"q" : city_name,
                      "appid" : safe_key["API_key"],
                      "units" : temp_unit}
        
        response = requests.get(base_url, params=parameters)
        
        if response.status_code != 200:
            print(f"Failed to fetch weather data for {city_name}. HTTP Status: {response.status_code}")
            continue
        
        try:
            weather = response.json()
        except requests.exceptions.JSONDecodeError:
            print(f"Invalid JSON response for weather data for {city_name}")
            continue
        

        for count in range(weather['cnt']): #cnt - A number of timestamps returned in the API response

            
            all_data = weather['list'][count]

            #Time of data forecasted, ISO, UTC
            date_time = all_data['dt_txt']
            date_time_clean = pd.to_datetime(date_time)

            # actual feels like temperature in degree celcius
            temperature = all_data['main']['feels_like']

            #weather_condition within the group
            weather_condition = all_data['weather'][0]['description']

            #cloudiness in percentage
            cloudiness = all_data['clouds']['all']

            #wind_speed in meter/sec
            wind_speed = all_data['wind']['speed']

            #pop-Probability_of_Perception values between 0 & 1. Indicated probabiltiy of rain
            pop = all_data['pop']

            #Part of the day (n-night, d-day)
            pod = all_data['sys']['pod']

            weather_data.append({"City_id" : city_id,
                                "Forecast_utc_time" : date_time_clean,
                                "City_time": city_time_clean,
                                "Part_of_day" : pod,
                                "Weather_condition" : weather_condition,
                                "Cloudiness_percent" : cloudiness,
                                "Temp_deg" : temperature,
                                "Wind_speed_M_per_sec" : wind_speed,
                                "Rain_probability_percent" : pop * 100
                                })
    return pd.DataFrame(weather_data)


### 6. Airport data for given city function 
- Use API -  "https://aerodatabox.p.rapidapi.com/airports/search/location"

In [241]:
def icao_airport_codes(cities_info_from_sql): 
  
  list_for_df = []

  for row in cities_info_from_sql.itertuples(index=False):
    
    city_name = row.City
    latitude = row.Latitude  
    longitude = row.Longitude


    base_url = "https://aerodatabox.p.rapidapi.com/airports/search/location"

    querystring = {
                    "lat": latitude, 
                    "lon": longitude, 
                    "radiusKm":"100",
                    "limit":"16", 
                    "withFlightInfoOnly":"true" 
                  }

    headers = { 
                "x-rapidapi-key": safe_key["x_rapidapi_key"],
                "x-rapidapi-host": "aerodatabox.p.rapidapi.com"
              }

    response = requests.get(base_url, headers=headers, params=querystring)

    if response.status_code != 200:
      print(f"Failed to fetch airport data for {city_name}. HTTP Status: {response.status_code}")
      continue

    try:
      airport_data = response.json()['items']
      list_for_df.append(pd.json_normalize(airport_data))
    except requests.exceptions.JSONDecodeError:
      print(f"Invalid JSON response for airport data for {city_name}")
      continue
     
  return pd.concat(list_for_df, ignore_index=True)

## 7. Flight data function
- API for timezone - "https://aerodatabox.p.rapidapi.com/airports/icao/{icao_code}"
- API for flight - "https://aerodatabox.p.rapidapi.com/flights/airports/icao/{code}/{tomorrow}T{time[0]}/{tomorrow}T{time[1]}"

In [242]:
def get_timezone_by_icao(icao_code):
    try:
        url = f"https://aerodatabox.p.rapidapi.com/airports/icao/{icao_code}"
        
        headers = {
            "x-rapidapi-key": safe_key["x_rapidapi_key"],
            "x-rapidapi-host": "aerodatabox.p.rapidapi.com"
            }

        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            data = response.json()
            timezone_str = data["timeZone"]
            return timezone_str
        else:
            print(f"Failed to fetch timezone for ICAO: {icao_code}")
            return "UTC"
    except Exception as e:
        print(f"Error in timezone lookup for ICAO: {icao_code}: {e}")
        return "UTC"

In [243]:
def FlightData(airports_from_db):
    flight_data = []

    for code_row in airports_from_db.itertuples(index=False):
        code = code_row.icao
        Airport_name = code_row.Airport_name
        City_id = code_row.City_id

        # Find the city timezone based on icao code API
        timezone_str = get_timezone_by_icao(code) or "UTC"
        city_timezone = pytz.timezone(timezone_str)

        # Get today and tomorrow's dates
        today = datetime.now(city_timezone).date()
        tomorrow = today + timedelta(days=1)

        # API requires two 12-hour calls
        times = [["00:00", "11:59"], ["12:00", "23:59"]]

        for time in times:

            url = f"https://aerodatabox.p.rapidapi.com/flights/airports/icao/{code}/{tomorrow}T{time[0]}/{tomorrow}T{time[1]}"
            
            querystring = {
                            "withLeg": "true",
                            "direction": "Arrival",
                            "withCancelled": "false",
                            "withCodeshared": "true",
                            "withCargo": "false",
                            "withPrivate": "false"
                        }
            
            headers = {
                "x-rapidapi-key": safe_key["x_rapidapi_key"],
                "x-rapidapi-host": "aerodatabox.p.rapidapi.com"
                }

            # API request
            response = requests.get(url, headers=headers, params=querystring)
            
            if response.status_code != 200:
                print(f"Error fetching flight data for ICAO: {code}, Status Code: {response.status_code}")
                continue

            try:
                flight = response.json()
            except requests.exceptions.JSONDecodeError:
                print(f"Invalid JSON response for flight data for ICAO: {code}")
                continue

            # Data retrieval timestamp
            retrieval_time = datetime.now(city_timezone).strftime("%Y-%m-%d %H:%M:%S")

            # Parse arrivals
            for item in flight.get("arrivals", []):

                #This skip step if arrival_time is not available
                arrival_time = item["arrival"]["scheduledTime"].get("local", None)
                if not arrival_time:
                    continue

                data = {
                    "Arrival_airport_icao": code,
                    "Arrival_airport_name": Airport_name,
                    "Departure_airport_icao": item["departure"]["airport"].get("icao", None),
                    "Flight_number": item.get("number", None),
                    "Arrival_time": arrival_time,
                    "Data_retrieved_time": retrieval_time,
                    "City_id" : City_id
                }

                flight_data.append(data)

    #Create DataFrame
    flight_df = pd.DataFrame(flight_data)

    if not flight_df.empty:
        flight_df["Arrival_time"] = pd.to_datetime(flight_df["Arrival_time"].str[:-6])
        flight_df["Data_retrieved_time"] = pd.to_datetime(flight_df["Data_retrieved_time"])

    return flight_df


## Create one final function to send and retrieve data i.e. update data

In [244]:
cities_name = ["Berlin", "Munich", "Pune", "Nasik"]
city_data_df, connection_string, cities_info_from_sql = update_cities_data(cities_name)

In [245]:
cities_population_from_sql = update_population_data(city_data_df, connection_string, cities_info_from_sql)

In [246]:
def update_weather_data(cities_info_from_sql, connection_string):

    weather_to_db = WeatherData(cities_info_from_sql, temp_unit='metric')

    send_data(weather_to_db, connection_string, "cities_weather")
    
    return "Weather Data has been updated"

update_weather_data(cities_info_from_sql, connection_string)


'Weather Data has been updated'

In [247]:
def update_airport_data(cities_info_from_sql, connection_string):

    airport_df = icao_airport_codes(cities_info_from_sql)

    icao_airport_names_df = airport_df.drop_duplicates(subset=["icao"])[["icao", "name", "municipalityName"]]

    #Select only required columns
    cities_airports_merged = cities_info_from_sql.merge(icao_airport_names_df, left_on = "City", right_on = "municipalityName", how="left")

    # Selecting only the columns we need
    airports_to_db = cities_airports_merged.loc[:,["icao", "name", "City_id"]]
    airports_to_db.rename(columns={"name": "Airport_name"}, inplace=True)

    send_data(airports_to_db, connection_string, "airports")
    
    return "Airport data has been updated"

update_airport_data(cities_info_from_sql, connection_string)

'Airport data has been updated'

In [248]:
def update_flight_data(connection_string):

    airports_from_db = fetch_data("airports", connection_string)
    
    flights_to_db = FlightData(airports_from_db)
    
    send_data(flights_to_db, connection_string, "flights")

    return "Flights data has been updated"

update_flight_data(connection_string)

'Flights data has been updated'