In [None]:
%run "./weather_ETL_logger"

In [None]:
! pip install python-dotenv
import requests
import uuid
from pyspark.sql.functions import col, from_unixtime, lit
from datetime import datetime
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, TimestampType, ArrayType

from dotenv import load_dotenv

You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-5c5aba28-be3d-4c81-91e3-3594bd16a92c/bin/python -m pip install --upgrade pip' command.[0m


In [None]:
# Returns a list of cities
def get_city_list():
    df = spark.read.format('delta').table('dim_city')
    city_list = df.select('name').rdd.flatMap(lambda x: x).collect()
    return city_list

In [None]:
# Returns a list of response from the api
def get_raw_data():
    api_key = '<your api key>'
    response = []
    city_list = get_city_list()
    for city in city_list:
        url = f"https://api.openweathermap.org/data/2.5/weather?q={city}&appid={api_key}"
        response.append(requests.get(url).json())
    
    return response
        

In [None]:
# Creates a table for raw_data
@update_log
def get_raw(city):
    response = get_raw_data()
    df = spark.createDataFrame(response, get_raw_json_schema())
    start_datetime = datetime.fromtimestamp(df.selectExpr('min(dt)').first()[0])
    end_datetime = datetime.fromtimestamp(df.selectExpr('max(dt)').first()[0])
    return df, start_datetime, end_datetime

In [None]:
# Creates a table for processed_data
@update_log
def get_processed_data(df):
    df = df.select(
        col('dt'),
        from_unixtime(col('dt')).alias('date_time'),
        to_date(col('date_time')).alias('date'),
        date_format(col('date_time'), 'HH:mm:ss').alias('time'),
        col('id').alias('city_id'),
        col('name').alias('city_name'),
        col('timezone'),
        col('sys.country'),
        col('coord.lat'),
        col('coord.lon'),
        col('main.temp'),
        col('main.temp_min'),
        col('main.temp_max'),
        col('main.pressure'),
        col('main.humidity'),
        col('visibility'),
        col('wind.speed').alias('wind_speed'),
        col('wind.deg').alias('wind_deg'),
        col('wind.gust').alias('wind_gust'),
        col('clouds.all').alias('clouds_all')
    )
    start_datetime = datetime.fromtimestamp(df.selectExpr('min(dt)').first()[0])
    end_datetime = datetime.fromtimestamp(df.selectExpr('max(dt)').first()[0])
    
    return df, start_datetime, end_datetime

