In [0]:
from pyspark.sql import SparkSession
session = SparkSession.builder.appName('sparksession').getOrCreate()

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [0]:
import uuid

IDudf = udf(lambda: str(uuid.uuid4()), StringType())

In [0]:
%run /Users/tha075bei026@tcioe.edu.np/actionLogger

In [0]:

log_schema = StructType([
    StructField("id",StringType()),
    StructField("load_type", StringType()),
    StructField("table_name", StringType()),
    StructField("process_start_time", TimestampType()),
    StructField("process_end_time", TimestampType()),
    StructField("status", StringType()),
    StructField("comments", StringType()),
    StructField("start_date_time", TimestampType()),
    StructField("end_date_time", TimestampType()),
    StructField("created_on", TimestampType()),
    StructField("created_by", StringType())

])

In [0]:
from pyspark.sql.window import Window

def forecast(hr_data):
    data_latest = hr_data.filter(hr_data.DateID == hr_data.agg({'DateID':'max'}).collect()[0][0] )
    
    window_spec1 = Window.partitionBy(['DateID','CityID']).orderBy(col('HourID').desc())
    ranked_data = data_latest.withColumn('rank',rank().over(window_spec1))
    ranked_data = ranked_data.filter(col('rank') <= 4)
    
    window_spec2 = Window.partitionBy("DateID", "CityID","Clouds").orderBy(col("count").desc())

    cloud_data = ranked_data.groupBy("DateID", "CityID", "Clouds").agg(count("*").alias("count")) \
        .withColumn("row_number", row_number().over(window_spec2)) \
        .filter(col("row_number") == 1) \
        .select("DateID", "Clouds", "CityID")
    


    avg_df = ranked_data.groupBy("DateID","CityID").agg(round(avg(col("Temperature")),2).alias('Temperature'),
                                                 round(avg(col("Pressure")),2).alias('Pressure'),                           
                                                 round(avg(col("Humidity")),2).alias('Humidity'),
                                                 round(avg(col("Visibility")),2).alias('Visibility'),
                                                 round(avg(col("Wind_speed")),2).alias('Wind_speed'),
                                                 round(avg(col("Wind_degree")),2).alias('Wind_degree'),
                                                 round(avg(col("Wind_gust")),2).alias('Wind_gust'))   


    avg_df = avg_df.withColumn('created_on', lit(session.sql("SELECT current_timestamp()").collect()[0][0]))
    avg_df = avg_df.join(cloud_data, how='inner', on=['DateID', 'CityID'])
    avg_df = avg_df.withColumn("Fact_HourID", IDudf())
    avg_df = avg_df.withColumn("is_forecasted", lit('Y'))
    avg_df = avg_df.withColumn('HourID', lit(ranked_data.agg({'HourID':'max'}).collect()[0][0]+ 1))

    window_spec3 = Window.partitionBy(['DateID','CityID']).orderBy(col('DateID').desc())

    avg_df = avg_df.withColumn('row_number',row_number().over(window_spec3))
    avg_df = avg_df.filter(col('row_number')==1)
    avg_df = avg_df.drop('row_number')
    
    avg_df = avg_df.select(col('HourID').cast('long').alias('HourID'),
                         col("DateID").cast('long').alias('DateID'),
                         col("CityID").cast('int').alias('CityID'),
                         col("Temperature").cast('double').alias('Temperature'),
                         col("Pressure").cast('double').alias('Pressure'),
                         col("Humidity").cast('double').alias('Humidity'),
                         col("Clouds"),
                         col("Visibility").cast('double').alias('Visibility'),
                         col("Wind_speed").cast('double').alias('Wind_speed'),
                         col("Wind_degree").cast('double').alias('Wind_degree'),
                         col("Wind_gust").cast('double').alias('Wind_gust'),
                         col("created_on").cast('timestamp').alias('created_on'),
                         col("Fact_HourID"),
                         col("is_forecasted"),
                        )
    
    avg_df.write.option("overwriteSchema", "true")\
            .format('delta')\
            .mode('append')\
            .save('dbfs:/databases/weather/fact_hourly')
    
    
    

In [0]:
def load_forecast_hourly():
    process_start_time =  session.sql("SELECT current_timestamp()").collect()[0][0]
    log_dict = {'id': str(uuid.uuid4().hex),
            'load_type': 'forecast_load_hourly',
            'table_name': 'fact_hourly',
            'process_start_time':  session.sql("SELECT current_timestamp()").collect()[0][0],
            'process_end_time': session.sql("SELECT current_timestamp()").collect()[0][0],
            'start_date_time':  session.sql("SELECT current_timestamp()").collect()[0][0],
            'end_date_time':  session.sql("SELECT current_timestamp()").collect()[0][0]}
    
    log = action_logger(log_dict)
    
    try:

        hr_data = session.read.format('delta').load('dbfs:/databases/weather/fact_hourly')
        for i in range(4):
            forecast(hr_data)
            
        log_dict = {
               'process_start_time' : process_start_time,
               'process_end_time' : session.sql("SELECT current_timestamp()").collect()[0][0],
               'status' : 'completed',
               'start_date_time' : hr_data.select(min('created_on')).first()[0],
               'end_date_time' : hr_data.select(max('created_on')).first()[0]}
            
        log.action(log_dict)
    
    except Exception as e:
        error_dict = {
               'process_start_time' : process_start_time,
               'process_end_time' : session.sql("SELECT current_timestamp()").collect()[0][0],
               'status' : 'error',
               'error_data' : e,
               'start_date_time' : session.sql("SELECT current_timestamp()").collect()[0][0],
               'end_date_time' : session.sql("SELECT current_timestamp()").collect()[0][0]
        }
        log.action(error_dict)     


In [0]:
# load_forecast_hourly()

In [0]:
# a = session.read.format('delta').load('dbfs:/databases/weather/fact_hourly')

In [0]:
# display(a)

HourID,DateID,CityID,Temperature,Pressure,Humidity,Clouds,Visibility,Wind_speed,Wind_degree,Wind_gust,created_on,Fact_HourID,is_forecasted
3,159,1282616,310.46,1002.0,17.0,scattered clouds,10000.0,2.34,163.0,2.05,2023-06-09T04:38:46.253+0000,d222d0d3-6bc1-4a8e-83b0-b20e603f8954,N
3,159,1282621,308.76,1001.0,17.0,clear sky,10000.0,2.43,227.0,1.77,2023-06-09T04:38:46.253+0000,eb6acaf7-1547-41ce-9d13-3a203cdc558e,N
3,159,1282635,310.67,1002.0,15.0,clear sky,10000.0,4.82,228.0,5.36,2023-06-09T04:38:46.253+0000,961a8873-2fb3-4f5e-8af6-be2a62eb7df2,N
3,159,1282665,314.34,1000.0,18.0,clear sky,10000.0,2.6,191.0,1.99,2023-06-09T04:38:46.253+0000,0e4adb0f-b6b4-487d-8dd9-1f925301ff7a,N
3,159,1282670,283.2,1018.0,50.0,clear sky,10000.0,3.38,196.0,2.53,2023-06-09T04:38:46.253+0000,97b9f8e7-dcd3-46b4-82dd-ee1d85b2c6a5,N
4,159,1282616,311.49,1001.0,16.0,few clouds,10000.0,2.39,167.0,2.07,2023-06-09T05:59:34.746+0000,62057c83-6f00-4466-be9f-1f0a372c4ba4,N
4,159,1282621,309.95,1000.0,14.0,clear sky,10000.0,2.47,232.0,2.16,2023-06-09T05:59:34.746+0000,d8f8598d-fd01-4acb-a066-e686fdc849bd,N
4,159,1282635,311.22,1001.0,15.0,clear sky,10000.0,5.46,222.0,5.22,2023-06-09T05:59:34.746+0000,9213d7a0-0e6a-428d-9181-761a0d0c6381,N
4,159,1282665,315.76,999.0,14.0,clear sky,10000.0,2.63,226.0,2.2,2023-06-09T05:59:34.746+0000,c893e8b5-53e7-4a38-aa3e-cad8906dc76f,N
4,159,1282670,283.63,1018.0,56.0,clear sky,10000.0,3.82,192.0,2.92,2023-06-09T05:59:34.746+0000,6428746e-4bb9-43e1-b3ed-62e5f17dbb7a,N
