# Setup

In [0]:
!pip install timezonefinder

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.functions import col,isnan, when, count, concat_ws, countDistinct, collect_set, rank, window, avg, hour, udf
import matplotlib.pyplot as plt
import pandas as pd
import re
import numpy as np
from pyspark.sql import types
from pyspark.sql.types import *
from pyspark.sql import Window
from itertools import combinations
from timezonefinder import TimezoneFinder
import pytz
from datetime import datetime


data_BASE_DIR = "dbfs:/mnt/mids-w261/"
display(dbutils.fs.ls(f"{data_BASE_DIR}")) #note the other possible samples we can use like 1 day

In [0]:

ydf = spark.read.parquet(f"dbfs:/mnt/mids-w261/datasets_final_project_2022/parquet_weather_data_1y/")

#stations = spark.read.parquet(f"dbfs:/mnt/mids-w261/datasets_final_project_2022/stations_data/*")


# Location Nulls

In [0]:
ydf=ydf.withColumn("WBAN",F.col('STATION').substr(-5, 5))
ydf=ydf.withColumn("COUNTRY",F.col('NAME').substr(-2, 2))

In [0]:
ydf = ydf.filter(F.col('COUNTRY')=='US')

In [0]:

#icao lookup table for extra IDs 
ICAO_lookup = ydf.filter(F.col('REM').isNotNull()) \
                        .filter(
                            (F.col('REM').contains('METAR')) |
                            (F.col('REM').contains('SPECI'))
                         ) \
                        .withColumn("ICAO", 
                                    F.regexp_extract(F.col("REM"), 
                                                     r"(?:METAR|SPECI)\s(\S+)", 1)) \
                        .filter(F.col('ICAO').isNotNull()) \
                        .withColumn("row_num", 
                                    F.row_number().over(
                                        Window.partitionBy('STATION')
                                        .orderBy(F.col("STATION").desc())
                                    )) \
                        .filter(F.col('row_num') == 1) \
                        .drop(F.col('row_num'))                               


display(ICAO_lookup)

In [0]:
ydf = ydf.join(ICAO_lookup.select('STATION','ICAO'), on='STATION', how='left_outer')
display(ydf)

In [0]:
ydf.filter(F.col('LONGITUDE').isNull()).count()

In [0]:
ydf.count()

In [0]:
ydf.filter(F.col('ICAO').isNull()).count() #only was able to match about 2/3 of ICAO IDs but should be ok

In [0]:
features = ['HourlyDewPointTemperature',
 'HourlyDryBulbTemperature',
 'HourlyPrecipitation',
 'HourlyPresentWeatherType',
 'HourlyPressureChange',
 'HourlyPressureTendency',
 'HourlyRelativeHumidity',
 'HourlySkyConditions',
 'HourlySeaLevelPressure',
 'HourlyStationPressure',
 'HourlyVisibility',
 'HourlyWetBulbTemperature',
 'HourlyWindDirection',
 'HourlyWindGustSpeed',
 'HourlyWindSpeed',
 'REM']

In [0]:
from functools import reduce
from operator import add



sum_expression = reduce(add, [F.when(F.col(c).isNull(), 1).otherwise(0) for c in features])

ydf_dedup = ydf \
    .filter(F.col("LATITUDE").isNotNull()) \
    .filter(F.col('COUNTRY') == 'US') \
    .withColumn(
        "null_count", 
        sum_expression
    )

ydf_dedup = ydf_dedup \
    .withColumn("row_num", 
                F.row_number().over(
                    Window \
                        .partitionBy("STATION", "DATE") \
                        .orderBy(F.col("null_count").asc())
                    )
                ) \
    .filter(F.col("row_num") == 1) \
    .drop(F.col('row_num')) \
    .drop(F.col('null_count'))

display(ydf_dedup)

## checks

In [0]:
ydf.count()

In [0]:
display(ydf.withColumn("DATE", F.col("DATE").cast('timestamp')).groupBy('STATION','DATE').count().filter(F.col('count')>1).orderBy(F.col('count').desc()))

In [0]:
ydf_dedup.count()

In [0]:
display(ydf_dedup.withColumn("DATE", F.col("DATE").cast('timestamp')).groupBy('STATION','DATE').count().filter(F.col('count')>1).orderBy(F.col('count').desc()))