# Setup

In [0]:
!pip install timezonefinder

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.functions import col,isnan, when, count, concat_ws, countDistinct, collect_set, rank, window, avg, hour, udf
import matplotlib.pyplot as plt
import pandas as pd
import re
import numpy as np
from pyspark.sql import types
from pyspark.sql.types import *
from pyspark.sql import Window
from itertools import combinations
from timezonefinder import TimezoneFinder
import pytz
from datetime import datetime


data_BASE_DIR = "dbfs:/mnt/mids-w261/"
display(dbutils.fs.ls(f"{data_BASE_DIR}")) #note the other possible samples we can use like 1 day

In [0]:
display(dbutils.fs.ls(f"{data_BASE_DIR}/datasets_final_project_2022/stations_data"))

In [0]:

qdf = spark.read.parquet(f"dbfs:/mnt/mids-w261/datasets_final_project_2022/parquet_weather_data_3m/")
ydf = spark.read.parquet(f"dbfs:/mnt/mids-w261/datasets_final_project_2022/parquet_weather_data_1y/")


In [0]:
stations = spark.read.parquet(f"dbfs:/mnt/mids-w261/datasets_final_project_2022/stations_data/*")

# Initial exploration

## stations

In [0]:
stations.columns

In [0]:
display(stations)

In [0]:
display(stations.select('station_id').distinct().count())
ydf.select('STATION').distinct().count()

In [0]:
display(stations.select('wban').distinct().count())


## MSHR

Weirdly, stations doesn't seem to have complete data either, so use MSHR dataset?

In [0]:

mshr = pd.read_csv('mshr_standard.txt',sep='\t') #via https://www.ncei.noaa.gov/access/homr/reports

In [0]:
def parse_fixed_width(row):
    return {
        "station_id": row[:8].strip(),
        "record_type": row[9:11].strip(),
        "coop_station_id": row[12:18].strip(),
        "climate_division": row[19:21].strip(),
        "WBAN_ID": row[22:27].strip(),
        "WMO_ID": row[28:33].strip(),
        "FAA_ID": row[34:39].strip(),
        "NWS_ID": row[40:44].strip(),
        "ICAO_ID": row[45:49].strip(),
        "country": row[50:70].strip(),
        "state_FIPS": row[71:73].strip(),
        "county": row[74:104].strip(),
        "time_zone": row[105:110].strip(),
        "coop_station_name": row[111:141].strip(),
        "principal_station_name": row[142:172].strip(),
        "begin_date": row[173:181].strip(),
        "end_date": row[182:191].strip(),
        "lat_deg": row[192:194].strip(),
        "lat_min": row[195:197].strip(),
        "lat_sec": row[198:200].strip(),
        "lon_deg": row[201:205].strip(),
        "lon_min": row[206:208].strip(),
        "lon_sec": row[209:211].strip(),
        "latlon_precision": row[212:219].strip(),
        "ground_elevation": row[219:225].strip(),
        "elevation_other": row[226:229].strip(),
        "elevation_other_type": row[230:231].strip(),
        "station_relocation": row[232:243].strip(),
        "station_types": row[244:].strip(),
    }


parsed_rows = [parse_fixed_width(mshr.iloc[i, 0]) for i in range(len(mshr))]

max_cols = max(len(row) for row in parsed_rows)

mshr_parse = pd.DataFrame(parsed_rows)

In [0]:
territory_FIPS = ['AS','GU','MP','PR','UM','VI']  #via https://www.census.gov/library/reference/code-lists/ansi.html#states

mshr_parse['end_date'] = mshr_parse['end_date'].apply(
    lambda x: pd.to_datetime('2025' + x[4:]) if x.startswith('9999') else pd.to_datetime(x)
)

mshr_parse['begin_date']=pd.to_datetime(mshr_parse['begin_date'], errors='coerce')

mshr_parse['latlon_precision']=mshr_parse['latlon_precision'].apply(lambda x: x[:3].strip(' '))

mshr_parse['lat_deg']=pd.to_numeric(mshr_parse['lat_deg'])
mshr_parse['lat_min']=pd.to_numeric(mshr_parse['lat_min'],errors='raise')
mshr_parse['lat_sec']=pd.to_numeric(mshr_parse['lat_sec'],errors='raise')

mshr_parse['lon_deg']=pd.to_numeric(mshr_parse['lon_deg'],errors='raise')
mshr_parse['lon_min']=pd.to_numeric(mshr_parse['lon_min'],errors='raise')
mshr_parse['lon_sec']=pd.to_numeric(mshr_parse['lon_sec'],errors='raise')

mshr_parse['lat_dd']= mshr_parse.apply(
    lambda row: (row['lat_deg'] + row['lat_min']/60 + row['lat_sec']/3600), 
                axis=1)

mshr_parse['lon_dd']= mshr_parse.apply(
    lambda row: (row['lon_deg'] + row['lon_min']/60 + row['lon_sec']/3600), 
                axis=1)


mshr_us= mshr_parse[(mshr_parse['state_FIPS'].isin(territory_FIPS)) | (mshr_parse['country'] == 'UNITED STATES')]

mshr_us = mshr_us[mshr_us['end_date'] >= pd.to_datetime('20150101')] #only stations that were functioning at the time of flights dataset

mshr_df = spark.createDataFrame(mshr_us)

#rename MSHR columns to match YDF where relevant
mshr_df=mshr_df.withColumnsRenamed({'lat_dd':'LATITUDE', 'lon_dd':'LONGITUDE', 'principal_station_name':'NAME'})





In [0]:
mshr_df.columns

## weather

%md
| Code  | Description                                                                 |
|-------|-----------------------------------------------------------------------------|
| CRB   | Climate Reference Book data from CDMP                                       |
| CRN05 | Climate Reference Network report, with 5-minute reporting interval          |
| CRN15 | Climate Reference Network report, with 15-minute reporting interval         |
| FM-12 | SYNOP Report of surface observation from a fixed land station               |
| FM-13 | SHIP Report of surface observation from a sea station                       |
| FM-14 | SYNOP MOBIL Report of surface observation from a mobile land station        |
| FM-15 | METAR Aviation routine weather report                                       |
| FM-16 | SPECI Aviation selected special weather report                              |
| FM-18 | BUOY Report of a buoy observation                                           |

In [0]:
qdf.select('SOURCE').distinct().show()

In [0]:
qdf.select('NAME').distinct().show()

In [0]:
qdf.select('STATION').distinct().show()

# Nulls & Duplicates

## Weather df

In [0]:
null_counts = qdf.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in qdf.columns])
display(null_counts)


In [0]:
ydf.groupBy('STATION').agg(
        F.count(F.when(F.col('LATITUDE').isNull(), 1)).alias('Latitude_Null'),
        F.count(F.when(F.col('LONGITUDE').isNotNull(), 1)).alias('Latitude_Non_Null'),
        F.count(F.when(F.col('NAME').isNull(), 1)).alias('Name_Null'),
        F.count(F.when(F.col('NAME').isNotNull(), 1)).alias('Name_Not_Null'),
        F.count(F.when(F.col('REM').isNull(), 1)).alias('REM_Null'),
        F.count(F.when(F.col('REM').isNotNull(), 1)).alias('REM_Not_Null')


        ) \
        .orderBy('REM_Not_Null','Name_Not_Null').show()



#for the first 5 stations don't have a way to extract location; no name and no REM


via https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt:  ID is the station identification code (WBAN station ID after cross ref).  Note that the first two
           characters denote the FIPS  country code, the third character 
           is a network code that identifies the station numbering system 
           used, and the remaining eight characters contain the actual 
           station ID. 


but doesn't seem to match example format like "US1AZMR0156"


https://www.ncei.noaa.gov/pub/data/noaa/isd-history.txt can match some

https://www.ncei.noaa.gov/access/homr/reports mshr_standard.txt can match some (ie 94045 is Ft Peck Surfrad) via WBAN station Id 


In [0]:
display(ydf.filter(F.col('STATION')==99999953182).select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in qdf.columns]))

In [0]:
display(ydf.filter((F.col('LATITUDE').isNotNull()) & F.col('STATION').startswith(str(999999))))

In [0]:
ydf.filter(F.col('STATION').startswith(str(999999))).count() 

In [0]:
ydf.filter(~F.col('STATION').startswith('999999')).count()

In [0]:
display(ydf.filter(F.col('LATITUDE').isNull() & (F.col('REM').isNull()) & (F.col('NAME').isNull())) \
            .groupBy('STATION','REPORT_TYPE') \
            .agg(F.count('*').alias('count')) \
            .orderBy(F.desc('count'))
    )

In [0]:


display(
    ydf.filter(F.col('STATION').contains('94044'))
       .select("STATION")
       .distinct()
)

display(
    ydf.filter(F.col('STATION').contains('94045'))
       .select("STATION")
       .distinct()
)

display(
    ydf.filter(F.col('STATION').contains('53182'))
       .select("STATION")
       .distinct()
)

display(
    ydf.filter(F.col('STATION').contains('54918'))
       .select("STATION")
       .distinct()
)

display(
    ydf.filter(F.col('STATION').contains('04835'))
       .select("STATION")
       .distinct()
)

display(
    ydf.filter(F.col('STATION').contains('53830'))
       .select("STATION")
       .distinct()
)

#99404599999=appalachicola FL


In [0]:
display(
    ydf.filter(F.col('STATION').contains('94045'))
       .select("STATION")
       .distinct()
)

In [0]:
display(ydf.filter(F.col('STATION') == '99999923583'))

## clean MSHR

In [0]:
qdf=qdf.withColumn("WBAN",F.col('STATION').substr(-5, 5))

In [0]:
ydf=ydf.withColumn("WBAN",F.col('STATION').substr(-5, 5))

In [0]:
mshr_parse['begin_date']

In [0]:
display(ydf.filter(F.col('WBAN')=='63868'))

In [0]:
mshr_parse[mshr_parse['WBAN_ID']=='63868']

| Abbreviation | Description |
|--------------|-------------|
| ASOS         | Automated Surface Observing System |
| AWOS         | Airway Weather Observation |
| AMOS         | Automated Observing Station |
| COOP         | COOPerative Station |
| USHCN        | U.S. Historical Climatology Network |
| MILITARY     | Military |
| SYNOPTIC     | (Undocumented) |
| USCRN        | U.S. Climate Reference Network |
| USRCRN       | U.S. Regional Climate Reference Network |
| AL USRCRN    | Alabama U.S. Regional Climate Reference Network |
| WXSVC        | Weather Service |
| NEXRAD       | NEXt generation RADar |
| RADAR        | (Undocumented) |
| UPPERAIR     | Upper Air |

In [0]:
mshr_us['station_types'].value_counts().head(20)




In [0]:
mshr_us

## duplicates exploration

In [0]:
ydf_country=ydf.withColumn("COUNTRY",F.col('NAME').substr(-2, 2))

In [0]:
ydf_country = ydf_country.withColumn("DATE", F.col("DATE").cast("timestamp"))


In [0]:
display(ydf_country.filter(F.col('COUNTRY').isin(territory_FIPS))
        .withColumn("row_num", 
                    F.row_number().over(
                        Window.partitionBy('COUNTRY')
                        .orderBy(F.col("COUNTRY").desc())
                    ))
    .filter(F.col("row_num") == 1)
    .drop("row_num"))


#confirmed, all of these are non- US territories:
#AS=Australia, FM=Micronesia, MH=Marshall Islands, MP=Mauritius (not mariana islands), and the VIs are British VI (?)

In [0]:
ydf_uniq = ydf_country  \
            .withColumn("hour", hour(F.col("DATE"))) \
            .filter(F.col('COUNTRY')=='US') \
            .withColumn("row_num", 
                    F.row_number().over(
                        Window.partitionBy('WBAN','DATE')
                        .orderBy(F.col("WBAN").desc())
                    )) \
    .filter(F.col("row_num") == 1)

In [0]:
display(ydf_uniq)

In [0]:
display(ydf_uniq.count())

#52876330 should be the number of YDF USA unique records after dedup

In [0]:
#no duplicate name/date/source combos for report_type = FM15, seems like an ok rule (?)
display(ydf_country.withColumn("hour", hour(F.col("DATE")))
    .filter(F.col('REPORT_TYPE') == 'FM-15')
    .filter(F.col('COUNTRY') == 'US')
    .groupBy('NAME','DATE','SOURCE')
    .count()
    .filter(F.col('count') > 1)
    )

In [0]:
#no duplicate name/date/source combos for report_type = FM15, seems like an ok rule (?)
display(ydf_country.withColumn("hour", hour(F.col("DATE")))
    .filter(F.col('REPORT_TYPE') == 'FM-15')
    .filter(F.col('COUNTRY') == 'US')
    .groupBy('NAME','DATE','SOURCE')
    .count()
    .filter(F.col('count') > 1)
    )

4 = USAF SURFACE HOURLY observation 

6 = ASOS/AWOS observation from NCEI              

7 = ASOS/AWOS observation merged with USAF SURFACE HOURLY observation 

O = Summary observation created by NCEI using hourly observations that may not share the same data source flag. 

In [0]:


#duplicate NAME-DATE pairs
duplicates = (
    ydf_country
    .filter(F.col('REPORT_TYPE') == 'FM-15')
    .filter(F.col('COUNTRY') == 'US')
    .groupBy('NAME', 'DATE')
    .count()
    .filter(F.col("count") > 1)
    .select("NAME", "DATE")
)

#sources for each duplicate NAME-DATE as an array
source_combinations = (
    ydf_country
    .join(duplicates, on=["NAME", "DATE"], how="inner")
    .groupBy("NAME", "DATE")
    .agg(F.collect_set("SOURCE").alias("source_list"))
)

#get unique (SOURCE1, SOURCE2) pairs

def generate_pairs(source_list):
    return [tuple(sorted(pair)) for pair in combinations(source_list, 2)] if len(source_list) > 1 else []

generate_pairs_udf = F.udf(generate_pairs, "array<struct<source1:string, source2:string>>")

source_pairs = source_combinations.withColumn("source_pairs", generate_pairs_udf(F.col("source_list")))

#explode the pairs & count occurrences per NAME
pair_counts = (
    source_pairs
    .select("NAME", F.explode("source_pairs").alias("pair"))
    .groupBy("NAME", "pair")
    .count()
)

#collapse to show only name & pair tally
final_result = (
    pair_counts
    .groupBy("NAME")
    .agg(F.collect_list(F.struct("pair", "count")).alias("source_tally"))
)

display(final_result)


In [0]:

hourly_columns = [col for col in ydf.columns if col.startswith('Hourly')]
# result = (
#     ydf_country
#     .filter(F.col('COUNTRY') == 'US')
#     .filter(F.col('REPORT_TYPE') == 'FM-15')
#     .groupBy("WBAN", "DATE", "SOURCE") 
#     .agg(
#         *[F.sum(F.when(F.col(c).isNull(), 1).otherwise(0)).alias(c) for c in hourly_columns]  # Count nulls per column
#     )
#     .withColumn("row_num", F.row_number().over(
#         Window.partitionBy("WBAN", "DATE").orderBy(F.col("WBAN").desc())
#     ))
#     .filter(F.col("row_num") == 1)
#     .drop("row_num")
# )

# # Display the result
# display(result)


In [0]:
agg_result =  (ydf_country
    .filter(F.col('COUNTRY') == 'US')
    .filter(F.col('REPORT_TYPE') == 'FM-15')
    .groupBy("WBAN", "DATE", "SOURCE") 
    .agg(
        *[F.sum(F.when(F.col(c).isNull(), 1).otherwise(0)).alias(c) for c in hourly_columns])  # Count nulls per column
    )

tally_result = (
    agg_result
    .withColumn("null_column_count", 
        sum(F.when(F.col(c).isNull(), 1).otherwise(0) for c in hourly_columns))
    .groupBy("WBAN", "SOURCE")  # Group by WBAN and SOURCE
    .agg(
        F.sum("null_column_count").alias("total_nulls")  # Tally null counts across WBAN and SOURCE
    )
)

# Display the result
display(tally_result.orderBy(F.col('total_nulls')))

Dup rule:

- Report type = FM-15
- average across duplicate sources if exists

In [0]:
# rest = ydf \
#         .filter(F.col("LATITUDE").isNotNull())


# join_condition = (
#     (rest["WBAN"] == mshr_df["WBAN_ID"]) &
#     (rest["DATE"] >= mshr_df["begin_date"]) &
#     (rest["DATE"] <= mshr_df["end_date"])
# )

# window_spec = Window \
#                     .partitionBy(missing_loc["WBAN"], missing_loc["DATE"]) \
#                     .orderBy(F.col("begin_date").desc() #most recent begin date
# )
                    
# #deduplicated 
# dedup = (rest.groupBy("WBAN", "DATE").count().filter(F.col("count") > 1)) \
#             .join(mshr_df.select(F.expr("* EXCEPT(LATITUDE, LONGITUDE, NAME)")), 
#                   on=["WBAN", "DATE"], 
#                   how="inner") \
#             .filter(~F.col("REPORT_TYPE").contains("SOD"))

# nondup = missing_loc.join(dedup, on=["WBAN", "DATE"], how="left_anti")

# missing_loc_clean = dedup.select(F.expr("* EXCEPT(COUNT)")).union(nondup)


# result = missing_loc_clean \
#     .join(mshr_df, join_condition, "left_outer") \
#     .withColumn("row_num", F.row_number().over(window_spec)) \
#     .filter(F.col("row_num") == 1) \
#     .drop("row_num")


# display(result)

## clean location nulls

mostly just dedup

In [0]:
missing_loc = ydf \
        .filter(F.col("LATITUDE").isNull()) \
        .select(F.expr("* EXCEPT(LATITUDE, LONGITUDE, NAME)")) #will replace those cols w MSHR fill ins


join_condition = (
    (missing_loc["WBAN"] == mshr_df["WBAN_ID"]) &
    (missing_loc["DATE"] >= mshr_df["begin_date"]) &
    (missing_loc["DATE"] <= mshr_df["end_date"])
)

window_spec = Window \
                    .partitionBy(missing_loc["WBAN"], missing_loc["DATE"]) \
                    .orderBy(F.col("begin_date").desc() #most recent begin date
)
                    
#deduplicated missing loc rows: based on EDA showing all dups in these null rows are SOD with nulls

dedup = (missing_loc.groupBy("WBAN", "DATE").count().filter(F.col("count") > 1)) \
            .join(missing_loc, on=["WBAN", "DATE"], how="inner") \
            .filter(~F.col("REPORT_TYPE").contains("SOD"))

nondup = missing_loc.join(dedup, on=["WBAN", "DATE"], how="left_anti")

missing_loc_clean = dedup.select(F.expr("* EXCEPT(COUNT)")).union(nondup)


missing_loc_result = missing_loc_clean \
    .join(mshr_df, join_condition, "left_outer") \
    .withColumn("row_num", F.row_number().over(window_spec)) \
    .filter(F.col("row_num") == 1) \
    .drop("row_num")


display(missing_loc_result)

In [0]:
loc_result = missing_loc_result.filter(F.col('country') == 'UNITED STATES')

In [0]:
display(loc_result.filter(F.col('WBAN') == '99999').groupBy('coop_station_name').count())

In [0]:
display(loc_result.groupBy('WBAN', 'DATE').count().filter(F.col('count') > 1)) 
#deduped!

## clean location non nulls

In [0]:
numeric_cols = ['HourlyAltimeterSetting',
 'HourlyDewPointTemperature',
 'HourlyDryBulbTemperature',
 'HourlyPrecipitation',
 'HourlyPresentWeatherType',
 'HourlyPressureChange',
 'HourlyPressureTendency',
 'HourlyRelativeHumidity',
 'HourlySkyConditions',
 'HourlySeaLevelPressure',
 'HourlyStationPressure',
 'HourlyVisibility',
 'HourlyWetBulbTemperature',
 'HourlyWindDirection',
 'HourlyWindGustSpeed',
 'HourlyWindSpeed',
 'DailyAverageDewPointTemperature',
 'DailyAverageDryBulbTemperature',
 'DailyAverageRelativeHumidity',
 'DailyAverageSeaLevelPressure',
 'DailyAverageStationPressure',
 'DailyAverageWetBulbTemperature',
 'DailyAverageWindSpeed',
 'DailyCoolingDegreeDays',
 'DailyDepartureFromNormalAverageTemperature',
 'DailyHeatingDegreeDays',
 'DailyMaximumDryBulbTemperature',
 'DailyMinimumDryBulbTemperature',
 'DailyPeakWindDirection',
 'DailyPeakWindSpeed',
 'DailyPrecipitation',
 'DailySnowDepth',
 'DailySnowfall',
 'DailySustainedWindDirection',
 'DailySustainedWindSpeed',
 'DailyWeather',
 'MonthlyAverageRH',
 'MonthlyDaysWithGT001Precip',
 'MonthlyDaysWithGT010Precip',
 'MonthlyDaysWithGT32Temp',
 'MonthlyDaysWithGT90Temp',
 'MonthlyDaysWithLT0Temp',
 'MonthlyDaysWithLT32Temp',
 'MonthlyDepartureFromNormalAverageTemperature',
 'MonthlyDepartureFromNormalCoolingDegreeDays',
 'MonthlyDepartureFromNormalHeatingDegreeDays',
 'MonthlyDepartureFromNormalMaximumTemperature',
 'MonthlyDepartureFromNormalMinimumTemperature',
 'MonthlyDepartureFromNormalPrecipitation',
 'MonthlyDewpointTemperature',
 'MonthlyGreatestPrecip',
 'MonthlyGreatestPrecipDate',
 'MonthlyGreatestSnowDepth',
 'MonthlyGreatestSnowDepthDate',
 'MonthlyGreatestSnowfall',
 'MonthlyGreatestSnowfallDate',
 'MonthlyMaxSeaLevelPressureValue',
 'MonthlyMaxSeaLevelPressureValueDate',
 'MonthlyMaxSeaLevelPressureValueTime',
 'MonthlyMaximumTemperature',
 'MonthlyMeanTemperature',
 'MonthlyMinSeaLevelPressureValue',
 'MonthlyMinSeaLevelPressureValueDate',
 'MonthlyMinSeaLevelPressureValueTime',
 'MonthlyMinimumTemperature',
 'MonthlySeaLevelPressure',
 'MonthlyStationPressure',
 'MonthlyTotalLiquidPrecipitation',
 'MonthlyTotalSnowfall',
 'MonthlyWetBulb',
  'ShortDurationEndDate005',
 'ShortDurationEndDate010',
 'ShortDurationEndDate015',
 'ShortDurationEndDate020',
 'ShortDurationEndDate030',
 'ShortDurationEndDate045',
 'ShortDurationEndDate060',
 'ShortDurationEndDate080',
 'ShortDurationEndDate100',
 'ShortDurationEndDate120',
 'ShortDurationEndDate150',
 'ShortDurationEndDate180',
 'ShortDurationPrecipitationValue005',
 'ShortDurationPrecipitationValue010',
 'ShortDurationPrecipitationValue015',
 'ShortDurationPrecipitationValue020',
 'ShortDurationPrecipitationValue030',
 'ShortDurationPrecipitationValue045',
 'ShortDurationPrecipitationValue060',
 'ShortDurationPrecipitationValue080',
 'ShortDurationPrecipitationValue100',
 'ShortDurationPrecipitationValue120',
 'ShortDurationPrecipitationValue150',
 'ShortDurationPrecipitationValue180']

In [0]:
display(ydf_country \
        .filter(F.col("LATITUDE").isNotNull()) \
        .filter(F.col('COUNTRY') == 'US') \
        .filter(F.col('REPORT_TYPE') == 'FM-15') \
        .filter(F.col('SOURCE') == 4) \
        .groupBy('WBAN', 'DATE') \
        .count() \
        .filter(F.col('count') > 1)
)

In [0]:
display(ydf_country \
        .filter(F.col("LATITUDE").isNotNull()) \
        .filter(F.col('COUNTRY') == 'US') \
        .filter(F.col('REPORT_TYPE') == 'FM-15') \
        .filter(F.col('WBAN') == '99999') \
        .groupBy('NAME').count()
)

In [0]:
display(stations.filter(stations.station_id.contains(720425)))

In [0]:
display(rest.filter(F.col('NAME').contains('HUGOTON')))

In [0]:
ydf_country.filter(F.col('NAME').isNull()).filter(F.col('LATITUDE').isNotNull()).count()

In [0]:
display(mshr_df.filter(F.col('NAME').contains('MONROE')))

In [0]:
display(mshr_df.filter(F.col('NAME').contains('HUGOTON')))

In [0]:
display(ydf_country \
        .filter(F.col("LATITUDE").isNotNull()) \
        .filter(F.col('COUNTRY') == 'US') \
        .filter(F.col('REPORT_TYPE') == 'FM-15') \
        .filter(F.col('SOURCE') == 4) \
        .groupBy('NAME', 'DATE') \
        .count() \
        .filter(F.col('count') > 1)
)

In [0]:


rest = ydf_country \
        .filter(F.col("LATITUDE").isNotNull()) \
        .filter(F.col('COUNTRY') == 'US') \
        .filter(F.col('REPORT_TYPE') == 'FM-15') \
        .withColumn(
            "null_count", 
            sum(
                [F.when(F.col(c).isNull(), 1).otherwise(0) for c in numeric_cols]
            )
    )
        

In [0]:
rest.filter(F.col('NAME').isNull()).count()

In [0]:
#can't join to MSHR because we don't have unique WBAN numbers , and don't need to pull location info from there anyway 
rest_result = rest \
    .withColumn("row_num", F.row_number().over(
        Window \
            .partitionBy("NAME", "DATE") \
            .orderBy(F.col("null_count").asc())
        )
                ) \
    .filter(F.col("row_num") == 1)
        
display(rest_result)

In [0]:
# rest_hourdupes = rest \
#             .groupBy("WBAN", "DATE") \
#             .count() \
#             .filter(F.col("count") > 1) \
#             .withColumn("row_num", 
#                     F.row_number().over(
#                         Window.partitionBy('WBAN')
#                         .orderBy(F.col("count").desc())
#                     )) \
#     .filter(F.col("row_num") == 1) \
#     .drop("row_num")


# rest_hourdupes = rest_hourdupes.join(
#         ydf.withColumn("hour", hour(F.col("DATE"))), 
#         on=["WBAN", 'DATE', 'HOUR'], 
#         how="left_outer")

In [0]:
display(rest_result.count())

In [0]:
loc_result.count()

In [0]:
#scrapped - try to avg numeric cols - too complicated considering also non numeric cols
# agg_result =  (rest
#     .filter(F.col('COUNTRY') == 'US')
#     .filter(F.col('REPORT_TYPE') == 'FM-15')
#     .groupBy("WBAN", "DATE", "SOURCE") 
#     .agg(
#         *[F.sum(F.when(F.col(c).isNull(), 1).otherwise(0)).alias(c) for c in hourly_columns])  # Count nulls per column
#     )

# tally_result = (
#     agg_result
#     .withColumn("null_column_count", 
#         sum(F.when(F.col(c).isNull(), 1).otherwise(0) for c in hourly_columns))
#     .groupBy("WBAN", "SOURCE")  # Group by WBAN and SOURCE
#     .agg(
#         F.sum("null_column_count").alias("total_nulls")  # Tally null counts across WBAN and SOURCE
#     )
# )

# # Display the result
# display(tally_result.orderBy(F.col('total_nulls')))

## union

In [0]:
df_cols = rest_result.columns

# get index of the duplicate columns
duplicate_col_index = list(set([df_cols.index(c) for c in df_cols if df_cols.count(c) == 2]))

# rename by adding suffix '_duplicated'
for i in duplicate_col_index:
    df_cols[i] = df_cols[i] + '_duplicated'

# rename the column in DF
rest_df = rest_result.toDF(*df_cols)

# remove flagged columns
cols_to_remove = [c for c in df_cols if '_duplicated' in c]


In [0]:
rest_df=rest_df.drop(*cols_to_remove)

In [0]:
full = loc_result \
        .select(F.expr('* EXCEPT(NAME)')) \
        .withColumnRenamed('coop_station_name', 'NAME') \
        .withColumnRenamed('country','COUNTRY') \
        .select(
            *[F.col(col) for col in rest_result.select(
                F.expr('* EXCEPT( null_count, row_num)')
                ).columns]
        ) \
        .union(rest_df.select(F.expr('* EXCEPT(null_count, row_num)')))

In [0]:
#YAY DE DUPED
display(full.groupBy('NAME','DATE').count().filter(F.col('count') > 1))

## checking exceptions

In [0]:
#double checking whats going on when it starts with a letter 

window_spec = Window.partitionBy("STATION").orderBy(F.col("DATE").desc())
display(
    ydf.filter(F.col('STATION').startswith('A'))
    .withColumn("row_num", F.row_number().over(window_spec))
    .filter(F.col("row_num") == 1)
    .drop("row_num")
    .join(mshr_df, ydf['WBAN'] == mshr_df['WBAN_ID'], "left_outer")
    .select(ydf['STATION'], ydf['WBAN'], mshr_df['WBAN_ID'], ydf['NAME'].alias('YDF_NAME'),mshr_df['NAME'].alias('MSHR_NAME'), mshr_df['coop_station_name'])
)

In [0]:
#double checking whats going on when it starts with a letter 

window_spec = Window.partitionBy("STATION").orderBy(F.col("DATE").desc())
display(
    ydf.filter(F.col("STATION").rlike("^[A-Z]"))
    .withColumn("row_num", F.row_number().over(window_spec))
    .filter(F.col("row_num") == 1)
    .drop("row_num")
    .join(mshr_df, ydf['WBAN'] == mshr_df['WBAN_ID'], "left_outer")
    .select(ydf['STATION'], ydf['WBAN'], mshr_df['WBAN_ID'], ydf['NAME'].alias('YDF_NAME'),mshr_df['NAME'].alias('MSHR_NAME'), mshr_df['coop_station_name'])
)

# Checkpoint

In [0]:
checkpoint_path = "dbfs:/student-groups/Group_4_1/interim/weather_1y_checkpoint"

# Set the checkpoint directory in the Spark context
spark.sparkContext.setCheckpointDir(checkpoint_path)

weather_ydf_checkpointed = full.checkpoint(eager=True)
weather_ydf_checkpointed.write.mode('overwrite').parquet(checkpoint_path)


# Timezone Cleaning

In [0]:
checkpoint_path = "dbfs:/student-groups/Group_4_1/interim/weather_1y_checkpoint"

ydf_ = spark.read.parquet(checkpoint_path)
display(ydf_)

In [0]:

#prepare for time lookup, UTC conversion and time grid interpolation
ydf_ = ydf_.withColumn('LATITUDE', ydf_.LATITUDE.cast(types.DoubleType()))
ydf_ = ydf_.withColumn('LONGITUDE', ydf_.LONGITUDE.cast(types.DoubleType()))
ydf_ = ydf_.withColumn('DATETIME', F.regexp_replace(F.col('DATE'), 'T', ' '))
ydf_ = ydf_.withColumn('DATETIME', F.to_timestamp('DATETIME', 'yyyy-MM-dd HH:mm:ss'))

display(ydf_)

In [0]:
#get unique stations so we can use their lat/lon for a lookup table
ydf_uniq = ydf_  \
            .withColumn("row_num", 
                    F.row_number().over(
                        Window.partitionBy('STATION')
                        .orderBy(F.col("STATION").desc())
                    )) \
    .filter(F.col('row_num') == 1) \
    .drop(F.col('row_num'))

In [0]:
ydf_uniq.count()

In [0]:
#code was used to create a lookup table, but this was saved to parquet and should not be run again on the 1y dataset

# def find_timezone(lat, lng):
#     tf = TimezoneFinder()
#     timezone_str = tf.timezone_at(lat=lat, lng=lng)
#     return timezone_str if timezone_str else "Unknown"

# find_timezone_udf = udf(find_timezone, StringType())


# ydf_tz = ydf_uniq.withColumn("timezone", find_timezone_udf(col("LATITUDE"), col("LONGITUDE")))
# folder_path = "dbfs:/student-groups/Group_4_1"
# ydf_tz.write.parquet(f"{folder_path}/external/weather_tz_lookup.parquet")

In [0]:
tz_lookup = spark.read.parquet("dbfs:/student-groups/Group_4_1/external/weather_tz_lookup.parquet")

In [0]:
display(tz_lookup)

In [0]:
ydf_time = ydf_.join(tz_lookup.select('STATION','timezone'), ['STATION'], 'left_outer')

In [0]:
display(ydf_time)

In [0]:


def get_utc(datetime_str, timezone_str):
    '''Using timezone information, get localized time of the datetime col then convert to UTC'''
    t = pytz.timezone(timezone_str)
    dt = datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
    local_dt = t.localize(dt)
    utc_dt = local_dt.astimezone(pytz.utc)
    return utc_dt

get_utc_udf = udf(get_utc, TimestampType())

ydf_time = ydf_time.withColumn('DATETIME', F.to_timestamp(F.col('DATETIME').cast('string')))
ydf_time = ydf_time.withColumn('utc_datetime', get_utc_udf(F.col('DATETIME').cast('string'), F.col('timezone')))

display(ydf_time)

In [0]:
ydf_time = ydf_time.drop(F.col('DATE'))

In [0]:
ydf_time = ydf_time.drop(F.col('COUNTRY'))

In [0]:
ydf_.count()

# Interpolation

## Step 1: Create Time Grid [checkpointed]

In [0]:
null_counts = ydf_.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in ydf_.columns])
display(null_counts) #brief look at how much interpolation we need to do 


In [0]:
#get to the nearest hour in case measurements are not precisely on the minute

ydf_time = ydf_time.withColumn(
    "DATETIME_HOURLY",
    F.date_trunc("hour", F.col("utc_datetime"))
)

In [0]:
ydf_time.count()

In [0]:
display(ydf_time.limit(10))

In [0]:

def date_range(t1, t2, step=60*60): #hourly data
    """Return a list of equally spaced points between t1 and t2 with stepsize step."""
    return [t1 + step*x for x in range(int((t2-t1)/step)+1)]

date_range_udf = F.udf(date_range, ArrayType(LongType()))


grid = ydf_time.groupBy('STATION')\
            .agg(F.min('DATETIME_HOURLY').cast('integer').alias('datetime_min'), 
                 F.max('DATETIME_HOURLY').cast('integer').alias('datetime_max')) \
            .withColumn("DATETIME_HOURLY",
                        F.explode(
                            date_range_udf("datetime_min", 
                                           "datetime_max")
                            )
                        ) \


grid = grid.drop('datetime_min','datetime_max') \
    .withColumn("DATETIME_HOURLY", F.col("DATETIME_HOURLY").cast('timestamp'))

In [0]:
display(grid.limit(10))

In [0]:
display(ydf_time.withColumn(
    "formatted_datetime",
    F.date_format(F.col("DATETIME_HOURLY"), "yyyy-MM-dd'T'HH:mm:ss.SSSXXX")
).select('DATETIME','timezone','utc_datetime','DATETIME_HOURLY','formatted_datetime')
)

In [0]:
ydf_grid = grid.join(ydf_time, ["STATION", "DATETIME_HOURLY"], "left_outer")


In [0]:
display(ydf_grid.limit(10))

In [0]:

### checkpoint

checkpoint_path = "dbfs:/student-groups/Group_4_1/interim/weather_1y_grid"

# Set the checkpoint directory in the Spark context
spark.sparkContext.setCheckpointDir(checkpoint_path)

ydf_grid_ = ydf_grid.checkpoint(eager=True)
ydf_grid_.write.mode('overwrite').parquet(checkpoint_path)


## Step 2: Nearest station data

[FAA](https://www.faa.gov/air_traffic/publications/atpubs/aim_html/chap7_section_1.html) 

[AERMET user guide](https://gaftp.epa.gov/Air/aqmg/SCRAM/models/met/aermet/aermet_userguide.pdf):
- If the dry-bulb or dew-point temperature is missing at some level, then an estimate for the missing temperature is made by linearly interpolating to the level in question. The data from the level immediately below and above the level in question are used. 
  - If the data that are required for the interpolation are also missing, then no interpolation is performed.

- the AERMET program includes substitutions for missing cloud cover and temperature data based on linear interpolation across gaps of one or two hours. 
  - Linear interpolation across short gaps is a reasonable approach for these variables since ambient temperatures tend to follow a diurnal cycle and do not vary significantly from hour to hour. 
  -  AERMOD relatively insensitive to hourly fluctuations in cloud cover, especially during convective hours since the heat flux is integrated across the day.
  - Gaps of 1-2 hrs for these params near  early morning transition to a convective boundary layer may result in all convective hours for that day being missing since key parameters needed for the convective height processing are not calculated. 
    - Substitution of temperature and cloud cover may allow for the needed parameters to be calculated and subsequently, the convective parameters to be calculated. 

- Substitutions are based on linear interpolation across gaps of 1-2 hrs. Interpolations are only made based on non-interpolated values on both sides of the data gap

brainstorming:
- for small gaps: temporal interpolation thru usual cubic spline (don't want to make interpolations off interpolations)
- for large gaps: spatial interpolation ie cressman scheme, with/without modified idw for elevation affected vars


In [0]:
checkpoint_path = "dbfs:/student-groups/Group_4_1/interim/weather_1y_grid"

ydf_grid = spark.read.parquet(checkpoint_path)
display(ydf_grid)

In [0]:
display(ydf_grid.filter(F.col('DATETIME').isNull()))

In [0]:
display(mshr_df.filter(F.col('WBAN_ID') == '27503'))

In [0]:
null_counts = ydf_grid.select([
    count(when(isnan(c) | col(c).isNull(), c)).alias(c)
    if ydf_grid.schema[c].dataType in ['DoubleType', 'FloatType']
    else count(when(col(c).isNull(), c)).alias(c)
    for c in ydf_grid.columns
])
display(null_counts)
 #brief look at how much interpolation we need to do 


In [0]:


def get_utc(datetime_str, timezone_str):
    '''Using timezone information, get localized time of the datetime col then convert to UTC'''
    t = pytz.timezone(timezone_str)
    dt = datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
    local_dt = t.localize(dt)
    utc_dt = local_dt.astimezone(pytz.utc)
    return utc_dt

get_utc_udf = udf(get_utc, TimestampType())

ydf_time = ydf_time.withColumn('DATETIME', F.to_timestamp(F.col('DATETIME').cast('string')))
ydf_time = ydf_time.withColumn('utc_datetime', get_utc_udf(F.col('DATETIME').cast('string'), F.col('timezone')))
