In [0]:
from pyspark.sql.functions import col
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pyspark.sql.functions import col,isnan, when, count, concat_ws, countDistinct, collect_set
data_BASE_DIR = "dbfs:/mnt/mids-w261/"
display(dbutils.fs.ls(f"{data_BASE_DIR}")) #note the other possible samples we can use like 1 day

In [0]:

qdf = spark.read.parquet(f"dbfs:/mnt/mids-w261/datasets_final_project_2022/parquet_weather_data_3m/")
ydf = spark.read.parquet(f"dbfs:/mnt/mids-w261/datasets_final_project_2022/parquet_weather_data_1y/")

In [0]:
qdf.columns

In [0]:
qdf.select('SOURCE').distinct().show()

In [0]:
qdf.select('NAME').distinct().show()

In [0]:
qdf.select('STATION').distinct().show()

# Nulls

## MSHR supplement

In [0]:
import pandas as pd
import re
mshr = pd.read_csv('mshr_standard.txt',sep='\t') #via https://www.ncei.noaa.gov/access/homr/reports

In [0]:
' '.join(pd.DataFrame(mshr).iloc[21][0].split())


In [0]:
mshr

In [0]:




def parse_row(row):
    fields = re.split(r'\s{2,}', row.strip())  # Split on 2+ spaces
    return fields

parsed_rows = [parse_row(mshr.iloc[i, 0]) for i in range(len(mshr))]
max_cols = max(len(row) for row in parsed_rows)
col_names = [f"Column_{i+1}" for i in range(max_cols)]


df_parsed = pd.DataFrame(parsed_rows, columns=col_names)




In [0]:
sample_row = '10000202 01 504590 01 25325 70395 KTN  KTN   PAKT UNITED STATES        AK KETCHIKAN GATEWAY BOROUGH      +9    KETCHIKAN                      KETCHIKAN INTL AP              19890301 19961209  55 21 34 -131 43 15 60     80     95  2 5000 FT NW  AIRWAYS COOP                                      '
print(sample_row[:8]) #station ID number; a
print(sample_row[9:11]) #record type number
print(sample_row[12:18]) #coop station identifier
print(sample_row[19:21]) #climate division
print(sample_row[22:27]) #wban station identifier
print(sample_row[28:33]) #wmo station identifier
print(sample_row[34:39]) #FAA location identifier
print(sample_row[40:44]) #NWS location identifier
print(sample_row[45:49]) #ICAO location identifier
print(sample_row[50:70]) #country name
print(sample_row[71:73]) #state FIPS abbrev
print(sample_row[74:104]) #county
print(sample_row[105:110]) #time zone
print(sample_row[111:141]) #coop station name
print(sample_row[142:172]) #principal station name
print(sample_row[173:181]) #beginning date of record
print(sample_row[182:191]) #end date of record
print(sample_row[192:194]) #latitude degrees
print(sample_row[195:197]) #latitude minutes
print(sample_row[198:200]) #latitude seconds
print(sample_row[201:205]) #longitude degrees
print(sample_row[206:208]) #longitude minutes
print(sample_row[209:211]) #longitude seconds
print(sample_row[212:219]) #lat/lon precision code
print(sample_row[219:225]) #ground elevation
print(sample_row[226:229]) #elevation - other
print(sample_row[230:231]) #elevation- other type code
print(sample_row[232:243]) #station relocation
print(sample_row[244:]) #station types


In [0]:
def parse_fixed_width(row):
    return {
        "station_id": row[:8].strip(),
        "record_type": row[9:11].strip(),
        "coop_station_id": row[12:18].strip(),
        "climate_division": row[19:21].strip(),
        "wban_station_id": row[22:27].strip(),
        "wmo_station_id": row[28:33].strip(),
        "FAA_id": row[34:39].strip(),
        "NWS_id": row[40:44].strip(),
        "ICAO_id": row[45:49].strip(),
        "country": row[50:70].strip(),
        "state_FIPS": row[71:73].strip(),
        "county": row[74:104].strip(),
        "time_zone": row[105:110].strip(),
        "coop_station_name": row[111:141].strip(),
        "principal_station_name": row[142:172].strip(),
        "begin_date": row[173:181].strip(),
        "end_date": row[182:191].strip(),
        "lat_deg": row[192:194].strip(),
        "lat_min": row[195:197].strip(),
        "lat_sec": row[198:200].strip(),
        "lon_deg": row[201:205].strip(),
        "lon_min": row[206:208].strip(),
        "lon_sec": row[209:211].strip(),
        "latlon_precision": row[212:219].strip(),
        "ground_elevation": row[219:225].strip(),
        "elevation_other": row[226:229].strip(),
        "elevation_other_type": row[230:231].strip(),
        "station_relocation": row[232:243].strip(),
        "station_types": row[244:].strip(),
    }


parsed_rows = [parse_fixed_width(mshr.iloc[i, 0]) for i in range(len(mshr))]

max_cols = max(len(row) for row in parsed_rows)

df_parsed = pd.DataFrame(parsed_rows)

In [0]:
df_parsed

In [0]:
df_parsed[df_parsed['wban_station_id'] == '53182']

In [0]:
display(ydf.filter(F.col('STATION').contains('53182')))

## Weather df

In [0]:
null_counts = qdf.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in qdf.columns])
display(null_counts)


In [0]:
ydf.groupBy('STATION').agg(
        F.count(F.when(F.col('LATITUDE').isNull(), 1)).alias('Latitude_Null'),
        F.count(F.when(F.col('LONGITUDE').isNotNull(), 1)).alias('Latitude_Non_Null'),
        F.count(F.when(F.col('NAME').isNull(), 1)).alias('Name_Null'),
        F.count(F.when(F.col('NAME').isNotNull(), 1)).alias('Name_Not_Null'),
        F.count(F.when(F.col('REM').isNull(), 1)).alias('REM_Null'),
        F.count(F.when(F.col('REM').isNotNull(), 1)).alias('REM_Not_Null')


        ) \
        .orderBy('REM_Not_Null','Name_Not_Null').show()



#for the first 5 stations don't have a way to extract location; no name and no REM


via https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt:  ID is the station identification code (WBAN station ID after cross ref).  Note that the first two
           characters denote the FIPS  country code, the third character 
           is a network code that identifies the station numbering system 
           used, and the remaining eight characters contain the actual 
           station ID. 


but doesn't seem to match example format like "US1AZMR0156"


https://www.ncei.noaa.gov/pub/data/noaa/isd-history.txt can match some

https://www.ncei.noaa.gov/access/homr/reports mshr_standard.txt can match some (ie 94045 is Ft Peck Surfrad) via WBAN station Id 


In [0]:
display(ydf.filter(F.col('STATION')==99999953182).select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in qdf.columns]))

In [0]:
display(ydf.filter((F.col('LATITUDE').isNotNull()) & F.col('STATION').startswith(str(999999))))

In [0]:
ydf.filter(F.col('STATION').startswith(str(999999))).count() 

In [0]:
ydf.filter(~F.col('STATION').startswith('999999')).count()

In [0]:
display(ydf.filter(F.col('LATITUDE').isNull() & (F.col('REM').isNull()) & (F.col('NAME').isNull())) \
            .groupBy('STATION','REPORT_TYPE') \
            .agg(F.count('*').alias('count')) \
            .orderBy(F.desc('count'))
    )

In [0]:


display(
    ydf.filter(F.col('STATION').contains('94044'))
       .select("STATION")
       .distinct()
)

display(
    ydf.filter(F.col('STATION').contains('94045'))
       .select("STATION")
       .distinct()
)

display(
    ydf.filter(F.col('STATION').contains('53182'))
       .select("STATION")
       .distinct()
)

display(
    ydf.filter(F.col('STATION').contains('54918'))
       .select("STATION")
       .distinct()
)

display(
    ydf.filter(F.col('STATION').contains('04835'))
       .select("STATION")
       .distinct()
)

display(
    ydf.filter(F.col('STATION').contains('53830'))
       .select("STATION")
       .distinct()
)

#99404599999=appalachicola FL


In [0]:
display(
    ydf.filter(F.col('STATION').contains('94045'))
       .select("STATION")
       .distinct()
)

In [0]:
display(ydf.filter(F.col('STATION') == '99999923583'))

## cross ref

In [0]:
qdf=qdf.withColumn("WBAN",F.col('STATION').substr(-5, 5))

In [0]:
ydf=ydf.withColumn("WBAN",F.col('STATION').substr(-5, 5))