In [1]:
import requests
import pandas as pd 
import numpy as np
import os
import csv
import re
import time
import datetime as dt
from bs4 import BeautifulSoup

In [2]:
def get_soup(url:str, delay=0) -> BeautifulSoup:
  """Simple wrapper for getting beautiful soup object from url with sleep delay
  
  Args: 

  url (str): url you're scraping

  delay (int): time you want to wait between next request (default 0)
  """
  result = requests.get(url)
  time.sleep(delay)
  return BeautifulSoup(result.content, "html.parser") 

def get_year_urls(uscrn_directory:str) -> list: 
  """
  Retrieves the URLs for every year's page in the given USCRN directory.
  
  Arguments:
  uscrn_directory (str): Either 'hourly02' or 'subhourly01' (i.e. source of wind data)

  Returns:
  year_urls (list): A list of URLs for every year's page.
  """

  if uscrn_directory not in ("hourly02", 'subhourly01'):
    raise Exception(f"Invalid directory given: {uscrn_directory} -- give 'hourly02' or 'subhourly01'")
  
  url = f"https://www.ncei.noaa.gov/pub/data/uscrn/products/{uscrn_directory}/"
  soup = get_soup(url, 1)

  # Wind data is first available in 2012
  start_year = 2012 if uscrn_directory == "subhourly01" else 2000

  links = soup.find_all("a") 
  years = [str(y).zfill(1) for y in range(start_year, 2024)]
  year_urls = [url + link['href'] for link in links if link['href'].rstrip('/') in years]
  return year_urls

def get_file_urls(uscrn_directory:str) -> list: 
  """
  Retrieves the URLs for every file contained on each year's page in the given USCRN directory

  Arguments:
  uscrn_directory (str): Either 'hourly02' or 'subhourly01'

  Returns: 
  file_urls (list): A list of file URLs.
  """

  if uscrn_directory not in ("hourly02", 'subhourly01'):
    raise Exception(f"Invalid directory given: {uscrn_directory} -- give 'hourly02' or 'subhourly01'")

  year_urls = get_year_urls(uscrn_directory)

  file_urls = []
  for url in year_urls: 
    soup = get_soup(url)
    file_links = soup.find_all('a', href=re.compile(r'AK.*\.txt'))
    if file_links:
      new_file_urls = [url + link.getText() for link in file_links]
      file_urls.extend(new_file_urls)
  return file_urls

def get_station_location(url) -> str: 
  """
  Extracts the name of the station from a given URL.
  
  Args:
  url (str): The URL to extract the station name from.
  
  Returns:
  station_location (str): The name of the station.
  """
  regex = r"([St.]*[A-Z][a-z]+_*[A-Za-z]*).*.txt" 
  file_name = re.search(regex, url).group(0)
  station_location = re.sub("(_formerly_Barrow.*|_[0-9].*)", "", file_name)
  return  station_location

In [3]:
file_urls = get_file_urls("subhourly01")

In [24]:
def get_raw_rows(file_urls, output_file) -> None:
  """

  Args:
    file_urls (list): List of text file urls. 
    output_file (str): The path to the output CSV file.

  Returns:
    None
  """
  for url in file_urls:
    # Get location from url
    station_location = get_station_location(url)
    # Get new rows 
    soup = get_soup(url, delay=.5)
    lines = [re.split('\s+', line) for line in str(soup).strip().splitlines()]
    # We're only scraping this data for the wind information, so we ignore rows that don't have any (i.e wind < 0)
    wind_cols = [[station_location] + line[:5] + line[-2:] for line in lines if float(line[-2]) >= 0]
    # Write rows to CSV
    if wind_cols:
      with open(output_file, "a+") as f:
        writer = csv.writer(f)
        writer.writerows(wind_cols)
      del wind_cols

In [5]:
colnames = ['station_location','wbanno','utc_date','utc_time',
  'lst_date','lst_time',"wind_1_5", "wind_flag"]

df = pd.read_csv("../../data/uscrn_wind_raw.csv", names=colnames)

In [7]:
df['wind_1_5'] = df['wind_1_5'].astype(float)

# convert to datetimes
df['utc_datetime'] = pd.to_datetime(df['utc_date'].astype(int).astype(str) + df['utc_time'].astype(int).astype(str).str.zfill(4), format='%Y%m%d%H%M')
df['lst_datetime'] = pd.to_datetime(df['lst_date'].astype(int).astype(str) + df['lst_time'].astype(int).astype(str).str.zfill(4), format='%Y%m%d%H%M')

# round to nearest hour 
df['utc_datetime'] = df['utc_datetime'].dt.floor("H")
df['lst_datetime'] = df['lst_datetime'].dt.floor("H")

# drop poor quality data (wind_flag == 3: roughly 1.9% of rows)
df = df[df['wind_flag'] == 0]
df.drop("wind_flag", axis=1, inplace=True)

# # calculate hourly averages 
# df = df.groupby(['station_location','wbanno','utc_datetime','lst_datetime','wind_flag'])['wind_1_5'].mean().reset_index()

# rename wind column 
# df.rename({"wind_1_5":"wind_hr_avg"}, axis=1, inplace=True)

In [9]:
df2 = df.copy()

In [11]:
df2 = df[['station_location', 'wbanno', 'utc_datetime', 'lst_datetime', 'wind_hr_avg', 'wind_flag']]

In [13]:
df2.groupby(['station_location', 'wbanno', 'utc_datetime', 'lst_datetime','wind_flag'])

Unnamed: 0,station_location,wbanno,utc_datetime,lst_datetime,wind_1_5,wind_flag
0,Fairbanks,26494,2012-08-09 17:00:00,2012-08-09 08:00:00,0.00,3
1,Fairbanks,26494,2012-08-09 17:00:00,2012-08-09 08:00:00,0.00,3
2,Fairbanks,26494,2012-08-09 17:00:00,2012-08-09 08:00:00,0.00,3
3,Fairbanks,26494,2012-08-09 17:00:00,2012-08-09 08:00:00,0.00,3
4,Fairbanks,26494,2012-08-09 17:00:00,2012-08-09 08:00:00,0.00,3
...,...,...,...,...,...,...
20740031,Yakutat,25382,2023-03-04 20:00:00,2023-03-04 11:00:00,0.00,0
20740032,Yakutat,25382,2023-03-04 20:00:00,2023-03-04 11:00:00,0.00,0
20740033,Yakutat,25382,2023-03-04 20:00:00,2023-03-04 11:00:00,0.02,0
20740034,Yakutat,25382,2023-03-04 20:00:00,2023-03-04 11:00:00,0.24,0


In [14]:
df2['wind_flag'].value_counts()

0    20345198
3      394838
Name: wind_flag, dtype: int64

In [17]:
df2 = df2[df2['wind_flag'] == 0]